{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8055493398970687, "eval_steps": 500, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 354.1875, "completions/mean_terminated_length": 354.1875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.00044752741105392703, "frac_reward_zero_std": 0.0, "grad_norm": 0.05193624645471573, "learning_rate": 0.0, "loss": -0.0477, "num_tokens": 136396.0, "reward": 2.5913524627685547, "reward_std": 2.2160112857818604, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.46635231375694275, "rewards/ngram_similarity_reward/std": 0.33532705903053284, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 553.640625, "completions/mean_terminated_length": 553.640625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.0008950548221078541, "frac_reward_zero_std": 0.0, "grad_norm": 0.038587283343076706, "learning_rate": 2.2321428571428572e-08, "loss": -0.0007, "num_tokens": 294677.0, "reward": 1.472002387046814, "reward_std": 1.188935399055481, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.4876273274421692, "rewards/ngram_similarity_reward/std": 0.3976302742958069, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 555.84375, "completions/mean_terminated_length": 555.84375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.0013425822331617813, "frac_reward_zero_std": 0.0, "grad_norm": 0.03477500006556511, "learning_rate": 4.4642857142857145e-08, "loss": -0.0399, "num_tokens": 432059.0, "reward": 3.8383781909942627, "reward_std": 2.0193111896514893, "rewards/accuracy_reward/mean": 3.453125, "rewards/accuracy_reward/std": 3.0728185176849365, "rewards/ngram_similarity_reward/mean": 0.38525325059890747, "rewards/ngram_similarity_reward/std": 0.3316202759742737, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 501.40625, "completions/mean_terminated_length": 501.40625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.0017901096442157081, "frac_reward_zero_std": 0.0, "grad_norm": 0.04869001358747482, "learning_rate": 6.696428571428571e-08, "loss": -0.0012, "num_tokens": 574293.0, "reward": 0.7406258583068848, "reward_std": 2.232048511505127, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 2.312781572341919, "rewards/ngram_similarity_reward/mean": 0.25625085830688477, "rewards/ngram_similarity_reward/std": 0.1979217827320099, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 415.828125, "completions/mean_terminated_length": 415.828125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.0022376370552696354, "frac_reward_zero_std": 0.0, "grad_norm": 0.048768848180770874, "learning_rate": 8.928571428571429e-08, "loss": -0.0033, "num_tokens": 763018.0, "reward": 1.6235284805297852, "reward_std": 0.8000533580780029, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 2.7888667583465576, "rewards/ngram_similarity_reward/mean": 0.3735284209251404, "rewards/ngram_similarity_reward/std": 0.3147267699241638, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 417.78125, "completions/mean_terminated_length": 417.78125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.0026851644663235625, "frac_reward_zero_std": 0.0, "grad_norm": 0.0469495914876461, "learning_rate": 1.1160714285714287e-07, "loss": 0.0593, "num_tokens": 914380.0, "reward": 4.5329790115356445, "reward_std": 0.535412609577179, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6267289519309998, "rewards/ngram_similarity_reward/std": 0.3272629976272583, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 458.0, "completions/mean_terminated_length": 458.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.003132691877377489, "frac_reward_zero_std": 0.0, "grad_norm": 0.04102824255824089, "learning_rate": 1.3392857142857142e-07, "loss": -0.0378, "num_tokens": 1121724.0, "reward": 1.5913887023925781, "reward_std": 1.4958816766738892, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 3.0533857345581055, "rewards/ngram_similarity_reward/mean": 0.3882637619972229, "rewards/ngram_similarity_reward/std": 0.3548280894756317, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 566.21875, "completions/mean_terminated_length": 566.21875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.0035802192884314163, "frac_reward_zero_std": 0.0, "grad_norm": 0.035449933260679245, "learning_rate": 1.5625e-07, "loss": -0.0124, "num_tokens": 1278298.0, "reward": 6.122851371765137, "reward_std": 0.2047199010848999, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6228512525558472, "rewards/ngram_similarity_reward/std": 0.23033379018306732, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 504.25, "completions/mean_terminated_length": 504.25, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.004027746699485343, "frac_reward_zero_std": 0.0, "grad_norm": 0.04072337597608566, "learning_rate": 1.7857142857142858e-07, "loss": 0.026, "num_tokens": 1476954.0, "reward": 5.461928844451904, "reward_std": 1.5869636535644531, "rewards/accuracy_reward/mean": 4.90625, "rewards/accuracy_reward/std": 1.8663159608840942, "rewards/ngram_similarity_reward/mean": 0.5556788444519043, "rewards/ngram_similarity_reward/std": 0.33379217982292175, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 373.90625, "completions/mean_terminated_length": 373.90625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.004475274110539271, "frac_reward_zero_std": 0.0, "grad_norm": 0.05392281338572502, "learning_rate": 2.0089285714285717e-07, "loss": -0.0074, "num_tokens": 1613524.0, "reward": 1.703417181968689, "reward_std": 0.7789101600646973, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 2.816432476043701, "rewards/ngram_similarity_reward/mean": 0.3440423011779785, "rewards/ngram_similarity_reward/std": 0.31959813833236694, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 379.625, "completions/mean_terminated_length": 379.625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.0049228015215931975, "frac_reward_zero_std": 0.0, "grad_norm": 0.050036825239658356, "learning_rate": 2.2321428571428574e-07, "loss": 0.0064, "num_tokens": 1747084.0, "reward": 3.507603168487549, "reward_std": 1.5738906860351562, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6326034069061279, "rewards/ngram_similarity_reward/std": 0.43446189165115356, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 574.3125, "completions/mean_terminated_length": 574.3125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.005370328932647125, "frac_reward_zero_std": 0.0, "grad_norm": 0.0350542776286602, "learning_rate": 2.455357142857143e-07, "loss": 0.0035, "num_tokens": 1903600.0, "reward": 2.881260871887207, "reward_std": 0.5595024228096008, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.47501087188720703, "rewards/ngram_similarity_reward/std": 0.3289196491241455, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 590.140625, "completions/mean_terminated_length": 590.140625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.005817856343701052, "frac_reward_zero_std": 0.0, "grad_norm": 0.040751758962869644, "learning_rate": 2.6785714285714284e-07, "loss": 0.009, "num_tokens": 2084681.0, "reward": 2.4814798831939697, "reward_std": 0.8326557874679565, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.45022982358932495, "rewards/ngram_similarity_reward/std": 0.3732970952987671, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 497.515625, "completions/mean_terminated_length": 497.515625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.006265383754754978, "frac_reward_zero_std": 0.0, "grad_norm": 0.04194331914186478, "learning_rate": 2.901785714285715e-07, "loss": 0.0094, "num_tokens": 2263066.0, "reward": 5.9365949630737305, "reward_std": 0.4958219528198242, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.53034508228302, "rewards/ngram_similarity_reward/std": 0.1774548590183258, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 525.234375, "completions/mean_terminated_length": 525.234375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.006712911165808906, "frac_reward_zero_std": 0.0, "grad_norm": 0.04093608260154724, "learning_rate": 3.125e-07, "loss": -0.0241, "num_tokens": 2399881.0, "reward": 4.266244888305664, "reward_std": 0.7034568786621094, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5474951267242432, "rewards/ngram_similarity_reward/std": 0.23368707299232483, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 459.015625, "completions/mean_terminated_length": 459.015625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.0071604385768628325, "frac_reward_zero_std": 0.0, "grad_norm": 0.05097994580864906, "learning_rate": 3.3482142857142856e-07, "loss": 0.0236, "num_tokens": 2542938.0, "reward": 4.596446990966797, "reward_std": 0.706952691078186, "rewards/accuracy_reward/mean": 4.109375, "rewards/accuracy_reward/std": 2.354443311691284, "rewards/ngram_similarity_reward/mean": 0.4870717227458954, "rewards/ngram_similarity_reward/std": 0.3474036753177643, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 448.453125, "completions/mean_terminated_length": 448.453125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.00760796598791676, "frac_reward_zero_std": 0.0, "grad_norm": 0.04329349100589752, "learning_rate": 3.5714285714285716e-07, "loss": 0.0215, "num_tokens": 2690647.0, "reward": 3.7567265033721924, "reward_std": 1.6267927885055542, "rewards/accuracy_reward/mean": 3.421875, "rewards/accuracy_reward/std": 2.896657705307007, "rewards/ngram_similarity_reward/mean": 0.3348517119884491, "rewards/ngram_similarity_reward/std": 0.31598660349845886, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 542.546875, "completions/mean_terminated_length": 542.546875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.008055493398970687, "frac_reward_zero_std": 0.0, "grad_norm": 0.03838830068707466, "learning_rate": 3.794642857142857e-07, "loss": 0.0247, "num_tokens": 2852970.0, "reward": 2.5155956745147705, "reward_std": 1.9023000001907349, "rewards/accuracy_reward/mean": 2.203125, "rewards/accuracy_reward/std": 3.1354587078094482, "rewards/ngram_similarity_reward/mean": 0.31247082352638245, "rewards/ngram_similarity_reward/std": 0.21052129566669464, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 558.28125, "completions/mean_terminated_length": 558.28125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.008503020810024613, "frac_reward_zero_std": 0.0, "grad_norm": 0.040122196078300476, "learning_rate": 4.0178571428571434e-07, "loss": -0.0191, "num_tokens": 2975660.0, "reward": 5.218594074249268, "reward_std": 0.8617122173309326, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.3748440444469452, "rewards/ngram_similarity_reward/std": 0.21162128448486328, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 431.421875, "completions/mean_terminated_length": 431.421875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.008950548221078542, "frac_reward_zero_std": 0.0, "grad_norm": 0.05076023191213608, "learning_rate": 4.2410714285714293e-07, "loss": 0.0282, "num_tokens": 3114839.0, "reward": 4.0693511962890625, "reward_std": 1.58225417137146, "rewards/accuracy_reward/mean": 3.3125, "rewards/accuracy_reward/std": 2.948634386062622, "rewards/ngram_similarity_reward/mean": 0.756851077079773, "rewards/ngram_similarity_reward/std": 0.36218729615211487, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 430.359375, "completions/mean_terminated_length": 430.359375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.009398075632132468, "frac_reward_zero_std": 0.0, "grad_norm": 0.04346788302063942, "learning_rate": 4.4642857142857147e-07, "loss": -0.0295, "num_tokens": 3248126.0, "reward": 3.656329393386841, "reward_std": 1.5899829864501953, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 2.9727182388305664, "rewards/ngram_similarity_reward/mean": 0.5157045125961304, "rewards/ngram_similarity_reward/std": 0.39607036113739014, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 426.421875, "completions/mean_terminated_length": 426.421875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.009845603043186395, "frac_reward_zero_std": 0.0, "grad_norm": 0.04499637708067894, "learning_rate": 4.6875000000000006e-07, "loss": -0.0182, "num_tokens": 3437609.0, "reward": 2.1935207843780518, "reward_std": 2.1468324661254883, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 3.0315799713134766, "rewards/ngram_similarity_reward/mean": 0.3185208737850189, "rewards/ngram_similarity_reward/std": 0.2438775599002838, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 482.25, "completions/mean_terminated_length": 482.25, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.010293130454240322, "frac_reward_zero_std": 0.0, "grad_norm": 0.0404851995408535, "learning_rate": 4.910714285714286e-07, "loss": 0.0027, "num_tokens": 3609705.0, "reward": 5.2805070877075195, "reward_std": 1.4755187034606934, "rewards/accuracy_reward/mean": 4.734375, "rewards/accuracy_reward/std": 2.04506516456604, "rewards/ngram_similarity_reward/mean": 0.5461318492889404, "rewards/ngram_similarity_reward/std": 0.40060216188430786, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 496.84375, "completions/mean_terminated_length": 496.84375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.01074065786529425, "frac_reward_zero_std": 0.0, "grad_norm": 0.05109725892543793, "learning_rate": 5.133928571428571e-07, "loss": 0.0107, "num_tokens": 3773583.0, "reward": 2.40609073638916, "reward_std": 0.0649171993136406, "rewards/accuracy_reward/mean": 2.0, "rewards/accuracy_reward/std": 3.5276684761047363, "rewards/ngram_similarity_reward/mean": 0.4060908854007721, "rewards/ngram_similarity_reward/std": 0.24911099672317505, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 437.015625, "completions/mean_terminated_length": 437.015625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.011188185276348177, "frac_reward_zero_std": 0.0, "grad_norm": 0.049206919968128204, "learning_rate": 5.357142857142857e-07, "loss": 0.0513, "num_tokens": 3998208.0, "reward": 3.4450340270996094, "reward_std": 2.016695261001587, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.47628408670425415, "rewards/ngram_similarity_reward/std": 0.4277351498603821, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 540.015625, "completions/mean_terminated_length": 540.015625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.011635712687402103, "frac_reward_zero_std": 0.0, "grad_norm": 0.0415828563272953, "learning_rate": 5.580357142857143e-07, "loss": 0.022, "num_tokens": 4127169.0, "reward": 2.0314695835113525, "reward_std": 2.002382755279541, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 2.9304099082946777, "rewards/ngram_similarity_reward/mean": 0.40646952390670776, "rewards/ngram_similarity_reward/std": 0.2828986942768097, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 374.234375, "completions/mean_terminated_length": 374.234375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.01208324009845603, "frac_reward_zero_std": 0.0, "grad_norm": 0.04801315814256668, "learning_rate": 5.80357142857143e-07, "loss": 0.0078, "num_tokens": 4280480.0, "reward": 1.0853207111358643, "reward_std": 0.8151639699935913, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 2.465988874435425, "rewards/ngram_similarity_reward/mean": 0.41344574093818665, "rewards/ngram_similarity_reward/std": 0.3735671937465668, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 401.21875, "completions/mean_terminated_length": 401.21875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.012530767509509957, "frac_reward_zero_std": 0.0, "grad_norm": 0.05504123494029045, "learning_rate": 6.026785714285715e-07, "loss": 0.0103, "num_tokens": 4488494.0, "reward": 4.440656661987305, "reward_std": 2.232184410095215, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6281571388244629, "rewards/ngram_similarity_reward/std": 0.4001638889312744, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 451.03125, "completions/mean_terminated_length": 451.03125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.012978294920563885, "frac_reward_zero_std": 0.0, "grad_norm": 0.0563817173242569, "learning_rate": 6.25e-07, "loss": 0.0162, "num_tokens": 4694608.0, "reward": 2.5744268894195557, "reward_std": 1.4449257850646973, "rewards/accuracy_reward/mean": 2.25, "rewards/accuracy_reward/std": 3.0860671997070312, "rewards/ngram_similarity_reward/mean": 0.3244269788265228, "rewards/ngram_similarity_reward/std": 0.31681835651397705, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 494.5625, "completions/mean_terminated_length": 494.5625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.013425822331617812, "frac_reward_zero_std": 0.0, "grad_norm": 0.04479817673563957, "learning_rate": 6.473214285714287e-07, "loss": -0.0052, "num_tokens": 4860052.0, "reward": 3.1864285469055176, "reward_std": 0.5244491100311279, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5926785469055176, "rewards/ngram_similarity_reward/std": 0.21132132411003113, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 464.265625, "completions/mean_terminated_length": 464.265625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.013873349742671738, "frac_reward_zero_std": 0.0, "grad_norm": 0.04182331636548042, "learning_rate": 6.696428571428571e-07, "loss": 0.029, "num_tokens": 5026789.0, "reward": 2.2157487869262695, "reward_std": 1.5576858520507812, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 3.077979803085327, "rewards/ngram_similarity_reward/mean": 0.7626237869262695, "rewards/ngram_similarity_reward/std": 0.3413919508457184, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 603.671875, "completions/mean_terminated_length": 603.671875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.014320877153725665, "frac_reward_zero_std": 0.0, "grad_norm": 0.03902869299054146, "learning_rate": 6.919642857142858e-07, "loss": 0.0338, "num_tokens": 5196720.0, "reward": 5.464887619018555, "reward_std": 1.6251399517059326, "rewards/accuracy_reward/mean": 4.984375, "rewards/accuracy_reward/std": 1.790558934211731, "rewards/ngram_similarity_reward/mean": 0.4805128276348114, "rewards/ngram_similarity_reward/std": 0.18633846938610077, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 440.859375, "completions/mean_terminated_length": 440.859375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.014768404564779593, "frac_reward_zero_std": 0.0, "grad_norm": 0.04514501616358757, "learning_rate": 7.142857142857143e-07, "loss": 0.0242, "num_tokens": 5358903.0, "reward": 2.4213662147521973, "reward_std": 1.0139738321304321, "rewards/accuracy_reward/mean": 2.203125, "rewards/accuracy_reward/std": 3.0272817611694336, "rewards/ngram_similarity_reward/mean": 0.21824108064174652, "rewards/ngram_similarity_reward/std": 0.17951112985610962, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 479.546875, "completions/mean_terminated_length": 479.546875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.01521593197583352, "frac_reward_zero_std": 0.0, "grad_norm": 0.03740592673420906, "learning_rate": 7.36607142857143e-07, "loss": 0.0389, "num_tokens": 5491690.0, "reward": 5.158116340637207, "reward_std": 1.3710438013076782, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.4081161618232727, "rewards/ngram_similarity_reward/std": 0.32177120447158813, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 602.734375, "completions/mean_terminated_length": 602.734375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.015663459386887447, "frac_reward_zero_std": 0.0, "grad_norm": 0.03585360199213028, "learning_rate": 7.589285714285714e-07, "loss": 0.0072, "num_tokens": 5624489.0, "reward": 5.171844482421875, "reward_std": 1.2208667993545532, "rewards/accuracy_reward/mean": 4.640625, "rewards/accuracy_reward/std": 2.1445181369781494, "rewards/ngram_similarity_reward/mean": 0.5312194228172302, "rewards/ngram_similarity_reward/std": 0.28985437750816345, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 383.734375, "completions/mean_terminated_length": 383.734375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.016110986797941373, "frac_reward_zero_std": 0.0, "grad_norm": 0.054027266800403595, "learning_rate": 7.8125e-07, "loss": 0.002, "num_tokens": 5789336.0, "reward": 3.743284225463867, "reward_std": 1.6936142444610596, "rewards/accuracy_reward/mean": 3.21875, "rewards/accuracy_reward/std": 2.9732606410980225, "rewards/ngram_similarity_reward/mean": 0.5245344042778015, "rewards/ngram_similarity_reward/std": 0.34867286682128906, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 456.9375, "completions/mean_terminated_length": 456.9375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.0165585142089953, "frac_reward_zero_std": 0.0, "grad_norm": 0.04130946472287178, "learning_rate": 8.035714285714287e-07, "loss": -0.0255, "num_tokens": 5944020.0, "reward": 3.0221548080444336, "reward_std": 0.6041401624679565, "rewards/accuracy_reward/mean": 2.375, "rewards/accuracy_reward/std": 3.057647228240967, "rewards/ngram_similarity_reward/mean": 0.6471550464630127, "rewards/ngram_similarity_reward/std": 0.3582594692707062, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 543.265625, "completions/mean_terminated_length": 543.265625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.017006041620049227, "frac_reward_zero_std": 0.0, "grad_norm": 0.039212699979543686, "learning_rate": 8.258928571428572e-07, "loss": -0.0365, "num_tokens": 6109381.0, "reward": 3.2279555797576904, "reward_std": 1.5816094875335693, "rewards/accuracy_reward/mean": 2.828125, "rewards/accuracy_reward/std": 3.167567253112793, "rewards/ngram_similarity_reward/mean": 0.3998306691646576, "rewards/ngram_similarity_reward/std": 0.2724950611591339, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 486.671875, "completions/mean_terminated_length": 486.671875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.017453569031103153, "frac_reward_zero_std": 0.0, "grad_norm": 0.04461957886815071, "learning_rate": 8.482142857142859e-07, "loss": -0.0291, "num_tokens": 6312912.0, "reward": 2.9794769287109375, "reward_std": 1.6418548822402954, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.2353148460388184, "rewards/ngram_similarity_reward/mean": 0.3857269585132599, "rewards/ngram_similarity_reward/std": 0.2621348798274994, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 600.125, "completions/mean_terminated_length": 600.125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.017901096442157084, "frac_reward_zero_std": 0.0, "grad_norm": 0.03619004786014557, "learning_rate": 8.705357142857143e-07, "loss": 0.0238, "num_tokens": 6480600.0, "reward": 3.216808557510376, "reward_std": 1.4437744617462158, "rewards/accuracy_reward/mean": 2.734375, "rewards/accuracy_reward/std": 3.0692648887634277, "rewards/ngram_similarity_reward/mean": 0.4824334979057312, "rewards/ngram_similarity_reward/std": 0.3833947777748108, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 459.5, "completions/mean_terminated_length": 459.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.01834862385321101, "frac_reward_zero_std": 0.0, "grad_norm": 0.04745332896709442, "learning_rate": 8.928571428571429e-07, "loss": 0.0291, "num_tokens": 6705192.0, "reward": 3.351127862930298, "reward_std": 0.8508948087692261, "rewards/accuracy_reward/mean": 2.859375, "rewards/accuracy_reward/std": 3.0203921794891357, "rewards/ngram_similarity_reward/mean": 0.4917528033256531, "rewards/ngram_similarity_reward/std": 0.4018055200576782, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 487.71875, "completions/mean_terminated_length": 487.71875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.018796151264264937, "frac_reward_zero_std": 0.0, "grad_norm": 0.03918340429663658, "learning_rate": 9.151785714285715e-07, "loss": -0.0052, "num_tokens": 6857878.0, "reward": 5.294514179229736, "reward_std": 1.5053304433822632, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.3570142984390259, "rewards/ngram_similarity_reward/std": 0.3143937885761261, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 504.78125, "completions/mean_terminated_length": 504.78125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.019243678675318864, "frac_reward_zero_std": 0.0, "grad_norm": 0.04142072796821594, "learning_rate": 9.375000000000001e-07, "loss": -0.0023, "num_tokens": 7040488.0, "reward": 2.6512510776519775, "reward_std": 2.6500251293182373, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.5418761968612671, "rewards/ngram_similarity_reward/std": 0.3100045323371887, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 548.046875, "completions/mean_terminated_length": 548.046875, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.01969120608637279, "frac_reward_zero_std": 0.0, "grad_norm": 0.038975950330495834, "learning_rate": 9.598214285714287e-07, "loss": -0.0069, "num_tokens": 7197131.0, "reward": 5.58192253112793, "reward_std": 1.3043636083602905, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.6444226503372192, "rewards/ngram_similarity_reward/std": 0.36534932255744934, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 483.625, "completions/mean_terminated_length": 483.625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.020138733497426717, "frac_reward_zero_std": 0.0, "grad_norm": 0.04290686175227165, "learning_rate": 9.821428571428572e-07, "loss": 0.0074, "num_tokens": 7329955.0, "reward": 0.9148364067077637, "reward_std": 2.1159703731536865, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 2.607795238494873, "rewards/ngram_similarity_reward/mean": 0.25858643651008606, "rewards/ngram_similarity_reward/std": 0.15243861079216003, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 470.265625, "completions/mean_terminated_length": 470.265625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.020586260908480643, "frac_reward_zero_std": 0.0, "grad_norm": 0.043191708624362946, "learning_rate": 1.0044642857142857e-06, "loss": 0.0283, "num_tokens": 7490068.0, "reward": 3.445145606994629, "reward_std": 2.0403270721435547, "rewards/accuracy_reward/mean": 3.03125, "rewards/accuracy_reward/std": 3.0130341053009033, "rewards/ngram_similarity_reward/mean": 0.4138953983783722, "rewards/ngram_similarity_reward/std": 0.31189674139022827, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 516.578125, "completions/mean_terminated_length": 516.578125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.02103378831953457, "frac_reward_zero_std": 0.0, "grad_norm": 0.03917146101593971, "learning_rate": 1.0267857142857143e-06, "loss": 0.0058, "num_tokens": 7677833.0, "reward": 3.6610493659973145, "reward_std": 0.9401412606239319, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6922991275787354, "rewards/ngram_similarity_reward/std": 0.3831275999546051, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 425.296875, "completions/mean_terminated_length": 425.296875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.0214813157305885, "frac_reward_zero_std": 0.0, "grad_norm": 0.05108068510890007, "learning_rate": 1.049107142857143e-06, "loss": -0.0085, "num_tokens": 7783212.0, "reward": 3.6683921813964844, "reward_std": 0.9822139739990234, "rewards/accuracy_reward/mean": 3.234375, "rewards/accuracy_reward/std": 2.950610399246216, "rewards/ngram_similarity_reward/mean": 0.43401747941970825, "rewards/ngram_similarity_reward/std": 0.2134656310081482, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 446.359375, "completions/mean_terminated_length": 446.359375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.021928843141642427, "frac_reward_zero_std": 0.0, "grad_norm": 0.04882439225912094, "learning_rate": 1.0714285714285714e-06, "loss": -0.0035, "num_tokens": 7927283.0, "reward": 0.4699714779853821, "reward_std": 1.4633207321166992, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 1.9338643550872803, "rewards/ngram_similarity_reward/mean": 0.39184650778770447, "rewards/ngram_similarity_reward/std": 0.2500672936439514, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 523.703125, "completions/mean_terminated_length": 523.703125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.022376370552696354, "frac_reward_zero_std": 0.0, "grad_norm": 0.050660137087106705, "learning_rate": 1.0937500000000001e-06, "loss": -0.0242, "num_tokens": 8075872.0, "reward": 4.185028076171875, "reward_std": 1.3079955577850342, "rewards/accuracy_reward/mean": 3.875, "rewards/accuracy_reward/std": 2.7284510135650635, "rewards/ngram_similarity_reward/mean": 0.31002795696258545, "rewards/ngram_similarity_reward/std": 0.29505354166030884, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 457.828125, "completions/mean_terminated_length": 457.828125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.02282389796375028, "frac_reward_zero_std": 0.0, "grad_norm": 0.04532083496451378, "learning_rate": 1.1160714285714287e-06, "loss": 0.0207, "num_tokens": 8224117.0, "reward": 3.2468814849853516, "reward_std": 1.64667809009552, "rewards/accuracy_reward/mean": 2.859375, "rewards/accuracy_reward/std": 3.0203921794891357, "rewards/ngram_similarity_reward/mean": 0.38750651478767395, "rewards/ngram_similarity_reward/std": 0.2459414303302765, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 538.1875, "completions/mean_terminated_length": 538.1875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.023271425374804207, "frac_reward_zero_std": 0.0, "grad_norm": 0.038736362010240555, "learning_rate": 1.1383928571428572e-06, "loss": -0.0134, "num_tokens": 8356273.0, "reward": 3.7373809814453125, "reward_std": 0.941716194152832, "rewards/accuracy_reward/mean": 3.3125, "rewards/accuracy_reward/std": 2.948634386062622, "rewards/ngram_similarity_reward/mean": 0.4248809814453125, "rewards/ngram_similarity_reward/std": 0.2687967121601105, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 492.953125, "completions/mean_terminated_length": 492.953125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.023718952785858134, "frac_reward_zero_std": 0.0, "grad_norm": 0.03777806833386421, "learning_rate": 1.160714285714286e-06, "loss": -0.0303, "num_tokens": 8515358.0, "reward": 4.260929584503174, "reward_std": 1.2759349346160889, "rewards/accuracy_reward/mean": 3.65625, "rewards/accuracy_reward/std": 2.868652582168579, "rewards/ngram_similarity_reward/mean": 0.6046797037124634, "rewards/ngram_similarity_reward/std": 0.4571002721786499, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 542.90625, "completions/mean_terminated_length": 542.90625, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.02416648019691206, "frac_reward_zero_std": 0.0, "grad_norm": 0.037424709647893906, "learning_rate": 1.1830357142857143e-06, "loss": 0.0158, "num_tokens": 8662824.0, "reward": 3.1465628147125244, "reward_std": 1.4686698913574219, "rewards/accuracy_reward/mean": 2.734375, "rewards/accuracy_reward/std": 3.0692648887634277, "rewards/ngram_similarity_reward/mean": 0.41218769550323486, "rewards/ngram_similarity_reward/std": 0.20500494539737701, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 468.71875, "completions/mean_terminated_length": 468.71875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.024614007607965987, "frac_reward_zero_std": 0.0, "grad_norm": 0.041173093020915985, "learning_rate": 1.205357142857143e-06, "loss": 0.0014, "num_tokens": 8831638.0, "reward": 4.659518241882324, "reward_std": 0.2648826539516449, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6595180034637451, "rewards/ngram_similarity_reward/std": 0.39609453082084656, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 557.984375, "completions/mean_terminated_length": 557.984375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.025061535019019913, "frac_reward_zero_std": 0.0, "grad_norm": 0.04032100737094879, "learning_rate": 1.2276785714285716e-06, "loss": 0.0007, "num_tokens": 8978853.0, "reward": 3.653855323791504, "reward_std": 1.3852059841156006, "rewards/accuracy_reward/mean": 3.203125, "rewards/accuracy_reward/std": 2.995656728744507, "rewards/ngram_similarity_reward/mean": 0.4507303237915039, "rewards/ngram_similarity_reward/std": 0.30365705490112305, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 447.5625, "completions/mean_terminated_length": 447.5625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.025509062430073844, "frac_reward_zero_std": 0.0, "grad_norm": 0.04351978376507759, "learning_rate": 1.25e-06, "loss": 0.0547, "num_tokens": 9109401.0, "reward": 2.6265063285827637, "reward_std": 1.2239631414413452, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.5952565670013428, "rewards/ngram_similarity_reward/std": 0.3527531027793884, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 413.140625, "completions/mean_terminated_length": 413.140625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.02595658984112777, "frac_reward_zero_std": 0.25, "grad_norm": 0.03853273391723633, "learning_rate": 1.2723214285714286e-06, "loss": -0.0464, "num_tokens": 9255714.0, "reward": 2.893002510070801, "reward_std": 0.7541555166244507, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7680025100708008, "rewards/ngram_similarity_reward/std": 0.3234589397907257, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 604.34375, "completions/mean_terminated_length": 604.34375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.026404117252181697, "frac_reward_zero_std": 0.0, "grad_norm": 0.03918275237083435, "learning_rate": 1.2946428571428574e-06, "loss": 0.0445, "num_tokens": 9396520.0, "reward": 2.26601505279541, "reward_std": 1.9519855976104736, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.37539005279541016, "rewards/ngram_similarity_reward/std": 0.1937084197998047, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 444.015625, "completions/mean_terminated_length": 444.015625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.026851644663235624, "frac_reward_zero_std": 0.0, "grad_norm": 0.044681839644908905, "learning_rate": 1.316964285714286e-06, "loss": -0.0, "num_tokens": 9521689.0, "reward": 5.670622825622559, "reward_std": 0.9016618132591248, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.6393731236457825, "rewards/ngram_similarity_reward/std": 0.3744213879108429, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 503.84375, "completions/mean_terminated_length": 503.84375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.02729917207428955, "frac_reward_zero_std": 0.0, "grad_norm": 0.04541458934545517, "learning_rate": 1.3392857142857143e-06, "loss": 0.0333, "num_tokens": 9663695.0, "reward": 4.416263580322266, "reward_std": 0.6310166120529175, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.5256385803222656, "rewards/ngram_similarity_reward/std": 0.3763166069984436, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 490.609375, "completions/mean_terminated_length": 490.609375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.027746699485343477, "frac_reward_zero_std": 0.0, "grad_norm": 0.04283396899700165, "learning_rate": 1.3616071428571428e-06, "loss": 0.0117, "num_tokens": 9825190.0, "reward": 2.2028346061706543, "reward_std": 1.3296781778335571, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 2.9857051372528076, "rewards/ngram_similarity_reward/mean": 0.2809595465660095, "rewards/ngram_similarity_reward/std": 0.18962761759757996, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 557.828125, "completions/mean_terminated_length": 557.828125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.028194226896397404, "frac_reward_zero_std": 0.0, "grad_norm": 0.03895300626754761, "learning_rate": 1.3839285714285715e-06, "loss": 0.0294, "num_tokens": 9960459.0, "reward": 4.8852128982543945, "reward_std": 2.1436607837677, "rewards/accuracy_reward/mean": 4.53125, "rewards/accuracy_reward/std": 2.27455735206604, "rewards/ngram_similarity_reward/mean": 0.3539627194404602, "rewards/ngram_similarity_reward/std": 0.22679011523723602, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 471.390625, "completions/mean_terminated_length": 471.390625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.02864175430745133, "frac_reward_zero_std": 0.0, "grad_norm": 0.04387656971812248, "learning_rate": 1.40625e-06, "loss": 0.0381, "num_tokens": 10086228.0, "reward": 3.403301477432251, "reward_std": 1.543312430381775, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.340801477432251, "rewards/ngram_similarity_reward/std": 0.1824437826871872, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 477.84375, "completions/mean_terminated_length": 477.84375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.029089281718505257, "frac_reward_zero_std": 0.0, "grad_norm": 0.04593445733189583, "learning_rate": 1.4285714285714286e-06, "loss": 0.0418, "num_tokens": 10272906.0, "reward": 2.3133389949798584, "reward_std": 2.1918067932128906, "rewards/accuracy_reward/mean": 1.984375, "rewards/accuracy_reward/std": 3.03415584564209, "rewards/ngram_similarity_reward/mean": 0.328963965177536, "rewards/ngram_similarity_reward/std": 0.2558513283729553, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 445.28125, "completions/mean_terminated_length": 445.28125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.029536809129559187, "frac_reward_zero_std": 0.0, "grad_norm": 0.061927419155836105, "learning_rate": 1.4508928571428574e-06, "loss": -0.0232, "num_tokens": 10470828.0, "reward": 1.4907304048538208, "reward_std": 0.48672711849212646, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.39698031544685364, "rewards/ngram_similarity_reward/std": 0.23842564225196838, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 485.640625, "completions/mean_terminated_length": 485.640625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.029984336540613114, "frac_reward_zero_std": 0.0, "grad_norm": 0.037593647837638855, "learning_rate": 1.473214285714286e-06, "loss": 0.0226, "num_tokens": 10637333.0, "reward": 1.6215100288391113, "reward_std": 0.4261550009250641, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5277600288391113, "rewards/ngram_similarity_reward/std": 0.392575740814209, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 558.578125, "completions/mean_terminated_length": 558.578125, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.03043186395166704, "frac_reward_zero_std": 0.0, "grad_norm": 0.038532938808202744, "learning_rate": 1.4955357142857145e-06, "loss": 0.0055, "num_tokens": 10768250.0, "reward": 4.58320426940918, "reward_std": 0.2183823436498642, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5832041501998901, "rewards/ngram_similarity_reward/std": 0.2772037982940674, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 508.21875, "completions/mean_terminated_length": 508.21875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.030879391362720967, "frac_reward_zero_std": 0.0, "grad_norm": 0.041135646402835846, "learning_rate": 1.5178571428571428e-06, "loss": 0.0032, "num_tokens": 10935176.0, "reward": 2.627410888671875, "reward_std": 2.070981979370117, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.5180359482765198, "rewards/ngram_similarity_reward/std": 0.2647475302219391, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 444.890625, "completions/mean_terminated_length": 444.890625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.031326918773774894, "frac_reward_zero_std": 0.0, "grad_norm": 0.05290055274963379, "learning_rate": 1.5401785714285715e-06, "loss": -0.0024, "num_tokens": 11132913.0, "reward": 3.530862331390381, "reward_std": 1.4581063985824585, "rewards/accuracy_reward/mean": 3.21875, "rewards/accuracy_reward/std": 2.9732606410980225, "rewards/ngram_similarity_reward/mean": 0.31211215257644653, "rewards/ngram_similarity_reward/std": 0.2300308793783188, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 474.890625, "completions/mean_terminated_length": 474.890625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.031774446184828824, "frac_reward_zero_std": 0.0, "grad_norm": 0.046519357711076736, "learning_rate": 1.5625e-06, "loss": -0.015, "num_tokens": 11288058.0, "reward": 2.81695556640625, "reward_std": 0.12096526473760605, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.06007981300354, "rewards/ngram_similarity_reward/mean": 0.3482056260108948, "rewards/ngram_similarity_reward/std": 0.271560937166214, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 497.9375, "completions/mean_terminated_length": 497.9375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.03222197359588275, "frac_reward_zero_std": 0.0, "grad_norm": 0.04361455515027046, "learning_rate": 1.5848214285714286e-06, "loss": -0.0223, "num_tokens": 11440454.0, "reward": 1.5024499893188477, "reward_std": 2.089134454727173, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.7544608116149902, "rewards/ngram_similarity_reward/mean": 0.5180749893188477, "rewards/ngram_similarity_reward/std": 0.35097816586494446, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 509.234375, "completions/mean_terminated_length": 509.234375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.03266950100693668, "frac_reward_zero_std": 0.0, "grad_norm": 0.03942706063389778, "learning_rate": 1.6071428571428574e-06, "loss": 0.0095, "num_tokens": 11607461.0, "reward": 1.3038195371627808, "reward_std": 0.1510372906923294, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.31944456696510315, "rewards/ngram_similarity_reward/std": 0.29977452754974365, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 640.359375, "completions/mean_terminated_length": 640.359375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.0331170284179906, "frac_reward_zero_std": 0.0, "grad_norm": 0.04091178998351097, "learning_rate": 1.629464285714286e-06, "loss": 0.0386, "num_tokens": 11837804.0, "reward": 1.7923917770385742, "reward_std": 0.8479336500167847, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 2.880171298980713, "rewards/ngram_similarity_reward/mean": 0.37051689624786377, "rewards/ngram_similarity_reward/std": 0.3080310523509979, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 566.265625, "completions/mean_terminated_length": 566.265625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.03356455582904453, "frac_reward_zero_std": 0.0, "grad_norm": 0.03632889688014984, "learning_rate": 1.6517857142857144e-06, "loss": 0.0286, "num_tokens": 12010413.0, "reward": 2.0851569175720215, "reward_std": 1.3150734901428223, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 3.2409443855285645, "rewards/ngram_similarity_reward/mean": 0.44453203678131104, "rewards/ngram_similarity_reward/std": 0.2968069314956665, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 485.34375, "completions/mean_terminated_length": 485.34375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.034012083240098454, "frac_reward_zero_std": 0.0, "grad_norm": 0.04526903107762337, "learning_rate": 1.6741071428571428e-06, "loss": 0.0471, "num_tokens": 12245683.0, "reward": -0.17060258984565735, "reward_std": 0.41853880882263184, "rewards/accuracy_reward/mean": -0.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.23564741015434265, "rewards/ngram_similarity_reward/std": 0.21910040080547333, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 430.859375, "completions/mean_terminated_length": 430.859375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.034459610651152384, "frac_reward_zero_std": 0.0, "grad_norm": 0.042782995849847794, "learning_rate": 1.6964285714285717e-06, "loss": -0.0128, "num_tokens": 12423866.0, "reward": 1.6177589893341064, "reward_std": 0.5983878374099731, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.43025898933410645, "rewards/ngram_similarity_reward/std": 0.3256424367427826, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 480.953125, "completions/mean_terminated_length": 480.953125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.03490713806220631, "frac_reward_zero_std": 0.0, "grad_norm": 0.038841940462589264, "learning_rate": 1.71875e-06, "loss": 0.0114, "num_tokens": 12587143.0, "reward": 4.244690895080566, "reward_std": 0.8090140223503113, "rewards/accuracy_reward/mean": 3.703125, "rewards/accuracy_reward/std": 2.789889335632324, "rewards/ngram_similarity_reward/mean": 0.5415658950805664, "rewards/ngram_similarity_reward/std": 0.3842836916446686, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 544.9375, "completions/mean_terminated_length": 544.9375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.03535466547326024, "frac_reward_zero_std": 0.0, "grad_norm": 0.03895031660795212, "learning_rate": 1.7410714285714286e-06, "loss": -0.0014, "num_tokens": 12752179.0, "reward": 4.22926664352417, "reward_std": 1.0198653936386108, "rewards/accuracy_reward/mean": 3.75, "rewards/accuracy_reward/std": 2.8284270763397217, "rewards/ngram_similarity_reward/mean": 0.47926658391952515, "rewards/ngram_similarity_reward/std": 0.40398722887039185, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 524.953125, "completions/mean_terminated_length": 524.953125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.03580219288431417, "frac_reward_zero_std": 0.0, "grad_norm": 0.04183723405003548, "learning_rate": 1.7633928571428574e-06, "loss": 0.002, "num_tokens": 12924576.0, "reward": 4.149851322174072, "reward_std": 0.7294169068336487, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.36860156059265137, "rewards/ngram_similarity_reward/std": 0.30380597710609436, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 436.1875, "completions/mean_terminated_length": 436.1875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.03624972029536809, "frac_reward_zero_std": 0.0, "grad_norm": 0.048419512808322906, "learning_rate": 1.7857142857142859e-06, "loss": -0.004, "num_tokens": 13157100.0, "reward": 1.6001965999603271, "reward_std": 0.6172314286231995, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.4126965403556824, "rewards/ngram_similarity_reward/std": 0.3676692247390747, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 552.828125, "completions/mean_terminated_length": 552.828125, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.03669724770642202, "frac_reward_zero_std": 0.0, "grad_norm": 0.04128267988562584, "learning_rate": 1.8080357142857144e-06, "loss": -0.0078, "num_tokens": 13299041.0, "reward": 2.3026301860809326, "reward_std": 1.387294888496399, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.4120052754878998, "rewards/ngram_similarity_reward/std": 0.34123870730400085, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 553.34375, "completions/mean_terminated_length": 553.34375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.037144775117475944, "frac_reward_zero_std": 0.0, "grad_norm": 0.03644965589046478, "learning_rate": 1.830357142857143e-06, "loss": 0.0274, "num_tokens": 13436519.0, "reward": 5.4892730712890625, "reward_std": 0.8663454055786133, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.4580225944519043, "rewards/ngram_similarity_reward/std": 0.2673526406288147, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 460.859375, "completions/mean_terminated_length": 460.859375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.037592302528529874, "frac_reward_zero_std": 0.0, "grad_norm": 0.04325474426150322, "learning_rate": 1.8526785714285717e-06, "loss": 0.0131, "num_tokens": 13568558.0, "reward": 4.265529632568359, "reward_std": 0.8148656487464905, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6405298709869385, "rewards/ngram_similarity_reward/std": 0.2982690930366516, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 544.1875, "completions/mean_terminated_length": 544.1875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.0380398299395838, "frac_reward_zero_std": 0.0, "grad_norm": 0.039485905319452286, "learning_rate": 1.8750000000000003e-06, "loss": 0.0067, "num_tokens": 13701690.0, "reward": 2.838294744491577, "reward_std": 0.8348063230514526, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.33829477429389954, "rewards/ngram_similarity_reward/std": 0.1729307323694229, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 467.390625, "completions/mean_terminated_length": 442.3016052246094, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.03848735735063773, "frac_reward_zero_std": 0.0, "grad_norm": 0.051283568143844604, "learning_rate": 1.8973214285714286e-06, "loss": -0.0116, "num_tokens": 13856371.0, "reward": 2.866548776626587, "reward_std": 0.18996483087539673, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.3665488660335541, "rewards/ngram_similarity_reward/std": 0.2918168306350708, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 512.171875, "completions/mean_terminated_length": 512.171875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.03893488476169165, "frac_reward_zero_std": 0.0, "grad_norm": 0.04058018699288368, "learning_rate": 1.9196428571428573e-06, "loss": -0.0175, "num_tokens": 14019662.0, "reward": 4.449865341186523, "reward_std": 0.2415446937084198, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4498656094074249, "rewards/ngram_similarity_reward/std": 0.2835099697113037, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 494.015625, "completions/mean_terminated_length": 494.015625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.03938241217274558, "frac_reward_zero_std": 0.0, "grad_norm": 0.04097066447138786, "learning_rate": 1.941964285714286e-06, "loss": -0.0135, "num_tokens": 14143855.0, "reward": 0.696371853351593, "reward_std": 0.8611838221549988, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.352621853351593, "rewards/ngram_similarity_reward/std": 0.2985036075115204, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 477.4375, "completions/mean_terminated_length": 477.4375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.03982993958379951, "frac_reward_zero_std": 0.0, "grad_norm": 0.04545062407851219, "learning_rate": 1.9642857142857144e-06, "loss": -0.0054, "num_tokens": 14315243.0, "reward": 0.8684969544410706, "reward_std": 1.7307615280151367, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 2.566380739212036, "rewards/ngram_similarity_reward/mean": 0.14974701404571533, "rewards/ngram_similarity_reward/std": 0.08504395186901093, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 413.359375, "completions/mean_terminated_length": 413.359375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.040277466994853434, "frac_reward_zero_std": 0.0, "grad_norm": 0.046176016330718994, "learning_rate": 1.9866071428571427e-06, "loss": -0.0133, "num_tokens": 14486578.0, "reward": 1.0136975049972534, "reward_std": 1.921843409538269, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.201197549700737, "rewards/ngram_similarity_reward/std": 0.16361328959465027, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 475.71875, "completions/mean_terminated_length": 450.7619323730469, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.040724994405907364, "frac_reward_zero_std": 0.0, "grad_norm": 0.04870830848813057, "learning_rate": 2.0089285714285715e-06, "loss": 0.0291, "num_tokens": 14603360.0, "reward": 1.7914204597473145, "reward_std": 0.9845656156539917, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.9945387840270996, "rewards/ngram_similarity_reward/mean": 0.510170578956604, "rewards/ngram_similarity_reward/std": 0.4287711977958679, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 446.5625, "completions/mean_terminated_length": 446.5625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.04117252181696129, "frac_reward_zero_std": 0.0, "grad_norm": 0.05217056721448898, "learning_rate": 2.0312500000000002e-06, "loss": -0.0099, "num_tokens": 14773364.0, "reward": 4.463947772979736, "reward_std": 0.512155294418335, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.3701978325843811, "rewards/ngram_similarity_reward/std": 0.3230494260787964, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 407.0, "completions/mean_terminated_length": 407.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.04162004922801522, "frac_reward_zero_std": 0.0, "grad_norm": 0.055510032922029495, "learning_rate": 2.0535714285714286e-06, "loss": -0.0025, "num_tokens": 14983252.0, "reward": 2.310368061065674, "reward_std": 1.5189405679702759, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.27911806106567383, "rewards/ngram_similarity_reward/std": 0.20924191176891327, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 491.546875, "completions/mean_terminated_length": 491.546875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.04206757663906914, "frac_reward_zero_std": 0.0, "grad_norm": 0.04173356667160988, "learning_rate": 2.0758928571428573e-06, "loss": 0.0024, "num_tokens": 15147351.0, "reward": 3.0661611557006836, "reward_std": 0.45635727047920227, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.47241121530532837, "rewards/ngram_similarity_reward/std": 0.2998490035533905, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 463.4375, "completions/mean_terminated_length": 463.4375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.04251510405012307, "frac_reward_zero_std": 0.0, "grad_norm": 0.04815267398953438, "learning_rate": 2.098214285714286e-06, "loss": 0.0653, "num_tokens": 15272083.0, "reward": 3.49208664894104, "reward_std": 2.500577688217163, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 2.9727182388305664, "rewards/ngram_similarity_reward/mean": 0.3514615297317505, "rewards/ngram_similarity_reward/std": 0.3659766614437103, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 379.0625, "completions/mean_terminated_length": 379.0625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.042962631461177, "frac_reward_zero_std": 0.0, "grad_norm": 0.04753585159778595, "learning_rate": 2.1205357142857144e-06, "loss": 0.0354, "num_tokens": 15390535.0, "reward": 6.03642463684082, "reward_std": 0.17955493927001953, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.5364243984222412, "rewards/ngram_similarity_reward/std": 0.34952878952026367, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 636.890625, "completions/mean_terminated_length": 636.890625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.043410158872230924, "frac_reward_zero_std": 0.0, "grad_norm": 0.03281034156680107, "learning_rate": 2.1428571428571427e-06, "loss": -0.0114, "num_tokens": 15560800.0, "reward": 3.7070133686065674, "reward_std": 1.9805774688720703, "rewards/accuracy_reward/mean": 3.328125, "rewards/accuracy_reward/std": 2.9252848625183105, "rewards/ngram_similarity_reward/mean": 0.37888818979263306, "rewards/ngram_similarity_reward/std": 0.2659132778644562, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 419.578125, "completions/mean_terminated_length": 419.578125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.043857686283284854, "frac_reward_zero_std": 0.0, "grad_norm": 0.04952317103743553, "learning_rate": 2.1651785714285715e-06, "loss": 0.0254, "num_tokens": 15709669.0, "reward": 4.6569719314575195, "reward_std": 1.7644875049591064, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.2819717824459076, "rewards/ngram_similarity_reward/std": 0.20795418322086334, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 591.234375, "completions/mean_terminated_length": 591.234375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.04430521369433878, "frac_reward_zero_std": 0.0, "grad_norm": 0.03588514402508736, "learning_rate": 2.1875000000000002e-06, "loss": 0.01, "num_tokens": 15931524.0, "reward": 3.773355484008789, "reward_std": 1.285116195678711, "rewards/accuracy_reward/mean": 3.375, "rewards/accuracy_reward/std": 3.0783421993255615, "rewards/ngram_similarity_reward/mean": 0.3983556032180786, "rewards/ngram_similarity_reward/std": 0.2645743191242218, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 493.515625, "completions/mean_terminated_length": 493.515625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.04475274110539271, "frac_reward_zero_std": 0.0, "grad_norm": 0.04176740348339081, "learning_rate": 2.2098214285714286e-06, "loss": -0.0062, "num_tokens": 16140549.0, "reward": 4.266956329345703, "reward_std": 0.763481855392456, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5482061505317688, "rewards/ngram_similarity_reward/std": 0.4332646429538727, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 518.03125, "completions/mean_terminated_length": 518.03125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.04520026851644663, "frac_reward_zero_std": 0.0, "grad_norm": 0.04275078326463699, "learning_rate": 2.2321428571428573e-06, "loss": 0.0128, "num_tokens": 16286791.0, "reward": 4.03216552734375, "reward_std": 0.9512901306152344, "rewards/accuracy_reward/mean": 3.40625, "rewards/accuracy_reward/std": 2.920745372772217, "rewards/ngram_similarity_reward/mean": 0.62591552734375, "rewards/ngram_similarity_reward/std": 0.2111629694700241, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 483.484375, "completions/mean_terminated_length": 483.484375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.04564779592750056, "frac_reward_zero_std": 0.0, "grad_norm": 0.03929956257343292, "learning_rate": 2.254464285714286e-06, "loss": -0.0066, "num_tokens": 16411382.0, "reward": 3.690983533859253, "reward_std": 0.8513113260269165, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.5347336530685425, "rewards/ngram_similarity_reward/std": 0.31054314970970154, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 387.15625, "completions/mean_terminated_length": 387.15625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.046095323338554484, "frac_reward_zero_std": 0.0, "grad_norm": 0.048255544155836105, "learning_rate": 2.2767857142857144e-06, "loss": 0.0101, "num_tokens": 16553152.0, "reward": 5.592085361480713, "reward_std": 1.3386536836624146, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.5608352422714233, "rewards/ngram_similarity_reward/std": 0.3552662134170532, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 431.828125, "completions/mean_terminated_length": 431.828125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.046542850749608414, "frac_reward_zero_std": 0.0, "grad_norm": 0.04789068549871445, "learning_rate": 2.2991071428571427e-06, "loss": -0.0112, "num_tokens": 16694837.0, "reward": 1.405816912651062, "reward_std": 1.6296827793121338, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.31206685304641724, "rewards/ngram_similarity_reward/std": 0.17024105787277222, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 564.84375, "completions/mean_terminated_length": 564.84375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.046990378160662344, "frac_reward_zero_std": 0.0, "grad_norm": 0.04270428791642189, "learning_rate": 2.321428571428572e-06, "loss": 0.0652, "num_tokens": 16856475.0, "reward": 4.657665729522705, "reward_std": 0.23186588287353516, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.6732906103134155, "rewards/ngram_similarity_reward/std": 0.34633952379226685, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 563.3125, "completions/mean_terminated_length": 563.3125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.04743790557171627, "frac_reward_zero_std": 0.0, "grad_norm": 0.039831411093473434, "learning_rate": 2.3437500000000002e-06, "loss": -0.0179, "num_tokens": 17034127.0, "reward": 2.423933982849121, "reward_std": 1.1356269121170044, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.31455907225608826, "rewards/ngram_similarity_reward/std": 0.2853781580924988, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 323.40625, "completions/mean_terminated_length": 323.40625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.0478854329827702, "frac_reward_zero_std": 0.0, "grad_norm": 0.059055786579847336, "learning_rate": 2.3660714285714285e-06, "loss": -0.0032, "num_tokens": 17159337.0, "reward": 4.369568824768066, "reward_std": 1.6163207292556763, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.8383191227912903, "rewards/ngram_similarity_reward/std": 0.43380168080329895, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 609.796875, "completions/mean_terminated_length": 609.796875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.04833296039382412, "frac_reward_zero_std": 0.0, "grad_norm": 0.03806445747613907, "learning_rate": 2.3883928571428573e-06, "loss": 0.004, "num_tokens": 17346972.0, "reward": 0.2939775288105011, "reward_std": 1.3833703994750977, "rewards/accuracy_reward/mean": -0.0625, "rewards/accuracy_reward/std": 2.006932497024536, "rewards/ngram_similarity_reward/mean": 0.3564775288105011, "rewards/ngram_similarity_reward/std": 0.18514876067638397, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 585.234375, "completions/mean_terminated_length": 585.234375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.04878048780487805, "frac_reward_zero_std": 0.0, "grad_norm": 0.03510132431983948, "learning_rate": 2.410714285714286e-06, "loss": 0.0063, "num_tokens": 17471019.0, "reward": 4.088577747344971, "reward_std": 1.2437450885772705, "rewards/accuracy_reward/mean": 3.578125, "rewards/accuracy_reward/std": 2.880171298980713, "rewards/ngram_similarity_reward/mean": 0.5104526281356812, "rewards/ngram_similarity_reward/std": 0.3364333212375641, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 351.109375, "completions/mean_terminated_length": 351.109375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.049228015215931974, "frac_reward_zero_std": 0.0, "grad_norm": 0.05111013725399971, "learning_rate": 2.4330357142857144e-06, "loss": -0.0383, "num_tokens": 17628610.0, "reward": 4.171026229858398, "reward_std": 2.411790370941162, "rewards/accuracy_reward/mean": 3.703125, "rewards/accuracy_reward/std": 2.789889335632324, "rewards/ngram_similarity_reward/mean": 0.46790117025375366, "rewards/ngram_similarity_reward/std": 0.3062148094177246, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 503.578125, "completions/mean_terminated_length": 503.578125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.049675542626985904, "frac_reward_zero_std": 0.0, "grad_norm": 0.04645279049873352, "learning_rate": 2.455357142857143e-06, "loss": -0.0245, "num_tokens": 17800039.0, "reward": 2.558358669281006, "reward_std": 0.6171290278434753, "rewards/accuracy_reward/mean": 2.203125, "rewards/accuracy_reward/std": 3.2400259971618652, "rewards/ngram_similarity_reward/mean": 0.3552337884902954, "rewards/ngram_similarity_reward/std": 0.31524649262428284, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 581.890625, "completions/mean_terminated_length": 581.890625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.05012307003803983, "frac_reward_zero_std": 0.0, "grad_norm": 0.033150479197502136, "learning_rate": 2.477678571428572e-06, "loss": 0.0029, "num_tokens": 17942640.0, "reward": 4.360316753387451, "reward_std": 0.6434515118598938, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.5478165745735168, "rewards/ngram_similarity_reward/std": 0.34956124424934387, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 496.65625, "completions/mean_terminated_length": 496.65625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.05057059744909376, "frac_reward_zero_std": 0.0, "grad_norm": 0.0438249446451664, "learning_rate": 2.5e-06, "loss": -0.0134, "num_tokens": 18105098.0, "reward": 0.2818566560745239, "reward_std": 1.0355392694473267, "rewards/accuracy_reward/mean": -0.078125, "rewards/accuracy_reward/std": 2.3217720985412598, "rewards/ngram_similarity_reward/mean": 0.35998162627220154, "rewards/ngram_similarity_reward/std": 0.3143450915813446, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 474.4375, "completions/mean_terminated_length": 474.4375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.05101812486014769, "frac_reward_zero_std": 0.25, "grad_norm": 0.03859485685825348, "learning_rate": 2.5223214285714285e-06, "loss": 0.0201, "num_tokens": 18257814.0, "reward": 2.7156527042388916, "reward_std": 0.7003411650657654, "rewards/accuracy_reward/mean": 2.265625, "rewards/accuracy_reward/std": 3.0692648887634277, "rewards/ngram_similarity_reward/mean": 0.4500276446342468, "rewards/ngram_similarity_reward/std": 0.3689570426940918, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 458.734375, "completions/mean_terminated_length": 458.734375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.05146565227120161, "frac_reward_zero_std": 0.0, "grad_norm": 0.04315647855401039, "learning_rate": 2.5446428571428573e-06, "loss": -0.0008, "num_tokens": 18409813.0, "reward": 2.2372071743011475, "reward_std": 2.4554202556610107, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 2.9425299167633057, "rewards/ngram_similarity_reward/mean": 0.5028321743011475, "rewards/ngram_similarity_reward/std": 0.3332710266113281, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 553.0, "completions/mean_terminated_length": 553.0, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.05191317968225554, "frac_reward_zero_std": 0.0, "grad_norm": 0.03877819702029228, "learning_rate": 2.5669642857142856e-06, "loss": 0.0226, "num_tokens": 18551605.0, "reward": 2.798994779586792, "reward_std": 0.607537031173706, "rewards/accuracy_reward/mean": 2.359375, "rewards/accuracy_reward/std": 3.075077533721924, "rewards/ngram_similarity_reward/mean": 0.439619779586792, "rewards/ngram_similarity_reward/std": 0.34050148725509644, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 520.25, "completions/mean_terminated_length": 520.25, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.052360707093309464, "frac_reward_zero_std": 0.0, "grad_norm": 0.039993252605199814, "learning_rate": 2.5892857142857148e-06, "loss": 0.0302, "num_tokens": 18704229.0, "reward": 2.8932952880859375, "reward_std": 0.6330355405807495, "rewards/accuracy_reward/mean": 2.4375, "rewards/accuracy_reward/std": 3.3040380477905273, "rewards/ngram_similarity_reward/mean": 0.4557953178882599, "rewards/ngram_similarity_reward/std": 0.2842147946357727, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 528.09375, "completions/mean_terminated_length": 528.09375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.052808234504363394, "frac_reward_zero_std": 0.0, "grad_norm": 0.03796974569559097, "learning_rate": 2.611607142857143e-06, "loss": 0.0116, "num_tokens": 18892827.0, "reward": 3.090029239654541, "reward_std": 1.1082619428634644, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5900291204452515, "rewards/ngram_similarity_reward/std": 0.3288757801055908, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 515.0625, "completions/mean_terminated_length": 515.0625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.05325576191541732, "frac_reward_zero_std": 0.0, "grad_norm": 0.04664303734898567, "learning_rate": 2.633928571428572e-06, "loss": -0.0034, "num_tokens": 19058383.0, "reward": 1.6570677757263184, "reward_std": 1.6934335231781006, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 2.9302406311035156, "rewards/ngram_similarity_reward/mean": 0.43831780552864075, "rewards/ngram_similarity_reward/std": 0.25029969215393066, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 571.40625, "completions/mean_terminated_length": 571.40625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.05370328932647125, "frac_reward_zero_std": 0.0, "grad_norm": 0.039782412350177765, "learning_rate": 2.65625e-06, "loss": 0.0124, "num_tokens": 19215817.0, "reward": 1.7129923105239868, "reward_std": 1.179494023323059, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.881075382232666, "rewards/ngram_similarity_reward/mean": 0.43174242973327637, "rewards/ngram_similarity_reward/std": 0.22701890766620636, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 522.953125, "completions/mean_terminated_length": 522.953125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.05415081673752517, "frac_reward_zero_std": 0.0, "grad_norm": 0.04186190664768219, "learning_rate": 2.6785714285714285e-06, "loss": 0.0102, "num_tokens": 19349942.0, "reward": 4.323519706726074, "reward_std": 0.27767765522003174, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.8206562995910645, "rewards/ngram_similarity_reward/mean": 0.4328947067260742, "rewards/ngram_similarity_reward/std": 0.38358616828918457, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 456.78125, "completions/mean_terminated_length": 456.78125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.0545983441485791, "frac_reward_zero_std": 0.0, "grad_norm": 0.040842872112989426, "learning_rate": 2.7008928571428573e-06, "loss": 0.0099, "num_tokens": 19477352.0, "reward": 2.548858642578125, "reward_std": 1.7461631298065186, "rewards/accuracy_reward/mean": 2.078125, "rewards/accuracy_reward/std": 3.0488338470458984, "rewards/ngram_similarity_reward/mean": 0.4707334041595459, "rewards/ngram_similarity_reward/std": 0.36607521772384644, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 526.71875, "completions/mean_terminated_length": 526.71875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.05504587155963303, "frac_reward_zero_std": 0.0, "grad_norm": 0.03907957300543785, "learning_rate": 2.7232142857142856e-06, "loss": 0.0137, "num_tokens": 19622662.0, "reward": 3.366270065307617, "reward_std": 1.1402533054351807, "rewards/accuracy_reward/mean": 2.953125, "rewards/accuracy_reward/std": 3.0075550079345703, "rewards/ngram_similarity_reward/mean": 0.4131450653076172, "rewards/ngram_similarity_reward/std": 0.33299681544303894, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 539.546875, "completions/mean_terminated_length": 539.546875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.055493398970686954, "frac_reward_zero_std": 0.0, "grad_norm": 0.038894135504961014, "learning_rate": 2.7455357142857148e-06, "loss": -0.0165, "num_tokens": 19796617.0, "reward": 3.563744068145752, "reward_std": 1.986689805984497, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.3137441873550415, "rewards/ngram_similarity_reward/std": 0.23322314023971558, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 405.703125, "completions/mean_terminated_length": 405.703125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.055940926381740884, "frac_reward_zero_std": 0.0, "grad_norm": 0.051937028765678406, "learning_rate": 2.767857142857143e-06, "loss": 0.0385, "num_tokens": 19931254.0, "reward": 2.6729683876037598, "reward_std": 1.4942256212234497, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.36046838760375977, "rewards/ngram_similarity_reward/std": 0.23005659878253937, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 598.828125, "completions/mean_terminated_length": 575.825439453125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.05638845379279481, "frac_reward_zero_std": 0.0, "grad_norm": 0.032587017863988876, "learning_rate": 2.790178571428572e-06, "loss": -0.0526, "num_tokens": 20112907.0, "reward": 5.738628387451172, "reward_std": 1.0721367597579956, "rewards/accuracy_reward/mean": 5.203125, "rewards/accuracy_reward/std": 1.3531819581985474, "rewards/ngram_similarity_reward/mean": 0.5355039834976196, "rewards/ngram_similarity_reward/std": 0.25528010725975037, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 359.921875, "completions/mean_terminated_length": 359.921875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.05683598120384874, "frac_reward_zero_std": 0.0, "grad_norm": 0.05148518830537796, "learning_rate": 2.8125e-06, "loss": 0.0046, "num_tokens": 20219878.0, "reward": 4.365318775177002, "reward_std": 0.6934431195259094, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.5528188943862915, "rewards/ngram_similarity_reward/std": 0.33181366324424744, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 522.25, "completions/mean_terminated_length": 522.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.05728350861490266, "frac_reward_zero_std": 0.0, "grad_norm": 0.04125802963972092, "learning_rate": 2.834821428571429e-06, "loss": 0.0127, "num_tokens": 20376422.0, "reward": 3.3187947273254395, "reward_std": 0.7205219268798828, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.537544846534729, "rewards/ngram_similarity_reward/std": 0.26360082626342773, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 441.53125, "completions/mean_terminated_length": 441.53125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.05773103602595659, "frac_reward_zero_std": 0.0, "grad_norm": 0.04405700042843819, "learning_rate": 2.8571428571428573e-06, "loss": -0.0094, "num_tokens": 20533656.0, "reward": 4.606054782867432, "reward_std": 0.45147010684013367, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6998050212860107, "rewards/ngram_similarity_reward/std": 0.36088237166404724, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 519.921875, "completions/mean_terminated_length": 519.921875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.058178563437010514, "frac_reward_zero_std": 0.0, "grad_norm": 0.05712265148758888, "learning_rate": 2.8794642857142856e-06, "loss": -0.0075, "num_tokens": 20753891.0, "reward": 2.823075532913208, "reward_std": 0.505908727645874, "rewards/accuracy_reward/mean": 2.375, "rewards/accuracy_reward/std": 3.057647228240967, "rewards/ngram_similarity_reward/mean": 0.44807544350624084, "rewards/ngram_similarity_reward/std": 0.31414222717285156, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 547.8125, "completions/mean_terminated_length": 547.8125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.058626090848064444, "frac_reward_zero_std": 0.0, "grad_norm": 0.038814567029476166, "learning_rate": 2.9017857142857148e-06, "loss": 0.0243, "num_tokens": 20905799.0, "reward": 6.176115989685059, "reward_std": 0.22915717959403992, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6761159896850586, "rewards/ngram_similarity_reward/std": 0.29612261056900024, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 549.984375, "completions/mean_terminated_length": 549.984375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.059073618259118374, "frac_reward_zero_std": 0.0, "grad_norm": 0.04424475133419037, "learning_rate": 2.924107142857143e-06, "loss": 0.0323, "num_tokens": 21116870.0, "reward": 2.8654801845550537, "reward_std": 0.6325905323028564, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.0400354862213135, "rewards/ngram_similarity_reward/mean": 0.4748552441596985, "rewards/ngram_similarity_reward/std": 0.3601526618003845, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 425.671875, "completions/mean_terminated_length": 425.671875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.0595211456701723, "frac_reward_zero_std": 0.0, "grad_norm": 0.048342108726501465, "learning_rate": 2.946428571428572e-06, "loss": -0.0011, "num_tokens": 21273953.0, "reward": 3.6667661666870117, "reward_std": 1.964839220046997, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 3.0613763332366943, "rewards/ngram_similarity_reward/mean": 0.5105161666870117, "rewards/ngram_similarity_reward/std": 0.3453754186630249, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 495.8125, "completions/mean_terminated_length": 495.8125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.05996867308122623, "frac_reward_zero_std": 0.0, "grad_norm": 0.04297291487455368, "learning_rate": 2.96875e-06, "loss": -0.0019, "num_tokens": 21426885.0, "reward": 1.790950059890747, "reward_std": 2.361938714981079, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 3.0315799713134766, "rewards/ngram_similarity_reward/mean": 0.41595014929771423, "rewards/ngram_similarity_reward/std": 0.2542986571788788, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 538.9375, "completions/mean_terminated_length": 538.9375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.06041620049228015, "frac_reward_zero_std": 0.0, "grad_norm": 0.03412027657032013, "learning_rate": 2.991071428571429e-06, "loss": 0.0113, "num_tokens": 21567537.0, "reward": 3.170560598373413, "reward_std": 0.8621605634689331, "rewards/accuracy_reward/mean": 2.8125, "rewards/accuracy_reward/std": 3.080275297164917, "rewards/ngram_similarity_reward/mean": 0.3580605685710907, "rewards/ngram_similarity_reward/std": 0.26210764050483704, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 499.171875, "completions/mean_terminated_length": 499.171875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.06086372790333408, "frac_reward_zero_std": 0.0, "grad_norm": 0.04292212426662445, "learning_rate": 3.0133928571428572e-06, "loss": 0.0119, "num_tokens": 21743868.0, "reward": 0.061600834131240845, "reward_std": 1.426113486289978, "rewards/accuracy_reward/mean": -0.15625, "rewards/accuracy_reward/std": 1.8790301084518433, "rewards/ngram_similarity_reward/mean": 0.21785084903240204, "rewards/ngram_similarity_reward/std": 0.15603578090667725, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 532.078125, "completions/mean_terminated_length": 532.078125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.061311255314388004, "frac_reward_zero_std": 0.0, "grad_norm": 0.038399964570999146, "learning_rate": 3.0357142857142856e-06, "loss": 0.0026, "num_tokens": 21879521.0, "reward": 4.752626419067383, "reward_std": 1.480465292930603, "rewards/accuracy_reward/mean": 4.359375, "rewards/accuracy_reward/std": 2.3962087631225586, "rewards/ngram_similarity_reward/mean": 0.3932513892650604, "rewards/ngram_similarity_reward/std": 0.2955648601055145, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 467.15625, "completions/mean_terminated_length": 467.15625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.061758782725441934, "frac_reward_zero_std": 0.0, "grad_norm": 0.04397137835621834, "learning_rate": 3.0580357142857147e-06, "loss": -0.0153, "num_tokens": 22029819.0, "reward": 3.7678921222686768, "reward_std": 1.2001043558120728, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.42414209246635437, "rewards/ngram_similarity_reward/std": 0.2783641219139099, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 502.25, "completions/mean_terminated_length": 502.25, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.06220631013649586, "frac_reward_zero_std": 0.0, "grad_norm": 0.03933661803603172, "learning_rate": 3.080357142857143e-06, "loss": 0.0225, "num_tokens": 22192283.0, "reward": 1.9660686254501343, "reward_std": 0.9389387965202332, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 2.900294303894043, "rewards/ngram_similarity_reward/mean": 0.43481865525245667, "rewards/ngram_similarity_reward/std": 0.3456280529499054, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 413.78125, "completions/mean_terminated_length": 413.78125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.06265383754754979, "frac_reward_zero_std": 0.0, "grad_norm": 0.04940592870116234, "learning_rate": 3.102678571428572e-06, "loss": -0.0052, "num_tokens": 22412765.0, "reward": 2.9114465713500977, "reward_std": 0.5219171047210693, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5051964521408081, "rewards/ngram_similarity_reward/std": 0.3867158889770508, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 563.984375, "completions/mean_terminated_length": 563.984375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.06310136495860372, "frac_reward_zero_std": 0.0, "grad_norm": 0.03664904087781906, "learning_rate": 3.125e-06, "loss": 0.0047, "num_tokens": 22587036.0, "reward": 5.278068542480469, "reward_std": 1.061753273010254, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.1602470874786377, "rewards/ngram_similarity_reward/mean": 0.5280686020851135, "rewards/ngram_similarity_reward/std": 0.248811274766922, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 398.796875, "completions/mean_terminated_length": 398.796875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.06354889236965765, "frac_reward_zero_std": 0.0, "grad_norm": 0.04452500119805336, "learning_rate": 3.147321428571429e-06, "loss": 0.0195, "num_tokens": 22730191.0, "reward": 3.75919771194458, "reward_std": 2.232400417327881, "rewards/accuracy_reward/mean": 3.21875, "rewards/accuracy_reward/std": 2.9732606410980225, "rewards/ngram_similarity_reward/mean": 0.5404477715492249, "rewards/ngram_similarity_reward/std": 0.45260751247406006, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 414.890625, "completions/mean_terminated_length": 414.890625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.06399641978071156, "frac_reward_zero_std": 0.0, "grad_norm": 0.047404687851667404, "learning_rate": 3.1696428571428572e-06, "loss": -0.0106, "num_tokens": 22862792.0, "reward": 1.301940679550171, "reward_std": 0.5988505482673645, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 2.5850608348846436, "rewards/ngram_similarity_reward/mean": 0.4269407093524933, "rewards/ngram_similarity_reward/std": 0.3728107511997223, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 516.234375, "completions/mean_terminated_length": 516.234375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.0644439471917655, "frac_reward_zero_std": 0.0, "grad_norm": 0.03927481546998024, "learning_rate": 3.1919642857142856e-06, "loss": -0.0034, "num_tokens": 22998087.0, "reward": 2.7710466384887695, "reward_std": 1.9648394584655762, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.2537753582000732, "rewards/ngram_similarity_reward/mean": 0.28667137026786804, "rewards/ngram_similarity_reward/std": 0.20069169998168945, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 507.140625, "completions/mean_terminated_length": 507.140625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.06489147460281942, "frac_reward_zero_std": 0.0, "grad_norm": 0.043464187532663345, "learning_rate": 3.2142857142857147e-06, "loss": -0.0193, "num_tokens": 23129872.0, "reward": 0.631460964679718, "reward_std": 1.3900837898254395, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 2.122255563735962, "rewards/ngram_similarity_reward/mean": 0.3189609944820404, "rewards/ngram_similarity_reward/std": 0.18644043803215027, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 430.859375, "completions/mean_terminated_length": 430.859375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.06533900201387335, "frac_reward_zero_std": 0.0, "grad_norm": 0.04917949065566063, "learning_rate": 3.2366071428571435e-06, "loss": 0.0218, "num_tokens": 23257031.0, "reward": 2.4517362117767334, "reward_std": 0.8715531229972839, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.34236112236976624, "rewards/ngram_similarity_reward/std": 0.3702693283557892, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 435.46875, "completions/mean_terminated_length": 435.46875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.06578652942492727, "frac_reward_zero_std": 0.0, "grad_norm": 0.045783959329128265, "learning_rate": 3.258928571428572e-06, "loss": 0.0036, "num_tokens": 23395637.0, "reward": 4.754144668579102, "reward_std": 1.722858190536499, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.4728948473930359, "rewards/ngram_similarity_reward/std": 0.3473533093929291, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 513.96875, "completions/mean_terminated_length": 513.96875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.0662340568359812, "frac_reward_zero_std": 0.0, "grad_norm": 0.047589968889951706, "learning_rate": 3.28125e-06, "loss": 0.006, "num_tokens": 23523539.0, "reward": 4.243943214416504, "reward_std": 0.95088791847229, "rewards/accuracy_reward/mean": 3.796875, "rewards/accuracy_reward/std": 2.746886730194092, "rewards/ngram_similarity_reward/mean": 0.4470682442188263, "rewards/ngram_similarity_reward/std": 0.33129891753196716, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 448.765625, "completions/mean_terminated_length": 448.765625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.06668158424703513, "frac_reward_zero_std": 0.0, "grad_norm": 0.048214901238679886, "learning_rate": 3.303571428571429e-06, "loss": -0.0314, "num_tokens": 23715156.0, "reward": 2.7262778282165527, "reward_std": 2.69027042388916, "rewards/accuracy_reward/mean": 2.375, "rewards/accuracy_reward/std": 3.057647228240967, "rewards/ngram_similarity_reward/mean": 0.35127800703048706, "rewards/ngram_similarity_reward/std": 0.27484095096588135, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 459.375, "completions/mean_terminated_length": 459.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.06712911165808906, "frac_reward_zero_std": 0.0, "grad_norm": 0.04197873920202255, "learning_rate": 3.3258928571428572e-06, "loss": 0.0316, "num_tokens": 23860924.0, "reward": 3.065863847732544, "reward_std": 0.19159740209579468, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.565863847732544, "rewards/ngram_similarity_reward/std": 0.30514025688171387, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 548.921875, "completions/mean_terminated_length": 548.921875, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.06757663906914299, "frac_reward_zero_std": 0.0, "grad_norm": 0.032930515706539154, "learning_rate": 3.3482142857142855e-06, "loss": 0.0249, "num_tokens": 24041575.0, "reward": 1.9457918405532837, "reward_std": 2.021454095840454, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.38329190015792847, "rewards/ngram_similarity_reward/std": 0.2622060477733612, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 391.921875, "completions/mean_terminated_length": 391.921875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.06802416648019691, "frac_reward_zero_std": 0.25, "grad_norm": 0.04094477370381355, "learning_rate": 3.3705357142857147e-06, "loss": 0.0036, "num_tokens": 24213394.0, "reward": 3.6835200786590576, "reward_std": 1.5544017553329468, "rewards/accuracy_reward/mean": 2.828125, "rewards/accuracy_reward/std": 3.060525417327881, "rewards/ngram_similarity_reward/mean": 0.8553951978683472, "rewards/ngram_similarity_reward/std": 0.3850794732570648, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 465.234375, "completions/mean_terminated_length": 465.234375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.06847169389125084, "frac_reward_zero_std": 0.0, "grad_norm": 0.04359624162316322, "learning_rate": 3.3928571428571435e-06, "loss": -0.0134, "num_tokens": 24358353.0, "reward": 3.7651684284210205, "reward_std": 2.4716694355010986, "rewards/accuracy_reward/mean": 3.203125, "rewards/accuracy_reward/std": 3.104935646057129, "rewards/ngram_similarity_reward/mean": 0.5620434284210205, "rewards/ngram_similarity_reward/std": 0.42700931429862976, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 411.640625, "completions/mean_terminated_length": 411.640625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.06891922130230477, "frac_reward_zero_std": 0.0, "grad_norm": 0.04364282637834549, "learning_rate": 3.415178571428572e-06, "loss": 0.0097, "num_tokens": 24512362.0, "reward": 4.9647016525268555, "reward_std": 0.9715432524681091, "rewards/accuracy_reward/mean": 4.640625, "rewards/accuracy_reward/std": 2.1445181369781494, "rewards/ngram_similarity_reward/mean": 0.3240765333175659, "rewards/ngram_similarity_reward/std": 0.24237005412578583, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 513.984375, "completions/mean_terminated_length": 513.984375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.0693667487133587, "frac_reward_zero_std": 0.0, "grad_norm": 0.03732859343290329, "learning_rate": 3.4375e-06, "loss": 0.0164, "num_tokens": 24673833.0, "reward": 3.5389277935028076, "reward_std": 0.9162474274635315, "rewards/accuracy_reward/mean": 2.90625, "rewards/accuracy_reward/std": 3.2791731357574463, "rewards/ngram_similarity_reward/mean": 0.6326779127120972, "rewards/ngram_similarity_reward/std": 0.33283960819244385, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 505.875, "completions/mean_terminated_length": 456.1290283203125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.06981427612441261, "frac_reward_zero_std": 0.0, "grad_norm": 0.05703406408429146, "learning_rate": 3.459821428571429e-06, "loss": 0.0632, "num_tokens": 24883121.0, "reward": 1.2255878448486328, "reward_std": 1.6483601331710815, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 3.0478575229644775, "rewards/ngram_similarity_reward/mean": 0.3349628448486328, "rewards/ngram_similarity_reward/std": 0.29052525758743286, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 389.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.07026180353546654, "frac_reward_zero_std": 0.0, "grad_norm": 0.05271059647202492, "learning_rate": 3.482142857142857e-06, "loss": -0.015, "num_tokens": 25023321.0, "reward": 1.4020805358886719, "reward_std": 0.50262451171875, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.30833062529563904, "rewards/ngram_similarity_reward/std": 0.2317408323287964, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 423.234375, "completions/mean_terminated_length": 423.234375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.07070933094652047, "frac_reward_zero_std": 0.0, "grad_norm": 0.05880529060959816, "learning_rate": 3.504464285714286e-06, "loss": -0.0119, "num_tokens": 25222936.0, "reward": 3.830268383026123, "reward_std": 2.3002877235412598, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 3.039423704147339, "rewards/ngram_similarity_reward/mean": 0.5802686214447021, "rewards/ngram_similarity_reward/std": 0.3722204566001892, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 518.96875, "completions/mean_terminated_length": 518.96875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.0711568583575744, "frac_reward_zero_std": 0.0, "grad_norm": 0.040393684059381485, "learning_rate": 3.5267857142857147e-06, "loss": -0.0017, "num_tokens": 25403078.0, "reward": 6.280098915100098, "reward_std": 0.16385754942893982, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7800989151000977, "rewards/ngram_similarity_reward/std": 0.33270302414894104, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 472.09375, "completions/mean_terminated_length": 472.09375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.07160438576862833, "frac_reward_zero_std": 0.0, "grad_norm": 0.04423805698752403, "learning_rate": 3.5491071428571435e-06, "loss": 0.0217, "num_tokens": 25550540.0, "reward": 4.868206977844238, "reward_std": 2.067688465118408, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.4932067394256592, "rewards/ngram_similarity_reward/std": 0.2828584611415863, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 417.328125, "completions/mean_terminated_length": 417.328125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.07205191317968225, "frac_reward_zero_std": 0.0, "grad_norm": 0.04653473198413849, "learning_rate": 3.5714285714285718e-06, "loss": 0.0195, "num_tokens": 25798705.0, "reward": 4.217301845550537, "reward_std": 1.03849458694458, "rewards/accuracy_reward/mean": 3.703125, "rewards/accuracy_reward/std": 2.789889335632324, "rewards/ngram_similarity_reward/mean": 0.5141770243644714, "rewards/ngram_similarity_reward/std": 0.34160828590393066, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 444.296875, "completions/mean_terminated_length": 444.296875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.07249944059073618, "frac_reward_zero_std": 0.0, "grad_norm": 0.04895284026861191, "learning_rate": 3.59375e-06, "loss": -0.028, "num_tokens": 25952308.0, "reward": 3.087433338165283, "reward_std": 0.8225682973861694, "rewards/accuracy_reward/mean": 2.640625, "rewards/accuracy_reward/std": 3.1816298961639404, "rewards/ngram_similarity_reward/mean": 0.44680821895599365, "rewards/ngram_similarity_reward/std": 0.2572721242904663, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 486.390625, "completions/mean_terminated_length": 486.390625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.07294696800179011, "frac_reward_zero_std": 0.0, "grad_norm": 0.03934817016124725, "learning_rate": 3.616071428571429e-06, "loss": 0.0129, "num_tokens": 26090429.0, "reward": 3.4666459560394287, "reward_std": 1.3379069566726685, "rewards/accuracy_reward/mean": 3.1875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.2791461646556854, "rewards/ngram_similarity_reward/std": 0.29793688654899597, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 469.078125, "completions/mean_terminated_length": 469.078125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.07339449541284404, "frac_reward_zero_std": 0.0, "grad_norm": 0.041254252195358276, "learning_rate": 3.638392857142857e-06, "loss": 0.0119, "num_tokens": 26243394.0, "reward": 2.9599480628967285, "reward_std": 0.6078701615333557, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.1671366691589355, "rewards/ngram_similarity_reward/mean": 0.49119803309440613, "rewards/ngram_similarity_reward/std": 0.3554515540599823, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 441.734375, "completions/mean_terminated_length": 441.734375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.07384202282389796, "frac_reward_zero_std": 0.0, "grad_norm": 0.04607221856713295, "learning_rate": 3.660714285714286e-06, "loss": 0.0285, "num_tokens": 26401953.0, "reward": 5.063331604003906, "reward_std": 1.823121428489685, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.4070812463760376, "rewards/ngram_similarity_reward/std": 0.3174854516983032, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 438.09375, "completions/mean_terminated_length": 438.09375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.07428955023495189, "frac_reward_zero_std": 0.0, "grad_norm": 0.04993997514247894, "learning_rate": 3.6830357142857147e-06, "loss": 0.0015, "num_tokens": 26650343.0, "reward": 2.793623447418213, "reward_std": 0.7833993434906006, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.29362359642982483, "rewards/ngram_similarity_reward/std": 0.15472392737865448, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 511.984375, "completions/mean_terminated_length": 511.984375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.07473707764600582, "frac_reward_zero_std": 0.0, "grad_norm": 0.04063967615365982, "learning_rate": 3.7053571428571434e-06, "loss": 0.0231, "num_tokens": 26812294.0, "reward": 3.3472909927368164, "reward_std": 0.7561337351799011, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5660411715507507, "rewards/ngram_similarity_reward/std": 0.21464978158473969, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 569.171875, "completions/mean_terminated_length": 569.171875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.07518460505705975, "frac_reward_zero_std": 0.0, "grad_norm": 0.04079766198992729, "learning_rate": 3.7276785714285718e-06, "loss": 0.0022, "num_tokens": 27019057.0, "reward": 2.664179801940918, "reward_std": 2.0348520278930664, "rewards/accuracy_reward/mean": 2.28125, "rewards/accuracy_reward/std": 3.0522892475128174, "rewards/ngram_similarity_reward/mean": 0.38292986154556274, "rewards/ngram_similarity_reward/std": 0.3396172821521759, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 523.265625, "completions/mean_terminated_length": 523.265625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.07563213246811368, "frac_reward_zero_std": 0.0, "grad_norm": 0.051377370953559875, "learning_rate": 3.7500000000000005e-06, "loss": 0.0057, "num_tokens": 27158818.0, "reward": 2.863034963607788, "reward_std": 0.5506872534751892, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.0400354862213135, "rewards/ngram_similarity_reward/mean": 0.4724099636077881, "rewards/ngram_similarity_reward/std": 0.32486027479171753, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 408.765625, "completions/mean_terminated_length": 408.765625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.0760796598791676, "frac_reward_zero_std": 0.0, "grad_norm": 0.047743599861860275, "learning_rate": 3.772321428571429e-06, "loss": 0.0161, "num_tokens": 27296803.0, "reward": 4.988167762756348, "reward_std": 2.043489456176758, "rewards/accuracy_reward/mean": 4.359375, "rewards/accuracy_reward/std": 2.3962087631225586, "rewards/ngram_similarity_reward/mean": 0.6287930607795715, "rewards/ngram_similarity_reward/std": 0.36715027689933777, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 661.5, "completions/mean_terminated_length": 661.5, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.07652718729022152, "frac_reward_zero_std": 0.0, "grad_norm": 0.03597584366798401, "learning_rate": 3.794642857142857e-06, "loss": 0.0039, "num_tokens": 27466067.0, "reward": 2.084123373031616, "reward_std": 2.295442819595337, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 3.0002894401550293, "rewards/ngram_similarity_reward/mean": 0.4122483730316162, "rewards/ngram_similarity_reward/std": 0.2563944458961487, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 516.0, "completions/mean_terminated_length": 516.0, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.07697471470127545, "frac_reward_zero_std": 0.0, "grad_norm": 0.040315892547369, "learning_rate": 3.816964285714286e-06, "loss": -0.0344, "num_tokens": 27624883.0, "reward": 3.6607158184051514, "reward_std": 1.7966370582580566, "rewards/accuracy_reward/mean": 2.953125, "rewards/accuracy_reward/std": 3.0075550079345703, "rewards/ngram_similarity_reward/mean": 0.7075908184051514, "rewards/ngram_similarity_reward/std": 0.3306605815887451, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 552.21875, "completions/mean_terminated_length": 552.21875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.07742224211232938, "frac_reward_zero_std": 0.0, "grad_norm": 0.0378226637840271, "learning_rate": 3.839285714285715e-06, "loss": -0.03, "num_tokens": 27786833.0, "reward": 4.285251140594482, "reward_std": 0.4423610270023346, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.3790009617805481, "rewards/ngram_similarity_reward/std": 0.2819773256778717, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 511.34375, "completions/mean_terminated_length": 511.34375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.0778697695233833, "frac_reward_zero_std": 0.0, "grad_norm": 0.03857111185789108, "learning_rate": 3.8616071428571434e-06, "loss": -0.0041, "num_tokens": 27967703.0, "reward": 3.4933242797851562, "reward_std": 2.03798508644104, "rewards/accuracy_reward/mean": 2.921875, "rewards/accuracy_reward/std": 3.1562721729278564, "rewards/ngram_similarity_reward/mean": 0.5714495182037354, "rewards/ngram_similarity_reward/std": 0.2844545841217041, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 355.453125, "completions/mean_terminated_length": 355.453125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.07831729693443723, "frac_reward_zero_std": 0.0, "grad_norm": 0.06271856278181076, "learning_rate": 3.883928571428572e-06, "loss": -0.0087, "num_tokens": 28067812.0, "reward": 2.6634888648986816, "reward_std": 0.49967533349990845, "rewards/accuracy_reward/mean": 2.171875, "rewards/accuracy_reward/std": 3.271108388900757, "rewards/ngram_similarity_reward/mean": 0.49161386489868164, "rewards/ngram_similarity_reward/std": 0.3125540614128113, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 601.296875, "completions/mean_terminated_length": 601.296875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.07876482434549116, "frac_reward_zero_std": 0.0, "grad_norm": 0.0370563268661499, "learning_rate": 3.90625e-06, "loss": -0.0243, "num_tokens": 28227879.0, "reward": 3.002413034439087, "reward_std": 0.2257809340953827, "rewards/accuracy_reward/mean": 2.421875, "rewards/accuracy_reward/std": 3.113231897354126, "rewards/ngram_similarity_reward/mean": 0.5805378556251526, "rewards/ngram_similarity_reward/std": 0.343523770570755, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 438.6875, "completions/mean_terminated_length": 438.6875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.07921235175654509, "frac_reward_zero_std": 0.0, "grad_norm": 0.04832509532570839, "learning_rate": 3.928571428571429e-06, "loss": 0.0243, "num_tokens": 28378259.0, "reward": -0.3152479827404022, "reward_std": 0.05669836327433586, "rewards/accuracy_reward/mean": -0.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.18475203216075897, "rewards/ngram_similarity_reward/std": 0.07380875945091248, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 423.09375, "completions/mean_terminated_length": 423.09375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.07965987916759902, "frac_reward_zero_std": 0.0, "grad_norm": 0.046438515186309814, "learning_rate": 3.950892857142858e-06, "loss": 0.0181, "num_tokens": 28553577.0, "reward": 1.5509165525436401, "reward_std": 2.002094030380249, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.4259165823459625, "rewards/ngram_similarity_reward/std": 0.3136703372001648, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 403.0625, "completions/mean_terminated_length": 403.0625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.08010740657865294, "frac_reward_zero_std": 0.0, "grad_norm": 0.04984445497393608, "learning_rate": 3.9732142857142855e-06, "loss": -0.0088, "num_tokens": 28694173.0, "reward": 1.3050878047943115, "reward_std": 2.3252506256103516, "rewards/accuracy_reward/mean": 0.953125, "rewards/accuracy_reward/std": 2.6543147563934326, "rewards/ngram_similarity_reward/mean": 0.35196271538734436, "rewards/ngram_similarity_reward/std": 0.29783403873443604, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 515.40625, "completions/mean_terminated_length": 515.40625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.08055493398970687, "frac_reward_zero_std": 0.0, "grad_norm": 0.04675843566656113, "learning_rate": 3.995535714285715e-06, "loss": 0.0441, "num_tokens": 28901319.0, "reward": 3.3299739360809326, "reward_std": 1.4819881916046143, "rewards/accuracy_reward/mean": 2.5625, "rewards/accuracy_reward/std": 3.059593439102173, "rewards/ngram_similarity_reward/mean": 0.7674739360809326, "rewards/ngram_similarity_reward/std": 0.2389378547668457, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 536.984375, "completions/mean_terminated_length": 536.984375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.0810024614007608, "frac_reward_zero_std": 0.0, "grad_norm": 0.036061979830265045, "learning_rate": 4.017857142857143e-06, "loss": 0.0345, "num_tokens": 29065846.0, "reward": 4.682528972625732, "reward_std": 1.3342541456222534, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.4950290620326996, "rewards/ngram_similarity_reward/std": 0.3298114836215973, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 465.796875, "completions/mean_terminated_length": 465.796875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.08144998881181473, "frac_reward_zero_std": 0.0, "grad_norm": 0.0419074110686779, "learning_rate": 4.040178571428572e-06, "loss": -0.0185, "num_tokens": 29235481.0, "reward": 4.840533256530762, "reward_std": 1.6569175720214844, "rewards/accuracy_reward/mean": 4.5, "rewards/accuracy_reward/std": 2.350278615951538, "rewards/ngram_similarity_reward/mean": 0.3405328094959259, "rewards/ngram_similarity_reward/std": 0.31163933873176575, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 498.21875, "completions/mean_terminated_length": 498.21875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.08189751622286866, "frac_reward_zero_std": 0.0, "grad_norm": 0.04102945700287819, "learning_rate": 4.0625000000000005e-06, "loss": 0.0178, "num_tokens": 29347175.0, "reward": 2.4598028659820557, "reward_std": 0.7249634265899658, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.3348028361797333, "rewards/ngram_similarity_reward/std": 0.3524908423423767, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 518.234375, "completions/mean_terminated_length": 518.234375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.08234504363392257, "frac_reward_zero_std": 0.0, "grad_norm": 0.03980467468500137, "learning_rate": 4.084821428571429e-06, "loss": 0.0118, "num_tokens": 29515670.0, "reward": 2.591090440750122, "reward_std": 1.683458924293518, "rewards/accuracy_reward/mean": 2.34375, "rewards/accuracy_reward/std": 3.1983067989349365, "rewards/ngram_similarity_reward/mean": 0.24734047055244446, "rewards/ngram_similarity_reward/std": 0.12441064417362213, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 441.90625, "completions/mean_terminated_length": 441.90625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.0827925710449765, "frac_reward_zero_std": 0.0, "grad_norm": 0.046047866344451904, "learning_rate": 4.107142857142857e-06, "loss": -0.0203, "num_tokens": 29651072.0, "reward": 2.0882480144500732, "reward_std": 1.8428391218185425, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.43199795484542847, "rewards/ngram_similarity_reward/std": 0.2237108051776886, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 525.78125, "completions/mean_terminated_length": 525.78125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.08324009845603043, "frac_reward_zero_std": 0.0, "grad_norm": 0.043506283313035965, "learning_rate": 4.129464285714286e-06, "loss": 0.0176, "num_tokens": 29813602.0, "reward": 1.362823486328125, "reward_std": 0.18018248677253723, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 2.6425621509552, "rewards/ngram_similarity_reward/mean": 0.39407360553741455, "rewards/ngram_similarity_reward/std": 0.4013862907886505, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 507.265625, "completions/mean_terminated_length": 507.265625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.08368762586708436, "frac_reward_zero_std": 0.0, "grad_norm": 0.03793657198548317, "learning_rate": 4.151785714285715e-06, "loss": -0.0095, "num_tokens": 30014931.0, "reward": 2.19004225730896, "reward_std": 0.8865464925765991, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.4400422275066376, "rewards/ngram_similarity_reward/std": 0.27053165435791016, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 503.515625, "completions/mean_terminated_length": 503.515625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.08413515327813828, "frac_reward_zero_std": 0.0, "grad_norm": 0.03992355987429619, "learning_rate": 4.174107142857143e-06, "loss": -0.01, "num_tokens": 30162724.0, "reward": 4.297519683837891, "reward_std": 1.1939719915390015, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5787696838378906, "rewards/ngram_similarity_reward/std": 0.39754706621170044, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 517.125, "completions/mean_terminated_length": 517.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.08458268068919221, "frac_reward_zero_std": 0.0, "grad_norm": 0.04642122983932495, "learning_rate": 4.196428571428572e-06, "loss": 0.0242, "num_tokens": 30309724.0, "reward": 1.3255119323730469, "reward_std": 0.6579290628433228, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 2.607795238494873, "rewards/ngram_similarity_reward/mean": 0.4817619323730469, "rewards/ngram_similarity_reward/std": 0.3493627607822418, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 486.3125, "completions/mean_terminated_length": 486.3125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.08503020810024614, "frac_reward_zero_std": 0.0, "grad_norm": 0.042718902230262756, "learning_rate": 4.21875e-06, "loss": 0.012, "num_tokens": 30467344.0, "reward": 1.8112475872039795, "reward_std": 0.8907710313796997, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 3.041870594024658, "rewards/ngram_similarity_reward/mean": 0.5924974679946899, "rewards/ngram_similarity_reward/std": 0.3628630042076111, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 524.953125, "completions/mean_terminated_length": 524.953125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.08547773551130007, "frac_reward_zero_std": 0.0, "grad_norm": 0.0406482107937336, "learning_rate": 4.241071428571429e-06, "loss": -0.0003, "num_tokens": 30606797.0, "reward": -0.15879753232002258, "reward_std": 0.19097502529621124, "rewards/accuracy_reward/mean": -0.53125, "rewards/accuracy_reward/std": 0.17536810040473938, "rewards/ngram_similarity_reward/mean": 0.3724524974822998, "rewards/ngram_similarity_reward/std": 0.1509428173303604, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 413.75, "completions/mean_terminated_length": 413.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.085925262922354, "frac_reward_zero_std": 0.0, "grad_norm": 0.04431195929646492, "learning_rate": 4.2633928571428576e-06, "loss": 0.0018, "num_tokens": 30774717.0, "reward": 3.4124279022216797, "reward_std": 2.0548229217529297, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.44367799162864685, "rewards/ngram_similarity_reward/std": 0.3273553252220154, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 441.890625, "completions/mean_terminated_length": 441.890625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.08637279033340792, "frac_reward_zero_std": 0.0, "grad_norm": 0.044728197157382965, "learning_rate": 4.2857142857142855e-06, "loss": -0.0185, "num_tokens": 30937206.0, "reward": 4.989612579345703, "reward_std": 0.7707114815711975, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.7083626985549927, "rewards/ngram_similarity_reward/std": 0.34404411911964417, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 453.703125, "completions/mean_terminated_length": 453.703125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.08682031774446185, "frac_reward_zero_std": 0.0, "grad_norm": 0.044507987797260284, "learning_rate": 4.308035714285715e-06, "loss": 0.0035, "num_tokens": 31065091.0, "reward": 2.8430004119873047, "reward_std": 0.5047019720077515, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.2492506206035614, "rewards/ngram_similarity_reward/std": 0.21281088888645172, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 387.84375, "completions/mean_terminated_length": 387.84375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.08726784515551578, "frac_reward_zero_std": 0.0, "grad_norm": 0.04783143848180771, "learning_rate": 4.330357142857143e-06, "loss": -0.0006, "num_tokens": 31185113.0, "reward": 3.92765212059021, "reward_std": 0.851255476474762, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.4901522696018219, "rewards/ngram_similarity_reward/std": 0.4444206655025482, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 494.5, "completions/mean_terminated_length": 494.5, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.08771537256656971, "frac_reward_zero_std": 0.0, "grad_norm": 0.044673748314380646, "learning_rate": 4.352678571428572e-06, "loss": -0.0218, "num_tokens": 31339833.0, "reward": 2.257666826248169, "reward_std": 2.731825828552246, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 3.264733076095581, "rewards/ngram_similarity_reward/mean": 0.5232917666435242, "rewards/ngram_similarity_reward/std": 0.2643654942512512, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 464.875, "completions/mean_terminated_length": 464.875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.08816289997762362, "frac_reward_zero_std": 0.0, "grad_norm": 0.04198407754302025, "learning_rate": 4.3750000000000005e-06, "loss": -0.0096, "num_tokens": 31491393.0, "reward": 2.045973300933838, "reward_std": 1.1696488857269287, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.4834733009338379, "rewards/ngram_similarity_reward/std": 0.34941643476486206, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 470.328125, "completions/mean_terminated_length": 470.328125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.08861042738867755, "frac_reward_zero_std": 0.0, "grad_norm": 0.04621649533510208, "learning_rate": 4.397321428571429e-06, "loss": 0.0005, "num_tokens": 31621558.0, "reward": 3.066758632659912, "reward_std": 2.3714590072631836, "rewards/accuracy_reward/mean": 2.75, "rewards/accuracy_reward/std": 3.0498504638671875, "rewards/ngram_similarity_reward/mean": 0.3167587220668793, "rewards/ngram_similarity_reward/std": 0.28607502579689026, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 510.5, "completions/mean_terminated_length": 510.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.08905795479973148, "frac_reward_zero_std": 0.0, "grad_norm": 0.036292556673288345, "learning_rate": 4.419642857142857e-06, "loss": -0.0053, "num_tokens": 31739878.0, "reward": 4.306193828582764, "reward_std": 0.6418611407279968, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.4936937987804413, "rewards/ngram_similarity_reward/std": 0.41456860303878784, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 559.65625, "completions/mean_terminated_length": 559.65625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.08950548221078541, "frac_reward_zero_std": 0.0, "grad_norm": 0.042119771242141724, "learning_rate": 4.441964285714286e-06, "loss": 0.0317, "num_tokens": 31892224.0, "reward": 5.85109806060791, "reward_std": 1.0437448024749756, "rewards/accuracy_reward/mean": 5.28125, "rewards/accuracy_reward/std": 1.227576732635498, "rewards/ngram_similarity_reward/mean": 0.569847583770752, "rewards/ngram_similarity_reward/std": 0.36416593194007874, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 559.0625, "completions/mean_terminated_length": 559.0625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.08995300962183934, "frac_reward_zero_std": 0.0, "grad_norm": 0.042125869542360306, "learning_rate": 4.464285714285715e-06, "loss": 0.0102, "num_tokens": 32027972.0, "reward": 4.4564409255981445, "reward_std": 1.9459412097930908, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.47206610441207886, "rewards/ngram_similarity_reward/std": 0.2877153158187866, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 488.859375, "completions/mean_terminated_length": 488.859375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.09040053703289326, "frac_reward_zero_std": 0.0, "grad_norm": 0.04488145560026169, "learning_rate": 4.486607142857143e-06, "loss": -0.0141, "num_tokens": 32181947.0, "reward": 3.771817207336426, "reward_std": 1.2006248235702515, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.9545164108276367, "rewards/ngram_similarity_reward/mean": 0.24056729674339294, "rewards/ngram_similarity_reward/std": 0.1451207399368286, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 529.515625, "completions/mean_terminated_length": 529.515625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.09084806444394719, "frac_reward_zero_std": 0.0, "grad_norm": 0.034034159034490585, "learning_rate": 4.508928571428572e-06, "loss": 0.0283, "num_tokens": 32329980.0, "reward": 4.561273097991943, "reward_std": 1.712855577468872, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.28002309799194336, "rewards/ngram_similarity_reward/std": 0.19624720513820648, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 423.28125, "completions/mean_terminated_length": 423.28125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.09129559185500112, "frac_reward_zero_std": 0.25, "grad_norm": 0.04006795212626457, "learning_rate": 4.53125e-06, "loss": -0.027, "num_tokens": 32454158.0, "reward": 5.31807279586792, "reward_std": 1.2390682697296143, "rewards/accuracy_reward/mean": 4.71875, "rewards/accuracy_reward/std": 2.0890398025512695, "rewards/ngram_similarity_reward/mean": 0.599323034286499, "rewards/ngram_similarity_reward/std": 0.47235941886901855, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 513.28125, "completions/mean_terminated_length": 513.28125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.09174311926605505, "frac_reward_zero_std": 0.0, "grad_norm": 0.039949022233486176, "learning_rate": 4.553571428571429e-06, "loss": -0.0302, "num_tokens": 32580416.0, "reward": 5.481403827667236, "reward_std": 1.2307987213134766, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.5439037084579468, "rewards/ngram_similarity_reward/std": 0.31527870893478394, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 464.25, "completions/mean_terminated_length": 464.25, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.09219064667710897, "frac_reward_zero_std": 0.0, "grad_norm": 0.05170599743723869, "learning_rate": 4.5758928571428575e-06, "loss": 0.0335, "num_tokens": 32767136.0, "reward": -0.4308336675167084, "reward_std": 0.19196897745132446, "rewards/accuracy_reward/mean": -0.765625, "rewards/accuracy_reward/std": 0.44515693187713623, "rewards/ngram_similarity_reward/mean": 0.334791362285614, "rewards/ngram_similarity_reward/std": 0.3124901056289673, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 457.359375, "completions/mean_terminated_length": 457.359375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.0926381740881629, "frac_reward_zero_std": 0.0, "grad_norm": 0.0499611496925354, "learning_rate": 4.5982142857142854e-06, "loss": 0.0122, "num_tokens": 32950759.0, "reward": 2.8177924156188965, "reward_std": 1.109889268875122, "rewards/accuracy_reward/mean": 2.28125, "rewards/accuracy_reward/std": 3.0522892475128174, "rewards/ngram_similarity_reward/mean": 0.5365424156188965, "rewards/ngram_similarity_reward/std": 0.3115372061729431, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 446.234375, "completions/mean_terminated_length": 446.234375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.09308570149921683, "frac_reward_zero_std": 0.0, "grad_norm": 0.043924376368522644, "learning_rate": 4.620535714285715e-06, "loss": 0.0304, "num_tokens": 33076294.0, "reward": 3.9390711784362793, "reward_std": 1.7528796195983887, "rewards/accuracy_reward/mean": 3.5, "rewards/accuracy_reward/std": 2.8894994258880615, "rewards/ngram_similarity_reward/mean": 0.43907126784324646, "rewards/ngram_similarity_reward/std": 0.3475610017776489, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 469.359375, "completions/mean_terminated_length": 469.359375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.09353322891027076, "frac_reward_zero_std": 0.0, "grad_norm": 0.037999190390110016, "learning_rate": 4.642857142857144e-06, "loss": -0.0256, "num_tokens": 33247661.0, "reward": 3.102001905441284, "reward_std": 1.3292903900146484, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6020020246505737, "rewards/ngram_similarity_reward/std": 0.3597540855407715, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 459.5625, "completions/mean_terminated_length": 459.5625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.09398075632132469, "frac_reward_zero_std": 0.0, "grad_norm": 0.0505104660987854, "learning_rate": 4.665178571428572e-06, "loss": 0.0413, "num_tokens": 33433393.0, "reward": 1.5454846620559692, "reward_std": 0.5603064894676208, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.4517347812652588, "rewards/ngram_similarity_reward/std": 0.4080568552017212, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 465.703125, "completions/mean_terminated_length": 465.703125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.0944282837323786, "frac_reward_zero_std": 0.0, "grad_norm": 0.04667956009507179, "learning_rate": 4.6875000000000004e-06, "loss": -0.0065, "num_tokens": 33562830.0, "reward": 3.4275259971618652, "reward_std": 1.291181206703186, "rewards/accuracy_reward/mean": 2.84375, "rewards/accuracy_reward/std": 3.0405657291412354, "rewards/ngram_similarity_reward/mean": 0.5837761163711548, "rewards/ngram_similarity_reward/std": 0.2460503727197647, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 626.828125, "completions/mean_terminated_length": 626.828125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.09487581114343253, "frac_reward_zero_std": 0.0, "grad_norm": 0.04065338522195816, "learning_rate": 4.709821428571429e-06, "loss": -0.0215, "num_tokens": 33746627.0, "reward": 1.388837218284607, "reward_std": 0.7045068740844727, "rewards/accuracy_reward/mean": 0.921875, "rewards/accuracy_reward/std": 2.915773868560791, "rewards/ngram_similarity_reward/mean": 0.46696218848228455, "rewards/ngram_similarity_reward/std": 0.3326452374458313, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 505.671875, "completions/mean_terminated_length": 505.671875, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.09532333855448646, "frac_reward_zero_std": 0.0, "grad_norm": 0.04317639395594597, "learning_rate": 4.732142857142857e-06, "loss": -0.0012, "num_tokens": 33925358.0, "reward": 1.106865406036377, "reward_std": 0.2781585454940796, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.688710927963257, "rewards/ngram_similarity_reward/mean": 0.2006155252456665, "rewards/ngram_similarity_reward/std": 0.17418237030506134, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 519.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.0957708659655404, "frac_reward_zero_std": 0.0, "grad_norm": 0.038894299417734146, "learning_rate": 4.754464285714286e-06, "loss": 0.0314, "num_tokens": 34096694.0, "reward": 2.903021812438965, "reward_std": 0.17010778188705444, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.40302157402038574, "rewards/ngram_similarity_reward/std": 0.1969473510980606, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 649.09375, "completions/mean_terminated_length": 649.09375, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.09621839337659431, "frac_reward_zero_std": 0.0, "grad_norm": 0.0412026047706604, "learning_rate": 4.776785714285715e-06, "loss": 0.0123, "num_tokens": 34303068.0, "reward": 0.10277429223060608, "reward_std": 0.8195408582687378, "rewards/accuracy_reward/mean": -0.171875, "rewards/accuracy_reward/std": 1.491294264793396, "rewards/ngram_similarity_reward/mean": 0.27464932203292847, "rewards/ngram_similarity_reward/std": 0.15213480591773987, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 535.015625, "completions/mean_terminated_length": 535.015625, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.09666592078764824, "frac_reward_zero_std": 0.0, "grad_norm": 0.03897232189774513, "learning_rate": 4.799107142857143e-06, "loss": 0.0136, "num_tokens": 34470301.0, "reward": 3.1363539695739746, "reward_std": 1.6776666641235352, "rewards/accuracy_reward/mean": 2.796875, "rewards/accuracy_reward/std": 3.0998191833496094, "rewards/ngram_similarity_reward/mean": 0.3394790291786194, "rewards/ngram_similarity_reward/std": 0.2934320271015167, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 538.1875, "completions/mean_terminated_length": 538.1875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.09711344819870217, "frac_reward_zero_std": 0.0, "grad_norm": 0.03999040275812149, "learning_rate": 4.821428571428572e-06, "loss": 0.0359, "num_tokens": 34629321.0, "reward": 3.709991693496704, "reward_std": 1.4276716709136963, "rewards/accuracy_reward/mean": 3.375, "rewards/accuracy_reward/std": 2.9680843353271484, "rewards/ngram_similarity_reward/mean": 0.33499157428741455, "rewards/ngram_similarity_reward/std": 0.27073392271995544, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 418.40625, "completions/mean_terminated_length": 418.40625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.0975609756097561, "frac_reward_zero_std": 0.0, "grad_norm": 0.05568109452724457, "learning_rate": 4.84375e-06, "loss": 0.0013, "num_tokens": 34798147.0, "reward": 2.7140893936157227, "reward_std": 1.6045554876327515, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 3.0953354835510254, "rewards/ngram_similarity_reward/mean": 0.7922143936157227, "rewards/ngram_similarity_reward/std": 0.2839650809764862, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 512.375, "completions/mean_terminated_length": 512.375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.09800850302081003, "frac_reward_zero_std": 0.0, "grad_norm": 0.03910877928137779, "learning_rate": 4.866071428571429e-06, "loss": -0.0238, "num_tokens": 34947019.0, "reward": 4.046869277954102, "reward_std": 0.9176902770996094, "rewards/accuracy_reward/mean": 3.515625, "rewards/accuracy_reward/std": 2.8646292686462402, "rewards/ngram_similarity_reward/mean": 0.5312443971633911, "rewards/ngram_similarity_reward/std": 0.34237968921661377, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 463.609375, "completions/mean_terminated_length": 438.4603576660156, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.09845603043186395, "frac_reward_zero_std": 0.0, "grad_norm": 0.053166162222623825, "learning_rate": 4.8883928571428575e-06, "loss": 0.0166, "num_tokens": 35168546.0, "reward": 0.9042137265205383, "reward_std": 1.8351986408233643, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 2.607795238494873, "rewards/ngram_similarity_reward/mean": 0.2479637861251831, "rewards/ngram_similarity_reward/std": 0.16935303807258606, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 565.671875, "completions/mean_terminated_length": 565.671875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.09890355784291788, "frac_reward_zero_std": 0.0, "grad_norm": 0.048766668885946274, "learning_rate": 4.910714285714286e-06, "loss": -0.0135, "num_tokens": 35410541.0, "reward": 4.157958984375, "reward_std": 0.9653618931770325, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.3767091631889343, "rewards/ngram_similarity_reward/std": 0.25629374384880066, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 419.78125, "completions/mean_terminated_length": 419.78125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.09935108525397181, "frac_reward_zero_std": 0.0, "grad_norm": 0.04921666532754898, "learning_rate": 4.933035714285715e-06, "loss": -0.0085, "num_tokens": 35553471.0, "reward": 5.763852119445801, "reward_std": 0.57377028465271, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.4513515830039978, "rewards/ngram_similarity_reward/std": 0.20995618402957916, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 418.765625, "completions/mean_terminated_length": 418.765625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.09979861266502574, "frac_reward_zero_std": 0.0, "grad_norm": 0.05048259720206261, "learning_rate": 4.955357142857144e-06, "loss": 0.0148, "num_tokens": 35726384.0, "reward": 1.6927458047866821, "reward_std": 0.9506723284721375, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 2.9450995922088623, "rewards/ngram_similarity_reward/mean": 0.3489959239959717, "rewards/ngram_similarity_reward/std": 0.31981024146080017, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.10024614007607965, "frac_reward_zero_std": 0.0, "grad_norm": 0.0539417527616024, "learning_rate": 4.977678571428572e-06, "loss": 0.0187, "num_tokens": 35859280.0, "reward": 4.095028877258301, "reward_std": 1.9678847789764404, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.20440366864204407, "rewards/ngram_similarity_reward/std": 0.10812616348266602, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 497.859375, "completions/mean_terminated_length": 497.859375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.10069366748713358, "frac_reward_zero_std": 0.0, "grad_norm": 0.03823324292898178, "learning_rate": 5e-06, "loss": 0.0313, "num_tokens": 35986151.0, "reward": 3.178346633911133, "reward_std": 1.6972424983978271, "rewards/accuracy_reward/mean": 2.765625, "rewards/accuracy_reward/std": 3.0302298069000244, "rewards/ngram_similarity_reward/mean": 0.41272154450416565, "rewards/ngram_similarity_reward/std": 0.2593502402305603, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 430.3125, "completions/mean_terminated_length": 430.3125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.10114119489818751, "frac_reward_zero_std": 0.0, "grad_norm": 0.05047139897942543, "learning_rate": 4.999999384125436e-06, "loss": -0.0278, "num_tokens": 36197019.0, "reward": 3.2247214317321777, "reward_std": 1.6524477005004883, "rewards/accuracy_reward/mean": 2.75, "rewards/accuracy_reward/std": 3.0498504638671875, "rewards/ngram_similarity_reward/mean": 0.4747212529182434, "rewards/ngram_similarity_reward/std": 0.32453668117523193, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 541.609375, "completions/mean_terminated_length": 541.609375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.10158872230924144, "frac_reward_zero_std": 0.0, "grad_norm": 0.042722173035144806, "learning_rate": 4.999997536502079e-06, "loss": 0.0054, "num_tokens": 36336962.0, "reward": 1.242129921913147, "reward_std": 1.0416669845581055, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 2.6189463138580322, "rewards/ngram_similarity_reward/mean": 0.41400498151779175, "rewards/ngram_similarity_reward/std": 0.20894940197467804, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 487.015625, "completions/mean_terminated_length": 487.015625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.10203624972029537, "frac_reward_zero_std": 0.0, "grad_norm": 0.047618549317121506, "learning_rate": 4.99999445713094e-06, "loss": 0.0304, "num_tokens": 36472643.0, "reward": 4.7300238609313965, "reward_std": 1.587036371231079, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.44877392053604126, "rewards/ngram_similarity_reward/std": 0.2516137957572937, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 490.90625, "completions/mean_terminated_length": 490.90625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.10248377713134929, "frac_reward_zero_std": 0.0, "grad_norm": 0.04048790782690048, "learning_rate": 4.9999901460137076e-06, "loss": -0.0135, "num_tokens": 36616349.0, "reward": 3.358356475830078, "reward_std": 2.4169936180114746, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.2958567142486572, "rewards/ngram_similarity_reward/std": 0.25693514943122864, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 431.453125, "completions/mean_terminated_length": 431.453125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.10293130454240322, "frac_reward_zero_std": 0.25, "grad_norm": 0.051024310290813446, "learning_rate": 4.999984603152739e-06, "loss": -0.0329, "num_tokens": 36745498.0, "reward": 3.4914653301239014, "reward_std": 0.9072697758674622, "rewards/accuracy_reward/mean": 2.796875, "rewards/accuracy_reward/std": 3.2055492401123047, "rewards/ngram_similarity_reward/mean": 0.6945902109146118, "rewards/ngram_similarity_reward/std": 0.3811330497264862, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 496.0625, "completions/mean_terminated_length": 496.0625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.10337883195345715, "frac_reward_zero_std": 0.0, "grad_norm": 0.043420251458883286, "learning_rate": 4.999977828551071e-06, "loss": 0.028, "num_tokens": 36900382.0, "reward": 5.204824447631836, "reward_std": 1.7110671997070312, "rewards/accuracy_reward/mean": 4.734375, "rewards/accuracy_reward/std": 2.04506516456604, "rewards/ngram_similarity_reward/mean": 0.4704493582248688, "rewards/ngram_similarity_reward/std": 0.3356776833534241, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 431.296875, "completions/mean_terminated_length": 431.296875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.10382635936451108, "frac_reward_zero_std": 0.0, "grad_norm": 0.046472541987895966, "learning_rate": 4.99996982221241e-06, "loss": 0.0075, "num_tokens": 37060545.0, "reward": 3.730433702468872, "reward_std": 2.3555819988250732, "rewards/accuracy_reward/mean": 3.359375, "rewards/accuracy_reward/std": 3.1007792949676514, "rewards/ngram_similarity_reward/mean": 0.37105870246887207, "rewards/ngram_similarity_reward/std": 0.29995277523994446, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 473.1875, "completions/mean_terminated_length": 473.1875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.104273886775565, "frac_reward_zero_std": 0.0, "grad_norm": 0.04544816538691521, "learning_rate": 4.999960584141141e-06, "loss": 0.011, "num_tokens": 37187629.0, "reward": 2.8511152267456055, "reward_std": 0.46699586510658264, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.44486522674560547, "rewards/ngram_similarity_reward/std": 0.3931611478328705, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 444.375, "completions/mean_terminated_length": 444.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.10472141418661893, "frac_reward_zero_std": 0.0, "grad_norm": 0.038571011275053024, "learning_rate": 4.9999501143423195e-06, "loss": -0.007, "num_tokens": 37348581.0, "reward": 3.952913761138916, "reward_std": 1.3186864852905273, "rewards/accuracy_reward/mean": 3.328125, "rewards/accuracy_reward/std": 2.9252848625183105, "rewards/ngram_similarity_reward/mean": 0.6247888803482056, "rewards/ngram_similarity_reward/std": 0.35860466957092285, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 497.359375, "completions/mean_terminated_length": 497.359375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.10516894159767286, "frac_reward_zero_std": 0.0, "grad_norm": 0.05110171064734459, "learning_rate": 4.999938412821679e-06, "loss": 0.0098, "num_tokens": 37513564.0, "reward": 3.3419313430786133, "reward_std": 0.8716861009597778, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.4669312834739685, "rewards/ngram_similarity_reward/std": 0.30982959270477295, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 516.1875, "completions/mean_terminated_length": 516.1875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.10561646900872679, "frac_reward_zero_std": 0.0, "grad_norm": 0.03886553272604942, "learning_rate": 4.999925479585624e-06, "loss": 0.0077, "num_tokens": 37677224.0, "reward": 5.317903518676758, "reward_std": 1.877345323562622, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.6616535782814026, "rewards/ngram_similarity_reward/std": 0.3614196181297302, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 472.0625, "completions/mean_terminated_length": 472.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.10606399641978072, "frac_reward_zero_std": 0.0, "grad_norm": 0.04324449226260185, "learning_rate": 4.999911314641236e-06, "loss": -0.0293, "num_tokens": 37799580.0, "reward": 6.1421403884887695, "reward_std": 0.5494071841239929, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.7358900308609009, "rewards/ngram_similarity_reward/std": 0.3383578658103943, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 422.125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.10651152383083463, "frac_reward_zero_std": 0.0, "grad_norm": 0.046920593827962875, "learning_rate": 4.999895917996267e-06, "loss": -0.014, "num_tokens": 37940724.0, "reward": 4.145472526550293, "reward_std": 0.9606025218963623, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.4267226457595825, "rewards/ngram_similarity_reward/std": 0.318371057510376, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 455.734375, "completions/mean_terminated_length": 455.734375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.10695905124188856, "frac_reward_zero_std": 0.0, "grad_norm": 0.04800686240196228, "learning_rate": 4.99987928965915e-06, "loss": 0.0262, "num_tokens": 38096931.0, "reward": 2.8433384895324707, "reward_std": 2.219339370727539, "rewards/accuracy_reward/mean": 2.359375, "rewards/accuracy_reward/std": 3.384686231613159, "rewards/ngram_similarity_reward/mean": 0.4839634597301483, "rewards/ngram_similarity_reward/std": 0.35715481638908386, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 471.59375, "completions/mean_terminated_length": 471.59375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.1074065786529425, "frac_reward_zero_std": 0.0, "grad_norm": 0.04236859083175659, "learning_rate": 4.999861429638984e-06, "loss": -0.0202, "num_tokens": 38262169.0, "reward": 3.3624043464660645, "reward_std": 2.224200487136841, "rewards/accuracy_reward/mean": 3.0, "rewards/accuracy_reward/std": 3.1622776985168457, "rewards/ngram_similarity_reward/mean": 0.36240440607070923, "rewards/ngram_similarity_reward/std": 0.258233904838562, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 479.578125, "completions/mean_terminated_length": 479.578125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.10785410606399642, "frac_reward_zero_std": 0.0, "grad_norm": 0.045299142599105835, "learning_rate": 4.999842337945548e-06, "loss": 0.0048, "num_tokens": 38404846.0, "reward": 2.6669921875, "reward_std": 1.0915460586547852, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.16699212789535522, "rewards/ngram_similarity_reward/std": 0.13970988988876343, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 443.15625, "completions/mean_terminated_length": 443.15625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.10830163347505034, "frac_reward_zero_std": 0.0, "grad_norm": 0.05201602727174759, "learning_rate": 4.999822014589293e-06, "loss": 0.0129, "num_tokens": 38569688.0, "reward": 0.9952648878097534, "reward_std": 0.18254108726978302, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 2.8044307231903076, "rewards/ngram_similarity_reward/mean": 0.2608899772167206, "rewards/ngram_similarity_reward/std": 0.2795363664627075, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 503.484375, "completions/mean_terminated_length": 503.484375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.10874916088610427, "frac_reward_zero_std": 0.0, "grad_norm": 0.04663022607564926, "learning_rate": 4.9998004595813476e-06, "loss": 0.0253, "num_tokens": 38735095.0, "reward": 4.354253768920898, "reward_std": 0.46198853850364685, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.44800350069999695, "rewards/ngram_similarity_reward/std": 0.3534550070762634, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 483.046875, "completions/mean_terminated_length": 483.046875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.1091966882971582, "frac_reward_zero_std": 0.0, "grad_norm": 0.046560950577259064, "learning_rate": 4.9997776729335085e-06, "loss": -0.0048, "num_tokens": 38898442.0, "reward": 3.153549909591675, "reward_std": 1.9385340213775635, "rewards/accuracy_reward/mean": 2.671875, "rewards/accuracy_reward/std": 3.1449365615844727, "rewards/ngram_similarity_reward/mean": 0.48167479038238525, "rewards/ngram_similarity_reward/std": 0.41635817289352417, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 508.78125, "completions/mean_terminated_length": 508.78125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.10964421570821213, "frac_reward_zero_std": 0.0, "grad_norm": 0.0464753620326519, "learning_rate": 4.999753654658252e-06, "loss": -0.0335, "num_tokens": 39044524.0, "reward": 2.106961727142334, "reward_std": 2.0896620750427246, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.356961727142334, "rewards/ngram_similarity_reward/std": 0.2141096293926239, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 463.84375, "completions/mean_terminated_length": 463.84375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.11009174311926606, "frac_reward_zero_std": 0.0, "grad_norm": 0.045450758188962936, "learning_rate": 4.999728404768726e-06, "loss": 0.0302, "num_tokens": 39231858.0, "reward": 2.8807318210601807, "reward_std": 1.6852710247039795, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.4744817614555359, "rewards/ngram_similarity_reward/std": 0.29741978645324707, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 537.125, "completions/mean_terminated_length": 537.125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.11053927053031998, "frac_reward_zero_std": 0.0, "grad_norm": 0.043713342398405075, "learning_rate": 4.999701923278754e-06, "loss": 0.0225, "num_tokens": 39396474.0, "reward": 2.1098146438598633, "reward_std": 0.8655683398246765, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 3.001157283782959, "rewards/ngram_similarity_reward/mean": 0.20356449484825134, "rewards/ngram_similarity_reward/std": 0.10082338750362396, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 553.703125, "completions/mean_terminated_length": 553.703125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.11098679794137391, "frac_reward_zero_std": 0.0, "grad_norm": 0.04395830258727074, "learning_rate": 4.999674210202833e-06, "loss": 0.0229, "num_tokens": 39523719.0, "reward": 2.5181853771209717, "reward_std": 0.8695477247238159, "rewards/accuracy_reward/mean": 2.140625, "rewards/accuracy_reward/std": 3.0930912494659424, "rewards/ngram_similarity_reward/mean": 0.3775605261325836, "rewards/ngram_similarity_reward/std": 0.3463709056377411, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 513.375, "completions/mean_terminated_length": 513.375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.11143432535242784, "frac_reward_zero_std": 0.0, "grad_norm": 0.040253348648548126, "learning_rate": 4.999645265556134e-06, "loss": -0.013, "num_tokens": 39664863.0, "reward": 3.704132080078125, "reward_std": 1.0076619386672974, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7353817820549011, "rewards/ngram_similarity_reward/std": 0.3593013882637024, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 511.96875, "completions/mean_terminated_length": 511.96875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.11188185276348177, "frac_reward_zero_std": 0.0, "grad_norm": 0.045767977833747864, "learning_rate": 4.999615089354502e-06, "loss": 0.0125, "num_tokens": 39805069.0, "reward": 3.003373384475708, "reward_std": 0.2290632426738739, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.1305904388427734, "rewards/ngram_similarity_reward/mean": 0.5971232652664185, "rewards/ngram_similarity_reward/std": 0.33951181173324585, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 429.40625, "completions/mean_terminated_length": 429.40625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.11232938017453568, "frac_reward_zero_std": 0.0, "grad_norm": 0.04780712351202965, "learning_rate": 4.999583681614458e-06, "loss": 0.0247, "num_tokens": 39941287.0, "reward": 2.729985237121582, "reward_std": 0.19259922206401825, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.24561025202274323, "rewards/ngram_similarity_reward/std": 0.21396705508232117, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 445.53125, "completions/mean_terminated_length": 445.53125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.11277690758558961, "frac_reward_zero_std": 0.0, "grad_norm": 0.04843899607658386, "learning_rate": 4.999551042353196e-06, "loss": -0.0417, "num_tokens": 40124777.0, "reward": 5.458471775054932, "reward_std": 1.1269618272781372, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.5209718942642212, "rewards/ngram_similarity_reward/std": 0.35723310708999634, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 527.28125, "completions/mean_terminated_length": 527.28125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.11322443499664354, "frac_reward_zero_std": 0.0, "grad_norm": 0.04037504270672798, "learning_rate": 4.999517171588584e-06, "loss": 0.0072, "num_tokens": 40279451.0, "reward": 4.945751190185547, "reward_std": 1.7158468961715698, "rewards/accuracy_reward/mean": 4.546875, "rewards/accuracy_reward/std": 2.2355687618255615, "rewards/ngram_similarity_reward/mean": 0.3988759219646454, "rewards/ngram_similarity_reward/std": 0.3014982044696808, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 462.125, "completions/mean_terminated_length": 462.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.11367196240769747, "frac_reward_zero_std": 0.0, "grad_norm": 0.04890478029847145, "learning_rate": 4.999482069339163e-06, "loss": -0.0018, "num_tokens": 40435987.0, "reward": 3.4920666217803955, "reward_std": 0.8878836035728455, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.42956671118736267, "rewards/ngram_similarity_reward/std": 0.2548423409461975, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 542.578125, "completions/mean_terminated_length": 542.578125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.1141194898187514, "frac_reward_zero_std": 0.0, "grad_norm": 0.03973216935992241, "learning_rate": 4.99944573562415e-06, "loss": 0.0244, "num_tokens": 40635496.0, "reward": 4.6910176277160645, "reward_std": 1.8697986602783203, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.5035178065299988, "rewards/ngram_similarity_reward/std": 0.29467612504959106, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 562.03125, "completions/mean_terminated_length": 562.03125, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.11456701722980532, "frac_reward_zero_std": 0.0, "grad_norm": 0.04116257652640343, "learning_rate": 4.999408170463438e-06, "loss": 0.0027, "num_tokens": 40805418.0, "reward": 2.6413564682006836, "reward_std": 1.014211654663086, "rewards/accuracy_reward/mean": 2.25, "rewards/accuracy_reward/std": 3.1922526359558105, "rewards/ngram_similarity_reward/mean": 0.39135655760765076, "rewards/ngram_similarity_reward/std": 0.23213790357112885, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 513.046875, "completions/mean_terminated_length": 513.046875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.11501454464085925, "frac_reward_zero_std": 0.0, "grad_norm": 0.05036421865224838, "learning_rate": 4.999369373877589e-06, "loss": 0.0415, "num_tokens": 40960749.0, "reward": 1.2556395530700684, "reward_std": 0.6643199920654297, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 2.7658276557922363, "rewards/ngram_similarity_reward/mean": 0.28688937425613403, "rewards/ngram_similarity_reward/std": 0.24688217043876648, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 529.859375, "completions/mean_terminated_length": 529.859375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.11546207205191318, "frac_reward_zero_std": 0.0, "grad_norm": 0.040541063994169235, "learning_rate": 4.999329345887844e-06, "loss": 0.0607, "num_tokens": 41230180.0, "reward": -0.11133871972560883, "reward_std": 1.5756843090057373, "rewards/accuracy_reward/mean": -0.3125, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.20116126537322998, "rewards/ngram_similarity_reward/std": 0.22335268557071686, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 568.6875, "completions/mean_terminated_length": 568.6875, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.11590959946296711, "frac_reward_zero_std": 0.0, "grad_norm": 0.0399027056992054, "learning_rate": 4.999288086516114e-06, "loss": 0.0382, "num_tokens": 41373776.0, "reward": 3.2704312801361084, "reward_std": 0.8012506365776062, "rewards/accuracy_reward/mean": 2.859375, "rewards/accuracy_reward/std": 3.0203921794891357, "rewards/ngram_similarity_reward/mean": 0.4110559821128845, "rewards/ngram_similarity_reward/std": 0.3374841809272766, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 494.078125, "completions/mean_terminated_length": 494.078125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.11635712687402103, "frac_reward_zero_std": 0.0, "grad_norm": 0.04895956814289093, "learning_rate": 4.999245595784988e-06, "loss": 0.018, "num_tokens": 41518421.0, "reward": 1.0387669801712036, "reward_std": 1.3193118572235107, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 2.6931350231170654, "rewards/ngram_similarity_reward/mean": 0.3200168013572693, "rewards/ngram_similarity_reward/std": 0.2850259840488434, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 423.140625, "completions/mean_terminated_length": 423.140625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.11680465428507496, "frac_reward_zero_std": 0.0, "grad_norm": 0.04695745185017586, "learning_rate": 4.999201873717725e-06, "loss": 0.0225, "num_tokens": 41688078.0, "reward": 3.881317615509033, "reward_std": 1.5000773668289185, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.6313177943229675, "rewards/ngram_similarity_reward/std": 0.30789679288864136, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 525.03125, "completions/mean_terminated_length": 525.03125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.11725218169612889, "frac_reward_zero_std": 0.0, "grad_norm": 0.041961099952459335, "learning_rate": 4.999156920338263e-06, "loss": 0.0157, "num_tokens": 41825072.0, "reward": 3.311887741088867, "reward_std": 1.407604694366455, "rewards/accuracy_reward/mean": 2.578125, "rewards/accuracy_reward/std": 3.0410144329071045, "rewards/ngram_similarity_reward/mean": 0.7337629795074463, "rewards/ngram_similarity_reward/std": 0.28070977330207825, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 465.703125, "completions/mean_terminated_length": 465.703125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.11769970910718282, "frac_reward_zero_std": 0.0, "grad_norm": 0.05207325145602226, "learning_rate": 4.9991107356712116e-06, "loss": -0.0008, "num_tokens": 41965293.0, "reward": 3.4179024696350098, "reward_std": 1.8766613006591797, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.44915229082107544, "rewards/ngram_similarity_reward/std": 0.32722440361976624, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 438.234375, "completions/mean_terminated_length": 438.234375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.11814723651823675, "frac_reward_zero_std": 0.0, "grad_norm": 0.058121174573898315, "learning_rate": 4.9990633197418515e-06, "loss": 0.0237, "num_tokens": 42121772.0, "reward": 1.065765380859375, "reward_std": 1.3512228727340698, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 2.7945525646209717, "rewards/ngram_similarity_reward/mean": 0.3157654106616974, "rewards/ngram_similarity_reward/std": 0.28690239787101746, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 587.21875, "completions/mean_terminated_length": 587.21875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.11859476392929066, "frac_reward_zero_std": 0.0, "grad_norm": 0.03617122396826744, "learning_rate": 4.999014672576143e-06, "loss": -0.0155, "num_tokens": 42274778.0, "reward": 3.6269023418426514, "reward_std": 2.9205880165100098, "rewards/accuracy_reward/mean": 3.125, "rewards/accuracy_reward/std": 2.994704246520996, "rewards/ngram_similarity_reward/mean": 0.5019023418426514, "rewards/ngram_similarity_reward/std": 0.27822884917259216, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 478.28125, "completions/mean_terminated_length": 478.28125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.1190422913403446, "frac_reward_zero_std": 0.0, "grad_norm": 0.04338669404387474, "learning_rate": 4.998964794200716e-06, "loss": 0.0158, "num_tokens": 42423340.0, "reward": 2.7534642219543457, "reward_std": 0.5386490821838379, "rewards/accuracy_reward/mean": 2.375, "rewards/accuracy_reward/std": 3.057647228240967, "rewards/ngram_similarity_reward/mean": 0.37846437096595764, "rewards/ngram_similarity_reward/std": 0.32297518849372864, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 499.78125, "completions/mean_terminated_length": 499.78125, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.11948981875139852, "frac_reward_zero_std": 0.0, "grad_norm": 0.042514264583587646, "learning_rate": 4.9989136846428775e-06, "loss": 0.0057, "num_tokens": 42586094.0, "reward": 2.722477674484253, "reward_std": 0.7117666602134705, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.1305904388427734, "rewards/ngram_similarity_reward/mean": 0.12872779369354248, "rewards/ngram_similarity_reward/std": 0.17331068217754364, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 454.4375, "completions/mean_terminated_length": 454.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.11993734616245245, "frac_reward_zero_std": 0.0, "grad_norm": 0.04440493881702423, "learning_rate": 4.998861343930605e-06, "loss": -0.0051, "num_tokens": 42709482.0, "reward": 5.638522148132324, "reward_std": 0.634451150894165, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.24789699912071228, "rewards/ngram_similarity_reward/std": 0.2870684862136841, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 443.53125, "completions/mean_terminated_length": 443.53125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.12038487357350637, "frac_reward_zero_std": 0.0, "grad_norm": 0.04513474926352501, "learning_rate": 4.998807772092555e-06, "loss": 0.0183, "num_tokens": 42844220.0, "reward": 5.522170066833496, "reward_std": 0.832690954208374, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.49091994762420654, "rewards/ngram_similarity_reward/std": 0.31039613485336304, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 390.71875, "completions/mean_terminated_length": 390.71875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.1208324009845603, "frac_reward_zero_std": 0.0, "grad_norm": 0.05287130922079086, "learning_rate": 4.998752969158052e-06, "loss": -0.0091, "num_tokens": 42986714.0, "reward": 1.4076976776123047, "reward_std": 1.0002360343933105, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.5014476776123047, "rewards/ngram_similarity_reward/std": 0.31911924481391907, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 351.875, "completions/mean_terminated_length": 351.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.12127992839561423, "frac_reward_zero_std": 0.0, "grad_norm": 0.06454234570264816, "learning_rate": 4.9986969351571006e-06, "loss": -0.0137, "num_tokens": 43177970.0, "reward": 4.2275071144104, "reward_std": 0.9109856486320496, "rewards/accuracy_reward/mean": 3.546875, "rewards/accuracy_reward/std": 2.9300289154052734, "rewards/ngram_similarity_reward/mean": 0.6806321740150452, "rewards/ngram_similarity_reward/std": 0.34795522689819336, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 544.921875, "completions/mean_terminated_length": 544.921875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.12172745580666816, "frac_reward_zero_std": 0.0, "grad_norm": 0.050481028854846954, "learning_rate": 4.998639670120374e-06, "loss": 0.0122, "num_tokens": 43335037.0, "reward": 2.832256555557251, "reward_std": 0.6346484422683716, "rewards/accuracy_reward/mean": 2.375, "rewards/accuracy_reward/std": 3.057647228240967, "rewards/ngram_similarity_reward/mean": 0.4572564959526062, "rewards/ngram_similarity_reward/std": 0.2750431001186371, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 528.875, "completions/mean_terminated_length": 528.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.12217498321772209, "frac_reward_zero_std": 0.25, "grad_norm": 0.03463369607925415, "learning_rate": 4.9985811740792226e-06, "loss": 0.0546, "num_tokens": 43462565.0, "reward": 0.013282734900712967, "reward_std": 0.1335204541683197, "rewards/accuracy_reward/mean": -0.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.5132827758789062, "rewards/ngram_similarity_reward/std": 0.32260528206825256, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 508.9375, "completions/mean_terminated_length": 508.9375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.12262251062877601, "frac_reward_zero_std": 0.0, "grad_norm": 0.04936639964580536, "learning_rate": 4.9985214470656705e-06, "loss": 0.0042, "num_tokens": 43670353.0, "reward": 2.0254318714141846, "reward_std": 1.2559350728988647, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 2.900294303894043, "rewards/ngram_similarity_reward/mean": 0.49418196082115173, "rewards/ngram_similarity_reward/std": 0.33127668499946594, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 397.828125, "completions/mean_terminated_length": 397.828125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.12307003803982994, "frac_reward_zero_std": 0.0, "grad_norm": 0.04740218073129654, "learning_rate": 4.998460489112412e-06, "loss": -0.0289, "num_tokens": 43845318.0, "reward": 5.616533279418945, "reward_std": 1.3415679931640625, "rewards/accuracy_reward/mean": 5.0, "rewards/accuracy_reward/std": 1.7366269826889038, "rewards/ngram_similarity_reward/mean": 0.6165330410003662, "rewards/ngram_similarity_reward/std": 0.46372175216674805, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 382.734375, "completions/mean_terminated_length": 382.734375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.12351756545088387, "frac_reward_zero_std": 0.25, "grad_norm": 0.045659687370061874, "learning_rate": 4.998398300252821e-06, "loss": 0.0129, "num_tokens": 43985925.0, "reward": 2.289724826812744, "reward_std": 0.761822521686554, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.2584747076034546, "rewards/ngram_similarity_reward/std": 0.21390005946159363, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 402.25, "completions/mean_terminated_length": 402.25, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.1239650928619378, "frac_reward_zero_std": 0.0, "grad_norm": 0.0604717992246151, "learning_rate": 4.998334880520942e-06, "loss": 0.0173, "num_tokens": 44192549.0, "reward": 1.8768291473388672, "reward_std": 2.8309593200683594, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 3.0002894401550293, "rewards/ngram_similarity_reward/mean": 0.20495399832725525, "rewards/ngram_similarity_reward/std": 0.11042793095111847, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 515.6875, "completions/mean_terminated_length": 515.6875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.12441262027299171, "frac_reward_zero_std": 0.0, "grad_norm": 0.048352211713790894, "learning_rate": 4.998270229951493e-06, "loss": -0.0155, "num_tokens": 44332465.0, "reward": 4.659803867340088, "reward_std": 1.2321932315826416, "rewards/accuracy_reward/mean": 4.109375, "rewards/accuracy_reward/std": 2.6584229469299316, "rewards/ngram_similarity_reward/mean": 0.550429105758667, "rewards/ngram_similarity_reward/std": 0.4145485460758209, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 482.296875, "completions/mean_terminated_length": 482.296875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.12486014768404564, "frac_reward_zero_std": 0.0, "grad_norm": 0.04150489717721939, "learning_rate": 4.998204348579867e-06, "loss": 0.0014, "num_tokens": 44462228.0, "reward": 5.0958943367004395, "reward_std": 0.978097677230835, "rewards/accuracy_reward/mean": 4.453125, "rewards/accuracy_reward/std": 2.4587368965148926, "rewards/ngram_similarity_reward/mean": 0.6427693963050842, "rewards/ngram_similarity_reward/std": 0.24158473312854767, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 514.75, "completions/mean_terminated_length": 514.75, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.12530767509509957, "frac_reward_zero_std": 0.0, "grad_norm": 0.0430060438811779, "learning_rate": 4.99813723644213e-06, "loss": 0.0225, "num_tokens": 44624964.0, "reward": 4.896323204040527, "reward_std": 1.478863000869751, "rewards/accuracy_reward/mean": 4.34375, "rewards/accuracy_reward/std": 2.4314002990722656, "rewards/ngram_similarity_reward/mean": 0.5525734424591064, "rewards/ngram_similarity_reward/std": 0.38259801268577576, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 471.265625, "completions/mean_terminated_length": 471.265625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.1257552025061535, "frac_reward_zero_std": 0.0, "grad_norm": 0.049225497990846634, "learning_rate": 4.998068893575021e-06, "loss": 0.0518, "num_tokens": 44814821.0, "reward": 2.4747767448425293, "reward_std": 0.9369537830352783, "rewards/accuracy_reward/mean": 2.078125, "rewards/accuracy_reward/std": 3.0488338470458984, "rewards/ngram_similarity_reward/mean": 0.39665159583091736, "rewards/ngram_similarity_reward/std": 0.2629046142101288, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 514.40625, "completions/mean_terminated_length": 514.40625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.12620272991720743, "frac_reward_zero_std": 0.0, "grad_norm": 0.04439893737435341, "learning_rate": 4.997999320015956e-06, "loss": 0.0095, "num_tokens": 44957471.0, "reward": 2.500685691833496, "reward_std": 1.8301470279693604, "rewards/accuracy_reward/mean": 2.015625, "rewards/accuracy_reward/std": 3.00260329246521, "rewards/ngram_similarity_reward/mean": 0.48506051301956177, "rewards/ngram_similarity_reward/std": 0.2847696840763092, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 450.1875, "completions/mean_terminated_length": 450.1875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.12665025732826135, "frac_reward_zero_std": 0.0, "grad_norm": 0.051984407007694244, "learning_rate": 4.997928515803023e-06, "loss": 0.0022, "num_tokens": 45095595.0, "reward": 5.386553764343262, "reward_std": 1.2933610677719116, "rewards/accuracy_reward/mean": 4.890625, "rewards/accuracy_reward/std": 1.915825366973877, "rewards/ngram_similarity_reward/mean": 0.4959290623664856, "rewards/ngram_similarity_reward/std": 0.31944161653518677, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 503.84375, "completions/mean_terminated_length": 503.84375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.1270977847393153, "frac_reward_zero_std": 0.0, "grad_norm": 0.04425594583153725, "learning_rate": 4.99785648097498e-06, "loss": -0.0495, "num_tokens": 45318497.0, "reward": 5.2758564949035645, "reward_std": 1.8868292570114136, "rewards/accuracy_reward/mean": 4.78125, "rewards/accuracy_reward/std": 2.0737876892089844, "rewards/ngram_similarity_reward/mean": 0.4946065843105316, "rewards/ngram_similarity_reward/std": 0.3254554569721222, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 477.578125, "completions/mean_terminated_length": 477.578125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.1275453121503692, "frac_reward_zero_std": 0.0, "grad_norm": 0.04923122376203537, "learning_rate": 4.9977832155712666e-06, "loss": 0.0074, "num_tokens": 45497046.0, "reward": 1.7728796005249023, "reward_std": 1.6681783199310303, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 3.1495466232299805, "rewards/ngram_similarity_reward/mean": 0.5541296005249023, "rewards/ngram_similarity_reward/std": 0.48000162839889526, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 487.578125, "completions/mean_terminated_length": 487.578125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.12799283956142313, "frac_reward_zero_std": 0.0, "grad_norm": 0.04564898833632469, "learning_rate": 4.997708719631989e-06, "loss": 0.0231, "num_tokens": 45648443.0, "reward": 3.957265615463257, "reward_std": 0.9509031176567078, "rewards/accuracy_reward/mean": 3.421875, "rewards/accuracy_reward/std": 2.896657705307007, "rewards/ngram_similarity_reward/mean": 0.5353904962539673, "rewards/ngram_similarity_reward/std": 0.38713234663009644, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 374.453125, "completions/mean_terminated_length": 374.453125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.12844036697247707, "frac_reward_zero_std": 0.0, "grad_norm": 0.05063045024871826, "learning_rate": 4.997632993197929e-06, "loss": 0.0039, "num_tokens": 45787080.0, "reward": 4.43035888671875, "reward_std": 0.6818827390670776, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6178590655326843, "rewards/ngram_similarity_reward/std": 0.33551886677742004, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 425.578125, "completions/mean_terminated_length": 425.578125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.128887894383531, "frac_reward_zero_std": 0.0, "grad_norm": 0.04895696043968201, "learning_rate": 4.997556036310543e-06, "loss": 0.017, "num_tokens": 45971293.0, "reward": 4.363377571105957, "reward_std": 1.7207295894622803, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.4727526605129242, "rewards/ngram_similarity_reward/std": 0.45147505402565, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 483.375, "completions/mean_terminated_length": 483.375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.12933542179458493, "frac_reward_zero_std": 0.0, "grad_norm": 0.04794878885149956, "learning_rate": 4.9974778490119605e-06, "loss": -0.0003, "num_tokens": 46140261.0, "reward": 3.348125457763672, "reward_std": 2.0464982986450195, "rewards/accuracy_reward/mean": 2.796875, "rewards/accuracy_reward/std": 3.0998191833496094, "rewards/ngram_similarity_reward/mean": 0.5512505769729614, "rewards/ngram_similarity_reward/std": 0.33140721917152405, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 506.90625, "completions/mean_terminated_length": 506.90625, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.12978294920563885, "frac_reward_zero_std": 0.0, "grad_norm": 0.039918627589941025, "learning_rate": 4.997398431344985e-06, "loss": -0.0138, "num_tokens": 46271311.0, "reward": 5.304322242736816, "reward_std": 1.2509558200836182, "rewards/accuracy_reward/mean": 4.921875, "rewards/accuracy_reward/std": 1.8153201341629028, "rewards/ngram_similarity_reward/mean": 0.3824467360973358, "rewards/ngram_similarity_reward/std": 0.2415844053030014, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 454.546875, "completions/mean_terminated_length": 454.546875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.13023047661669276, "frac_reward_zero_std": 0.0, "grad_norm": 0.05253060534596443, "learning_rate": 4.997317783353095e-06, "loss": 0.0051, "num_tokens": 46435554.0, "reward": 3.7336955070495605, "reward_std": 1.3570480346679688, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.6711956262588501, "rewards/ngram_similarity_reward/std": 0.31571659445762634, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 361.5, "completions/mean_terminated_length": 361.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.1306780040277467, "frac_reward_zero_std": 0.0, "grad_norm": 0.06143470108509064, "learning_rate": 4.997235905080438e-06, "loss": 0.0071, "num_tokens": 46609250.0, "reward": 2.271378517150879, "reward_std": 0.8853912353515625, "rewards/accuracy_reward/mean": 2.0, "rewards/accuracy_reward/std": 3.3333334922790527, "rewards/ngram_similarity_reward/mean": 0.2713784873485565, "rewards/ngram_similarity_reward/std": 0.23282523453235626, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 466.484375, "completions/mean_terminated_length": 466.484375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.13112553143880062, "frac_reward_zero_std": 0.0, "grad_norm": 0.04411265626549721, "learning_rate": 4.997152796571838e-06, "loss": -0.0272, "num_tokens": 46754321.0, "reward": 3.6616954803466797, "reward_std": 1.9821219444274902, "rewards/accuracy_reward/mean": 3.09375, "rewards/accuracy_reward/std": 3.037954330444336, "rewards/ngram_similarity_reward/mean": 0.5679454207420349, "rewards/ngram_similarity_reward/std": 0.3640801012516022, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 647.34375, "completions/mean_terminated_length": 647.34375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.13157305884985454, "frac_reward_zero_std": 0.0, "grad_norm": 0.03811763972043991, "learning_rate": 4.997068457872794e-06, "loss": 0.0037, "num_tokens": 46910871.0, "reward": 2.387169599533081, "reward_std": 1.858168125152588, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 2.9857051372528076, "rewards/ngram_similarity_reward/mean": 0.4652945399284363, "rewards/ngram_similarity_reward/std": 0.28880149126052856, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 447.625, "completions/mean_terminated_length": 447.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.13202058626090848, "frac_reward_zero_std": 0.0, "grad_norm": 0.04353957623243332, "learning_rate": 4.996982889029477e-06, "loss": 0.0123, "num_tokens": 47060175.0, "reward": 4.953624725341797, "reward_std": 1.593971610069275, "rewards/accuracy_reward/mean": 4.578125, "rewards/accuracy_reward/std": 2.304617166519165, "rewards/ngram_similarity_reward/mean": 0.37549978494644165, "rewards/ngram_similarity_reward/std": 0.2559112310409546, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 590.28125, "completions/mean_terminated_length": 590.28125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.1324681136719624, "frac_reward_zero_std": 0.0, "grad_norm": 0.04188549146056175, "learning_rate": 4.996896090088728e-06, "loss": 0.0031, "num_tokens": 47221473.0, "reward": 3.032853841781616, "reward_std": 1.2330999374389648, "rewards/accuracy_reward/mean": 2.796875, "rewards/accuracy_reward/std": 3.0998191833496094, "rewards/ngram_similarity_reward/mean": 0.23597882688045502, "rewards/ngram_similarity_reward/std": 0.1108793169260025, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 517.171875, "completions/mean_terminated_length": 517.171875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.13291564108301634, "frac_reward_zero_std": 0.0, "grad_norm": 0.05190104618668556, "learning_rate": 4.996808061098067e-06, "loss": -0.0203, "num_tokens": 47398780.0, "reward": 3.1222596168518066, "reward_std": 1.8057923316955566, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.126309394836426, "rewards/ngram_similarity_reward/mean": 0.4347594976425171, "rewards/ngram_similarity_reward/std": 0.27208247780799866, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 545.484375, "completions/mean_terminated_length": 545.484375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.13336316849407026, "frac_reward_zero_std": 0.0, "grad_norm": 0.04385865479707718, "learning_rate": 4.9967188021056845e-06, "loss": 0.0138, "num_tokens": 47560507.0, "reward": 4.006990432739258, "reward_std": 0.9415175914764404, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.2882406711578369, "rewards/ngram_similarity_reward/std": 0.23498745262622833, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 405.171875, "completions/mean_terminated_length": 405.171875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.13381069590512418, "frac_reward_zero_std": 0.0, "grad_norm": 0.05112559720873833, "learning_rate": 4.996628313160445e-06, "loss": 0.0161, "num_tokens": 47743270.0, "reward": 2.28450870513916, "reward_std": 2.212578773498535, "rewards/accuracy_reward/mean": 1.765625, "rewards/accuracy_reward/std": 3.1332433223724365, "rewards/ngram_similarity_reward/mean": 0.5188837051391602, "rewards/ngram_similarity_reward/std": 0.3469817042350769, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 632.78125, "completions/mean_terminated_length": 632.78125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.13425822331617812, "frac_reward_zero_std": 0.0, "grad_norm": 0.038688741624355316, "learning_rate": 4.996536594311886e-06, "loss": -0.0127, "num_tokens": 47918376.0, "reward": 2.733279228210449, "reward_std": 2.08182430267334, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.231172561645508, "rewards/ngram_similarity_reward/mean": 0.42077934741973877, "rewards/ngram_similarity_reward/std": 0.22763550281524658, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 481.1875, "completions/mean_terminated_length": 481.1875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.13470575072723204, "frac_reward_zero_std": 0.0, "grad_norm": 0.0463065467774868, "learning_rate": 4.996443645610218e-06, "loss": 0.0123, "num_tokens": 48076724.0, "reward": 3.142300844192505, "reward_std": 2.0055160522460938, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5485509634017944, "rewards/ngram_similarity_reward/std": 0.22397854924201965, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 417.640625, "completions/mean_terminated_length": 417.640625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.13515327813828598, "frac_reward_zero_std": 0.0, "grad_norm": 0.05372007563710213, "learning_rate": 4.996349467106325e-06, "loss": 0.0212, "num_tokens": 48215021.0, "reward": 2.7351503372192383, "reward_std": 0.22711166739463806, "rewards/accuracy_reward/mean": 2.25, "rewards/accuracy_reward/std": 3.295017957687378, "rewards/ngram_similarity_reward/mean": 0.48515036702156067, "rewards/ngram_similarity_reward/std": 0.35704848170280457, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 537.46875, "completions/mean_terminated_length": 537.46875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.1356008055493399, "frac_reward_zero_std": 0.0, "grad_norm": 0.04387521743774414, "learning_rate": 4.996254058851767e-06, "loss": 0.003, "num_tokens": 48369211.0, "reward": 4.248077392578125, "reward_std": 0.7423344850540161, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.4668273627758026, "rewards/ngram_similarity_reward/std": 0.244145005941391, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 439.765625, "completions/mean_terminated_length": 439.765625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.13604833296039381, "frac_reward_zero_std": 0.0, "grad_norm": 0.04732627049088478, "learning_rate": 4.996157420898771e-06, "loss": 0.049, "num_tokens": 48509756.0, "reward": 3.4477286338806152, "reward_std": 1.2740354537963867, "rewards/accuracy_reward/mean": 2.921875, "rewards/accuracy_reward/std": 3.0488338470458984, "rewards/ngram_similarity_reward/mean": 0.5258538722991943, "rewards/ngram_similarity_reward/std": 0.37970679998397827, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 508.96875, "completions/mean_terminated_length": 508.96875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.13649586037144776, "frac_reward_zero_std": 0.0, "grad_norm": 0.03984339162707329, "learning_rate": 4.996059553300243e-06, "loss": 0.0369, "num_tokens": 48651274.0, "reward": 4.591022968292236, "reward_std": 0.9147764444351196, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5910229682922363, "rewards/ngram_similarity_reward/std": 0.23684833943843842, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 455.03125, "completions/mean_terminated_length": 455.03125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.13694338778250167, "frac_reward_zero_std": 0.0, "grad_norm": 0.046683188527822495, "learning_rate": 4.99596045610976e-06, "loss": 0.0198, "num_tokens": 48823820.0, "reward": 3.555692195892334, "reward_std": 1.0254743099212646, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 3.082810640335083, "rewards/ngram_similarity_reward/mean": 0.4150674343109131, "rewards/ngram_similarity_reward/std": 0.2890641987323761, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 430.6875, "completions/mean_terminated_length": 430.6875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.13739091519355562, "frac_reward_zero_std": 0.0, "grad_norm": 0.051073748618364334, "learning_rate": 4.995860129381572e-06, "loss": 0.016, "num_tokens": 48977368.0, "reward": 3.4029407501220703, "reward_std": 0.8974111080169678, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.3404407501220703, "rewards/ngram_similarity_reward/std": 0.2654664218425751, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 414.984375, "completions/mean_terminated_length": 414.984375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.13783844260460953, "frac_reward_zero_std": 0.25, "grad_norm": 0.041511911898851395, "learning_rate": 4.995758573170601e-06, "loss": -0.0017, "num_tokens": 49123271.0, "reward": 1.4354006052017212, "reward_std": 0.1506062150001526, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.45102566480636597, "rewards/ngram_similarity_reward/std": 0.2433464378118515, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 435.53125, "completions/mean_terminated_length": 435.53125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.13828597001566345, "frac_reward_zero_std": 0.0, "grad_norm": 0.05282020941376686, "learning_rate": 4.995655787532445e-06, "loss": 0.0198, "num_tokens": 49279321.0, "reward": 3.551271915435791, "reward_std": 1.4317071437835693, "rewards/accuracy_reward/mean": 3.015625, "rewards/accuracy_reward/std": 3.2464497089385986, "rewards/ngram_similarity_reward/mean": 0.5356469750404358, "rewards/ngram_similarity_reward/std": 0.31283071637153625, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 486.984375, "completions/mean_terminated_length": 486.984375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.1387334974267174, "frac_reward_zero_std": 0.0, "grad_norm": 0.04417431727051735, "learning_rate": 4.995551772523372e-06, "loss": -0.0243, "num_tokens": 49497016.0, "reward": 5.310014724731445, "reward_std": 0.978313684463501, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.6537647247314453, "rewards/ngram_similarity_reward/std": 0.39969301223754883, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 396.265625, "completions/mean_terminated_length": 396.265625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.1391810248377713, "frac_reward_zero_std": 0.25, "grad_norm": 0.04919736087322235, "learning_rate": 4.9954465282003265e-06, "loss": 0.0251, "num_tokens": 49650041.0, "reward": 4.4382853507995605, "reward_std": 1.0855292081832886, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.8132855892181396, "rewards/ngram_similarity_reward/std": 0.44261741638183594, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 504.578125, "completions/mean_terminated_length": 504.578125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.13962855224882523, "frac_reward_zero_std": 0.0, "grad_norm": 0.0423838347196579, "learning_rate": 4.995340054620922e-06, "loss": 0.0222, "num_tokens": 49805038.0, "reward": 2.7991769313812256, "reward_std": 1.8164341449737549, "rewards/accuracy_reward/mean": 2.25, "rewards/accuracy_reward/std": 3.0860671997070312, "rewards/ngram_similarity_reward/mean": 0.5491769909858704, "rewards/ngram_similarity_reward/std": 0.3414195477962494, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 409.4375, "completions/mean_terminated_length": 409.4375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.14007607965987917, "frac_reward_zero_std": 0.25, "grad_norm": 0.05049237236380577, "learning_rate": 4.995232351843448e-06, "loss": 0.0289, "num_tokens": 50079466.0, "reward": 4.114768028259277, "reward_std": 1.654599666595459, "rewards/accuracy_reward/mean": 3.40625, "rewards/accuracy_reward/std": 3.0327250957489014, "rewards/ngram_similarity_reward/mean": 0.7085182666778564, "rewards/ngram_similarity_reward/std": 0.29894617199897766, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 313.671875, "completions/mean_terminated_length": 313.671875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.1405236070709331, "frac_reward_zero_std": 0.25, "grad_norm": 0.0518927238881588, "learning_rate": 4.995123419926864e-06, "loss": -0.0175, "num_tokens": 50258693.0, "reward": 3.5665626525878906, "reward_std": 1.0320234298706055, "rewards/accuracy_reward/mean": 2.890625, "rewards/accuracy_reward/std": 3.195319890975952, "rewards/ngram_similarity_reward/mean": 0.6759374737739563, "rewards/ngram_similarity_reward/std": 0.2605065703392029, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 434.40625, "completions/mean_terminated_length": 434.40625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.14097113448198703, "frac_reward_zero_std": 0.0, "grad_norm": 0.051625993102788925, "learning_rate": 4.995013258930806e-06, "loss": -0.0135, "num_tokens": 50402687.0, "reward": 4.174743175506592, "reward_std": 0.5071485042572021, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.2684934139251709, "rewards/ngram_similarity_reward/std": 0.19142936170101166, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 417.5625, "completions/mean_terminated_length": 417.5625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.14141866189304095, "frac_reward_zero_std": 0.0, "grad_norm": 0.04863320663571358, "learning_rate": 4.994901868915581e-06, "loss": 0.0242, "num_tokens": 50554563.0, "reward": 1.499826192855835, "reward_std": 0.551353931427002, "rewards/accuracy_reward/mean": 1.0625, "rewards/accuracy_reward/std": 2.695528507232666, "rewards/ngram_similarity_reward/mean": 0.4373261630535126, "rewards/ngram_similarity_reward/std": 0.35376015305519104, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 403.828125, "completions/mean_terminated_length": 403.828125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.14186618930409486, "frac_reward_zero_std": 0.0, "grad_norm": 0.04763497784733772, "learning_rate": 4.994789249942166e-06, "loss": 0.0255, "num_tokens": 50678952.0, "reward": 4.207784652709961, "reward_std": 0.6976755857467651, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.4265344738960266, "rewards/ngram_similarity_reward/std": 0.2707797884941101, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 519.578125, "completions/mean_terminated_length": 519.578125, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.1423137167151488, "frac_reward_zero_std": 0.0, "grad_norm": 0.04119405895471573, "learning_rate": 4.994675402072217e-06, "loss": 0.0025, "num_tokens": 50848589.0, "reward": 5.666694164276123, "reward_std": 0.502032995223999, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.2760693430900574, "rewards/ngram_similarity_reward/std": 0.15548977255821228, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 476.90625, "completions/mean_terminated_length": 476.90625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.14276124412620272, "frac_reward_zero_std": 0.0, "grad_norm": 0.05252527445554733, "learning_rate": 4.994560325368057e-06, "loss": 0.0324, "num_tokens": 50998887.0, "reward": 3.12988018989563, "reward_std": 1.5426826477050781, "rewards/accuracy_reward/mean": 2.765625, "rewards/accuracy_reward/std": 3.0302298069000244, "rewards/ngram_similarity_reward/mean": 0.364255428314209, "rewards/ngram_similarity_reward/std": 0.2845660150051117, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 428.46875, "completions/mean_terminated_length": 428.46875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.14320877153725667, "frac_reward_zero_std": 0.0, "grad_norm": 0.05109865590929985, "learning_rate": 4.994444019892687e-06, "loss": 0.0118, "num_tokens": 51160677.0, "reward": 4.334773063659668, "reward_std": 0.6890419721603394, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.5222730040550232, "rewards/ngram_similarity_reward/std": 0.28128302097320557, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 499.296875, "completions/mean_terminated_length": 499.296875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.14365629894831058, "frac_reward_zero_std": 0.0, "grad_norm": 0.04184623435139656, "learning_rate": 4.994326485709774e-06, "loss": 0.0489, "num_tokens": 51305976.0, "reward": 2.736367702484131, "reward_std": 0.8544173836708069, "rewards/accuracy_reward/mean": 2.1875, "rewards/accuracy_reward/std": 3.043989896774292, "rewards/ngram_similarity_reward/mean": 0.5488678216934204, "rewards/ngram_similarity_reward/std": 0.3216118812561035, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 544.1875, "completions/mean_terminated_length": 544.1875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.1441038263593645, "frac_reward_zero_std": 0.0, "grad_norm": 0.04549555107951164, "learning_rate": 4.994207722883664e-06, "loss": -0.0169, "num_tokens": 51472868.0, "reward": 2.5141773223876953, "reward_std": 2.492871046066284, "rewards/accuracy_reward/mean": 1.96875, "rewards/accuracy_reward/std": 3.0496878623962402, "rewards/ngram_similarity_reward/mean": 0.5454275012016296, "rewards/ngram_similarity_reward/std": 0.35773760080337524, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 510.359375, "completions/mean_terminated_length": 510.359375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.14455135377041844, "frac_reward_zero_std": 0.0, "grad_norm": 0.04742863029241562, "learning_rate": 4.994087731479371e-06, "loss": -0.0141, "num_tokens": 51641211.0, "reward": 1.274566650390625, "reward_std": 0.5307009220123291, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 2.8312318325042725, "rewards/ngram_similarity_reward/mean": 0.399566650390625, "rewards/ngram_similarity_reward/std": 0.23947805166244507, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 551.15625, "completions/mean_terminated_length": 551.15625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.14499888118147236, "frac_reward_zero_std": 0.0, "grad_norm": 0.042440809309482574, "learning_rate": 4.993966511562586e-06, "loss": -0.0165, "num_tokens": 51799509.0, "reward": 2.732563018798828, "reward_std": 0.9093632698059082, "rewards/accuracy_reward/mean": 2.28125, "rewards/accuracy_reward/std": 3.2634034156799316, "rewards/ngram_similarity_reward/mean": 0.4513130187988281, "rewards/ngram_similarity_reward/std": 0.3170211613178253, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 478.03125, "completions/mean_terminated_length": 478.03125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.1454464085925263, "frac_reward_zero_std": 0.0, "grad_norm": 0.04281028360128403, "learning_rate": 4.993844063199668e-06, "loss": -0.0039, "num_tokens": 51955303.0, "reward": 3.9197871685028076, "reward_std": 0.9051376581192017, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.38853707909584045, "rewards/ngram_similarity_reward/std": 0.26418963074684143, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 367.3125, "completions/mean_terminated_length": 367.3125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.14589393600358022, "frac_reward_zero_std": 0.0, "grad_norm": 0.05799470096826553, "learning_rate": 4.993720386457653e-06, "loss": 0.0324, "num_tokens": 52132203.0, "reward": 3.663940906524658, "reward_std": 1.499124526977539, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.4139409363269806, "rewards/ngram_similarity_reward/std": 0.3726455271244049, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 389.421875, "completions/mean_terminated_length": 389.421875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.14634146341463414, "frac_reward_zero_std": 0.0, "grad_norm": 0.05426111817359924, "learning_rate": 4.993595481404245e-06, "loss": 0.0035, "num_tokens": 52287798.0, "reward": 3.8550524711608887, "reward_std": 1.761357069015503, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.6050525307655334, "rewards/ngram_similarity_reward/std": 0.3760647475719452, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 574.796875, "completions/mean_terminated_length": 574.796875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.14678899082568808, "frac_reward_zero_std": 0.0, "grad_norm": 0.043712202459573746, "learning_rate": 4.993469348107822e-06, "loss": -0.0342, "num_tokens": 52441769.0, "reward": 3.295194625854492, "reward_std": 1.5132980346679688, "rewards/accuracy_reward/mean": 2.921875, "rewards/accuracy_reward/std": 3.0488338470458984, "rewards/ngram_similarity_reward/mean": 0.3733198344707489, "rewards/ngram_similarity_reward/std": 0.3137679696083069, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 481.90625, "completions/mean_terminated_length": 481.90625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.147236518236742, "frac_reward_zero_std": 0.0, "grad_norm": 0.05058812350034714, "learning_rate": 4.993341986637437e-06, "loss": -0.0285, "num_tokens": 52588659.0, "reward": 2.0073139667510986, "reward_std": 0.9403772950172424, "rewards/accuracy_reward/mean": 1.828125, "rewards/accuracy_reward/std": 3.076045274734497, "rewards/ngram_similarity_reward/mean": 0.17918887734413147, "rewards/ngram_similarity_reward/std": 0.08094964176416397, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 505.15625, "completions/mean_terminated_length": 505.15625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.14768404564779591, "frac_reward_zero_std": 0.0, "grad_norm": 0.04791771620512009, "learning_rate": 4.993213397062812e-06, "loss": 0.04, "num_tokens": 52768349.0, "reward": 2.805192232131958, "reward_std": 0.7437160015106201, "rewards/accuracy_reward/mean": 2.265625, "rewards/accuracy_reward/std": 3.0692648887634277, "rewards/ngram_similarity_reward/mean": 0.5395671725273132, "rewards/ngram_similarity_reward/std": 0.3949489891529083, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 518.828125, "completions/mean_terminated_length": 518.828125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.14813157305884986, "frac_reward_zero_std": 0.0, "grad_norm": 0.042132019996643066, "learning_rate": 4.993083579454345e-06, "loss": 0.0154, "num_tokens": 52953938.0, "reward": 2.460780620574951, "reward_std": 0.8049179911613464, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.42953070998191833, "rewards/ngram_similarity_reward/std": 0.24472883343696594, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 499.40625, "completions/mean_terminated_length": 499.40625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.14857910046990377, "frac_reward_zero_std": 0.0, "grad_norm": 0.04983328655362129, "learning_rate": 4.992952533883099e-06, "loss": -0.01, "num_tokens": 53159516.0, "reward": 1.8761520385742188, "reward_std": 1.512256145477295, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 2.9807584285736084, "rewards/ngram_similarity_reward/mean": 0.43865206837654114, "rewards/ngram_similarity_reward/std": 0.384187251329422, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 473.125, "completions/mean_terminated_length": 473.125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.14902662788095772, "frac_reward_zero_std": 0.0, "grad_norm": 0.04624255374073982, "learning_rate": 4.992820260420817e-06, "loss": -0.0091, "num_tokens": 53309988.0, "reward": 4.088277816772461, "reward_std": 0.7323791980743408, "rewards/accuracy_reward/mean": 3.640625, "rewards/accuracy_reward/std": 3.007225275039673, "rewards/ngram_similarity_reward/mean": 0.44765257835388184, "rewards/ngram_similarity_reward/std": 0.2967788875102997, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 470.296875, "completions/mean_terminated_length": 470.296875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.14947415529201163, "frac_reward_zero_std": 0.0, "grad_norm": 0.05054234340786934, "learning_rate": 4.9926867591399125e-06, "loss": 0.0328, "num_tokens": 53433335.0, "reward": 1.209140419960022, "reward_std": 0.9032200574874878, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 2.6659226417541504, "rewards/ngram_similarity_reward/mean": 0.2716403603553772, "rewards/ngram_similarity_reward/std": 0.20133136212825775, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 428.984375, "completions/mean_terminated_length": 428.984375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.14992168270306555, "frac_reward_zero_std": 0.0, "grad_norm": 0.04859912022948265, "learning_rate": 4.992552030113469e-06, "loss": 0.0081, "num_tokens": 53569542.0, "reward": 1.492279291152954, "reward_std": 0.8460592031478882, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 2.8015992641448975, "rewards/ngram_similarity_reward/mean": 0.25790441036224365, "rewards/ngram_similarity_reward/std": 0.18795253336429596, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 531.578125, "completions/mean_terminated_length": 531.578125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.1503692101141195, "frac_reward_zero_std": 0.0, "grad_norm": 0.04315594211220741, "learning_rate": 4.992416073415242e-06, "loss": 0.0014, "num_tokens": 53731147.0, "reward": 3.3873965740203857, "reward_std": 1.5658332109451294, "rewards/accuracy_reward/mean": 2.75, "rewards/accuracy_reward/std": 3.261122703552246, "rewards/ngram_similarity_reward/mean": 0.6373966932296753, "rewards/ngram_similarity_reward/std": 0.3304082155227661, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 417.234375, "completions/mean_terminated_length": 417.234375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.1508167375251734, "frac_reward_zero_std": 0.0, "grad_norm": 0.055264078080654144, "learning_rate": 4.992278889119661e-06, "loss": 0.0051, "num_tokens": 53894890.0, "reward": 1.4579107761383057, "reward_std": 0.48005035519599915, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.3641607165336609, "rewards/ngram_similarity_reward/std": 0.346956342458725, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 614.21875, "completions/mean_terminated_length": 614.21875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.15126426493622736, "frac_reward_zero_std": 0.0, "grad_norm": 0.03559010848402977, "learning_rate": 4.992140477301827e-06, "loss": -0.0071, "num_tokens": 54065272.0, "reward": 2.9556291103363037, "reward_std": 1.1817878484725952, "rewards/accuracy_reward/mean": 2.453125, "rewards/accuracy_reward/std": 3.077979803085327, "rewards/ngram_similarity_reward/mean": 0.5025041103363037, "rewards/ngram_similarity_reward/std": 0.2608015537261963, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 503.125, "completions/mean_terminated_length": 503.125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.15171179234728127, "frac_reward_zero_std": 0.0, "grad_norm": 0.04555033519864082, "learning_rate": 4.992000838037512e-06, "loss": 0.0097, "num_tokens": 54187600.0, "reward": 3.294914722442627, "reward_std": 0.7349349856376648, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5136648416519165, "rewards/ngram_similarity_reward/std": 0.34142234921455383, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 474.40625, "completions/mean_terminated_length": 474.40625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.1521593197583352, "frac_reward_zero_std": 0.0, "grad_norm": 0.04681862145662308, "learning_rate": 4.9918599714031625e-06, "loss": -0.0015, "num_tokens": 54336842.0, "reward": 1.9993470907211304, "reward_std": 1.2931156158447266, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 3.039423704147339, "rewards/ngram_similarity_reward/mean": 0.4993470013141632, "rewards/ngram_similarity_reward/std": 0.2642381191253662, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 439.796875, "completions/mean_terminated_length": 439.796875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.15260684716938913, "frac_reward_zero_std": 0.0, "grad_norm": 0.045354828238487244, "learning_rate": 4.991717877475893e-06, "loss": -0.0309, "num_tokens": 54471741.0, "reward": 4.207108497619629, "reward_std": 1.2334591150283813, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.20710866153240204, "rewards/ngram_similarity_reward/std": 0.11993768811225891, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 477.921875, "completions/mean_terminated_length": 477.921875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.15305437458044305, "frac_reward_zero_std": 0.0, "grad_norm": 0.04923330619931221, "learning_rate": 4.991574556333492e-06, "loss": 0.0041, "num_tokens": 54613128.0, "reward": 3.2489047050476074, "reward_std": 1.4373531341552734, "rewards/accuracy_reward/mean": 3.046875, "rewards/accuracy_reward/std": 2.991680145263672, "rewards/ngram_similarity_reward/mean": 0.20202943682670593, "rewards/ngram_similarity_reward/std": 0.15467119216918945, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 476.03125, "completions/mean_terminated_length": 476.03125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.153501901991497, "frac_reward_zero_std": 0.0, "grad_norm": 0.04791298136115074, "learning_rate": 4.991430008054422e-06, "loss": 0.0334, "num_tokens": 54802138.0, "reward": 5.949857711791992, "reward_std": 0.7111374139785767, "rewards/accuracy_reward/mean": 5.296875, "rewards/accuracy_reward/std": 1.1433686017990112, "rewards/ngram_similarity_reward/mean": 0.6529824733734131, "rewards/ngram_similarity_reward/std": 0.31718015670776367, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 564.125, "completions/mean_terminated_length": 564.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.1539494294025509, "frac_reward_zero_std": 0.0, "grad_norm": 0.04487808421254158, "learning_rate": 4.9912842327178125e-06, "loss": 0.0061, "num_tokens": 54978162.0, "reward": 3.2296833992004395, "reward_std": 1.9352624416351318, "rewards/accuracy_reward/mean": 2.75, "rewards/accuracy_reward/std": 3.0498504638671875, "rewards/ngram_similarity_reward/mean": 0.47968345880508423, "rewards/ngram_similarity_reward/std": 0.36490052938461304, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 469.765625, "completions/mean_terminated_length": 469.765625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.15439695681360482, "frac_reward_zero_std": 0.0, "grad_norm": 0.050543803721666336, "learning_rate": 4.991137230403469e-06, "loss": 0.0272, "num_tokens": 55135347.0, "reward": 4.8638715744018555, "reward_std": 1.9317225217819214, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.5826215744018555, "rewards/ngram_similarity_reward/std": 0.31985583901405334, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 469.375, "completions/mean_terminated_length": 469.375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.15484448422465877, "frac_reward_zero_std": 0.0, "grad_norm": 0.04796312749385834, "learning_rate": 4.990989001191866e-06, "loss": -0.0286, "num_tokens": 55284347.0, "reward": 0.8435247540473938, "reward_std": 1.1935805082321167, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 2.56652569770813, "rewards/ngram_similarity_reward/mean": 0.32789987325668335, "rewards/ngram_similarity_reward/std": 0.3246069550514221, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 428.953125, "completions/mean_terminated_length": 428.953125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.15529201163571268, "frac_reward_zero_std": 0.0, "grad_norm": 0.05468254163861275, "learning_rate": 4.990839545164152e-06, "loss": -0.014, "num_tokens": 55443336.0, "reward": 2.6985256671905518, "reward_std": 0.7273306250572205, "rewards/accuracy_reward/mean": 2.25, "rewards/accuracy_reward/std": 3.0860671997070312, "rewards/ngram_similarity_reward/mean": 0.44852563738822937, "rewards/ngram_similarity_reward/std": 0.2961733043193817, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 429.59375, "completions/mean_terminated_length": 429.59375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.1557395390467666, "frac_reward_zero_std": 0.0, "grad_norm": 0.052619464695453644, "learning_rate": 4.990688862402145e-06, "loss": -0.0551, "num_tokens": 55608798.0, "reward": 2.735952138900757, "reward_std": 1.4438188076019287, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.119161367416382, "rewards/ngram_similarity_reward/mean": 0.5172022581100464, "rewards/ngram_similarity_reward/std": 0.3857964873313904, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 465.421875, "completions/mean_terminated_length": 465.421875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.15618706645782054, "frac_reward_zero_std": 0.0, "grad_norm": 0.05453364923596382, "learning_rate": 4.990536952988335e-06, "loss": -0.0107, "num_tokens": 55755113.0, "reward": 2.8205485343933105, "reward_std": 0.9628164768218994, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.06007981300354, "rewards/ngram_similarity_reward/mean": 0.35179853439331055, "rewards/ngram_similarity_reward/std": 0.2584626376628876, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 432.296875, "completions/mean_terminated_length": 432.296875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.15663459386887446, "frac_reward_zero_std": 0.0, "grad_norm": 0.06002373620867729, "learning_rate": 4.990383817005885e-06, "loss": 0.0612, "num_tokens": 55985580.0, "reward": 4.117739677429199, "reward_std": 0.9441288709640503, "rewards/accuracy_reward/mean": 3.578125, "rewards/accuracy_reward/std": 2.6083180904388428, "rewards/ngram_similarity_reward/mean": 0.5396143794059753, "rewards/ngram_similarity_reward/std": 0.40891844034194946, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 469.953125, "completions/mean_terminated_length": 469.953125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.1570821212799284, "frac_reward_zero_std": 0.0, "grad_norm": 0.04758965224027634, "learning_rate": 4.990229454538626e-06, "loss": -0.0109, "num_tokens": 56161241.0, "reward": 4.118064880371094, "reward_std": 1.6833152770996094, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.4930647909641266, "rewards/ngram_similarity_reward/std": 0.3592028319835663, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 475.71875, "completions/mean_terminated_length": 475.71875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.15752964869098232, "frac_reward_zero_std": 0.0, "grad_norm": 0.05025568604469299, "learning_rate": 4.990073865671067e-06, "loss": 0.0062, "num_tokens": 56406215.0, "reward": 4.09308385848999, "reward_std": 1.8486790657043457, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.3118337392807007, "rewards/ngram_similarity_reward/std": 0.18081018328666687, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 634.390625, "completions/mean_terminated_length": 634.390625, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.15797717610203624, "frac_reward_zero_std": 0.0, "grad_norm": 0.03872627764940262, "learning_rate": 4.989917050488381e-06, "loss": 0.0052, "num_tokens": 56531936.0, "reward": 2.959627628326416, "reward_std": 0.1740277260541916, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.06007981300354, "rewards/ngram_similarity_reward/mean": 0.4908774793148041, "rewards/ngram_similarity_reward/std": 0.3493154048919678, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 453.09375, "completions/mean_terminated_length": 453.09375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.15842470351309018, "frac_reward_zero_std": 0.0, "grad_norm": 0.05340864509344101, "learning_rate": 4.989759009076415e-06, "loss": 0.0086, "num_tokens": 56691830.0, "reward": 4.392290115356445, "reward_std": 0.4906888008117676, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.4860401153564453, "rewards/ngram_similarity_reward/std": 0.38415634632110596, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 537.015625, "completions/mean_terminated_length": 537.015625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.1588722309241441, "frac_reward_zero_std": 0.0, "grad_norm": 0.04268348962068558, "learning_rate": 4.98959974152169e-06, "loss": 0.0337, "num_tokens": 56853303.0, "reward": 4.8049211502075195, "reward_std": 1.7453646659851074, "rewards/accuracy_reward/mean": 4.15625, "rewards/accuracy_reward/std": 2.564833879470825, "rewards/ngram_similarity_reward/mean": 0.6486713886260986, "rewards/ngram_similarity_reward/std": 0.3948211073875427, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 466.03125, "completions/mean_terminated_length": 466.03125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.15931975833519804, "frac_reward_zero_std": 0.0, "grad_norm": 0.043552812188863754, "learning_rate": 4.9894392479113945e-06, "loss": 0.0055, "num_tokens": 57025721.0, "reward": 3.4975011348724365, "reward_std": 2.0453078746795654, "rewards/accuracy_reward/mean": 2.953125, "rewards/accuracy_reward/std": 3.0075550079345703, "rewards/ngram_similarity_reward/mean": 0.5443758964538574, "rewards/ngram_similarity_reward/std": 0.3401655852794647, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 406.34375, "completions/mean_terminated_length": 406.34375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.15976728574625196, "frac_reward_zero_std": 0.0, "grad_norm": 0.0548367016017437, "learning_rate": 4.989277528333392e-06, "loss": 0.0039, "num_tokens": 57154383.0, "reward": 3.869058609008789, "reward_std": 2.3097217082977295, "rewards/accuracy_reward/mean": 3.421875, "rewards/accuracy_reward/std": 2.896657705307007, "rewards/ngram_similarity_reward/mean": 0.4471837282180786, "rewards/ngram_similarity_reward/std": 0.3531920909881592, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 499.203125, "completions/mean_terminated_length": 474.61907958984375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.16021481315730587, "frac_reward_zero_std": 0.0, "grad_norm": 0.060244787484407425, "learning_rate": 4.989114582876212e-06, "loss": 0.0195, "num_tokens": 57300668.0, "reward": 3.5350584983825684, "reward_std": 1.4591851234436035, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 3.080275297164917, "rewards/ngram_similarity_reward/mean": 0.47255846858024597, "rewards/ngram_similarity_reward/std": 0.33570200204849243, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 500.859375, "completions/mean_terminated_length": 500.859375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.16066234056835982, "frac_reward_zero_std": 0.0, "grad_norm": 0.05069715902209282, "learning_rate": 4.98895041162906e-06, "loss": 0.0158, "num_tokens": 57512851.0, "reward": 4.281793594360352, "reward_std": 0.7293245792388916, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5630437731742859, "rewards/ngram_similarity_reward/std": 0.38598915934562683, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 431.0, "completions/mean_terminated_length": 431.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.16110986797941373, "frac_reward_zero_std": 0.0, "grad_norm": 0.04762047156691551, "learning_rate": 4.9887850146818095e-06, "loss": 0.0561, "num_tokens": 57691923.0, "reward": 4.7552032470703125, "reward_std": 2.4422922134399414, "rewards/accuracy_reward/mean": 3.96875, "rewards/accuracy_reward/std": 2.6783599853515625, "rewards/ngram_similarity_reward/mean": 0.7864532470703125, "rewards/ngram_similarity_reward/std": 0.36044836044311523, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 414.375, "completions/mean_terminated_length": 414.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.16155739539046768, "frac_reward_zero_std": 0.0, "grad_norm": 0.057258009910583496, "learning_rate": 4.988618392125007e-06, "loss": -0.0112, "num_tokens": 57855067.0, "reward": 2.3432412147521973, "reward_std": 1.5988247394561768, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5619911551475525, "rewards/ngram_similarity_reward/std": 0.34821024537086487, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 477.6875, "completions/mean_terminated_length": 477.6875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.1620049228015216, "frac_reward_zero_std": 0.0, "grad_norm": 0.04392610117793083, "learning_rate": 4.988450544049869e-06, "loss": 0.0179, "num_tokens": 57988775.0, "reward": 2.244896650314331, "reward_std": 1.298945426940918, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 2.9572014808654785, "rewards/ngram_similarity_reward/mean": 0.5261467695236206, "rewards/ngram_similarity_reward/std": 0.33598366379737854, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 495.40625, "completions/mean_terminated_length": 495.40625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.1624524502125755, "frac_reward_zero_std": 0.0, "grad_norm": 0.048559024930000305, "learning_rate": 4.988281470548282e-06, "loss": 0.0189, "num_tokens": 58112769.0, "reward": 4.195694923400879, "reward_std": 0.6702969670295715, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.38319528102874756, "rewards/ngram_similarity_reward/std": 0.34498733282089233, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 375.625, "completions/mean_terminated_length": 375.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.16289997762362945, "frac_reward_zero_std": 0.0, "grad_norm": 0.06379535794258118, "learning_rate": 4.988111171712804e-06, "loss": -0.0161, "num_tokens": 58332233.0, "reward": 2.30958890914917, "reward_std": 2.4288597106933594, "rewards/accuracy_reward/mean": 2.0, "rewards/accuracy_reward/std": 3.0184617042541504, "rewards/ngram_similarity_reward/mean": 0.30958884954452515, "rewards/ngram_similarity_reward/std": 0.25692975521087646, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 418.890625, "completions/mean_terminated_length": 418.890625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.16334750503468337, "frac_reward_zero_std": 0.0, "grad_norm": 0.054858047515153885, "learning_rate": 4.987939647636666e-06, "loss": -0.0046, "num_tokens": 58467634.0, "reward": 4.41879415512085, "reward_std": 0.28737539052963257, "rewards/accuracy_reward/mean": 3.9375, "rewards/accuracy_reward/std": 2.736438512802124, "rewards/ngram_similarity_reward/mean": 0.48129430413246155, "rewards/ngram_similarity_reward/std": 0.3488374948501587, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 507.265625, "completions/mean_terminated_length": 507.265625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.16379503244573732, "frac_reward_zero_std": 0.0, "grad_norm": 0.05520766228437424, "learning_rate": 4.987766898413766e-06, "loss": 0.0402, "num_tokens": 58667411.0, "reward": 3.7660741806030273, "reward_std": 2.487922430038452, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6098242402076721, "rewards/ngram_similarity_reward/std": 0.3422655463218689, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 491.25, "completions/mean_terminated_length": 491.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.16424255985679123, "frac_reward_zero_std": 0.0, "grad_norm": 0.04897642135620117, "learning_rate": 4.987592924138676e-06, "loss": 0.0341, "num_tokens": 58819587.0, "reward": 1.3902175426483154, "reward_std": 0.9953027963638306, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.4995926022529602, "rewards/ngram_similarity_reward/std": 0.3933001160621643, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 458.4375, "completions/mean_terminated_length": 458.4375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.16469008726784515, "frac_reward_zero_std": 0.0, "grad_norm": 0.047864362597465515, "learning_rate": 4.987417724906636e-06, "loss": 0.0007, "num_tokens": 58973983.0, "reward": 2.1851534843444824, "reward_std": 1.8688929080963135, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 2.91611385345459, "rewards/ngram_similarity_reward/mean": 0.5445283055305481, "rewards/ngram_similarity_reward/std": 0.33005306124687195, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 529.015625, "completions/mean_terminated_length": 504.90478515625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.1651376146788991, "frac_reward_zero_std": 0.0, "grad_norm": 0.05091036856174469, "learning_rate": 4.987241300813559e-06, "loss": -0.0226, "num_tokens": 59123152.0, "reward": 1.76096510887146, "reward_std": 1.6029728651046753, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 3.006441593170166, "rewards/ngram_similarity_reward/mean": 0.3547152280807495, "rewards/ngram_similarity_reward/std": 0.21333430707454681, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 485.15625, "completions/mean_terminated_length": 485.15625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.165585142089953, "frac_reward_zero_std": 0.0, "grad_norm": 0.046371713280677795, "learning_rate": 4.987063651956025e-06, "loss": -0.0032, "num_tokens": 59267834.0, "reward": 2.320300579071045, "reward_std": 1.143520474433899, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.2890505790710449, "rewards/ngram_similarity_reward/std": 0.3301846385002136, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 385.5, "completions/mean_terminated_length": 385.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.16603266950100692, "frac_reward_zero_std": 0.0, "grad_norm": 0.059546373784542084, "learning_rate": 4.98688477843129e-06, "loss": 0.016, "num_tokens": 59411946.0, "reward": 3.2313122749328613, "reward_std": 1.873366355895996, "rewards/accuracy_reward/mean": 2.734375, "rewards/accuracy_reward/std": 3.1760122776031494, "rewards/ngram_similarity_reward/mean": 0.4969370663166046, "rewards/ngram_similarity_reward/std": 0.2913515865802765, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 505.6875, "completions/mean_terminated_length": 505.6875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.16648019691206087, "frac_reward_zero_std": 0.0, "grad_norm": 0.045408546924591064, "learning_rate": 4.986704680337274e-06, "loss": -0.0131, "num_tokens": 59589190.0, "reward": 1.599818229675293, "reward_std": 0.12945838272571564, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.6154431104660034, "rewards/ngram_similarity_reward/std": 0.3580954074859619, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 529.296875, "completions/mean_terminated_length": 529.296875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.16692772432311478, "frac_reward_zero_std": 0.0, "grad_norm": 0.04404851794242859, "learning_rate": 4.986523357772573e-06, "loss": -0.0016, "num_tokens": 59736329.0, "reward": 5.319042205810547, "reward_std": 0.9707709550857544, "rewards/accuracy_reward/mean": 4.953125, "rewards/accuracy_reward/std": 1.8934279680252075, "rewards/ngram_similarity_reward/mean": 0.36591705679893494, "rewards/ngram_similarity_reward/std": 0.2848031520843506, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 450.40625, "completions/mean_terminated_length": 450.40625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.16737525173416873, "frac_reward_zero_std": 0.0, "grad_norm": 0.05363212525844574, "learning_rate": 4.9863408108364506e-06, "loss": 0.0066, "num_tokens": 59854755.0, "reward": 2.6478450298309326, "reward_std": 0.7685288190841675, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.5228448510169983, "rewards/ngram_similarity_reward/std": 0.3450181782245636, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 462.640625, "completions/mean_terminated_length": 462.640625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.16782277914522264, "frac_reward_zero_std": 0.0, "grad_norm": 0.04987025633454323, "learning_rate": 4.986157039628841e-06, "loss": 0.0063, "num_tokens": 60053868.0, "reward": 2.8956449031829834, "reward_std": 1.9078601598739624, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.4112697243690491, "rewards/ngram_similarity_reward/std": 0.206552654504776, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 444.359375, "completions/mean_terminated_length": 444.359375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.16827030655627656, "frac_reward_zero_std": 0.25, "grad_norm": 0.04339223727583885, "learning_rate": 4.9859720442503465e-06, "loss": -0.0199, "num_tokens": 60211587.0, "reward": 2.1971282958984375, "reward_std": 1.369988203048706, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 3.0420336723327637, "rewards/ngram_similarity_reward/mean": 0.572128415107727, "rewards/ngram_similarity_reward/std": 0.443522572517395, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 451.453125, "completions/mean_terminated_length": 451.453125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.1687178339673305, "frac_reward_zero_std": 0.0, "grad_norm": 0.049212515354156494, "learning_rate": 4.985785824802244e-06, "loss": -0.0071, "num_tokens": 60375920.0, "reward": 2.760453701019287, "reward_std": 1.8617186546325684, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.26045358180999756, "rewards/ngram_similarity_reward/std": 0.17604276537895203, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 588.421875, "completions/mean_terminated_length": 588.421875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.16916536137838442, "frac_reward_zero_std": 0.0, "grad_norm": 0.04697156697511673, "learning_rate": 4.985598381386479e-06, "loss": 0.0357, "num_tokens": 60516331.0, "reward": 4.39042854309082, "reward_std": 0.49993008375167847, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.49980348348617554, "rewards/ngram_similarity_reward/std": 0.2500842213630676, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 433.890625, "completions/mean_terminated_length": 433.890625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.16961288878943837, "frac_reward_zero_std": 0.0, "grad_norm": 0.05765402689576149, "learning_rate": 4.985409714105665e-06, "loss": -0.0063, "num_tokens": 60666036.0, "reward": 4.322384834289551, "reward_std": 1.326669454574585, "rewards/accuracy_reward/mean": 3.859375, "rewards/accuracy_reward/std": 2.7566208839416504, "rewards/ngram_similarity_reward/mean": 0.46300965547561646, "rewards/ngram_similarity_reward/std": 0.18290044367313385, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 538.5, "completions/mean_terminated_length": 538.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.17006041620049228, "frac_reward_zero_std": 0.0, "grad_norm": 0.04762609675526619, "learning_rate": 4.985219823063086e-06, "loss": -0.0093, "num_tokens": 60784148.0, "reward": 1.2124340534210205, "reward_std": 1.3166241645812988, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.3999340534210205, "rewards/ngram_similarity_reward/std": 0.2689370810985565, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 446.0625, "completions/mean_terminated_length": 446.0625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.1705079436115462, "frac_reward_zero_std": 0.0, "grad_norm": 0.05093666911125183, "learning_rate": 4.985028708362697e-06, "loss": -0.005, "num_tokens": 60923352.0, "reward": 4.361514091491699, "reward_std": 0.5552492737770081, "rewards/accuracy_reward/mean": 3.859375, "rewards/accuracy_reward/std": 2.7566208839416504, "rewards/ngram_similarity_reward/mean": 0.5021390318870544, "rewards/ngram_similarity_reward/std": 0.35762351751327515, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 460.96875, "completions/mean_terminated_length": 460.96875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.17095547102260014, "frac_reward_zero_std": 0.0, "grad_norm": 0.050654876977205276, "learning_rate": 4.984836370109124e-06, "loss": 0.0272, "num_tokens": 61069670.0, "reward": 3.0200891494750977, "reward_std": 0.8383454084396362, "rewards/accuracy_reward/mean": 2.421875, "rewards/accuracy_reward/std": 3.113231897354126, "rewards/ngram_similarity_reward/mean": 0.5982141494750977, "rewards/ngram_similarity_reward/std": 0.4427201449871063, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 472.90625, "completions/mean_terminated_length": 472.90625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.17140299843365406, "frac_reward_zero_std": 0.0, "grad_norm": 0.05232694745063782, "learning_rate": 4.98464280840766e-06, "loss": -0.0054, "num_tokens": 61196592.0, "reward": 4.779727935791016, "reward_std": 0.6334294080734253, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.5922282934188843, "rewards/ngram_similarity_reward/std": 0.34544986486434937, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 408.15625, "completions/mean_terminated_length": 408.15625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.171850525844708, "frac_reward_zero_std": 0.0, "grad_norm": 0.06515532732009888, "learning_rate": 4.98444802336427e-06, "loss": 0.0095, "num_tokens": 61394442.0, "reward": 3.603017807006836, "reward_std": 1.3876564502716064, "rewards/accuracy_reward/mean": 3.328125, "rewards/accuracy_reward/std": 2.9252848625183105, "rewards/ngram_similarity_reward/mean": 0.274892657995224, "rewards/ngram_similarity_reward/std": 0.20359322428703308, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 446.3125, "completions/mean_terminated_length": 446.3125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.17229805325576192, "frac_reward_zero_std": 0.0, "grad_norm": 0.05600956082344055, "learning_rate": 4.984252015085588e-06, "loss": 0.0006, "num_tokens": 61581278.0, "reward": 1.1497323513031006, "reward_std": 2.092327356338501, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.688710927963257, "rewards/ngram_similarity_reward/mean": 0.24348226189613342, "rewards/ngram_similarity_reward/std": 0.1237434595823288, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 478.921875, "completions/mean_terminated_length": 478.921875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.17274558066681583, "frac_reward_zero_std": 0.0, "grad_norm": 0.04640120267868042, "learning_rate": 4.9840547836789175e-06, "loss": -0.0179, "num_tokens": 61737433.0, "reward": 6.133259296417236, "reward_std": 0.5081021189689636, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.7270093560218811, "rewards/ngram_similarity_reward/std": 0.27775880694389343, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 521.71875, "completions/mean_terminated_length": 521.71875, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.17319310807786978, "frac_reward_zero_std": 0.0, "grad_norm": 0.044407736510038376, "learning_rate": 4.9838563292522304e-06, "loss": 0.0146, "num_tokens": 61899751.0, "reward": 3.686185359954834, "reward_std": 2.0083577632904053, "rewards/accuracy_reward/mean": 3.046875, "rewards/accuracy_reward/std": 2.991680145263672, "rewards/ngram_similarity_reward/mean": 0.6393104791641235, "rewards/ngram_similarity_reward/std": 0.3743637204170227, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 469.390625, "completions/mean_terminated_length": 469.390625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.1736406354889237, "frac_reward_zero_std": 0.0, "grad_norm": 0.045903291553258896, "learning_rate": 4.983656651914172e-06, "loss": -0.0037, "num_tokens": 62064528.0, "reward": 3.4533281326293945, "reward_std": 2.146829843521118, "rewards/accuracy_reward/mean": 2.921875, "rewards/accuracy_reward/std": 3.0488338470458984, "rewards/ngram_similarity_reward/mean": 0.5314529538154602, "rewards/ngram_similarity_reward/std": 0.30242854356765747, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 518.015625, "completions/mean_terminated_length": 518.015625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.1740881628999776, "frac_reward_zero_std": 0.0, "grad_norm": 0.046720489859580994, "learning_rate": 4.983455751774051e-06, "loss": -0.008, "num_tokens": 62232193.0, "reward": 2.3623266220092773, "reward_std": 1.4893290996551514, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.47170156240463257, "rewards/ngram_similarity_reward/std": 0.19742360711097717, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 616.84375, "completions/mean_terminated_length": 594.1270141601562, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.17453569031103155, "frac_reward_zero_std": 0.0, "grad_norm": 0.041487254202365875, "learning_rate": 4.983253628941852e-06, "loss": 0.0107, "num_tokens": 62382647.0, "reward": 0.36744025349617004, "reward_std": 1.2747513055801392, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 1.790558934211731, "rewards/ngram_similarity_reward/mean": 0.35181528329849243, "rewards/ngram_similarity_reward/std": 0.24288035929203033, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 403.078125, "completions/mean_terminated_length": 403.078125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.17498321772208547, "frac_reward_zero_std": 0.0, "grad_norm": 0.05197535827755928, "learning_rate": 4.983050283528224e-06, "loss": -0.0025, "num_tokens": 62521292.0, "reward": 5.143540382385254, "reward_std": 1.4719245433807373, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.5810403823852539, "rewards/ngram_similarity_reward/std": 0.38260239362716675, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 339.984375, "completions/mean_terminated_length": 339.984375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.17543074513313942, "frac_reward_zero_std": 0.0, "grad_norm": 0.07508906722068787, "learning_rate": 4.982845715644489e-06, "loss": -0.008, "num_tokens": 62662379.0, "reward": 2.900442600250244, "reward_std": 0.5704430341720581, "rewards/accuracy_reward/mean": 2.578125, "rewards/accuracy_reward/std": 3.0410144329071045, "rewards/ngram_similarity_reward/mean": 0.32231757044792175, "rewards/ngram_similarity_reward/std": 0.35339492559432983, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 451.125, "completions/mean_terminated_length": 451.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.17587827254419333, "frac_reward_zero_std": 0.0, "grad_norm": 0.057737164199352264, "learning_rate": 4.982639925402636e-06, "loss": -0.0154, "num_tokens": 62806643.0, "reward": 4.112510681152344, "reward_std": 0.8752514123916626, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.3312610387802124, "rewards/ngram_similarity_reward/std": 0.19397112727165222, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 377.046875, "completions/mean_terminated_length": 377.046875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.17632579995524725, "frac_reward_zero_std": 0.25, "grad_norm": 0.0579199455678463, "learning_rate": 4.982432912915321e-06, "loss": 0.0082, "num_tokens": 62972886.0, "reward": 3.8911335468292236, "reward_std": 1.3057481050491333, "rewards/accuracy_reward/mean": 3.40625, "rewards/accuracy_reward/std": 2.920745372772217, "rewards/ngram_similarity_reward/mean": 0.4848836660385132, "rewards/ngram_similarity_reward/std": 0.46673062443733215, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 411.46875, "completions/mean_terminated_length": 411.46875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.1767733273663012, "frac_reward_zero_std": 0.0, "grad_norm": 0.06127110496163368, "learning_rate": 4.982224678295876e-06, "loss": 0.0187, "num_tokens": 63124420.0, "reward": 1.1061592102050781, "reward_std": 0.47718849778175354, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 2.8416807651519775, "rewards/ngram_similarity_reward/mean": 0.24678421020507812, "rewards/ngram_similarity_reward/std": 0.08838558197021484, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 373.90625, "completions/mean_terminated_length": 373.90625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.1772208547773551, "frac_reward_zero_std": 0.0, "grad_norm": 0.05738307535648346, "learning_rate": 4.982015221658294e-06, "loss": 0.0137, "num_tokens": 63244862.0, "reward": 4.085636138916016, "reward_std": 0.895679235458374, "rewards/accuracy_reward/mean": 3.671875, "rewards/accuracy_reward/std": 2.8427278995513916, "rewards/ngram_similarity_reward/mean": 0.4137610197067261, "rewards/ngram_similarity_reward/std": 0.353694349527359, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 462.734375, "completions/mean_terminated_length": 462.734375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.17766838218840905, "frac_reward_zero_std": 0.0, "grad_norm": 0.05412351340055466, "learning_rate": 4.981804543117243e-06, "loss": 0.0322, "num_tokens": 63466141.0, "reward": 2.1925134658813477, "reward_std": 1.0302202701568604, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 3.0002894401550293, "rewards/ngram_similarity_reward/mean": 0.5206387042999268, "rewards/ngram_similarity_reward/std": 0.35803091526031494, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 448.421875, "completions/mean_terminated_length": 448.421875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.17811590959946297, "frac_reward_zero_std": 0.0, "grad_norm": 0.05046705901622772, "learning_rate": 4.9815926427880575e-06, "loss": 0.0316, "num_tokens": 63606904.0, "reward": 1.6938735246658325, "reward_std": 1.4424854516983032, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.41262343525886536, "rewards/ngram_similarity_reward/std": 0.31365764141082764, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 449.6875, "completions/mean_terminated_length": 424.3174743652344, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.17856343701051688, "frac_reward_zero_std": 0.0, "grad_norm": 0.0631108209490776, "learning_rate": 4.981379520786742e-06, "loss": -0.0484, "num_tokens": 63857620.0, "reward": 1.294722080230713, "reward_std": 1.6329095363616943, "rewards/accuracy_reward/mean": 0.921875, "rewards/accuracy_reward/std": 2.6773874759674072, "rewards/ngram_similarity_reward/mean": 0.3728471100330353, "rewards/ngram_similarity_reward/std": 0.2743929624557495, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 319.984375, "completions/mean_terminated_length": 319.984375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.17901096442157083, "frac_reward_zero_std": 0.0, "grad_norm": 0.07036460191011429, "learning_rate": 4.981165177229967e-06, "loss": -0.0157, "num_tokens": 64122227.0, "reward": 4.307033538818359, "reward_std": 0.4799138009548187, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.4007837772369385, "rewards/ngram_similarity_reward/std": 0.2723061740398407, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 531.5625, "completions/mean_terminated_length": 531.5625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.17945849183262474, "frac_reward_zero_std": 0.0, "grad_norm": 0.04598090052604675, "learning_rate": 4.980949612235073e-06, "loss": -0.0366, "num_tokens": 64286343.0, "reward": 6.062546730041504, "reward_std": 0.4906230568885803, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.6562970876693726, "rewards/ngram_similarity_reward/std": 0.25769153237342834, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 446.140625, "completions/mean_terminated_length": 446.140625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.1799060192436787, "frac_reward_zero_std": 0.0, "grad_norm": 0.05102456733584404, "learning_rate": 4.980732825920072e-06, "loss": 0.0096, "num_tokens": 64425040.0, "reward": 4.750601768493652, "reward_std": 1.248483419418335, "rewards/accuracy_reward/mean": 4.359375, "rewards/accuracy_reward/std": 2.3962087631225586, "rewards/ngram_similarity_reward/mean": 0.39122653007507324, "rewards/ngram_similarity_reward/std": 0.261292427778244, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 522.59375, "completions/mean_terminated_length": 522.59375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.1803535466547326, "frac_reward_zero_std": 0.0, "grad_norm": 0.04561065137386322, "learning_rate": 4.980514818403642e-06, "loss": -0.0013, "num_tokens": 64586262.0, "reward": 3.251201629638672, "reward_std": 1.5547279119491577, "rewards/accuracy_reward/mean": 2.671875, "rewards/accuracy_reward/std": 3.037097215652466, "rewards/ngram_similarity_reward/mean": 0.579326868057251, "rewards/ngram_similarity_reward/std": 0.26619914174079895, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 531.46875, "completions/mean_terminated_length": 531.46875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.18080107406578652, "frac_reward_zero_std": 0.0, "grad_norm": 0.04496293514966965, "learning_rate": 4.980295589805129e-06, "loss": 0.0011, "num_tokens": 64718356.0, "reward": 4.318756580352783, "reward_std": 0.5536075830459595, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.42813175916671753, "rewards/ngram_similarity_reward/std": 0.35951486229896545, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 465.296875, "completions/mean_terminated_length": 465.296875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.18124860147684047, "frac_reward_zero_std": 0.0, "grad_norm": 0.051899295300245285, "learning_rate": 4.980075140244548e-06, "loss": 0.0078, "num_tokens": 64855735.0, "reward": 3.4069631099700928, "reward_std": 0.9889413118362427, "rewards/accuracy_reward/mean": 3.03125, "rewards/accuracy_reward/std": 3.0130341053009033, "rewards/ngram_similarity_reward/mean": 0.37571316957473755, "rewards/ngram_similarity_reward/std": 0.3223586082458496, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 495.046875, "completions/mean_terminated_length": 495.046875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.18169612888789438, "frac_reward_zero_std": 0.0, "grad_norm": 0.0473325289785862, "learning_rate": 4.979853469842584e-06, "loss": 0.0247, "num_tokens": 65002106.0, "reward": 3.3111226558685303, "reward_std": 0.7830137610435486, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5298726558685303, "rewards/ngram_similarity_reward/std": 0.29405301809310913, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 481.140625, "completions/mean_terminated_length": 481.140625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.1821436562989483, "frac_reward_zero_std": 0.0, "grad_norm": 0.046513911336660385, "learning_rate": 4.97963057872059e-06, "loss": 0.0184, "num_tokens": 65127523.0, "reward": 2.0190467834472656, "reward_std": 0.8510860800743103, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.2690466642379761, "rewards/ngram_similarity_reward/std": 0.2904336154460907, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 418.40625, "completions/mean_terminated_length": 418.40625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.18259118371000224, "frac_reward_zero_std": 0.0, "grad_norm": 0.05490051954984665, "learning_rate": 4.979406467000583e-06, "loss": -0.0138, "num_tokens": 65250397.0, "reward": 2.7550859451293945, "reward_std": 0.2528682351112366, "rewards/accuracy_reward/mean": 2.34375, "rewards/accuracy_reward/std": 3.1983067989349365, "rewards/ngram_similarity_reward/mean": 0.41133588552474976, "rewards/ngram_similarity_reward/std": 0.3694441020488739, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 513.296875, "completions/mean_terminated_length": 513.296875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.18303871112105616, "frac_reward_zero_std": 0.0, "grad_norm": 0.0506632924079895, "learning_rate": 4.979181134805255e-06, "loss": -0.0235, "num_tokens": 65420032.0, "reward": 3.2668845653533936, "reward_std": 1.1849749088287354, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.4856344759464264, "rewards/ngram_similarity_reward/std": 0.29462864995002747, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 484.734375, "completions/mean_terminated_length": 484.734375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1834862385321101, "frac_reward_zero_std": 0.0, "grad_norm": 0.04931401088833809, "learning_rate": 4.978954582257961e-06, "loss": -0.0443, "num_tokens": 65605263.0, "reward": 2.908878803253174, "reward_std": 1.4477934837341309, "rewards/accuracy_reward/mean": 2.4375, "rewards/accuracy_reward/std": 3.095695972442627, "rewards/ngram_similarity_reward/mean": 0.47137901186943054, "rewards/ngram_similarity_reward/std": 0.2872864902019501, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 544.03125, "completions/mean_terminated_length": 544.03125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.18393376594316402, "frac_reward_zero_std": 0.0, "grad_norm": 0.0414864607155323, "learning_rate": 4.978726809482727e-06, "loss": 0.0662, "num_tokens": 65764945.0, "reward": 3.0413129329681396, "reward_std": 1.360729694366455, "rewards/accuracy_reward/mean": 2.453125, "rewards/accuracy_reward/std": 3.287444591522217, "rewards/ngram_similarity_reward/mean": 0.5881880521774292, "rewards/ngram_similarity_reward/std": 0.3917367160320282, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 467.8125, "completions/mean_terminated_length": 467.8125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.18438129335421793, "frac_reward_zero_std": 0.0, "grad_norm": 0.049397651106119156, "learning_rate": 4.978497816604244e-06, "loss": -0.0061, "num_tokens": 65935605.0, "reward": 4.930202484130859, "reward_std": 0.7822836637496948, "rewards/accuracy_reward/mean": 4.265625, "rewards/accuracy_reward/std": 2.467195510864258, "rewards/ngram_similarity_reward/mean": 0.6645776033401489, "rewards/ngram_similarity_reward/std": 0.3582763373851776, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 499.21875, "completions/mean_terminated_length": 499.21875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.18482882076527188, "frac_reward_zero_std": 0.0, "grad_norm": 0.05851361155509949, "learning_rate": 4.978267603747875e-06, "loss": 0.0516, "num_tokens": 66133683.0, "reward": 2.65552020072937, "reward_std": 1.2815545797348022, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.4367702901363373, "rewards/ngram_similarity_reward/std": 0.2824799120426178, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 353.453125, "completions/mean_terminated_length": 353.453125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.1852763481763258, "frac_reward_zero_std": 0.0, "grad_norm": 0.06710066646337509, "learning_rate": 4.9780361710396475e-06, "loss": -0.0202, "num_tokens": 66249888.0, "reward": 2.8284106254577637, "reward_std": 1.3892254829406738, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.06007981300354, "rewards/ngram_similarity_reward/mean": 0.35966062545776367, "rewards/ngram_similarity_reward/std": 0.2577352225780487, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 420.3125, "completions/mean_terminated_length": 420.3125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.18572387558737974, "frac_reward_zero_std": 0.0, "grad_norm": 0.05144178494811058, "learning_rate": 4.977803518606258e-06, "loss": -0.0241, "num_tokens": 66384324.0, "reward": 4.493481636047363, "reward_std": 1.0047578811645508, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5872312188148499, "rewards/ngram_similarity_reward/std": 0.29360121488571167, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 418.375, "completions/mean_terminated_length": 418.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.18617140299843365, "frac_reward_zero_std": 0.0, "grad_norm": 0.06043994054198265, "learning_rate": 4.977569646575071e-06, "loss": -0.0143, "num_tokens": 66561676.0, "reward": 4.20809268951416, "reward_std": 0.9526137709617615, "rewards/accuracy_reward/mean": 3.796875, "rewards/accuracy_reward/std": 2.746886730194092, "rewards/ngram_similarity_reward/mean": 0.4112175703048706, "rewards/ngram_similarity_reward/std": 0.2842460870742798, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 450.71875, "completions/mean_terminated_length": 450.71875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.18661893040948757, "frac_reward_zero_std": 0.0, "grad_norm": 0.04853370413184166, "learning_rate": 4.977334555074119e-06, "loss": -0.0126, "num_tokens": 66737594.0, "reward": 0.9017431735992432, "reward_std": 1.474316954612732, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 2.312781572341919, "rewards/ngram_similarity_reward/mean": 0.41736823320388794, "rewards/ngram_similarity_reward/std": 0.3200174868106842, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 335.046875, "completions/mean_terminated_length": 335.046875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.18706645782054152, "frac_reward_zero_std": 0.0, "grad_norm": 0.06876197457313538, "learning_rate": 4.977098244232099e-06, "loss": -0.013, "num_tokens": 66936701.0, "reward": 4.07749080657959, "reward_std": 0.8316932916641235, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.45249098539352417, "rewards/ngram_similarity_reward/std": 0.2823086380958557, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 461.6875, "completions/mean_terminated_length": 461.6875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.18751398523159543, "frac_reward_zero_std": 0.0, "grad_norm": 0.05493154749274254, "learning_rate": 4.97686071417838e-06, "loss": 0.0447, "num_tokens": 67153945.0, "reward": 5.43075704574585, "reward_std": 1.6792662143707275, "rewards/accuracy_reward/mean": 4.921875, "rewards/accuracy_reward/std": 1.8153201341629028, "rewards/ngram_similarity_reward/mean": 0.5088820457458496, "rewards/ngram_similarity_reward/std": 0.3658638596534729, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 549.71875, "completions/mean_terminated_length": 549.71875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.18796151264264938, "frac_reward_zero_std": 0.0, "grad_norm": 0.04870909824967384, "learning_rate": 4.976621965042996e-06, "loss": -0.0049, "num_tokens": 67330359.0, "reward": 3.679755210876465, "reward_std": 1.5922231674194336, "rewards/accuracy_reward/mean": 2.953125, "rewards/accuracy_reward/std": 3.0075550079345703, "rewards/ngram_similarity_reward/mean": 0.7266303300857544, "rewards/ngram_similarity_reward/std": 0.3412902057170868, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 462.234375, "completions/mean_terminated_length": 462.234375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.1884090400537033, "frac_reward_zero_std": 0.0, "grad_norm": 0.056523796170949936, "learning_rate": 4.97638199695665e-06, "loss": -0.0166, "num_tokens": 67511190.0, "reward": 2.990081787109375, "reward_std": 2.6330084800720215, "rewards/accuracy_reward/mean": 2.296875, "rewards/accuracy_reward/std": 3.0351366996765137, "rewards/ngram_similarity_reward/mean": 0.693206787109375, "rewards/ngram_similarity_reward/std": 0.2941289246082306, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 462.171875, "completions/mean_terminated_length": 462.171875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.1888565674647572, "frac_reward_zero_std": 0.0, "grad_norm": 0.05080768093466759, "learning_rate": 4.9761408100507094e-06, "loss": 0.0207, "num_tokens": 67662577.0, "reward": 4.520578861236572, "reward_std": 0.07399225234985352, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5205788016319275, "rewards/ngram_similarity_reward/std": 0.4589429497718811, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 475.40625, "completions/mean_terminated_length": 475.40625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.18930409487581115, "frac_reward_zero_std": 0.0, "grad_norm": 0.05515008419752121, "learning_rate": 4.97589840445721e-06, "loss": -0.0312, "num_tokens": 67851435.0, "reward": 0.9724314212799072, "reward_std": 1.1153570413589478, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 2.6751632690429688, "rewards/ngram_similarity_reward/mean": 0.425556480884552, "rewards/ngram_similarity_reward/std": 0.25544473528862, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 475.71875, "completions/mean_terminated_length": 475.71875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.18975162228686507, "frac_reward_zero_std": 0.0, "grad_norm": 0.048770152032375336, "learning_rate": 4.975654780308857e-06, "loss": -0.0423, "num_tokens": 67989177.0, "reward": 3.2835075855255127, "reward_std": 1.7055535316467285, "rewards/accuracy_reward/mean": 2.90625, "rewards/accuracy_reward/std": 3.069143772125244, "rewards/ngram_similarity_reward/mean": 0.37725764513015747, "rewards/ngram_similarity_reward/std": 0.29064175486564636, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 491.15625, "completions/mean_terminated_length": 491.15625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.19019914969791898, "frac_reward_zero_std": 0.25, "grad_norm": 0.04984172061085701, "learning_rate": 4.975409937739021e-06, "loss": 0.0371, "num_tokens": 68125651.0, "reward": 5.302239418029785, "reward_std": 0.8608360290527344, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.6459895372390747, "rewards/ngram_similarity_reward/std": 0.2360706925392151, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 480.828125, "completions/mean_terminated_length": 480.828125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.19064667710897293, "frac_reward_zero_std": 0.0, "grad_norm": 0.04941911622881889, "learning_rate": 4.9751638768817385e-06, "loss": 0.0341, "num_tokens": 68249832.0, "reward": 2.1362762451171875, "reward_std": 1.6013157367706299, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 2.91611385345459, "rewards/ngram_similarity_reward/mean": 0.4956514239311218, "rewards/ngram_similarity_reward/std": 0.2930045425891876, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 375.46875, "completions/mean_terminated_length": 375.46875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.19109420452002684, "frac_reward_zero_std": 0.0, "grad_norm": 0.0572495236992836, "learning_rate": 4.974916597871714e-06, "loss": 0.0218, "num_tokens": 68406470.0, "reward": 0.5266842246055603, "reward_std": 1.5081195831298828, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 2.0653390884399414, "rewards/ngram_similarity_reward/mean": 0.3860591948032379, "rewards/ngram_similarity_reward/std": 0.25203803181648254, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 577.796875, "completions/mean_terminated_length": 577.796875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.1915417319310808, "frac_reward_zero_std": 0.0, "grad_norm": 0.04633355140686035, "learning_rate": 4.97466810084432e-06, "loss": 0.0083, "num_tokens": 68562169.0, "reward": 3.6001086235046387, "reward_std": 1.9819128513336182, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6313588619232178, "rewards/ngram_similarity_reward/std": 0.24967895448207855, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 432.90625, "completions/mean_terminated_length": 432.90625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.1919892593421347, "frac_reward_zero_std": 0.0, "grad_norm": 0.04926011711359024, "learning_rate": 4.974418385935594e-06, "loss": 0.0302, "num_tokens": 68738451.0, "reward": 4.414224624633789, "reward_std": 0.6799229383468628, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6017246246337891, "rewards/ngram_similarity_reward/std": 0.3715902268886566, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 490.4375, "completions/mean_terminated_length": 490.4375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.19243678675318862, "frac_reward_zero_std": 0.0, "grad_norm": 0.051539380103349686, "learning_rate": 4.97416745328224e-06, "loss": 0.0126, "num_tokens": 68862127.0, "reward": 3.573207378387451, "reward_std": 0.9432869553565979, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 2.9727182388305664, "rewards/ngram_similarity_reward/mean": 0.43258219957351685, "rewards/ngram_similarity_reward/std": 0.22983446717262268, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 526.6875, "completions/mean_terminated_length": 526.6875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.19288431416424257, "frac_reward_zero_std": 0.0, "grad_norm": 0.04874693974852562, "learning_rate": 4.973915303021632e-06, "loss": 0.0028, "num_tokens": 68999963.0, "reward": 2.221597194671631, "reward_std": 1.8298258781433105, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.37784695625305176, "rewards/ngram_similarity_reward/std": 0.1993177980184555, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 319.421875, "completions/mean_terminated_length": 319.421875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.19333184157529648, "frac_reward_zero_std": 0.0, "grad_norm": 0.08644500374794006, "learning_rate": 4.973661935291807e-06, "loss": -0.025, "num_tokens": 69236182.0, "reward": 3.3783063888549805, "reward_std": 1.6269056797027588, "rewards/accuracy_reward/mean": 2.703125, "rewards/accuracy_reward/std": 3.315091609954834, "rewards/ngram_similarity_reward/mean": 0.6751815676689148, "rewards/ngram_similarity_reward/std": 0.40489012002944946, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 518.09375, "completions/mean_terminated_length": 518.09375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.19377936898635043, "frac_reward_zero_std": 0.0, "grad_norm": 0.05006977915763855, "learning_rate": 4.973407350231469e-06, "loss": 0.006, "num_tokens": 69429164.0, "reward": 4.490677833557129, "reward_std": 0.5806868076324463, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.6000529527664185, "rewards/ngram_similarity_reward/std": 0.3240531384944916, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 512.03125, "completions/mean_terminated_length": 512.03125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.19422689639740434, "frac_reward_zero_std": 0.0, "grad_norm": 0.04619980603456497, "learning_rate": 4.97315154797999e-06, "loss": 0.0044, "num_tokens": 69583438.0, "reward": 4.255029678344727, "reward_std": 0.7939695119857788, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.4737798273563385, "rewards/ngram_similarity_reward/std": 0.2946584224700928, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 482.640625, "completions/mean_terminated_length": 482.640625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.19467442380845826, "frac_reward_zero_std": 0.0, "grad_norm": 0.05399477854371071, "learning_rate": 4.972894528677406e-06, "loss": 0.0359, "num_tokens": 69739415.0, "reward": 4.601756572723389, "reward_std": 2.62172532081604, "rewards/accuracy_reward/mean": 3.953125, "rewards/accuracy_reward/std": 2.7076005935668945, "rewards/ngram_similarity_reward/mean": 0.6486316919326782, "rewards/ngram_similarity_reward/std": 0.34401828050613403, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 365.109375, "completions/mean_terminated_length": 365.109375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.1951219512195122, "frac_reward_zero_std": 0.0, "grad_norm": 0.07542918622493744, "learning_rate": 4.972636292464423e-06, "loss": 0.0384, "num_tokens": 69959870.0, "reward": 2.8980531692504883, "reward_std": 1.627511978149414, "rewards/accuracy_reward/mean": 2.296875, "rewards/accuracy_reward/std": 3.143043279647827, "rewards/ngram_similarity_reward/mean": 0.6011780500411987, "rewards/ngram_similarity_reward/std": 0.3964519798755646, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 479.40625, "completions/mean_terminated_length": 479.40625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.19556947863056612, "frac_reward_zero_std": 0.0, "grad_norm": 0.052132755517959595, "learning_rate": 4.9723768394824085e-06, "loss": 0.0189, "num_tokens": 70115896.0, "reward": 4.600559234619141, "reward_std": 1.1112843751907349, "rewards/accuracy_reward/mean": 4.171875, "rewards/accuracy_reward/std": 2.5326733589172363, "rewards/ngram_similarity_reward/mean": 0.42868444323539734, "rewards/ngram_similarity_reward/std": 0.40038877725601196, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 509.609375, "completions/mean_terminated_length": 509.609375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.19601700604162006, "frac_reward_zero_std": 0.0, "grad_norm": 0.05371229350566864, "learning_rate": 4.9721161698734e-06, "loss": -0.0293, "num_tokens": 70343679.0, "reward": 2.7435035705566406, "reward_std": 0.9512712359428406, "rewards/accuracy_reward/mean": 2.171875, "rewards/accuracy_reward/std": 3.060525417327881, "rewards/ngram_similarity_reward/mean": 0.5716284513473511, "rewards/ngram_similarity_reward/std": 0.4092555046081543, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 508.78125, "completions/mean_terminated_length": 508.78125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.19646453345267398, "frac_reward_zero_std": 0.0, "grad_norm": 0.053464487195014954, "learning_rate": 4.971854283780099e-06, "loss": -0.0217, "num_tokens": 70481521.0, "reward": 4.153450012207031, "reward_std": 1.3068652153015137, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.7159500122070312, "rewards/ngram_similarity_reward/std": 0.3677367568016052, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 403.71875, "completions/mean_terminated_length": 403.71875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.1969120608637279, "frac_reward_zero_std": 0.0, "grad_norm": 0.05618702620267868, "learning_rate": 4.971591181345874e-06, "loss": -0.0447, "num_tokens": 70617183.0, "reward": 2.9440038204193115, "reward_std": 1.3587820529937744, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.4440038800239563, "rewards/ngram_similarity_reward/std": 0.38334041833877563, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 487.578125, "completions/mean_terminated_length": 487.578125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.19735958827478184, "frac_reward_zero_std": 0.0, "grad_norm": 0.049524884670972824, "learning_rate": 4.971326862714757e-06, "loss": 0.0157, "num_tokens": 70767364.0, "reward": 4.090793609619141, "reward_std": 1.2506130933761597, "rewards/accuracy_reward/mean": 3.5, "rewards/accuracy_reward/std": 2.8894994258880615, "rewards/ngram_similarity_reward/mean": 0.5907935500144958, "rewards/ngram_similarity_reward/std": 0.29597073793411255, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 392.5, "completions/mean_terminated_length": 392.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.19780711568583575, "frac_reward_zero_std": 0.0, "grad_norm": 0.0690118670463562, "learning_rate": 4.97106132803145e-06, "loss": -0.0388, "num_tokens": 70945348.0, "reward": 2.892568588256836, "reward_std": 2.564103126525879, "rewards/accuracy_reward/mean": 2.53125, "rewards/accuracy_reward/std": 3.0961766242980957, "rewards/ngram_similarity_reward/mean": 0.36131858825683594, "rewards/ngram_similarity_reward/std": 0.3403457701206207, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 441.4375, "completions/mean_terminated_length": 441.4375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.19825464309688967, "frac_reward_zero_std": 0.0, "grad_norm": 0.05034182220697403, "learning_rate": 4.9707945774413194e-06, "loss": -0.019, "num_tokens": 71070976.0, "reward": 3.9274301528930664, "reward_std": 0.9560231566429138, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.6774301528930664, "rewards/ngram_similarity_reward/std": 0.3557387590408325, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 499.6875, "completions/mean_terminated_length": 499.6875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.19870217050794362, "frac_reward_zero_std": 0.0, "grad_norm": 0.04697749763727188, "learning_rate": 4.970526611090391e-06, "loss": 0.004, "num_tokens": 71228108.0, "reward": 3.823631763458252, "reward_std": 0.9873033165931702, "rewards/accuracy_reward/mean": 3.390625, "rewards/accuracy_reward/std": 2.944552183151245, "rewards/ngram_similarity_reward/mean": 0.4330069422721863, "rewards/ngram_similarity_reward/std": 0.2955899238586426, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 389.9375, "completions/mean_terminated_length": 389.9375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.19914969791899753, "frac_reward_zero_std": 0.0, "grad_norm": 0.06391636282205582, "learning_rate": 4.970257429125368e-06, "loss": -0.0304, "num_tokens": 71359784.0, "reward": 1.6688861846923828, "reward_std": 0.9762160181999207, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 3.1515538692474365, "rewards/ngram_similarity_reward/mean": 0.3095111846923828, "rewards/ngram_similarity_reward/std": 0.3359506130218506, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 442.3125, "completions/mean_terminated_length": 442.3125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.19959722533005148, "frac_reward_zero_std": 0.0, "grad_norm": 0.0558396577835083, "learning_rate": 4.969987031693606e-06, "loss": 0.0272, "num_tokens": 71522284.0, "reward": 4.019309997558594, "reward_std": 0.8522672653198242, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.5818102359771729, "rewards/ngram_similarity_reward/std": 0.2885514795780182, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 549.1875, "completions/mean_terminated_length": 549.1875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.2000447527411054, "frac_reward_zero_std": 0.0, "grad_norm": 0.04167667776346207, "learning_rate": 4.969715418943137e-06, "loss": -0.0106, "num_tokens": 71676392.0, "reward": 4.276381492614746, "reward_std": 0.9113213419914246, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.8388814330101013, "rewards/ngram_similarity_reward/std": 0.19755546748638153, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 443.671875, "completions/mean_terminated_length": 443.671875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.2004922801521593, "frac_reward_zero_std": 0.0, "grad_norm": 0.06569624692201614, "learning_rate": 4.969442591022653e-06, "loss": -0.0035, "num_tokens": 71835811.0, "reward": 2.307880163192749, "reward_std": 1.4398939609527588, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5266302824020386, "rewards/ngram_similarity_reward/std": 0.37980908155441284, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 511.53125, "completions/mean_terminated_length": 511.53125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.20093980756321325, "frac_reward_zero_std": 0.0, "grad_norm": 0.05349518358707428, "learning_rate": 4.969168548081511e-06, "loss": -0.0157, "num_tokens": 72012293.0, "reward": 4.331980228424072, "reward_std": 0.8488610982894897, "rewards/accuracy_reward/mean": 3.6875, "rewards/accuracy_reward/std": 2.816476583480835, "rewards/ngram_similarity_reward/mean": 0.6444799304008484, "rewards/ngram_similarity_reward/std": 0.289334774017334, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 461.234375, "completions/mean_terminated_length": 461.234375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.20138733497426717, "frac_reward_zero_std": 0.0, "grad_norm": 0.0612126961350441, "learning_rate": 4.968893290269734e-06, "loss": 0.029, "num_tokens": 72167716.0, "reward": 3.29874587059021, "reward_std": 1.4539015293121338, "rewards/accuracy_reward/mean": 2.9375, "rewards/accuracy_reward/std": 3.028305768966675, "rewards/ngram_similarity_reward/mean": 0.36124569177627563, "rewards/ngram_similarity_reward/std": 0.24575692415237427, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 406.046875, "completions/mean_terminated_length": 406.046875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.2018348623853211, "frac_reward_zero_std": 0.0, "grad_norm": 0.062243226915597916, "learning_rate": 4.968616817738013e-06, "loss": -0.0154, "num_tokens": 72343751.0, "reward": 3.4030308723449707, "reward_std": 1.9362056255340576, "rewards/accuracy_reward/mean": 2.859375, "rewards/accuracy_reward/std": 3.0203921794891357, "rewards/ngram_similarity_reward/mean": 0.5436556339263916, "rewards/ngram_similarity_reward/std": 0.3432004451751709, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 403.109375, "completions/mean_terminated_length": 403.109375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.20228238979637503, "frac_reward_zero_std": 0.0, "grad_norm": 0.05695686489343643, "learning_rate": 4.968339130637696e-06, "loss": 0.0208, "num_tokens": 72479902.0, "reward": 3.9481887817382812, "reward_std": 0.9490264654159546, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.5106890201568604, "rewards/ngram_similarity_reward/std": 0.2993254065513611, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 416.84375, "completions/mean_terminated_length": 416.84375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.20272991720742894, "frac_reward_zero_std": 0.0, "grad_norm": 0.06031221151351929, "learning_rate": 4.968060229120806e-06, "loss": -0.0202, "num_tokens": 72703972.0, "reward": 4.6372809410095215, "reward_std": 1.6896450519561768, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.44978126883506775, "rewards/ngram_similarity_reward/std": 0.31349703669548035, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 401.71875, "completions/mean_terminated_length": 401.71875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.2031774446184829, "frac_reward_zero_std": 0.0, "grad_norm": 0.06598486751317978, "learning_rate": 4.967780113340025e-06, "loss": 0.0328, "num_tokens": 72925394.0, "reward": 2.2886803150177, "reward_std": 1.0122326612472534, "rewards/accuracy_reward/mean": 1.859375, "rewards/accuracy_reward/std": 3.046555280685425, "rewards/ngram_similarity_reward/mean": 0.4293053448200226, "rewards/ngram_similarity_reward/std": 0.3749549984931946, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 529.28125, "completions/mean_terminated_length": 529.28125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.2036249720295368, "frac_reward_zero_std": 0.0, "grad_norm": 0.047418881207704544, "learning_rate": 4.9674987834486986e-06, "loss": 0.0254, "num_tokens": 73091668.0, "reward": 4.394710063934326, "reward_std": 0.22299596667289734, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.3947100043296814, "rewards/ngram_similarity_reward/std": 0.3227953016757965, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 487.546875, "completions/mean_terminated_length": 487.546875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.20407249944059075, "frac_reward_zero_std": 0.0, "grad_norm": 0.05645826458930969, "learning_rate": 4.967216239600842e-06, "loss": -0.0032, "num_tokens": 73235847.0, "reward": 2.1269445419311523, "reward_std": 2.6225547790527344, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 2.9572014808654785, "rewards/ngram_similarity_reward/mean": 0.40819472074508667, "rewards/ngram_similarity_reward/std": 0.2562521994113922, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 499.65625, "completions/mean_terminated_length": 499.65625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.20452002685164467, "frac_reward_zero_std": 0.0, "grad_norm": 0.04859715700149536, "learning_rate": 4.966932481951129e-06, "loss": 0.0128, "num_tokens": 73428353.0, "reward": 2.2688302993774414, "reward_std": 0.929410457611084, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.5188302993774414, "rewards/ngram_similarity_reward/std": 0.27700766921043396, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.20496755426269858, "frac_reward_zero_std": 0.0, "grad_norm": 0.05721988156437874, "learning_rate": 4.966647510654904e-06, "loss": 0.0061, "num_tokens": 73564145.0, "reward": 5.685698509216309, "reward_std": 1.1184368133544922, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.5606982707977295, "rewards/ngram_similarity_reward/std": 0.4015916883945465, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 557.5625, "completions/mean_terminated_length": 557.5625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.20541508167375253, "frac_reward_zero_std": 0.0, "grad_norm": 0.05014842748641968, "learning_rate": 4.966361325868171e-06, "loss": -0.0044, "num_tokens": 73720213.0, "reward": 3.8704371452331543, "reward_std": 1.5883183479309082, "rewards/accuracy_reward/mean": 3.40625, "rewards/accuracy_reward/std": 2.920745372772217, "rewards/ngram_similarity_reward/mean": 0.4641871154308319, "rewards/ngram_similarity_reward/std": 0.29948803782463074, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 427.953125, "completions/mean_terminated_length": 427.953125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.20586260908480644, "frac_reward_zero_std": 0.0, "grad_norm": 0.059216953814029694, "learning_rate": 4.9660739277476e-06, "loss": -0.0108, "num_tokens": 73874130.0, "reward": 1.846233606338501, "reward_std": 2.161407947540283, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 2.8406331539154053, "rewards/ngram_similarity_reward/mean": 0.5493584275245667, "rewards/ngram_similarity_reward/std": 0.36027437448501587, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 456.734375, "completions/mean_terminated_length": 456.734375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.20631013649586036, "frac_reward_zero_std": 0.0, "grad_norm": 0.06060659512877464, "learning_rate": 4.965785316450528e-06, "loss": 0.0245, "num_tokens": 74051041.0, "reward": 2.9211111068725586, "reward_std": 0.16564224660396576, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.42111140489578247, "rewards/ngram_similarity_reward/std": 0.2438206523656845, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 487.203125, "completions/mean_terminated_length": 487.203125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.2067576639069143, "frac_reward_zero_std": 0.0, "grad_norm": 0.05391167104244232, "learning_rate": 4.9654954921349504e-06, "loss": -0.0394, "num_tokens": 74239342.0, "reward": 3.002830982208252, "reward_std": 1.39364492893219, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.502831220626831, "rewards/ngram_similarity_reward/std": 0.4091090261936188, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 490.59375, "completions/mean_terminated_length": 490.59375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.20720519131796822, "frac_reward_zero_std": 0.0, "grad_norm": 0.05286828801035881, "learning_rate": 4.965204454959531e-06, "loss": 0.0267, "num_tokens": 74439316.0, "reward": 2.826718807220459, "reward_std": 1.2495957612991333, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5142186284065247, "rewards/ngram_similarity_reward/std": 0.43164652585983276, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 429.765625, "completions/mean_terminated_length": 429.765625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.20765271872902216, "frac_reward_zero_std": 0.0, "grad_norm": 0.06051198020577431, "learning_rate": 4.964912205083597e-06, "loss": 0.0362, "num_tokens": 74562565.0, "reward": 4.667201042175293, "reward_std": 2.926253318786621, "rewards/accuracy_reward/mean": 4.015625, "rewards/accuracy_reward/std": 2.713822364807129, "rewards/ngram_similarity_reward/mean": 0.651576042175293, "rewards/ngram_similarity_reward/std": 0.3751135468482971, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 315.484375, "completions/mean_terminated_length": 315.484375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.20810024614007608, "frac_reward_zero_std": 0.25, "grad_norm": 0.07512948662042618, "learning_rate": 4.964618742667139e-06, "loss": -0.0049, "num_tokens": 74709732.0, "reward": 3.7250890731811523, "reward_std": 0.9738008975982666, "rewards/accuracy_reward/mean": 3.21875, "rewards/accuracy_reward/std": 2.9732606410980225, "rewards/ngram_similarity_reward/mean": 0.5063392519950867, "rewards/ngram_similarity_reward/std": 0.46273863315582275, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 527.6875, "completions/mean_terminated_length": 527.6875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.20854777355113, "frac_reward_zero_std": 0.0, "grad_norm": 0.04736018553376198, "learning_rate": 4.9643240678708085e-06, "loss": -0.0261, "num_tokens": 74866064.0, "reward": 4.556824207305908, "reward_std": 0.5301584005355835, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6505742073059082, "rewards/ngram_similarity_reward/std": 0.28748565912246704, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 550.75, "completions/mean_terminated_length": 550.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.20899530096218394, "frac_reward_zero_std": 0.0, "grad_norm": 0.04929441213607788, "learning_rate": 4.964028180855927e-06, "loss": 0.0112, "num_tokens": 74993584.0, "reward": 1.8805651664733887, "reward_std": 1.1419997215270996, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 2.8296544551849365, "rewards/ngram_similarity_reward/mean": 0.5368151068687439, "rewards/ngram_similarity_reward/std": 0.3858891427516937, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 442.03125, "completions/mean_terminated_length": 442.03125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.20944282837323785, "frac_reward_zero_std": 0.0, "grad_norm": 0.058800678700208664, "learning_rate": 4.9637310817844745e-06, "loss": 0.0347, "num_tokens": 75155474.0, "reward": 3.7268218994140625, "reward_std": 0.9965977072715759, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.5705718398094177, "rewards/ngram_similarity_reward/std": 0.36925530433654785, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 479.84375, "completions/mean_terminated_length": 479.84375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.2098903557842918, "frac_reward_zero_std": 0.0, "grad_norm": 0.05136004462838173, "learning_rate": 4.963432770819096e-06, "loss": 0.0027, "num_tokens": 75310568.0, "reward": 4.841269493103027, "reward_std": 0.8033795356750488, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.5600191354751587, "rewards/ngram_similarity_reward/std": 0.28958660364151, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 430.953125, "completions/mean_terminated_length": 430.953125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.21033788319534572, "frac_reward_zero_std": 0.0, "grad_norm": 0.05915706232190132, "learning_rate": 4.9631332481231004e-06, "loss": -0.0117, "num_tokens": 75459349.0, "reward": 2.92802357673645, "reward_std": 0.11898425221443176, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.42802348732948303, "rewards/ngram_similarity_reward/std": 0.261121928691864, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 447.828125, "completions/mean_terminated_length": 447.828125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.21078541060639963, "frac_reward_zero_std": 0.0, "grad_norm": 0.06052910536527634, "learning_rate": 4.962832513860459e-06, "loss": 0.0336, "num_tokens": 75594586.0, "reward": 4.519056797027588, "reward_std": 1.371473789215088, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5190566778182983, "rewards/ngram_similarity_reward/std": 0.3726147711277008, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 455.3125, "completions/mean_terminated_length": 455.3125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.21123293801745358, "frac_reward_zero_std": 0.0, "grad_norm": 0.05712325870990753, "learning_rate": 4.962530568195808e-06, "loss": -0.0486, "num_tokens": 75734366.0, "reward": 1.448999047279358, "reward_std": 0.5560168027877808, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 2.8622043132781982, "rewards/ngram_similarity_reward/mean": 0.6208740472793579, "rewards/ngram_similarity_reward/std": 0.4527631103992462, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 496.703125, "completions/mean_terminated_length": 496.703125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.2116804654285075, "frac_reward_zero_std": 0.0, "grad_norm": 0.04869071766734123, "learning_rate": 4.962227411294446e-06, "loss": -0.0011, "num_tokens": 75903963.0, "reward": 1.5254019498825073, "reward_std": 0.2591969966888428, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.5410269498825073, "rewards/ngram_similarity_reward/std": 0.337963342666626, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 463.625, "completions/mean_terminated_length": 463.625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.21212799283956144, "frac_reward_zero_std": 0.0, "grad_norm": 0.056896839290857315, "learning_rate": 4.961923043322333e-06, "loss": 0.0054, "num_tokens": 76043411.0, "reward": 3.734736680984497, "reward_std": 1.8387919664382935, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 2.9727182388305664, "rewards/ngram_similarity_reward/mean": 0.5941117405891418, "rewards/ngram_similarity_reward/std": 0.4166860282421112, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 438.796875, "completions/mean_terminated_length": 438.796875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.21257552025061535, "frac_reward_zero_std": 0.0, "grad_norm": 0.05332396924495697, "learning_rate": 4.961617464446094e-06, "loss": 0.0008, "num_tokens": 76195686.0, "reward": 4.474213123321533, "reward_std": 2.7889294624328613, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.7554631233215332, "rewards/ngram_similarity_reward/std": 0.3857472538948059, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 474.046875, "completions/mean_terminated_length": 474.046875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.21302304766166927, "frac_reward_zero_std": 0.0, "grad_norm": 0.0532119981944561, "learning_rate": 4.961310674833016e-06, "loss": -0.0179, "num_tokens": 76357321.0, "reward": 4.594073295593262, "reward_std": 1.177838921546936, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5940733551979065, "rewards/ngram_similarity_reward/std": 0.22110331058502197, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 519.40625, "completions/mean_terminated_length": 519.40625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.2134705750727232, "frac_reward_zero_std": 0.0, "grad_norm": 0.047116197645664215, "learning_rate": 4.961002674651051e-06, "loss": -0.011, "num_tokens": 76511683.0, "reward": 5.643961429595947, "reward_std": 1.319187879562378, "rewards/accuracy_reward/mean": 4.828125, "rewards/accuracy_reward/std": 1.9359153509140015, "rewards/ngram_similarity_reward/mean": 0.8158363699913025, "rewards/ngram_similarity_reward/std": 0.24353596568107605, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 417.078125, "completions/mean_terminated_length": 417.078125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.21391810248377713, "frac_reward_zero_std": 0.0, "grad_norm": 0.05638702213764191, "learning_rate": 4.960693464068809e-06, "loss": -0.0166, "num_tokens": 76684968.0, "reward": 4.017838954925537, "reward_std": 1.2605984210968018, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.5803389549255371, "rewards/ngram_similarity_reward/std": 0.3501337468624115, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 395.125, "completions/mean_terminated_length": 395.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.21436562989483104, "frac_reward_zero_std": 0.0, "grad_norm": 0.07186482101678848, "learning_rate": 4.960383043255568e-06, "loss": 0.0101, "num_tokens": 76812608.0, "reward": 2.7974987030029297, "reward_std": 1.5725497007369995, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6724984645843506, "rewards/ngram_similarity_reward/std": 0.44111230969429016, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 525.671875, "completions/mean_terminated_length": 525.671875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.214813157305885, "frac_reward_zero_std": 0.0, "grad_norm": 0.04860546812415123, "learning_rate": 4.960071412381265e-06, "loss": 0.0029, "num_tokens": 76971259.0, "reward": 1.3983885049819946, "reward_std": 1.2901124954223633, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.3983883261680603, "rewards/ngram_similarity_reward/std": 0.23067422211170197, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 502.609375, "completions/mean_terminated_length": 502.609375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.2152606847169389, "frac_reward_zero_std": 0.0, "grad_norm": 0.05025094002485275, "learning_rate": 4.9597585716165e-06, "loss": -0.0012, "num_tokens": 77119058.0, "reward": 3.136701822280884, "reward_std": 2.031083106994629, "rewards/accuracy_reward/mean": 2.765625, "rewards/accuracy_reward/std": 3.0302298069000244, "rewards/ngram_similarity_reward/mean": 0.37107694149017334, "rewards/ngram_similarity_reward/std": 0.2309703826904297, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 393.34375, "completions/mean_terminated_length": 393.34375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.21570821212799285, "frac_reward_zero_std": 0.0, "grad_norm": 0.06302059441804886, "learning_rate": 4.959444521132537e-06, "loss": 0.0032, "num_tokens": 77271688.0, "reward": 4.503539085388184, "reward_std": 0.938154935836792, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5035389065742493, "rewards/ngram_similarity_reward/std": 0.4234048128128052, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 487.625, "completions/mean_terminated_length": 487.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.21615573953904677, "frac_reward_zero_std": 0.0, "grad_norm": 0.051766663789749146, "learning_rate": 4.959129261101301e-06, "loss": 0.0325, "num_tokens": 77417744.0, "reward": 5.006524085998535, "reward_std": 0.9188006520271301, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.5377740859985352, "rewards/ngram_similarity_reward/std": 0.2830451428890228, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 564.5625, "completions/mean_terminated_length": 564.5625, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.21660326695010068, "frac_reward_zero_std": 0.0, "grad_norm": 0.04267222806811333, "learning_rate": 4.958812791695377e-06, "loss": -0.0045, "num_tokens": 77571620.0, "reward": 5.613181114196777, "reward_std": 1.2169044017791748, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.863180935382843, "rewards/ngram_similarity_reward/std": 0.2767801284790039, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 388.328125, "completions/mean_terminated_length": 388.328125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.21705079436115463, "frac_reward_zero_std": 0.0, "grad_norm": 0.06155702844262123, "learning_rate": 4.958495113088016e-06, "loss": -0.0267, "num_tokens": 77788457.0, "reward": 1.7356253862380981, "reward_std": 2.3263394832611084, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.45437535643577576, "rewards/ngram_similarity_reward/std": 0.22731998562812805, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 417.609375, "completions/mean_terminated_length": 417.609375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.21749832177220854, "frac_reward_zero_std": 0.0, "grad_norm": 0.05934767797589302, "learning_rate": 4.95817622545313e-06, "loss": 0.0046, "num_tokens": 77942288.0, "reward": 4.513749122619629, "reward_std": 1.5365564823150635, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5137491822242737, "rewards/ngram_similarity_reward/std": 0.2162044197320938, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 427.296875, "completions/mean_terminated_length": 427.296875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.21794584918326249, "frac_reward_zero_std": 0.0, "grad_norm": 0.06957036256790161, "learning_rate": 4.957856128965292e-06, "loss": 0.0439, "num_tokens": 78101459.0, "reward": 3.2877097129821777, "reward_std": 0.931866466999054, "rewards/accuracy_reward/mean": 2.703125, "rewards/accuracy_reward/std": 3.1074907779693604, "rewards/ngram_similarity_reward/mean": 0.5845849514007568, "rewards/ngram_similarity_reward/std": 0.360519677400589, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 507.796875, "completions/mean_terminated_length": 507.796875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.2183933765943164, "frac_reward_zero_std": 0.0, "grad_norm": 0.06689263880252838, "learning_rate": 4.957534823799735e-06, "loss": 0.0176, "num_tokens": 78233910.0, "reward": 4.001543998718262, "reward_std": 0.8976078629493713, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.3765438199043274, "rewards/ngram_similarity_reward/std": 0.32272958755493164, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 434.65625, "completions/mean_terminated_length": 434.65625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.21884090400537032, "frac_reward_zero_std": 0.0, "grad_norm": 0.06068659573793411, "learning_rate": 4.957212310132357e-06, "loss": 0.0258, "num_tokens": 78419840.0, "reward": 3.849094867706299, "reward_std": 1.5910649299621582, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.4115948975086212, "rewards/ngram_similarity_reward/std": 0.29876673221588135, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 453.84375, "completions/mean_terminated_length": 453.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.21928843141642426, "frac_reward_zero_std": 0.0, "grad_norm": 0.0636509358882904, "learning_rate": 4.956888588139716e-06, "loss": 0.031, "num_tokens": 78581206.0, "reward": 5.014695167541504, "reward_std": 1.7021368741989136, "rewards/accuracy_reward/mean": 4.453125, "rewards/accuracy_reward/std": 2.319206953048706, "rewards/ngram_similarity_reward/mean": 0.56156986951828, "rewards/ngram_similarity_reward/std": 0.30171915888786316, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 574.078125, "completions/mean_terminated_length": 574.078125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.21973595882747818, "frac_reward_zero_std": 0.0, "grad_norm": 0.05884205177426338, "learning_rate": 4.956563657999032e-06, "loss": -0.0081, "num_tokens": 78731227.0, "reward": 1.8951448202133179, "reward_std": 1.9604424238204956, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 2.816432476043701, "rewards/ngram_similarity_reward/mean": 0.5357697606086731, "rewards/ngram_similarity_reward/std": 0.2690957188606262, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 466.390625, "completions/mean_terminated_length": 466.390625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.22018348623853212, "frac_reward_zero_std": 0.0, "grad_norm": 0.05558258667588234, "learning_rate": 4.956237519888186e-06, "loss": -0.0245, "num_tokens": 78910468.0, "reward": 2.3418540954589844, "reward_std": 1.3856549263000488, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.6856042146682739, "rewards/ngram_similarity_reward/std": 0.31204110383987427, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 434.578125, "completions/mean_terminated_length": 434.578125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.22063101364958604, "frac_reward_zero_std": 0.0, "grad_norm": 0.05283565819263458, "learning_rate": 4.95591017398572e-06, "loss": 0.0229, "num_tokens": 79060265.0, "reward": 3.7463274002075195, "reward_std": 1.7352664470672607, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.5900774598121643, "rewards/ngram_similarity_reward/std": 0.32223933935165405, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 542.015625, "completions/mean_terminated_length": 542.015625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.22107854106063995, "frac_reward_zero_std": 0.0, "grad_norm": 0.04560478776693344, "learning_rate": 4.955581620470838e-06, "loss": -0.0261, "num_tokens": 79234922.0, "reward": 2.386859893798828, "reward_std": 0.9993495941162109, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 2.9857051372528076, "rewards/ngram_similarity_reward/mean": 0.46498507261276245, "rewards/ngram_similarity_reward/std": 0.38773688673973083, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 417.0625, "completions/mean_terminated_length": 417.0625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2215260684716939, "frac_reward_zero_std": 0.0, "grad_norm": 0.06720297783613205, "learning_rate": 4.955251859523404e-06, "loss": 0.0337, "num_tokens": 79384830.0, "reward": 5.404541015625, "reward_std": 1.311445713043213, "rewards/accuracy_reward/mean": 4.890625, "rewards/accuracy_reward/std": 1.915825366973877, "rewards/ngram_similarity_reward/mean": 0.5139156579971313, "rewards/ngram_similarity_reward/std": 0.22248059511184692, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 423.796875, "completions/mean_terminated_length": 423.796875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.22197359588274782, "frac_reward_zero_std": 0.0, "grad_norm": 0.056853897869586945, "learning_rate": 4.954920891323944e-06, "loss": 0.028, "num_tokens": 79552049.0, "reward": 3.310576915740967, "reward_std": 1.6470513343811035, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5293266773223877, "rewards/ngram_similarity_reward/std": 0.30274975299835205, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 406.03125, "completions/mean_terminated_length": 406.03125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.22242112329380176, "frac_reward_zero_std": 0.0, "grad_norm": 0.05910646170377731, "learning_rate": 4.954588716053645e-06, "loss": 0.0401, "num_tokens": 79714627.0, "reward": 0.7363100051879883, "reward_std": 1.615598440170288, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 2.2457523345947266, "rewards/ngram_similarity_reward/mean": 0.3769349455833435, "rewards/ngram_similarity_reward/std": 0.23151440918445587, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 462.703125, "completions/mean_terminated_length": 462.703125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.22286865070485568, "frac_reward_zero_std": 0.0, "grad_norm": 0.05820932984352112, "learning_rate": 4.954255333894354e-06, "loss": -0.0194, "num_tokens": 79846448.0, "reward": 0.9518995881080627, "reward_std": 1.9144651889801025, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.42064958810806274, "rewards/ngram_similarity_reward/std": 0.2036563754081726, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 381.515625, "completions/mean_terminated_length": 381.515625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.2233161781159096, "frac_reward_zero_std": 0.0, "grad_norm": 0.06320500373840332, "learning_rate": 4.953920745028579e-06, "loss": -0.0074, "num_tokens": 80019761.0, "reward": 3.8864622116088867, "reward_std": 0.8679347038269043, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.6364620327949524, "rewards/ngram_similarity_reward/std": 0.3301069736480713, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 415.25, "completions/mean_terminated_length": 415.25, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.22376370552696354, "frac_reward_zero_std": 0.0, "grad_norm": 0.06903304904699326, "learning_rate": 4.9535849496394885e-06, "loss": 0.0473, "num_tokens": 80138369.0, "reward": 4.351874828338623, "reward_std": 1.3796770572662354, "rewards/accuracy_reward/mean": 3.796875, "rewards/accuracy_reward/std": 2.746886730194092, "rewards/ngram_similarity_reward/mean": 0.5550000667572021, "rewards/ngram_similarity_reward/std": 0.3303433656692505, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 513.875, "completions/mean_terminated_length": 513.875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.22421123293801745, "frac_reward_zero_std": 0.0, "grad_norm": 0.05276532471179962, "learning_rate": 4.953247947910913e-06, "loss": -0.0046, "num_tokens": 80348649.0, "reward": 2.3902032375335693, "reward_std": 2.070019006729126, "rewards/accuracy_reward/mean": 2.015625, "rewards/accuracy_reward/std": 3.00260329246521, "rewards/ngram_similarity_reward/mean": 0.3745781481266022, "rewards/ngram_similarity_reward/std": 0.29120922088623047, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 505.25, "completions/mean_terminated_length": 505.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.22465876034907137, "frac_reward_zero_std": 0.0, "grad_norm": 0.05461040511727333, "learning_rate": 4.9529097400273395e-06, "loss": -0.0483, "num_tokens": 80511481.0, "reward": 3.9073472023010254, "reward_std": 1.3411670923233032, "rewards/accuracy_reward/mean": 3.328125, "rewards/accuracy_reward/std": 2.9252848625183105, "rewards/ngram_similarity_reward/mean": 0.5792225003242493, "rewards/ngram_similarity_reward/std": 0.3714323043823242, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 455.578125, "completions/mean_terminated_length": 455.578125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.2251062877601253, "frac_reward_zero_std": 0.0, "grad_norm": 0.05344484746456146, "learning_rate": 4.95257032617392e-06, "loss": -0.0038, "num_tokens": 80701806.0, "reward": 4.105986595153809, "reward_std": 1.2655099630355835, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.6684862375259399, "rewards/ngram_similarity_reward/std": 0.33517885208129883, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 469.953125, "completions/mean_terminated_length": 469.953125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.22555381517117923, "frac_reward_zero_std": 0.0, "grad_norm": 0.05020499974489212, "learning_rate": 4.952229706536465e-06, "loss": 0.0122, "num_tokens": 80860155.0, "reward": 1.7860900163650513, "reward_std": 0.6525092124938965, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 2.7316761016845703, "rewards/ngram_similarity_reward/mean": 0.6142149567604065, "rewards/ngram_similarity_reward/std": 0.31398242712020874, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 437.609375, "completions/mean_terminated_length": 437.609375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.22600134258223317, "frac_reward_zero_std": 0.0, "grad_norm": 0.056204263120889664, "learning_rate": 4.951887881301443e-06, "loss": 0.0174, "num_tokens": 80996610.0, "reward": 5.491084098815918, "reward_std": 0.8083322048187256, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.5535838603973389, "rewards/ngram_similarity_reward/std": 0.37020254135131836, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 504.28125, "completions/mean_terminated_length": 504.28125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.2264488699932871, "frac_reward_zero_std": 0.0, "grad_norm": 0.0578279085457325, "learning_rate": 4.951544850655985e-06, "loss": 0.0115, "num_tokens": 81168100.0, "reward": 2.248349189758301, "reward_std": 1.4716249704360962, "rewards/accuracy_reward/mean": 1.828125, "rewards/accuracy_reward/std": 2.9657018184661865, "rewards/ngram_similarity_reward/mean": 0.42022407054901123, "rewards/ngram_similarity_reward/std": 0.18899768590927124, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 451.484375, "completions/mean_terminated_length": 451.484375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.226896397404341, "frac_reward_zero_std": 0.0, "grad_norm": 0.0604545883834362, "learning_rate": 4.951200614787881e-06, "loss": -0.0065, "num_tokens": 81325603.0, "reward": 3.941903591156006, "reward_std": 0.9505756497383118, "rewards/accuracy_reward/mean": 3.421875, "rewards/accuracy_reward/std": 2.896657705307007, "rewards/ngram_similarity_reward/mean": 0.5200284719467163, "rewards/ngram_similarity_reward/std": 0.2735101282596588, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 442.75, "completions/mean_terminated_length": 442.75, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.22734392481539495, "frac_reward_zero_std": 0.0, "grad_norm": 0.05478325113654137, "learning_rate": 4.950855173885582e-06, "loss": 0.0086, "num_tokens": 81451043.0, "reward": 4.029051780700684, "reward_std": 1.3405178785324097, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.5915517210960388, "rewards/ngram_similarity_reward/std": 0.3374991714954376, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 424.6875, "completions/mean_terminated_length": 424.6875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.22779145222644887, "frac_reward_zero_std": 0.0, "grad_norm": 0.07086840271949768, "learning_rate": 4.950508528138195e-06, "loss": 0.0058, "num_tokens": 81679215.0, "reward": 3.891526222229004, "reward_std": 1.43088698387146, "rewards/accuracy_reward/mean": 3.3125, "rewards/accuracy_reward/std": 2.948634386062622, "rewards/ngram_similarity_reward/mean": 0.5790262222290039, "rewards/ngram_similarity_reward/std": 0.4064956307411194, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 451.796875, "completions/mean_terminated_length": 451.796875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.2282389796375028, "frac_reward_zero_std": 0.0, "grad_norm": 0.05182816833257675, "learning_rate": 4.9501606777354914e-06, "loss": 0.0048, "num_tokens": 81817906.0, "reward": 5.248233318328857, "reward_std": 1.3395018577575684, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.7794830799102783, "rewards/ngram_similarity_reward/std": 0.3089151382446289, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 455.703125, "completions/mean_terminated_length": 455.703125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.22868650704855673, "frac_reward_zero_std": 0.0, "grad_norm": 0.06050952896475792, "learning_rate": 4.949811622867899e-06, "loss": 0.0038, "num_tokens": 81981167.0, "reward": 3.9882125854492188, "reward_std": 2.2624154090881348, "rewards/accuracy_reward/mean": 3.515625, "rewards/accuracy_reward/std": 2.8646292686462402, "rewards/ngram_similarity_reward/mean": 0.4725874662399292, "rewards/ngram_similarity_reward/std": 0.2975623905658722, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 471.0, "completions/mean_terminated_length": 471.0, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.22913403445961064, "frac_reward_zero_std": 0.0, "grad_norm": 0.048352234065532684, "learning_rate": 4.949461363726506e-06, "loss": -0.0004, "num_tokens": 82172159.0, "reward": 4.106328964233398, "reward_std": 0.9110741019248962, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.5750784873962402, "rewards/ngram_similarity_reward/std": 0.3322790563106537, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 435.515625, "completions/mean_terminated_length": 435.515625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.22958156187066459, "frac_reward_zero_std": 0.0, "grad_norm": 0.05683273822069168, "learning_rate": 4.94910990050306e-06, "loss": -0.0037, "num_tokens": 82315248.0, "reward": 3.042149543762207, "reward_std": 0.1302480250597, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5421494841575623, "rewards/ngram_similarity_reward/std": 0.35772615671157837, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 413.5625, "completions/mean_terminated_length": 413.5625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.2300290892817185, "frac_reward_zero_std": 0.0, "grad_norm": 0.07653037458658218, "learning_rate": 4.9487572333899665e-06, "loss": 0.0415, "num_tokens": 82441892.0, "reward": 5.274172782897949, "reward_std": 0.9859016537666321, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.7116726040840149, "rewards/ngram_similarity_reward/std": 0.2897319495677948, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 544.125, "completions/mean_terminated_length": 544.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.23047661669277245, "frac_reward_zero_std": 0.0, "grad_norm": 0.05028749629855156, "learning_rate": 4.948403362580291e-06, "loss": 0.0515, "num_tokens": 82599324.0, "reward": 2.9024624824523926, "reward_std": 0.1806321144104004, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.4024624228477478, "rewards/ngram_similarity_reward/std": 0.238870769739151, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 547.34375, "completions/mean_terminated_length": 547.34375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.23092414410382636, "frac_reward_zero_std": 0.0, "grad_norm": 0.04469163715839386, "learning_rate": 4.9480482882677595e-06, "loss": 0.0052, "num_tokens": 82760306.0, "reward": 4.694647312164307, "reward_std": 0.9846379160881042, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.7102725505828857, "rewards/ngram_similarity_reward/std": 0.2618943154811859, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 384.46875, "completions/mean_terminated_length": 384.46875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.23137167151488028, "frac_reward_zero_std": 0.0, "grad_norm": 0.06376522779464722, "learning_rate": 4.947692010646754e-06, "loss": -0.022, "num_tokens": 82960528.0, "reward": 2.4959452152252197, "reward_std": 1.714483380317688, "rewards/accuracy_reward/mean": 2.046875, "rewards/accuracy_reward/std": 3.080557107925415, "rewards/ngram_similarity_reward/mean": 0.44907036423683167, "rewards/ngram_similarity_reward/std": 0.3995765149593353, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 468.25, "completions/mean_terminated_length": 468.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.23181919892593422, "frac_reward_zero_std": 0.0, "grad_norm": 0.05506211891770363, "learning_rate": 4.9473345299123174e-06, "loss": 0.0672, "num_tokens": 83088208.0, "reward": 4.447715759277344, "reward_std": 0.7623867392539978, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.7289658188819885, "rewards/ngram_similarity_reward/std": 0.27043387293815613, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 496.59375, "completions/mean_terminated_length": 496.59375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.23226672633698814, "frac_reward_zero_std": 0.0, "grad_norm": 0.06617296487092972, "learning_rate": 4.946975846260149e-06, "loss": -0.0103, "num_tokens": 83245510.0, "reward": 2.8303518295288086, "reward_std": 0.1702744960784912, "rewards/accuracy_reward/mean": 2.265625, "rewards/accuracy_reward/std": 3.2792866230010986, "rewards/ngram_similarity_reward/mean": 0.5647268295288086, "rewards/ngram_similarity_reward/std": 0.31829366087913513, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 531.671875, "completions/mean_terminated_length": 531.671875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.23271425374804205, "frac_reward_zero_std": 0.0, "grad_norm": 0.055209528654813766, "learning_rate": 4.94661595988661e-06, "loss": 0.0148, "num_tokens": 83383393.0, "reward": 2.1428885459899902, "reward_std": 1.666260004043579, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 2.9857051372528076, "rewards/ngram_similarity_reward/mean": 0.22101356089115143, "rewards/ngram_similarity_reward/std": 0.13001449406147003, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 466.421875, "completions/mean_terminated_length": 466.421875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.233161781159096, "frac_reward_zero_std": 0.0, "grad_norm": 0.061757054179906845, "learning_rate": 4.9462548709887165e-06, "loss": 0.0202, "num_tokens": 83538620.0, "reward": 2.524232864379883, "reward_std": 1.1955227851867676, "rewards/accuracy_reward/mean": 2.15625, "rewards/accuracy_reward/std": 3.0768916606903076, "rewards/ngram_similarity_reward/mean": 0.36798280477523804, "rewards/ngram_similarity_reward/std": 0.37950998544692993, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 401.828125, "completions/mean_terminated_length": 401.828125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.23360930857014992, "frac_reward_zero_std": 0.0, "grad_norm": 0.064455047249794, "learning_rate": 4.945892579764145e-06, "loss": 0.0213, "num_tokens": 83677825.0, "reward": 4.467645168304443, "reward_std": 1.6417489051818848, "rewards/accuracy_reward/mean": 3.796875, "rewards/accuracy_reward/std": 2.746886730194092, "rewards/ngram_similarity_reward/mean": 0.6707702279090881, "rewards/ngram_similarity_reward/std": 0.3119213581085205, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 559.234375, "completions/mean_terminated_length": 559.234375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.23405683598120386, "frac_reward_zero_std": 0.0, "grad_norm": 0.05815133824944496, "learning_rate": 4.94552908641123e-06, "loss": 0.0387, "num_tokens": 83805856.0, "reward": 1.0161261558532715, "reward_std": 0.9117844104766846, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 2.4028754234313965, "rewards/ngram_similarity_reward/mean": 0.45362603664398193, "rewards/ngram_similarity_reward/std": 0.24171987175941467, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 481.328125, "completions/mean_terminated_length": 481.328125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.23450436339225778, "frac_reward_zero_std": 0.0, "grad_norm": 0.05317988619208336, "learning_rate": 4.945164391128962e-06, "loss": -0.0071, "num_tokens": 83971189.0, "reward": 4.062438011169434, "reward_std": 1.3103262186050415, "rewards/accuracy_reward/mean": 3.578125, "rewards/accuracy_reward/std": 2.880171298980713, "rewards/ngram_similarity_reward/mean": 0.4843129515647888, "rewards/ngram_similarity_reward/std": 0.27475109696388245, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 336.828125, "completions/mean_terminated_length": 336.828125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2349518908033117, "frac_reward_zero_std": 0.0, "grad_norm": 0.07858358323574066, "learning_rate": 4.944798494116994e-06, "loss": 0.0186, "num_tokens": 84112474.0, "reward": 5.912669658660889, "reward_std": 0.9706466197967529, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.60016930103302, "rewards/ngram_similarity_reward/std": 0.3875333070755005, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 545.75, "completions/mean_terminated_length": 545.75, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.23539941821436564, "frac_reward_zero_std": 0.0, "grad_norm": 0.04891042038798332, "learning_rate": 4.944431395575633e-06, "loss": -0.0229, "num_tokens": 84290170.0, "reward": 4.428959846496582, "reward_std": 1.7386280298233032, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.616459846496582, "rewards/ngram_similarity_reward/std": 0.30519163608551025, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 430.734375, "completions/mean_terminated_length": 430.734375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.23584694562541955, "frac_reward_zero_std": 0.0, "grad_norm": 0.06413199752569199, "learning_rate": 4.944063095705845e-06, "loss": 0.0077, "num_tokens": 84453289.0, "reward": 3.515164375305176, "reward_std": 1.7847726345062256, "rewards/accuracy_reward/mean": 3.046875, "rewards/accuracy_reward/std": 2.991680145263672, "rewards/ngram_similarity_reward/mean": 0.4682896137237549, "rewards/ngram_similarity_reward/std": 0.20700648427009583, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 422.65625, "completions/mean_terminated_length": 422.65625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.2362944730364735, "frac_reward_zero_std": 0.0, "grad_norm": 0.06800197064876556, "learning_rate": 4.943693594709251e-06, "loss": -0.0152, "num_tokens": 84677251.0, "reward": 4.715863227844238, "reward_std": 1.7876237630844116, "rewards/accuracy_reward/mean": 4.078125, "rewards/accuracy_reward/std": 2.593059778213501, "rewards/ngram_similarity_reward/mean": 0.6377381086349487, "rewards/ngram_similarity_reward/std": 0.3081703186035156, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 375.625, "completions/mean_terminated_length": 375.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.2367420004475274, "frac_reward_zero_std": 0.0, "grad_norm": 0.0659157931804657, "learning_rate": 4.943322892788136e-06, "loss": 0.0055, "num_tokens": 84828923.0, "reward": 3.566446304321289, "reward_std": 0.9382291436195374, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.4101959764957428, "rewards/ngram_similarity_reward/std": 0.34982484579086304, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 416.578125, "completions/mean_terminated_length": 416.578125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.23718952785858133, "frac_reward_zero_std": 0.0, "grad_norm": 0.06760423630475998, "learning_rate": 4.942950990145438e-06, "loss": 0.0017, "num_tokens": 84963840.0, "reward": 4.4553704261779785, "reward_std": 0.24327006936073303, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4553704261779785, "rewards/ngram_similarity_reward/std": 0.32216182351112366, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 395.46875, "completions/mean_terminated_length": 395.46875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.23763705526963527, "frac_reward_zero_std": 0.0, "grad_norm": 0.07767239212989807, "learning_rate": 4.9425778869847516e-06, "loss": 0.0157, "num_tokens": 85140046.0, "reward": 3.088334321975708, "reward_std": 2.573495864868164, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5883344411849976, "rewards/ngram_similarity_reward/std": 0.31117013096809387, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 508.484375, "completions/mean_terminated_length": 508.484375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.2380845826806892, "frac_reward_zero_std": 0.0, "grad_norm": 0.05766969919204712, "learning_rate": 4.94220358351033e-06, "loss": -0.0255, "num_tokens": 85300285.0, "reward": 1.054612398147583, "reward_std": 1.5021476745605469, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 2.292099714279175, "rewards/ngram_similarity_reward/mean": 0.5389874577522278, "rewards/ngram_similarity_reward/std": 0.26103782653808594, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 416.296875, "completions/mean_terminated_length": 416.296875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.23853211009174313, "frac_reward_zero_std": 0.0, "grad_norm": 0.0637131929397583, "learning_rate": 4.941828079927083e-06, "loss": 0.0029, "num_tokens": 85415216.0, "reward": 3.05372953414917, "reward_std": 0.8926549553871155, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5537295341491699, "rewards/ngram_similarity_reward/std": 0.30394914746284485, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 481.046875, "completions/mean_terminated_length": 481.046875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.23897963750279705, "frac_reward_zero_std": 0.0, "grad_norm": 0.057355329394340515, "learning_rate": 4.941451376440579e-06, "loss": -0.0184, "num_tokens": 85631043.0, "reward": 3.322988510131836, "reward_std": 0.8318421244621277, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.44798851013183594, "rewards/ngram_similarity_reward/std": 0.28369003534317017, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 461.140625, "completions/mean_terminated_length": 461.140625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.23942716491385097, "frac_reward_zero_std": 0.0, "grad_norm": 0.058750320225954056, "learning_rate": 4.941073473257041e-06, "loss": -0.0238, "num_tokens": 85785164.0, "reward": 4.765369415283203, "reward_std": 1.5559390783309937, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.5778692960739136, "rewards/ngram_similarity_reward/std": 0.366667777299881, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 396.328125, "completions/mean_terminated_length": 396.328125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2398746923249049, "frac_reward_zero_std": 0.0, "grad_norm": 0.06921112537384033, "learning_rate": 4.940694370583351e-06, "loss": -0.0388, "num_tokens": 85968417.0, "reward": 3.6020917892456055, "reward_std": 1.2433319091796875, "rewards/accuracy_reward/mean": 3.046875, "rewards/accuracy_reward/std": 2.991680145263672, "rewards/ngram_similarity_reward/mean": 0.555216908454895, "rewards/ngram_similarity_reward/std": 0.3706609606742859, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 389.203125, "completions/mean_terminated_length": 389.203125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.24032221973595883, "frac_reward_zero_std": 0.25, "grad_norm": 0.06513098627328873, "learning_rate": 4.9403140686270455e-06, "loss": 0.0764, "num_tokens": 86096974.0, "reward": 3.6188747882843018, "reward_std": 1.255029320716858, "rewards/accuracy_reward/mean": 2.9375, "rewards/accuracy_reward/std": 3.028305768966675, "rewards/ngram_similarity_reward/mean": 0.6813750267028809, "rewards/ngram_similarity_reward/std": 0.4902365207672119, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 367.234375, "completions/mean_terminated_length": 367.234375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.24076974714701274, "frac_reward_zero_std": 0.0, "grad_norm": 0.06107205152511597, "learning_rate": 4.939932567596319e-06, "loss": 0.0035, "num_tokens": 86263213.0, "reward": 2.052935838699341, "reward_std": 0.839949905872345, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.4904356896877289, "rewards/ngram_similarity_reward/std": 0.13649572432041168, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 384.78125, "completions/mean_terminated_length": 384.78125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.24121727455806669, "frac_reward_zero_std": 0.0, "grad_norm": 0.06742729246616364, "learning_rate": 4.939549867700022e-06, "loss": -0.0325, "num_tokens": 86399407.0, "reward": 4.329132080078125, "reward_std": 1.331215500831604, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.7978819608688354, "rewards/ngram_similarity_reward/std": 0.31928256154060364, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 413.546875, "completions/mean_terminated_length": 413.546875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.2416648019691206, "frac_reward_zero_std": 0.0, "grad_norm": 0.06391128152608871, "learning_rate": 4.939165969147662e-06, "loss": -0.033, "num_tokens": 86631522.0, "reward": 3.186141014099121, "reward_std": 0.22396305203437805, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6861410737037659, "rewards/ngram_similarity_reward/std": 0.2991383373737335, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 441.0625, "completions/mean_terminated_length": 441.0625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.24211232938017455, "frac_reward_zero_std": 0.0, "grad_norm": 0.05881042405962944, "learning_rate": 4.9387808721494e-06, "loss": -0.0381, "num_tokens": 86759174.0, "reward": 4.801782608032227, "reward_std": 1.611395239830017, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.6142823696136475, "rewards/ngram_similarity_reward/std": 0.27828091382980347, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 446.796875, "completions/mean_terminated_length": 446.796875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.24255985679122846, "frac_reward_zero_std": 0.0, "grad_norm": 0.08241968601942062, "learning_rate": 4.938394576916057e-06, "loss": 0.0036, "num_tokens": 86895049.0, "reward": 5.214838027954102, "reward_std": 1.254507303237915, "rewards/accuracy_reward/mean": 4.640625, "rewards/accuracy_reward/std": 2.1445181369781494, "rewards/ngram_similarity_reward/mean": 0.5742127895355225, "rewards/ngram_similarity_reward/std": 0.232599139213562, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 489.828125, "completions/mean_terminated_length": 489.828125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.24300738420228238, "frac_reward_zero_std": 0.0, "grad_norm": 0.09177494049072266, "learning_rate": 4.938007083659106e-06, "loss": -0.0308, "num_tokens": 87020622.0, "reward": 4.419434547424316, "reward_std": 1.546202301979065, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6069345474243164, "rewards/ngram_similarity_reward/std": 0.2924477756023407, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 621.8125, "completions/mean_terminated_length": 621.8125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.24345491161333632, "frac_reward_zero_std": 0.0, "grad_norm": 0.05039747431874275, "learning_rate": 4.937618392590681e-06, "loss": -0.002, "num_tokens": 87177474.0, "reward": 3.1579737663269043, "reward_std": 1.5208743810653687, "rewards/accuracy_reward/mean": 2.578125, "rewards/accuracy_reward/std": 3.0410144329071045, "rewards/ngram_similarity_reward/mean": 0.5798487067222595, "rewards/ngram_similarity_reward/std": 0.34609490633010864, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 330.390625, "completions/mean_terminated_length": 330.390625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.24390243902439024, "frac_reward_zero_std": 0.0, "grad_norm": 0.08228424936532974, "learning_rate": 4.9372285039235654e-06, "loss": -0.0113, "num_tokens": 87311051.0, "reward": 2.638988494873047, "reward_std": 0.8976145386695862, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6077384352684021, "rewards/ngram_similarity_reward/std": 0.3406791687011719, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 377.875, "completions/mean_terminated_length": 377.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.24434996643544418, "frac_reward_zero_std": 0.0, "grad_norm": 0.07295098900794983, "learning_rate": 4.9368374178712035e-06, "loss": -0.0202, "num_tokens": 87509763.0, "reward": 3.602973461151123, "reward_std": 0.9902347326278687, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 2.9727182388305664, "rewards/ngram_similarity_reward/mean": 0.4623487591743469, "rewards/ngram_similarity_reward/std": 0.2751040458679199, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 383.0, "completions/mean_terminated_length": 383.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.2447974938464981, "frac_reward_zero_std": 0.0, "grad_norm": 0.1062786802649498, "learning_rate": 4.936445134647692e-06, "loss": 0.0131, "num_tokens": 87664179.0, "reward": 4.703701972961426, "reward_std": 0.20016412436962128, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7037018537521362, "rewards/ngram_similarity_reward/std": 0.39074793457984924, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 402.8125, "completions/mean_terminated_length": 402.8125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.24524502125755201, "frac_reward_zero_std": 0.0, "grad_norm": 0.10711069405078888, "learning_rate": 4.9360516544677835e-06, "loss": 0.0029, "num_tokens": 87875207.0, "reward": 5.1088080406188965, "reward_std": 1.2155022621154785, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.7338082790374756, "rewards/ngram_similarity_reward/std": 0.3225592076778412, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 456.375, "completions/mean_terminated_length": 456.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.24569254866860596, "frac_reward_zero_std": 0.0, "grad_norm": 0.05813458189368248, "learning_rate": 4.935656977546889e-06, "loss": -0.0346, "num_tokens": 88034511.0, "reward": 5.104488372802734, "reward_std": 0.8898274302482605, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.44823816418647766, "rewards/ngram_similarity_reward/std": 0.2638348937034607, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 417.125, "completions/mean_terminated_length": 417.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.24614007607965988, "frac_reward_zero_std": 0.0, "grad_norm": 0.06147436797618866, "learning_rate": 4.935261104101069e-06, "loss": 0.0314, "num_tokens": 88210951.0, "reward": 3.0853829383850098, "reward_std": 0.24257975816726685, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5853830575942993, "rewards/ngram_similarity_reward/std": 0.3101142346858978, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 443.75, "completions/mean_terminated_length": 443.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.24658760349071382, "frac_reward_zero_std": 0.0, "grad_norm": 0.0830966979265213, "learning_rate": 4.9348640343470435e-06, "loss": 0.0007, "num_tokens": 88340759.0, "reward": 4.599246978759766, "reward_std": 0.25810831785202026, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5992466807365417, "rewards/ngram_similarity_reward/std": 0.3302537798881531, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 527.828125, "completions/mean_terminated_length": 527.828125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.24703513090176774, "frac_reward_zero_std": 0.0, "grad_norm": 0.055706046521663666, "learning_rate": 4.934465768502187e-06, "loss": -0.0198, "num_tokens": 88516476.0, "reward": 5.346339225769043, "reward_std": 2.015418529510498, "rewards/accuracy_reward/mean": 4.59375, "rewards/accuracy_reward/std": 2.265817403793335, "rewards/ngram_similarity_reward/mean": 0.7525894641876221, "rewards/ngram_similarity_reward/std": 0.3101097345352173, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 355.40625, "completions/mean_terminated_length": 355.40625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.24748265831282165, "frac_reward_zero_std": 0.0, "grad_norm": 0.0677371695637703, "learning_rate": 4.934066306784525e-06, "loss": 0.0648, "num_tokens": 88676646.0, "reward": 4.451678276062012, "reward_std": 1.0497232675552368, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5454282760620117, "rewards/ngram_similarity_reward/std": 0.360625684261322, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 370.1875, "completions/mean_terminated_length": 370.1875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2479301857238756, "frac_reward_zero_std": 0.0, "grad_norm": 0.06734210252761841, "learning_rate": 4.933665649412743e-06, "loss": 0.0108, "num_tokens": 88806306.0, "reward": 3.965791702270508, "reward_std": 0.9451578259468079, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.7157918810844421, "rewards/ngram_similarity_reward/std": 0.2672803997993469, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 435.890625, "completions/mean_terminated_length": 435.890625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2483777131349295, "frac_reward_zero_std": 0.0, "grad_norm": 0.05665924772620201, "learning_rate": 4.933263796606178e-06, "loss": 0.0323, "num_tokens": 88980523.0, "reward": 4.632099628448486, "reward_std": 0.13429510593414307, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6320995688438416, "rewards/ngram_similarity_reward/std": 0.32506057620048523, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 434.3125, "completions/mean_terminated_length": 434.3125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.24882524054598343, "frac_reward_zero_std": 0.0, "grad_norm": 0.0613069124519825, "learning_rate": 4.9328607485848205e-06, "loss": -0.0059, "num_tokens": 89145327.0, "reward": 1.708435297012329, "reward_std": 2.089996814727783, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.3334353566169739, "rewards/ngram_similarity_reward/std": 0.22431553900241852, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 421.765625, "completions/mean_terminated_length": 421.765625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.24927276795703737, "frac_reward_zero_std": 0.0, "grad_norm": 0.0642596036195755, "learning_rate": 4.932456505569318e-06, "loss": -0.0034, "num_tokens": 89303664.0, "reward": 4.698173999786377, "reward_std": 1.4473786354064941, "rewards/accuracy_reward/mean": 4.0625, "rewards/accuracy_reward/std": 2.623913288116455, "rewards/ngram_similarity_reward/mean": 0.635674238204956, "rewards/ngram_similarity_reward/std": 0.35289034247398376, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 478.65625, "completions/mean_terminated_length": 478.65625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.2497202953680913, "frac_reward_zero_std": 0.0, "grad_norm": 0.05370509624481201, "learning_rate": 4.9320510677809705e-06, "loss": -0.0157, "num_tokens": 89444426.0, "reward": 3.192556858062744, "reward_std": 1.486718773841858, "rewards/accuracy_reward/mean": 2.765625, "rewards/accuracy_reward/std": 3.0302298069000244, "rewards/ngram_similarity_reward/mean": 0.4269317388534546, "rewards/ngram_similarity_reward/std": 0.24186871945858002, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 431.53125, "completions/mean_terminated_length": 431.53125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.25016782277914523, "frac_reward_zero_std": 0.0, "grad_norm": 0.06211641803383827, "learning_rate": 4.931644435441732e-06, "loss": 0.0252, "num_tokens": 89578060.0, "reward": 4.079433917999268, "reward_std": 1.3576300144195557, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.4544336199760437, "rewards/ngram_similarity_reward/std": 0.30528175830841064, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 511.625, "completions/mean_terminated_length": 511.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.25061535019019915, "frac_reward_zero_std": 0.0, "grad_norm": 0.06391696631908417, "learning_rate": 4.931236608774213e-06, "loss": 0.0084, "num_tokens": 89798164.0, "reward": 3.956958055496216, "reward_std": 1.2396081686019897, "rewards/accuracy_reward/mean": 3.421875, "rewards/accuracy_reward/std": 2.896657705307007, "rewards/ngram_similarity_reward/mean": 0.535082995891571, "rewards/ngram_similarity_reward/std": 0.20237237215042114, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 542.390625, "completions/mean_terminated_length": 542.390625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.25106287760125306, "frac_reward_zero_std": 0.0, "grad_norm": 0.06036326289176941, "learning_rate": 4.930827588001673e-06, "loss": 0.0104, "num_tokens": 90015293.0, "reward": 4.204543113708496, "reward_std": 1.2788963317871094, "rewards/accuracy_reward/mean": 3.796875, "rewards/accuracy_reward/std": 2.746886730194092, "rewards/ngram_similarity_reward/mean": 0.4076683521270752, "rewards/ngram_similarity_reward/std": 0.32381314039230347, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 342.453125, "completions/mean_terminated_length": 342.453125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.251510405012307, "frac_reward_zero_std": 0.0, "grad_norm": 0.07648829370737076, "learning_rate": 4.93041737334803e-06, "loss": 0.0066, "num_tokens": 90127866.0, "reward": 5.420676231384277, "reward_std": 2.02937912940979, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.6706759333610535, "rewards/ngram_similarity_reward/std": 0.3660717010498047, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 482.9375, "completions/mean_terminated_length": 482.9375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.25195793242336095, "frac_reward_zero_std": 0.0, "grad_norm": 0.05291726812720299, "learning_rate": 4.930005965037853e-06, "loss": 0.0074, "num_tokens": 90269814.0, "reward": 5.875433921813965, "reward_std": 0.8267433047294617, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.6566839218139648, "rewards/ngram_similarity_reward/std": 0.3229796588420868, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 426.65625, "completions/mean_terminated_length": 426.65625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.25240545983441487, "frac_reward_zero_std": 0.0, "grad_norm": 0.0659690722823143, "learning_rate": 4.929593363296365e-06, "loss": -0.0142, "num_tokens": 90419888.0, "reward": 5.506959915161133, "reward_std": 1.3257946968078613, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.5694600939750671, "rewards/ngram_similarity_reward/std": 0.2535358667373657, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 612.265625, "completions/mean_terminated_length": 612.265625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.2528529872454688, "frac_reward_zero_std": 0.0, "grad_norm": 0.05176481604576111, "learning_rate": 4.929179568349442e-06, "loss": 0.0655, "num_tokens": 90609329.0, "reward": 3.9212515354156494, "reward_std": 1.6650469303131104, "rewards/accuracy_reward/mean": 3.328125, "rewards/accuracy_reward/std": 2.9252848625183105, "rewards/ngram_similarity_reward/mean": 0.5931264162063599, "rewards/ngram_similarity_reward/std": 0.31439313292503357, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 438.28125, "completions/mean_terminated_length": 438.28125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.2533005146565227, "frac_reward_zero_std": 0.0, "grad_norm": 0.059034351259469986, "learning_rate": 4.928764580423615e-06, "loss": 0.0107, "num_tokens": 90790563.0, "reward": 5.449217796325684, "reward_std": 1.4397118091583252, "rewards/accuracy_reward/mean": 4.828125, "rewards/accuracy_reward/std": 1.9359153509140015, "rewards/ngram_similarity_reward/mean": 0.6210930943489075, "rewards/ngram_similarity_reward/std": 0.34128040075302124, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 453.390625, "completions/mean_terminated_length": 453.390625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.2537480420675766, "frac_reward_zero_std": 0.0, "grad_norm": 0.05689390376210213, "learning_rate": 4.928348399746066e-06, "loss": -0.0143, "num_tokens": 90948892.0, "reward": 5.924552917480469, "reward_std": 0.4729365110397339, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.5183030366897583, "rewards/ngram_similarity_reward/std": 0.37194132804870605, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 553.03125, "completions/mean_terminated_length": 553.03125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.2541955694786306, "frac_reward_zero_std": 0.0, "grad_norm": 0.0526115708053112, "learning_rate": 4.927931026544628e-06, "loss": 0.0328, "num_tokens": 91128334.0, "reward": 1.770582914352417, "reward_std": 0.6807299852371216, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 2.7316761016845703, "rewards/ngram_similarity_reward/mean": 0.5987077951431274, "rewards/ngram_similarity_reward/std": 0.27312007546424866, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 375.359375, "completions/mean_terminated_length": 375.359375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.2546430968896845, "frac_reward_zero_std": 0.0, "grad_norm": 0.06447634100914001, "learning_rate": 4.927512461047794e-06, "loss": -0.0145, "num_tokens": 91282837.0, "reward": 4.480569839477539, "reward_std": 0.23406577110290527, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4805700480937958, "rewards/ngram_similarity_reward/std": 0.3614371120929718, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 416.34375, "completions/mean_terminated_length": 416.34375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.2550906243007384, "frac_reward_zero_std": 0.0, "grad_norm": 0.060128338634967804, "learning_rate": 4.927092703484701e-06, "loss": 0.0107, "num_tokens": 91429115.0, "reward": 3.222437620162964, "reward_std": 0.2556608319282532, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7224376797676086, "rewards/ngram_similarity_reward/std": 0.39764177799224854, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 459.890625, "completions/mean_terminated_length": 459.890625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.25553815171179234, "frac_reward_zero_std": 0.0, "grad_norm": 0.05413290858268738, "learning_rate": 4.926671754085146e-06, "loss": 0.0039, "num_tokens": 91551092.0, "reward": 4.4916276931762695, "reward_std": 0.5226418972015381, "rewards/accuracy_reward/mean": 4.078125, "rewards/accuracy_reward/std": 2.593059778213501, "rewards/ngram_similarity_reward/mean": 0.41350257396698, "rewards/ngram_similarity_reward/std": 0.3172895312309265, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 465.28125, "completions/mean_terminated_length": 465.28125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.25598567912284625, "frac_reward_zero_std": 0.0, "grad_norm": 0.059520188719034195, "learning_rate": 4.9262496130795735e-06, "loss": -0.019, "num_tokens": 91701702.0, "reward": 3.140317916870117, "reward_std": 0.5068360567092896, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5465677976608276, "rewards/ngram_similarity_reward/std": 0.3046760559082031, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.2564332065339002, "frac_reward_zero_std": 0.0, "grad_norm": 0.07516378164291382, "learning_rate": 4.925826280699083e-06, "loss": 0.0017, "num_tokens": 91828230.0, "reward": 4.201871395111084, "reward_std": 1.5170429944992065, "rewards/accuracy_reward/mean": 3.5, "rewards/accuracy_reward/std": 2.8894994258880615, "rewards/ngram_similarity_reward/mean": 0.7018713355064392, "rewards/ngram_similarity_reward/std": 0.43469229340553284, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 450.765625, "completions/mean_terminated_length": 450.765625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.25688073394495414, "frac_reward_zero_std": 0.0, "grad_norm": 0.05854872614145279, "learning_rate": 4.9254017571754246e-06, "loss": 0.0312, "num_tokens": 91985671.0, "reward": 6.019687652587891, "reward_std": 0.4594746530056, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.6134378910064697, "rewards/ngram_similarity_reward/std": 0.34068456292152405, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 473.34375, "completions/mean_terminated_length": 473.34375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.25732826135600806, "frac_reward_zero_std": 0.0, "grad_norm": 0.04823269322514534, "learning_rate": 4.924976042741001e-06, "loss": 0.0281, "num_tokens": 92129325.0, "reward": 2.2590408325195312, "reward_std": 0.914676308631897, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.4152909517288208, "rewards/ngram_similarity_reward/std": 0.27419641613960266, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 379.921875, "completions/mean_terminated_length": 379.921875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.257775788767062, "frac_reward_zero_std": 0.0, "grad_norm": 0.06938162446022034, "learning_rate": 4.924549137628868e-06, "loss": 0.0411, "num_tokens": 92242680.0, "reward": 5.770617961883545, "reward_std": 0.8946278095245361, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.7393677234649658, "rewards/ngram_similarity_reward/std": 0.35116398334503174, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 379.328125, "completions/mean_terminated_length": 379.328125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.2582233161781159, "frac_reward_zero_std": 0.0, "grad_norm": 0.06705532222986221, "learning_rate": 4.924121042072731e-06, "loss": -0.0318, "num_tokens": 92421613.0, "reward": 3.4321212768554688, "reward_std": 1.660041332244873, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.6508713960647583, "rewards/ngram_similarity_reward/std": 0.3138256072998047, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 391.171875, "completions/mean_terminated_length": 391.171875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.25867084358916986, "frac_reward_zero_std": 0.0, "grad_norm": 0.06797738373279572, "learning_rate": 4.92369175630695e-06, "loss": 0.0537, "num_tokens": 92648280.0, "reward": 0.8223634958267212, "reward_std": 1.387064814567566, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.3848634958267212, "rewards/ngram_similarity_reward/std": 0.22720791399478912, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 367.59375, "completions/mean_terminated_length": 367.59375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.2591183710002238, "frac_reward_zero_std": 0.0, "grad_norm": 0.0671374499797821, "learning_rate": 4.923261280566534e-06, "loss": -0.0254, "num_tokens": 92783486.0, "reward": 5.127212047576904, "reward_std": 2.018561363220215, "rewards/accuracy_reward/mean": 4.515625, "rewards/accuracy_reward/std": 2.312781572341919, "rewards/ngram_similarity_reward/mean": 0.6115868091583252, "rewards/ngram_similarity_reward/std": 0.35721004009246826, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 394.390625, "completions/mean_terminated_length": 394.390625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.2595658984112777, "frac_reward_zero_std": 0.0, "grad_norm": 0.07511835545301437, "learning_rate": 4.922829615087144e-06, "loss": -0.0139, "num_tokens": 93007367.0, "reward": 3.1698343753814697, "reward_std": 0.7463058233261108, "rewards/accuracy_reward/mean": 2.765625, "rewards/accuracy_reward/std": 3.0302298069000244, "rewards/ngram_similarity_reward/mean": 0.4042094647884369, "rewards/ngram_similarity_reward/std": 0.3607175946235657, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 405.515625, "completions/mean_terminated_length": 405.515625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.2600134258223316, "frac_reward_zero_std": 0.0, "grad_norm": 0.051304783672094345, "learning_rate": 4.922396760105093e-06, "loss": -0.0243, "num_tokens": 93179992.0, "reward": 4.243990421295166, "reward_std": 1.482818365097046, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.9002405405044556, "rewards/ngram_similarity_reward/std": 0.30516666173934937, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 444.40625, "completions/mean_terminated_length": 444.40625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.26046095323338553, "frac_reward_zero_std": 0.0, "grad_norm": 0.06275703012943268, "learning_rate": 4.921962715857346e-06, "loss": 0.0095, "num_tokens": 93345746.0, "reward": 3.421440601348877, "reward_std": 0.8881598711013794, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.45269080996513367, "rewards/ngram_similarity_reward/std": 0.2987428307533264, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 409.375, "completions/mean_terminated_length": 409.375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.26090848064443944, "frac_reward_zero_std": 0.0, "grad_norm": 0.06608902662992477, "learning_rate": 4.921527482581515e-06, "loss": 0.0212, "num_tokens": 93504730.0, "reward": 4.2971320152282715, "reward_std": 0.9389599561691284, "rewards/accuracy_reward/mean": 3.796875, "rewards/accuracy_reward/std": 2.746886730194092, "rewards/ngram_similarity_reward/mean": 0.5002568960189819, "rewards/ngram_similarity_reward/std": 0.30913400650024414, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 464.640625, "completions/mean_terminated_length": 464.640625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.2613560080554934, "frac_reward_zero_std": 0.0, "grad_norm": 0.06220301240682602, "learning_rate": 4.921091060515869e-06, "loss": -0.001, "num_tokens": 93729187.0, "reward": 3.043252944946289, "reward_std": 1.370833158493042, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.3557531535625458, "rewards/ngram_similarity_reward/std": 0.26693427562713623, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 476.5, "completions/mean_terminated_length": 476.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.26180353546654733, "frac_reward_zero_std": 0.0, "grad_norm": 0.048784174025058746, "learning_rate": 4.920653449899324e-06, "loss": 0.0125, "num_tokens": 93899619.0, "reward": 4.929577350616455, "reward_std": 0.9999752044677734, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.8358274698257446, "rewards/ngram_similarity_reward/std": 0.2971649169921875, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 414.375, "completions/mean_terminated_length": 414.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.26225106287760125, "frac_reward_zero_std": 0.25, "grad_norm": 0.06329008936882019, "learning_rate": 4.920214650971446e-06, "loss": -0.013, "num_tokens": 94070251.0, "reward": 3.229645013809204, "reward_std": 1.3219417333602905, "rewards/accuracy_reward/mean": 2.671875, "rewards/accuracy_reward/std": 3.037097215652466, "rewards/ngram_similarity_reward/mean": 0.5577700138092041, "rewards/ngram_similarity_reward/std": 0.2504371404647827, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 413.078125, "completions/mean_terminated_length": 413.078125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.26269859028865516, "frac_reward_zero_std": 0.0, "grad_norm": 0.06915739178657532, "learning_rate": 4.919774663972455e-06, "loss": 0.0026, "num_tokens": 94293536.0, "reward": 4.383290767669678, "reward_std": 1.0135438442230225, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6645410060882568, "rewards/ngram_similarity_reward/std": 0.35068821907043457, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 404.625, "completions/mean_terminated_length": 404.625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.2631461176997091, "frac_reward_zero_std": 0.0, "grad_norm": 0.06700963526964188, "learning_rate": 4.919333489143217e-06, "loss": 0.0152, "num_tokens": 94442600.0, "reward": 2.7941768169403076, "reward_std": 1.2157671451568604, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.6848018169403076, "rewards/ngram_similarity_reward/std": 0.22740662097930908, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 379.25, "completions/mean_terminated_length": 379.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.26359364511076305, "frac_reward_zero_std": 0.0, "grad_norm": 0.07101466506719589, "learning_rate": 4.918891126725251e-06, "loss": -0.0079, "num_tokens": 94573016.0, "reward": 3.7072603702545166, "reward_std": 1.4600337743759155, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 2.9727182388305664, "rewards/ngram_similarity_reward/mean": 0.5666354894638062, "rewards/ngram_similarity_reward/std": 0.40288856625556946, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 446.828125, "completions/mean_terminated_length": 446.828125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.26404117252181697, "frac_reward_zero_std": 0.0, "grad_norm": 0.057009000331163406, "learning_rate": 4.918447576960727e-06, "loss": -0.0193, "num_tokens": 94736301.0, "reward": 3.3594250679016113, "reward_std": 2.1236746311187744, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.7656752467155457, "rewards/ngram_similarity_reward/std": 0.3162704110145569, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 512.671875, "completions/mean_terminated_length": 512.671875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.2644886999328709, "frac_reward_zero_std": 0.0, "grad_norm": 0.06178559362888336, "learning_rate": 4.918002840092462e-06, "loss": 0.0004, "num_tokens": 94929144.0, "reward": 2.7129454612731934, "reward_std": 1.9928107261657715, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.4941953420639038, "rewards/ngram_similarity_reward/std": 0.21917389333248138, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 337.3125, "completions/mean_terminated_length": 337.3125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2649362273439248, "frac_reward_zero_std": 0.25, "grad_norm": 0.06375846266746521, "learning_rate": 4.917556916363926e-06, "loss": 0.0134, "num_tokens": 95060652.0, "reward": 4.1918253898620605, "reward_std": 1.1015440225601196, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6605754494667053, "rewards/ngram_similarity_reward/std": 0.41147440671920776, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 410.96875, "completions/mean_terminated_length": 410.96875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.2653837547549787, "frac_reward_zero_std": 0.0, "grad_norm": 0.05672091618180275, "learning_rate": 4.917109806019236e-06, "loss": -0.0009, "num_tokens": 95203018.0, "reward": 3.603363513946533, "reward_std": 1.3123035430908203, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.8221136331558228, "rewards/ngram_similarity_reward/std": 0.3107687532901764, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 539.28125, "completions/mean_terminated_length": 539.28125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.2658312821660327, "frac_reward_zero_std": 0.0, "grad_norm": 0.048371218144893646, "learning_rate": 4.916661509303162e-06, "loss": 0.0377, "num_tokens": 95402348.0, "reward": 5.1871442794799805, "reward_std": 1.5259716510772705, "rewards/accuracy_reward/mean": 4.625, "rewards/accuracy_reward/std": 2.1858129501342773, "rewards/ngram_similarity_reward/mean": 0.5621447563171387, "rewards/ngram_similarity_reward/std": 0.3073975145816803, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 373.84375, "completions/mean_terminated_length": 373.84375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.2662788095770866, "frac_reward_zero_std": 0.0, "grad_norm": 0.06661540269851685, "learning_rate": 4.9162120264611195e-06, "loss": 0.0047, "num_tokens": 95524130.0, "reward": 4.550005912780762, "reward_std": 0.2907702922821045, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.5656312704086304, "rewards/ngram_similarity_reward/std": 0.3623160123825073, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 468.09375, "completions/mean_terminated_length": 468.09375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.2667263369881405, "frac_reward_zero_std": 0.0, "grad_norm": 0.061852775514125824, "learning_rate": 4.915761357739175e-06, "loss": 0.0104, "num_tokens": 95680824.0, "reward": 3.7030439376831055, "reward_std": 1.5367881059646606, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.359294056892395, "rewards/ngram_similarity_reward/std": 0.24002881348133087, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 505.375, "completions/mean_terminated_length": 505.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.26717386439919444, "frac_reward_zero_std": 0.0, "grad_norm": 0.05682015046477318, "learning_rate": 4.915309503384046e-06, "loss": 0.0316, "num_tokens": 95840192.0, "reward": 5.281811237335205, "reward_std": 1.534910798072815, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.6255611181259155, "rewards/ngram_similarity_reward/std": 0.31543436646461487, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 482.359375, "completions/mean_terminated_length": 482.359375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.26762139181024835, "frac_reward_zero_std": 0.0, "grad_norm": 0.05381014198064804, "learning_rate": 4.9148564636430965e-06, "loss": -0.0006, "num_tokens": 96013367.0, "reward": 0.008737348020076752, "reward_std": 0.182732954621315, "rewards/accuracy_reward/mean": -0.515625, "rewards/accuracy_reward/std": 0.125, "rewards/ngram_similarity_reward/mean": 0.524362325668335, "rewards/ngram_similarity_reward/std": 0.3303601145744324, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 415.1875, "completions/mean_terminated_length": 415.1875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.2680689192213023, "frac_reward_zero_std": 0.0, "grad_norm": 0.05661553516983986, "learning_rate": 4.91440223876434e-06, "loss": -0.0005, "num_tokens": 96122579.0, "reward": 4.493249893188477, "reward_std": 0.5313665270805359, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.6026246547698975, "rewards/ngram_similarity_reward/std": 0.1737067550420761, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 501.140625, "completions/mean_terminated_length": 501.140625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.26851644663235624, "frac_reward_zero_std": 0.0, "grad_norm": 0.05517631024122238, "learning_rate": 4.91394682899644e-06, "loss": -0.0006, "num_tokens": 96263420.0, "reward": 5.909665107727051, "reward_std": 0.9372274875640869, "rewards/accuracy_reward/mean": 5.296875, "rewards/accuracy_reward/std": 1.1433686017990112, "rewards/ngram_similarity_reward/mean": 0.6127904057502747, "rewards/ngram_similarity_reward/std": 0.2696954011917114, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 486.03125, "completions/mean_terminated_length": 486.03125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.26896397404341016, "frac_reward_zero_std": 0.0, "grad_norm": 0.05566961690783501, "learning_rate": 4.913490234588708e-06, "loss": -0.0116, "num_tokens": 96395678.0, "reward": 3.3153233528137207, "reward_std": 0.9597666263580322, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.8309487104415894, "rewards/ngram_similarity_reward/std": 0.2732831835746765, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 456.859375, "completions/mean_terminated_length": 456.859375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2694115014544641, "frac_reward_zero_std": 0.0, "grad_norm": 0.06226060166954994, "learning_rate": 4.913032455791102e-06, "loss": 0.0153, "num_tokens": 96627125.0, "reward": 3.7133123874664307, "reward_std": 1.417924165725708, "rewards/accuracy_reward/mean": 3.21875, "rewards/accuracy_reward/std": 3.0833332538604736, "rewards/ngram_similarity_reward/mean": 0.4945622682571411, "rewards/ngram_similarity_reward/std": 0.2964348793029785, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 478.359375, "completions/mean_terminated_length": 478.359375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.269859028865518, "frac_reward_zero_std": 0.0, "grad_norm": 0.054972272366285324, "learning_rate": 4.912573492854233e-06, "loss": 0.0164, "num_tokens": 96799596.0, "reward": 1.3673251867294312, "reward_std": 0.9833556413650513, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.4610751271247864, "rewards/ngram_similarity_reward/std": 0.244536891579628, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 376.5625, "completions/mean_terminated_length": 376.5625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.27030655627657196, "frac_reward_zero_std": 0.0, "grad_norm": 0.06426282227039337, "learning_rate": 4.912113346029356e-06, "loss": 0.0069, "num_tokens": 96967024.0, "reward": 3.4562032222747803, "reward_std": 0.5571986436843872, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.8624532222747803, "rewards/ngram_similarity_reward/std": 0.2347019612789154, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 465.515625, "completions/mean_terminated_length": 440.3968505859375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.2707540836876259, "frac_reward_zero_std": 0.0, "grad_norm": 0.07138356566429138, "learning_rate": 4.911652015568376e-06, "loss": 0.0217, "num_tokens": 97095601.0, "reward": 1.1763570308685303, "reward_std": 1.5949087142944336, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 2.455153465270996, "rewards/ngram_similarity_reward/mean": 0.48885706067085266, "rewards/ngram_similarity_reward/std": 0.38667434453964233, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 342.734375, "completions/mean_terminated_length": 342.734375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.2712016110986798, "frac_reward_zero_std": 0.0, "grad_norm": 0.07411551475524902, "learning_rate": 4.911189501723846e-06, "loss": 0.0243, "num_tokens": 97224736.0, "reward": 3.03371000289917, "reward_std": 0.28567981719970703, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.5493350028991699, "rewards/ngram_similarity_reward/std": 0.4061363935470581, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 361.578125, "completions/mean_terminated_length": 361.578125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2716491385097337, "frac_reward_zero_std": 0.25, "grad_norm": 0.05806978419423103, "learning_rate": 4.9107258047489654e-06, "loss": 0.0089, "num_tokens": 97350325.0, "reward": 2.486035108566284, "reward_std": 1.347016453742981, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.7360352277755737, "rewards/ngram_similarity_reward/std": 0.2674678862094879, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 406.8125, "completions/mean_terminated_length": 406.8125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.27209666592078763, "frac_reward_zero_std": 0.0, "grad_norm": 0.0656108483672142, "learning_rate": 4.910260924897583e-06, "loss": 0.0071, "num_tokens": 97494473.0, "reward": 4.586330413818359, "reward_std": 0.5184668302536011, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6800804138183594, "rewards/ngram_similarity_reward/std": 0.25902506709098816, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 392.21875, "completions/mean_terminated_length": 392.21875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.2725441933318416, "frac_reward_zero_std": 0.0, "grad_norm": 0.06588999181985855, "learning_rate": 4.909794862424195e-06, "loss": -0.0084, "num_tokens": 97619063.0, "reward": 3.3414297103881836, "reward_std": 2.093870162963867, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.4664297103881836, "rewards/ngram_similarity_reward/std": 0.28803008794784546, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 483.03125, "completions/mean_terminated_length": 483.03125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.2729917207428955, "frac_reward_zero_std": 0.0, "grad_norm": 0.06073449179530144, "learning_rate": 4.909327617583943e-06, "loss": -0.013, "num_tokens": 97786841.0, "reward": 3.875952959060669, "reward_std": 0.8475580215454102, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.43845313787460327, "rewards/ngram_similarity_reward/std": 0.3433631360530853, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 367.546875, "completions/mean_terminated_length": 367.546875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.27343924815394943, "frac_reward_zero_std": 0.0, "grad_norm": 0.09106756001710892, "learning_rate": 4.90885919063262e-06, "loss": -0.0131, "num_tokens": 97904540.0, "reward": 3.272808790206909, "reward_std": 1.9939448833465576, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5853087306022644, "rewards/ngram_similarity_reward/std": 0.3691607713699341, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 358.8125, "completions/mean_terminated_length": 358.8125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.27388677556500335, "frac_reward_zero_std": 0.0, "grad_norm": 0.06978929042816162, "learning_rate": 4.908389581826661e-06, "loss": -0.0039, "num_tokens": 98030768.0, "reward": 4.42750883102417, "reward_std": 1.1210732460021973, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.615009069442749, "rewards/ngram_similarity_reward/std": 0.36108338832855225, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 391.140625, "completions/mean_terminated_length": 391.140625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.27433430297605726, "frac_reward_zero_std": 0.0, "grad_norm": 0.07510551810264587, "learning_rate": 4.9079187914231515e-06, "loss": 0.0097, "num_tokens": 98201481.0, "reward": 3.266921043395996, "reward_std": 2.1021249294281006, "rewards/accuracy_reward/mean": 2.734375, "rewards/accuracy_reward/std": 3.0692648887634277, "rewards/ngram_similarity_reward/mean": 0.5325460433959961, "rewards/ngram_similarity_reward/std": 0.3077428638935089, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 525.453125, "completions/mean_terminated_length": 525.453125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.27478183038711124, "frac_reward_zero_std": 0.0, "grad_norm": 0.06352479755878448, "learning_rate": 4.907446819679822e-06, "loss": 0.0376, "num_tokens": 98410278.0, "reward": 4.358247756958008, "reward_std": 0.5445563793182373, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.4676225185394287, "rewards/ngram_similarity_reward/std": 0.31227847933769226, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 389.625, "completions/mean_terminated_length": 389.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.27522935779816515, "frac_reward_zero_std": 0.0, "grad_norm": 0.09563887119293213, "learning_rate": 4.906973666855053e-06, "loss": -0.0267, "num_tokens": 98536878.0, "reward": 4.742339611053467, "reward_std": 0.5450695157051086, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.6485894918441772, "rewards/ngram_similarity_reward/std": 0.36741480231285095, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 471.078125, "completions/mean_terminated_length": 471.078125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.27567688520921907, "frac_reward_zero_std": 0.0, "grad_norm": 0.060624074190855026, "learning_rate": 4.906499333207868e-06, "loss": 0.0118, "num_tokens": 98659075.0, "reward": 3.159956216812134, "reward_std": 1.605148434638977, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.37870633602142334, "rewards/ngram_similarity_reward/std": 0.2827274203300476, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 496.875, "completions/mean_terminated_length": 496.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.276124412620273, "frac_reward_zero_std": 0.0, "grad_norm": 0.0545518733561039, "learning_rate": 4.906023818997937e-06, "loss": 0.0125, "num_tokens": 98814603.0, "reward": 2.415585517883301, "reward_std": 1.351930022239685, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 2.9572014808654785, "rewards/ngram_similarity_reward/mean": 0.6968356370925903, "rewards/ngram_similarity_reward/std": 0.3501358926296234, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 437.890625, "completions/mean_terminated_length": 437.890625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.2765719400313269, "frac_reward_zero_std": 0.0, "grad_norm": 0.06158144772052765, "learning_rate": 4.905547124485579e-06, "loss": -0.0449, "num_tokens": 98954596.0, "reward": 3.255443572998047, "reward_std": 1.197216510772705, "rewards/accuracy_reward/mean": 2.75, "rewards/accuracy_reward/std": 3.0498504638671875, "rewards/ngram_similarity_reward/mean": 0.5054433345794678, "rewards/ngram_similarity_reward/std": 0.292623907327652, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 439.890625, "completions/mean_terminated_length": 439.890625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.2770194674423809, "frac_reward_zero_std": 0.0, "grad_norm": 0.06922630220651627, "learning_rate": 4.905069249931756e-06, "loss": -0.0113, "num_tokens": 99170269.0, "reward": 1.4937655925750732, "reward_std": 0.44643130898475647, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.5875155925750732, "rewards/ngram_similarity_reward/std": 0.38779351115226746, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 448.75, "completions/mean_terminated_length": 448.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.2774669948534348, "frac_reward_zero_std": 0.0, "grad_norm": 0.06195135787129402, "learning_rate": 4.904590195598079e-06, "loss": -0.0009, "num_tokens": 99338333.0, "reward": 3.647702693939209, "reward_std": 0.9113937616348267, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.4914528727531433, "rewards/ngram_similarity_reward/std": 0.2231704592704773, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 299.078125, "completions/mean_terminated_length": 299.078125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.2779145222644887, "frac_reward_zero_std": 0.0, "grad_norm": 0.08496079593896866, "learning_rate": 4.904109961746803e-06, "loss": 0.037, "num_tokens": 99479458.0, "reward": 3.8821115493774414, "reward_std": 1.3969141244888306, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.5383617281913757, "rewards/ngram_similarity_reward/std": 0.41145893931388855, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 418.546875, "completions/mean_terminated_length": 418.546875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.2783620496755426, "frac_reward_zero_std": 0.0, "grad_norm": 0.06467875093221664, "learning_rate": 4.9036285486408284e-06, "loss": 0.0095, "num_tokens": 99599173.0, "reward": 1.9830317497253418, "reward_std": 1.8674609661102295, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 2.914072036743164, "rewards/ngram_similarity_reward/mean": 0.467406690120697, "rewards/ngram_similarity_reward/std": 0.3296161890029907, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 455.375, "completions/mean_terminated_length": 455.375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.27880957708659654, "frac_reward_zero_std": 0.0, "grad_norm": 0.06274480372667313, "learning_rate": 4.903145956543704e-06, "loss": 0.0127, "num_tokens": 99729165.0, "reward": 5.742074012756348, "reward_std": 0.8376166224479675, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.5233241319656372, "rewards/ngram_similarity_reward/std": 0.32514575123786926, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 487.609375, "completions/mean_terminated_length": 487.609375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.27925710449765045, "frac_reward_zero_std": 0.0, "grad_norm": 0.057502295821905136, "learning_rate": 4.90266218571962e-06, "loss": 0.0013, "num_tokens": 99848244.0, "reward": 2.3932907581329346, "reward_std": 0.9242256283760071, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 3.001157283782959, "rewards/ngram_similarity_reward/mean": 0.48704057931900024, "rewards/ngram_similarity_reward/std": 0.3349016606807709, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 324.171875, "completions/mean_terminated_length": 324.171875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2797046319087044, "frac_reward_zero_std": 0.0, "grad_norm": 0.07760117202997208, "learning_rate": 4.902177236433414e-06, "loss": 0.0132, "num_tokens": 100005519.0, "reward": 4.642778396606445, "reward_std": 0.7077197432518005, "rewards/accuracy_reward/mean": 3.796875, "rewards/accuracy_reward/std": 2.746886730194092, "rewards/ngram_similarity_reward/mean": 0.8459034562110901, "rewards/ngram_similarity_reward/std": 0.35508018732070923, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 434.484375, "completions/mean_terminated_length": 434.484375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.28015215931975834, "frac_reward_zero_std": 0.0, "grad_norm": 0.07231221348047256, "learning_rate": 4.9016911089505695e-06, "loss": 0.0261, "num_tokens": 100249454.0, "reward": 3.2338109016418457, "reward_std": 1.6458979845046997, "rewards/accuracy_reward/mean": 2.765625, "rewards/accuracy_reward/std": 3.0302298069000244, "rewards/ngram_similarity_reward/mean": 0.4681856632232666, "rewards/ngram_similarity_reward/std": 0.31056922674179077, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 449.890625, "completions/mean_terminated_length": 449.890625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.28059968673081226, "frac_reward_zero_std": 0.0, "grad_norm": 0.06774061173200607, "learning_rate": 4.901203803537214e-06, "loss": -0.0067, "num_tokens": 100376119.0, "reward": 5.257980823516846, "reward_std": 2.083458662033081, "rewards/accuracy_reward/mean": 4.453125, "rewards/accuracy_reward/std": 2.319206953048706, "rewards/ngram_similarity_reward/mean": 0.8048558235168457, "rewards/ngram_similarity_reward/std": 0.3509519100189209, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 534.546875, "completions/mean_terminated_length": 534.546875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.2810472141418662, "frac_reward_zero_std": 0.0, "grad_norm": 0.04980470985174179, "learning_rate": 4.900715320460119e-06, "loss": 0.0007, "num_tokens": 100515658.0, "reward": 4.141946315765381, "reward_std": 0.7935642004013062, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.5169461965560913, "rewards/ngram_similarity_reward/std": 0.20286968350410461, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 561.375, "completions/mean_terminated_length": 561.375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.2814947415529201, "frac_reward_zero_std": 0.0, "grad_norm": 0.04802021011710167, "learning_rate": 4.900225659986703e-06, "loss": -0.0061, "num_tokens": 100706834.0, "reward": 4.6559953689575195, "reward_std": 0.5603998303413391, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.76537024974823, "rewards/ngram_similarity_reward/std": 0.1904088407754898, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 392.5, "completions/mean_terminated_length": 392.5, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.28194226896397406, "frac_reward_zero_std": 0.0, "grad_norm": 0.07349932938814163, "learning_rate": 4.899734822385027e-06, "loss": 0.0164, "num_tokens": 100879762.0, "reward": 1.2480851411819458, "reward_std": 0.28011858463287354, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 2.6659226417541504, "rewards/ngram_similarity_reward/mean": 0.31058529019355774, "rewards/ngram_similarity_reward/std": 0.20735912024974823, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 427.21875, "completions/mean_terminated_length": 427.21875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.282389796375028, "frac_reward_zero_std": 0.0, "grad_norm": 0.05627664178609848, "learning_rate": 4.8992428079237966e-06, "loss": -0.0289, "num_tokens": 101004768.0, "reward": 1.7765861749649048, "reward_std": 0.7219223976135254, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 2.7442219257354736, "rewards/ngram_similarity_reward/mean": 0.6203360557556152, "rewards/ngram_similarity_reward/std": 0.26624545454978943, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 517.953125, "completions/mean_terminated_length": 493.66668701171875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2828373237860819, "frac_reward_zero_std": 0.0, "grad_norm": 0.06717228144407272, "learning_rate": 4.898749616872363e-06, "loss": 0.0045, "num_tokens": 101171965.0, "reward": 2.8893322944641113, "reward_std": 0.5401940941810608, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.48308250308036804, "rewards/ngram_similarity_reward/std": 0.3030366599559784, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 425.515625, "completions/mean_terminated_length": 425.515625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.2832848511971358, "frac_reward_zero_std": 0.0, "grad_norm": 0.06200220435857773, "learning_rate": 4.89825524950072e-06, "loss": -0.0197, "num_tokens": 101308942.0, "reward": 4.342251777648926, "reward_std": 0.7502060532569885, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.5610020756721497, "rewards/ngram_similarity_reward/std": 0.3634032905101776, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 410.03125, "completions/mean_terminated_length": 410.03125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.28373237860818973, "frac_reward_zero_std": 0.25, "grad_norm": 0.05336322635412216, "learning_rate": 4.897759706079508e-06, "loss": -0.0188, "num_tokens": 101439872.0, "reward": 5.264481544494629, "reward_std": 1.371099591255188, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.6082311868667603, "rewards/ngram_similarity_reward/std": 0.4162740409374237, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 485.96875, "completions/mean_terminated_length": 485.96875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.2841799060192437, "frac_reward_zero_std": 0.0, "grad_norm": 0.05253211408853531, "learning_rate": 4.897262986880006e-06, "loss": -0.0079, "num_tokens": 101593086.0, "reward": 3.2981011867523193, "reward_std": 0.8422079086303711, "rewards/accuracy_reward/mean": 2.671875, "rewards/accuracy_reward/std": 3.037097215652466, "rewards/ngram_similarity_reward/mean": 0.6262260675430298, "rewards/ngram_similarity_reward/std": 0.3407411277294159, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 505.625, "completions/mean_terminated_length": 505.625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.2846274334302976, "frac_reward_zero_std": 0.0, "grad_norm": 0.0634000152349472, "learning_rate": 4.896765092174143e-06, "loss": 0.0095, "num_tokens": 101715574.0, "reward": 3.890976905822754, "reward_std": 0.9697810411453247, "rewards/accuracy_reward/mean": 3.5625, "rewards/accuracy_reward/std": 2.905249834060669, "rewards/ngram_similarity_reward/mean": 0.3284766972064972, "rewards/ngram_similarity_reward/std": 0.25927233695983887, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 366.921875, "completions/mean_terminated_length": 366.921875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.28507496084135153, "frac_reward_zero_std": 0.0, "grad_norm": 0.06476201117038727, "learning_rate": 4.896266022234487e-06, "loss": 0.0642, "num_tokens": 101864257.0, "reward": 4.763962745666504, "reward_std": 0.28067347407341003, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7639628052711487, "rewards/ngram_similarity_reward/std": 0.3518589437007904, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 421.375, "completions/mean_terminated_length": 421.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.28552248825240545, "frac_reward_zero_std": 0.0, "grad_norm": 0.06546300649642944, "learning_rate": 4.895765777334251e-06, "loss": 0.0724, "num_tokens": 101997993.0, "reward": 2.0880303382873535, "reward_std": 1.4095115661621094, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 2.958543062210083, "rewards/ngram_similarity_reward/mean": 0.49428027868270874, "rewards/ngram_similarity_reward/std": 0.2925429344177246, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 564.78125, "completions/mean_terminated_length": 564.78125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.28597001566345936, "frac_reward_zero_std": 0.0, "grad_norm": 0.051474377512931824, "learning_rate": 4.895264357747292e-06, "loss": 0.0091, "num_tokens": 102156187.0, "reward": 5.279137134552002, "reward_std": 1.6453781127929688, "rewards/accuracy_reward/mean": 4.640625, "rewards/accuracy_reward/std": 2.1445181369781494, "rewards/ngram_similarity_reward/mean": 0.638512134552002, "rewards/ngram_similarity_reward/std": 0.18305997550487518, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 457.1875, "completions/mean_terminated_length": 457.1875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.28641754307451334, "frac_reward_zero_std": 0.0, "grad_norm": 0.06602423638105392, "learning_rate": 4.8947617637481076e-06, "loss": -0.0184, "num_tokens": 102295063.0, "reward": 2.9706907272338867, "reward_std": 0.5678014755249023, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.37694060802459717, "rewards/ngram_similarity_reward/std": 0.32089659571647644, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 513.359375, "completions/mean_terminated_length": 489.0000305175781, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.28686507048556725, "frac_reward_zero_std": 0.0, "grad_norm": 0.057368215173482895, "learning_rate": 4.894257995611841e-06, "loss": -0.0077, "num_tokens": 102435582.0, "reward": 5.137367248535156, "reward_std": 1.5684716701507568, "rewards/accuracy_reward/mean": 4.546875, "rewards/accuracy_reward/std": 2.2355687618255615, "rewards/ngram_similarity_reward/mean": 0.590491771697998, "rewards/ngram_similarity_reward/std": 0.2243080735206604, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 541.828125, "completions/mean_terminated_length": 541.828125, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.28731259789662117, "frac_reward_zero_std": 0.0, "grad_norm": 0.05485881119966507, "learning_rate": 4.893753053614277e-06, "loss": -0.01, "num_tokens": 102592355.0, "reward": 3.4653518199920654, "reward_std": 0.8715257048606873, "rewards/accuracy_reward/mean": 2.84375, "rewards/accuracy_reward/std": 3.0405657291412354, "rewards/ngram_similarity_reward/mean": 0.6216020584106445, "rewards/ngram_similarity_reward/std": 0.2996468245983124, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 478.953125, "completions/mean_terminated_length": 478.953125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.2877601253076751, "frac_reward_zero_std": 0.0, "grad_norm": 0.0552397146821022, "learning_rate": 4.893246938031842e-06, "loss": -0.0002, "num_tokens": 102731856.0, "reward": 2.9069056510925293, "reward_std": 0.688241720199585, "rewards/accuracy_reward/mean": 2.296875, "rewards/accuracy_reward/std": 3.0351366996765137, "rewards/ngram_similarity_reward/mean": 0.6100307106971741, "rewards/ngram_similarity_reward/std": 0.379904180765152, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 403.171875, "completions/mean_terminated_length": 403.171875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.288207652718729, "frac_reward_zero_std": 0.0, "grad_norm": 0.07412349432706833, "learning_rate": 4.8927396491416086e-06, "loss": 0.0488, "num_tokens": 102911979.0, "reward": 3.819849967956543, "reward_std": 1.0436134338378906, "rewards/accuracy_reward/mean": 3.078125, "rewards/accuracy_reward/std": 3.0592284202575684, "rewards/ngram_similarity_reward/mean": 0.7417250871658325, "rewards/ngram_similarity_reward/std": 0.3448202311992645, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 472.609375, "completions/mean_terminated_length": 472.609375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.288655180129783, "frac_reward_zero_std": 0.0, "grad_norm": 0.06565144658088684, "learning_rate": 4.892231187221287e-06, "loss": 0.0324, "num_tokens": 103100290.0, "reward": 3.119292736053467, "reward_std": 1.507504940032959, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5255429148674011, "rewards/ngram_similarity_reward/std": 0.14479827880859375, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 461.78125, "completions/mean_terminated_length": 461.78125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2891027075408369, "frac_reward_zero_std": 0.0, "grad_norm": 0.05364995822310448, "learning_rate": 4.891721552549231e-06, "loss": -0.015, "num_tokens": 103222788.0, "reward": 4.745234966278076, "reward_std": 0.746932864189148, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.5577349662780762, "rewards/ngram_similarity_reward/std": 0.29511386156082153, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 414.734375, "completions/mean_terminated_length": 414.734375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.2895502349518908, "frac_reward_zero_std": 0.0, "grad_norm": 0.0833452045917511, "learning_rate": 4.891210745404438e-06, "loss": -0.0237, "num_tokens": 103374115.0, "reward": 1.926346778869629, "reward_std": 2.1057116985321045, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 2.967708110809326, "rewards/ngram_similarity_reward/mean": 0.473222017288208, "rewards/ngram_similarity_reward/std": 0.4059741199016571, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 394.359375, "completions/mean_terminated_length": 394.359375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.2899977623629447, "frac_reward_zero_std": 0.0, "grad_norm": 0.07352577894926071, "learning_rate": 4.8906987660665476e-06, "loss": -0.0028, "num_tokens": 103561898.0, "reward": 4.475122451782227, "reward_std": 0.20881153643131256, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4751223921775818, "rewards/ngram_similarity_reward/std": 0.2813428044319153, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 405.03125, "completions/mean_terminated_length": 405.03125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.29044528977399864, "frac_reward_zero_std": 0.0, "grad_norm": 0.09115469455718994, "learning_rate": 4.8901856148158375e-06, "loss": 0.0315, "num_tokens": 103780764.0, "reward": 2.3142309188842773, "reward_std": 1.3402717113494873, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 3.0315799713134766, "rewards/ngram_similarity_reward/mean": 0.43923094868659973, "rewards/ngram_similarity_reward/std": 0.20881466567516327, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 523.4375, "completions/mean_terminated_length": 523.4375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.2908928171850526, "frac_reward_zero_std": 0.0, "grad_norm": 0.05662143975496292, "learning_rate": 4.889671291933231e-06, "loss": -0.0036, "num_tokens": 103930728.0, "reward": 1.6569280624389648, "reward_std": 0.4532318115234375, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5631780028343201, "rewards/ngram_similarity_reward/std": 0.4234296381473541, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 475.84375, "completions/mean_terminated_length": 475.84375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.2913403445961065, "frac_reward_zero_std": 0.0, "grad_norm": 0.06426508724689484, "learning_rate": 4.8891557977002915e-06, "loss": 0.009, "num_tokens": 104147918.0, "reward": 4.594308853149414, "reward_std": 0.5861914753913879, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.7036839723587036, "rewards/ngram_similarity_reward/std": 0.24967730045318604, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 462.875, "completions/mean_terminated_length": 462.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.29178787200716044, "frac_reward_zero_std": 0.0, "grad_norm": 0.05419683828949928, "learning_rate": 4.888639132399221e-06, "loss": 0.0002, "num_tokens": 104316102.0, "reward": 3.444973945617676, "reward_std": 1.0651003122329712, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.5699740648269653, "rewards/ngram_similarity_reward/std": 0.20455977320671082, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 559.9375, "completions/mean_terminated_length": 559.9375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.29223539941821436, "frac_reward_zero_std": 0.0, "grad_norm": 0.058351751416921616, "learning_rate": 4.888121296312867e-06, "loss": 0.0426, "num_tokens": 104454386.0, "reward": 6.093581199645996, "reward_std": 0.6258885860443115, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.7029563784599304, "rewards/ngram_similarity_reward/std": 0.31050267815589905, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 456.09375, "completions/mean_terminated_length": 456.09375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.2926829268292683, "frac_reward_zero_std": 0.0, "grad_norm": 0.06823959946632385, "learning_rate": 4.887602289724715e-06, "loss": 0.0201, "num_tokens": 104597688.0, "reward": 4.1921162605285645, "reward_std": 0.9890491962432861, "rewards/accuracy_reward/mean": 3.546875, "rewards/accuracy_reward/std": 2.9300289154052734, "rewards/ngram_similarity_reward/mean": 0.6452413201332092, "rewards/ngram_similarity_reward/std": 0.35380610823631287, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 496.9375, "completions/mean_terminated_length": 496.9375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.29313045424032225, "frac_reward_zero_std": 0.0, "grad_norm": 0.05976645275950432, "learning_rate": 4.8870821129188915e-06, "loss": 0.0346, "num_tokens": 104694052.0, "reward": 2.5127129554748535, "reward_std": 1.7011172771453857, "rewards/accuracy_reward/mean": 2.09375, "rewards/accuracy_reward/std": 3.0327250957489014, "rewards/ngram_similarity_reward/mean": 0.41896289587020874, "rewards/ngram_similarity_reward/std": 0.30265194177627563, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 470.859375, "completions/mean_terminated_length": 470.859375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.29357798165137616, "frac_reward_zero_std": 0.0, "grad_norm": 0.05093248561024666, "learning_rate": 4.886560766180165e-06, "loss": -0.012, "num_tokens": 104829035.0, "reward": 3.722310781478882, "reward_std": 1.4227113723754883, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.4723109006881714, "rewards/ngram_similarity_reward/std": 0.33681726455688477, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 438.515625, "completions/mean_terminated_length": 438.515625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.2940255090624301, "frac_reward_zero_std": 0.0, "grad_norm": 0.06808004528284073, "learning_rate": 4.886038249793943e-06, "loss": -0.0052, "num_tokens": 104990956.0, "reward": 3.241586685180664, "reward_std": 0.6259248852729797, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6478367447853088, "rewards/ngram_similarity_reward/std": 0.3466797471046448, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 494.890625, "completions/mean_terminated_length": 494.890625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.294473036473484, "frac_reward_zero_std": 0.0, "grad_norm": 0.08694926649332047, "learning_rate": 4.885514564046276e-06, "loss": -0.0148, "num_tokens": 105229989.0, "reward": 4.648185729980469, "reward_std": 0.17134308815002441, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6481857895851135, "rewards/ngram_similarity_reward/std": 0.34424903988838196, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 460.671875, "completions/mean_terminated_length": 460.671875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2949205638845379, "frac_reward_zero_std": 0.0, "grad_norm": 0.06605000793933868, "learning_rate": 4.884989709223849e-06, "loss": 0.0273, "num_tokens": 105362992.0, "reward": 4.155838966369629, "reward_std": 0.8568543195724487, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6245891451835632, "rewards/ngram_similarity_reward/std": 0.35362404584884644, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 337.890625, "completions/mean_terminated_length": 337.890625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.29536809129559183, "frac_reward_zero_std": 0.0, "grad_norm": 0.07516282796859741, "learning_rate": 4.8844636856139946e-06, "loss": -0.0098, "num_tokens": 105524457.0, "reward": 4.741536617279053, "reward_std": 1.5814883708953857, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.36653655767440796, "rewards/ngram_similarity_reward/std": 0.26399847865104675, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 344.59375, "completions/mean_terminated_length": 344.59375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2958156187066458, "frac_reward_zero_std": 0.0, "grad_norm": 0.07712395489215851, "learning_rate": 4.883936493504678e-06, "loss": 0.0029, "num_tokens": 105719711.0, "reward": 4.29510498046875, "reward_std": 0.5769654512405396, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.48260533809661865, "rewards/ngram_similarity_reward/std": 0.33051398396492004, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 455.828125, "completions/mean_terminated_length": 455.828125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.2962631461176997, "frac_reward_zero_std": 0.0, "grad_norm": 0.052316777408123016, "learning_rate": 4.8834081331845095e-06, "loss": 0.0097, "num_tokens": 105881860.0, "reward": 4.3702569007873535, "reward_std": 0.45644718408584595, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.46400687098503113, "rewards/ngram_similarity_reward/std": 0.23395119607448578, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 468.65625, "completions/mean_terminated_length": 468.65625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.29671067352875363, "frac_reward_zero_std": 0.0, "grad_norm": 0.0560014508664608, "learning_rate": 4.882878604942737e-06, "loss": -0.0453, "num_tokens": 106034334.0, "reward": 1.8861474990844727, "reward_std": 0.8671086430549622, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.4173974394798279, "rewards/ngram_similarity_reward/std": 0.3207104206085205, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 393.03125, "completions/mean_terminated_length": 393.03125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.29715820093980755, "frac_reward_zero_std": 0.0, "grad_norm": 0.06537173688411713, "learning_rate": 4.882347909069246e-06, "loss": -0.0027, "num_tokens": 106171744.0, "reward": 2.1393954753875732, "reward_std": 2.222316026687622, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 3.077979803085327, "rewards/ngram_similarity_reward/mean": 0.686270534992218, "rewards/ngram_similarity_reward/std": 0.24120375514030457, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 558.328125, "completions/mean_terminated_length": 558.328125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.29760572835086146, "frac_reward_zero_std": 0.0, "grad_norm": 0.04718297719955444, "learning_rate": 4.881816045854562e-06, "loss": -0.0246, "num_tokens": 106351957.0, "reward": 3.061079502105713, "reward_std": 0.16121825575828552, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.06007981300354, "rewards/ngram_similarity_reward/mean": 0.5923295021057129, "rewards/ngram_similarity_reward/std": 0.2995980381965637, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 476.953125, "completions/mean_terminated_length": 476.953125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.29805325576191544, "frac_reward_zero_std": 0.0, "grad_norm": 0.059488993138074875, "learning_rate": 4.8812830155898535e-06, "loss": 0.0113, "num_tokens": 106528754.0, "reward": 2.6967692375183105, "reward_std": 0.6507419347763062, "rewards/accuracy_reward/mean": 2.296875, "rewards/accuracy_reward/std": 3.0351366996765137, "rewards/ngram_similarity_reward/mean": 0.3998942971229553, "rewards/ngram_similarity_reward/std": 0.33484646677970886, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 443.703125, "completions/mean_terminated_length": 443.703125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.29850078317296935, "frac_reward_zero_std": 0.0, "grad_norm": 0.07318499684333801, "learning_rate": 4.880748818566923e-06, "loss": 0.0234, "num_tokens": 106660447.0, "reward": 2.829117774963379, "reward_std": 2.2205731868743896, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5166178345680237, "rewards/ngram_similarity_reward/std": 0.2861684262752533, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 496.0625, "completions/mean_terminated_length": 496.0625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.29894831058402327, "frac_reward_zero_std": 0.0, "grad_norm": 0.05720727518200874, "learning_rate": 4.880213455078214e-06, "loss": 0.0109, "num_tokens": 106802035.0, "reward": 4.265432834625244, "reward_std": 0.7786337733268738, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.881075382232666, "rewards/ngram_similarity_reward/mean": 0.5466828346252441, "rewards/ngram_similarity_reward/std": 0.2712513506412506, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 397.203125, "completions/mean_terminated_length": 397.203125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.2993958379950772, "frac_reward_zero_std": 0.0, "grad_norm": 0.06245892122387886, "learning_rate": 4.879676925416806e-06, "loss": -0.0206, "num_tokens": 106967792.0, "reward": 4.526856422424316, "reward_std": 0.13491158187389374, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5268564224243164, "rewards/ngram_similarity_reward/std": 0.37578147649765015, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 457.390625, "completions/mean_terminated_length": 457.390625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.2998433654061311, "frac_reward_zero_std": 0.0, "grad_norm": 0.06878712773323059, "learning_rate": 4.879139229876422e-06, "loss": 0.041, "num_tokens": 107112089.0, "reward": 2.2240772247314453, "reward_std": 1.5732771158218384, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 2.9304099082946777, "rewards/ngram_similarity_reward/mean": 0.5990773439407349, "rewards/ngram_similarity_reward/std": 0.33353397250175476, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 504.71875, "completions/mean_terminated_length": 504.71875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.3002908928171851, "frac_reward_zero_std": 0.0, "grad_norm": 0.056628819555044174, "learning_rate": 4.878600368751419e-06, "loss": 0.0248, "num_tokens": 107290327.0, "reward": 5.932975769042969, "reward_std": 0.2094021886587143, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.4329761564731598, "rewards/ngram_similarity_reward/std": 0.28314828872680664, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 447.234375, "completions/mean_terminated_length": 447.234375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.300738420228239, "frac_reward_zero_std": 0.0, "grad_norm": 0.0657930076122284, "learning_rate": 4.8780603423367924e-06, "loss": 0.0008, "num_tokens": 107435574.0, "reward": 1.5740094184875488, "reward_std": 0.15247586369514465, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5740094184875488, "rewards/ngram_similarity_reward/std": 0.3831358850002289, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 341.15625, "completions/mean_terminated_length": 341.15625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.3011859476392929, "frac_reward_zero_std": 0.0, "grad_norm": 0.08099042624235153, "learning_rate": 4.877519150928178e-06, "loss": -0.0101, "num_tokens": 107615760.0, "reward": 3.729093551635742, "reward_std": 1.170121431350708, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7603434324264526, "rewards/ngram_similarity_reward/std": 0.31671011447906494, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 452.953125, "completions/mean_terminated_length": 452.953125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3016334750503468, "frac_reward_zero_std": 0.0, "grad_norm": 0.058367665857076645, "learning_rate": 4.876976794821847e-06, "loss": -0.0165, "num_tokens": 107815037.0, "reward": 3.6228060722351074, "reward_std": 0.8924241065979004, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.4665558934211731, "rewards/ngram_similarity_reward/std": 0.23758745193481445, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 434.859375, "completions/mean_terminated_length": 434.859375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.30208100246140074, "frac_reward_zero_std": 0.0, "grad_norm": 0.07167170941829681, "learning_rate": 4.876433274314709e-06, "loss": 0.0331, "num_tokens": 107947844.0, "reward": 4.721240997314453, "reward_std": 0.17073199152946472, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.721240758895874, "rewards/ngram_similarity_reward/std": 0.2963772416114807, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 391.765625, "completions/mean_terminated_length": 391.765625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.3025285298724547, "frac_reward_zero_std": 0.25, "grad_norm": 0.05580935254693031, "learning_rate": 4.8758885897043115e-06, "loss": 0.0061, "num_tokens": 108083285.0, "reward": 5.102502822875977, "reward_std": 0.8618116974830627, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.6337529420852661, "rewards/ngram_similarity_reward/std": 0.40071678161621094, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 470.109375, "completions/mean_terminated_length": 470.109375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.3029760572835086, "frac_reward_zero_std": 0.0, "grad_norm": 0.050674766302108765, "learning_rate": 4.875342741288838e-06, "loss": -0.0062, "num_tokens": 108203852.0, "reward": 4.888950824737549, "reward_std": 0.1928871124982834, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8889507055282593, "rewards/ngram_similarity_reward/std": 0.2250843644142151, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 453.796875, "completions/mean_terminated_length": 453.796875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.30342358469456254, "frac_reward_zero_std": 0.0, "grad_norm": 0.0681556984782219, "learning_rate": 4.87479572936711e-06, "loss": -0.0392, "num_tokens": 108421407.0, "reward": 2.8512043952941895, "reward_std": 0.8869644403457642, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.3512043356895447, "rewards/ngram_similarity_reward/std": 0.22943827509880066, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 411.15625, "completions/mean_terminated_length": 411.15625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.30387111210561646, "frac_reward_zero_std": 0.0, "grad_norm": 0.06231510639190674, "learning_rate": 4.874247554238587e-06, "loss": 0.0187, "num_tokens": 108548841.0, "reward": 2.546853542327881, "reward_std": 0.9314303398132324, "rewards/accuracy_reward/mean": 2.015625, "rewards/accuracy_reward/std": 3.00260329246521, "rewards/ngram_similarity_reward/mean": 0.5312284231185913, "rewards/ngram_similarity_reward/std": 0.36710941791534424, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 373.359375, "completions/mean_terminated_length": 373.359375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3043186395166704, "frac_reward_zero_std": 0.0, "grad_norm": 0.0652371495962143, "learning_rate": 4.873698216203364e-06, "loss": 0.0112, "num_tokens": 108667216.0, "reward": 6.076634407043457, "reward_std": 0.2062225341796875, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.5766341686248779, "rewards/ngram_similarity_reward/std": 0.3065849840641022, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 376.90625, "completions/mean_terminated_length": 376.90625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.30476616692772435, "frac_reward_zero_std": 0.0, "grad_norm": 0.07112135738134384, "learning_rate": 4.873147715562173e-06, "loss": 0.0233, "num_tokens": 108830522.0, "reward": 5.461245536804199, "reward_std": 0.980175256729126, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.7112454175949097, "rewards/ngram_similarity_reward/std": 0.39009037613868713, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 384.453125, "completions/mean_terminated_length": 384.453125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.30521369433877826, "frac_reward_zero_std": 0.0, "grad_norm": 0.07231220602989197, "learning_rate": 4.872596052616381e-06, "loss": 0.0189, "num_tokens": 108970679.0, "reward": 5.196628093719482, "reward_std": 1.5067083835601807, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.6341278553009033, "rewards/ngram_similarity_reward/std": 0.29436030983924866, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 326.125, "completions/mean_terminated_length": 326.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.3056612217498322, "frac_reward_zero_std": 0.0, "grad_norm": 0.08024755865335464, "learning_rate": 4.872043227667993e-06, "loss": 0.0353, "num_tokens": 109087855.0, "reward": 3.7122302055358887, "reward_std": 1.7788197994232178, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.5559800863265991, "rewards/ngram_similarity_reward/std": 0.33515578508377075, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 430.1875, "completions/mean_terminated_length": 430.1875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.3061087491608861, "frac_reward_zero_std": 0.0, "grad_norm": 0.07237806171178818, "learning_rate": 4.8714892410196504e-06, "loss": 0.0074, "num_tokens": 109239659.0, "reward": 0.10017599165439606, "reward_std": 0.5458903312683105, "rewards/accuracy_reward/mean": -0.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.5064259767532349, "rewards/ngram_similarity_reward/std": 0.26822736859321594, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 423.109375, "completions/mean_terminated_length": 423.109375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.30655627657194, "frac_reward_zero_std": 0.0, "grad_norm": 0.07401303946971893, "learning_rate": 4.87093409297463e-06, "loss": -0.0152, "num_tokens": 109460226.0, "reward": 2.838829755783081, "reward_std": 0.14354142546653748, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.33882978558540344, "rewards/ngram_similarity_reward/std": 0.25283560156822205, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 423.21875, "completions/mean_terminated_length": 423.21875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.307003803982994, "frac_reward_zero_std": 0.0, "grad_norm": 0.0632106363773346, "learning_rate": 4.8703777838368435e-06, "loss": -0.0158, "num_tokens": 109596736.0, "reward": 2.740299701690674, "reward_std": 1.2174031734466553, "rewards/accuracy_reward/mean": 2.0625, "rewards/accuracy_reward/std": 3.06477689743042, "rewards/ngram_similarity_reward/mean": 0.6777995824813843, "rewards/ngram_similarity_reward/std": 0.24209056794643402, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 457.765625, "completions/mean_terminated_length": 457.765625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.3074513313940479, "frac_reward_zero_std": 0.0, "grad_norm": 0.053076568990945816, "learning_rate": 4.869820313910839e-06, "loss": 0.017, "num_tokens": 109724545.0, "reward": 2.3994107246398926, "reward_std": 0.8497989177703857, "rewards/accuracy_reward/mean": 2.0, "rewards/accuracy_reward/std": 2.7602622509002686, "rewards/ngram_similarity_reward/mean": 0.39941078424453735, "rewards/ngram_similarity_reward/std": 0.2008398175239563, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 458.5625, "completions/mean_terminated_length": 458.5625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.3078988588051018, "frac_reward_zero_std": 0.0, "grad_norm": 0.06561536341905594, "learning_rate": 4.869261683501801e-06, "loss": -0.0267, "num_tokens": 109916965.0, "reward": 2.2723612785339355, "reward_std": 0.9010312557220459, "rewards/accuracy_reward/mean": 1.828125, "rewards/accuracy_reward/std": 2.9657018184661865, "rewards/ngram_similarity_reward/mean": 0.4442363679409027, "rewards/ngram_similarity_reward/std": 0.2966659963130951, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 465.296875, "completions/mean_terminated_length": 465.296875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.30834638621615573, "frac_reward_zero_std": 0.0, "grad_norm": 0.0637257769703865, "learning_rate": 4.868701892915549e-06, "loss": -0.0295, "num_tokens": 110058120.0, "reward": 4.125423431396484, "reward_std": 2.302952289581299, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.500423789024353, "rewards/ngram_similarity_reward/std": 0.321915864944458, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 394.15625, "completions/mean_terminated_length": 394.15625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.30879391362720965, "frac_reward_zero_std": 0.0, "grad_norm": 0.08084577322006226, "learning_rate": 4.868140942458535e-06, "loss": -0.0049, "num_tokens": 110201810.0, "reward": 4.3223981857299805, "reward_std": 0.7608660459518433, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6036486625671387, "rewards/ngram_similarity_reward/std": 0.3710053563117981, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 403.5, "completions/mean_terminated_length": 403.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.3092414410382636, "frac_reward_zero_std": 0.0, "grad_norm": 0.07621369510889053, "learning_rate": 4.867578832437849e-06, "loss": -0.0119, "num_tokens": 110356402.0, "reward": 5.923033714294434, "reward_std": 0.14946463704109192, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.4230341911315918, "rewards/ngram_similarity_reward/std": 0.3749995529651642, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 318.03125, "completions/mean_terminated_length": 318.03125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.30968896844931754, "frac_reward_zero_std": 0.0, "grad_norm": 0.07899641990661621, "learning_rate": 4.867015563161216e-06, "loss": 0.0141, "num_tokens": 110489444.0, "reward": 5.396566390991211, "reward_std": 0.969638466835022, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.6465665102005005, "rewards/ngram_similarity_reward/std": 0.3430982530117035, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 544.75, "completions/mean_terminated_length": 544.75, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.31013649586037145, "frac_reward_zero_std": 0.0, "grad_norm": 0.04846769943833351, "learning_rate": 4.866451134936991e-06, "loss": 0.009, "num_tokens": 110659156.0, "reward": 6.0783796310424805, "reward_std": 0.5822182297706604, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.6721292734146118, "rewards/ngram_similarity_reward/std": 0.34718698263168335, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 472.421875, "completions/mean_terminated_length": 472.421875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.31058402327142537, "frac_reward_zero_std": 0.0, "grad_norm": 0.0697740688920021, "learning_rate": 4.86588554807417e-06, "loss": 0.0322, "num_tokens": 110820767.0, "reward": 4.631163597106934, "reward_std": 0.09964464604854584, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6311637163162231, "rewards/ngram_similarity_reward/std": 0.3299922049045563, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 388.390625, "completions/mean_terminated_length": 388.390625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.3110315506824793, "frac_reward_zero_std": 0.0, "grad_norm": 0.08026900142431259, "learning_rate": 4.86531880288238e-06, "loss": 0.0145, "num_tokens": 110958792.0, "reward": 5.962247848510742, "reward_std": 0.5507302284240723, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.5716227889060974, "rewards/ngram_similarity_reward/std": 0.38306570053100586, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 454.125, "completions/mean_terminated_length": 454.125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3114790780935332, "frac_reward_zero_std": 0.0, "grad_norm": 0.05776282772421837, "learning_rate": 4.86475089967188e-06, "loss": -0.014, "num_tokens": 111126784.0, "reward": 0.5443506836891174, "reward_std": 1.210315227508545, "rewards/accuracy_reward/mean": -0.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.5756007432937622, "rewards/ngram_similarity_reward/std": 0.27265891432762146, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 382.734375, "completions/mean_terminated_length": 382.734375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.3119266055045872, "frac_reward_zero_std": 0.0, "grad_norm": 0.06984729319810867, "learning_rate": 4.8641818387535674e-06, "loss": -0.0328, "num_tokens": 111280703.0, "reward": 2.9061031341552734, "reward_std": 1.9089854955673218, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7811028957366943, "rewards/ngram_similarity_reward/std": 0.2761169373989105, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 439.390625, "completions/mean_terminated_length": 439.390625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3123741329156411, "frac_reward_zero_std": 0.0, "grad_norm": 0.07779427617788315, "learning_rate": 4.863611620438971e-06, "loss": 0.0051, "num_tokens": 111489048.0, "reward": 4.679882526397705, "reward_std": 1.6266264915466309, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.4923825263977051, "rewards/ngram_similarity_reward/std": 0.29693710803985596, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 372.234375, "completions/mean_terminated_length": 372.234375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.312821660326695, "frac_reward_zero_std": 0.0, "grad_norm": 0.07299550622701645, "learning_rate": 4.863040245040253e-06, "loss": -0.0002, "num_tokens": 111636247.0, "reward": 3.861237049102783, "reward_std": 1.870938777923584, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.3299867808818817, "rewards/ngram_similarity_reward/std": 0.2714199423789978, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 472.578125, "completions/mean_terminated_length": 472.578125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.3132691877377489, "frac_reward_zero_std": 0.0, "grad_norm": 0.0719018429517746, "learning_rate": 4.862467712870209e-06, "loss": 0.031, "num_tokens": 111793612.0, "reward": 4.607283592224121, "reward_std": 0.5640270113945007, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.7010337114334106, "rewards/ngram_similarity_reward/std": 0.3264153301715851, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 398.765625, "completions/mean_terminated_length": 398.765625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.31371671514880284, "frac_reward_zero_std": 0.0, "grad_norm": 0.05898962542414665, "learning_rate": 4.861894024242269e-06, "loss": -0.008, "num_tokens": 111933629.0, "reward": 5.79965877532959, "reward_std": 1.086059808731079, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.6746585369110107, "rewards/ngram_similarity_reward/std": 0.4353345036506653, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 423.421875, "completions/mean_terminated_length": 423.421875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3141642425598568, "frac_reward_zero_std": 0.0, "grad_norm": 0.0693143978714943, "learning_rate": 4.861319179470495e-06, "loss": 0.0245, "num_tokens": 112075736.0, "reward": 2.0054614543914795, "reward_std": 1.171264410018921, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.5367113351821899, "rewards/ngram_similarity_reward/std": 0.327590674161911, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 464.71875, "completions/mean_terminated_length": 464.71875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.3146117699709107, "frac_reward_zero_std": 0.0, "grad_norm": 0.05373014882206917, "learning_rate": 4.860743178869583e-06, "loss": 0.0038, "num_tokens": 112207318.0, "reward": 3.5712907314300537, "reward_std": 2.0306131839752197, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6025407910346985, "rewards/ngram_similarity_reward/std": 0.28013697266578674, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 401.984375, "completions/mean_terminated_length": 401.984375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.31505929738196464, "frac_reward_zero_std": 0.0, "grad_norm": 0.064559206366539, "learning_rate": 4.86016602275486e-06, "loss": -0.0199, "num_tokens": 112322181.0, "reward": 2.7623391151428223, "reward_std": 1.1833332777023315, "rewards/accuracy_reward/mean": 2.203125, "rewards/accuracy_reward/std": 3.0272817611694336, "rewards/ngram_similarity_reward/mean": 0.5592142343521118, "rewards/ngram_similarity_reward/std": 0.19574899971485138, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 459.875, "completions/mean_terminated_length": 459.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.31550682479301856, "frac_reward_zero_std": 0.0, "grad_norm": 0.0718189924955368, "learning_rate": 4.8595877114422884e-06, "loss": 0.0079, "num_tokens": 112496957.0, "reward": 3.048264980316162, "reward_std": 0.13818775117397308, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5482649803161621, "rewards/ngram_similarity_reward/std": 0.24722221493721008, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 414.65625, "completions/mean_terminated_length": 414.65625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.3159543522040725, "frac_reward_zero_std": 0.0, "grad_norm": 0.06648236513137817, "learning_rate": 4.85900824524846e-06, "loss": 0.023, "num_tokens": 112662759.0, "reward": 4.96194314956665, "reward_std": 1.1429578065872192, "rewards/accuracy_reward/mean": 4.265625, "rewards/accuracy_reward/std": 2.467195510864258, "rewards/ngram_similarity_reward/mean": 0.6963184475898743, "rewards/ngram_similarity_reward/std": 0.315858393907547, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 402.984375, "completions/mean_terminated_length": 402.984375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.31640187961512645, "frac_reward_zero_std": 0.0, "grad_norm": 0.070754274725914, "learning_rate": 4.8584276244906e-06, "loss": 0.0114, "num_tokens": 112782006.0, "reward": 5.393426895141602, "reward_std": 1.2861722707748413, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.549676775932312, "rewards/ngram_similarity_reward/std": 0.3782258629798889, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 355.546875, "completions/mean_terminated_length": 355.546875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.31684940702618036, "frac_reward_zero_std": 0.0, "grad_norm": 0.0816711038351059, "learning_rate": 4.857845849486566e-06, "loss": -0.02, "num_tokens": 112916265.0, "reward": 3.490420341491699, "reward_std": 0.8595148324966431, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.4279203712940216, "rewards/ngram_similarity_reward/std": 0.2959030866622925, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 481.234375, "completions/mean_terminated_length": 481.234375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3172969344372343, "frac_reward_zero_std": 0.0, "grad_norm": 0.06152066960930824, "learning_rate": 4.857262920554848e-06, "loss": 0.0112, "num_tokens": 113025592.0, "reward": 5.843417167663574, "reward_std": 0.7281556725502014, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.6246671676635742, "rewards/ngram_similarity_reward/std": 0.2800602614879608, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 367.125, "completions/mean_terminated_length": 367.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3177444618482882, "frac_reward_zero_std": 0.0, "grad_norm": 0.08010809123516083, "learning_rate": 4.8566788380145665e-06, "loss": -0.0067, "num_tokens": 113166368.0, "reward": 2.8130388259887695, "reward_std": 0.08101053535938263, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.31303858757019043, "rewards/ngram_similarity_reward/std": 0.27182063460350037, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 371.671875, "completions/mean_terminated_length": 371.671875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.3181919892593421, "frac_reward_zero_std": 0.0, "grad_norm": 0.07613039761781693, "learning_rate": 4.856093602185473e-06, "loss": 0.0029, "num_tokens": 113373771.0, "reward": 4.351356506347656, "reward_std": 0.5276011228561401, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.44510650634765625, "rewards/ngram_similarity_reward/std": 0.3404860198497772, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 451.890625, "completions/mean_terminated_length": 451.890625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.3186395166703961, "frac_reward_zero_std": 0.0, "grad_norm": 0.08762478083372116, "learning_rate": 4.855507213387954e-06, "loss": -0.0114, "num_tokens": 113518340.0, "reward": 6.231166839599609, "reward_std": 0.29846394062042236, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7311668395996094, "rewards/ngram_similarity_reward/std": 0.3233652412891388, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 463.609375, "completions/mean_terminated_length": 463.609375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.31908704408145, "frac_reward_zero_std": 0.0, "grad_norm": 0.05265612527728081, "learning_rate": 4.854919671943021e-06, "loss": 0.0227, "num_tokens": 113650379.0, "reward": 2.966203212738037, "reward_std": 1.021661400794983, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5599533319473267, "rewards/ngram_similarity_reward/std": 0.26916348934173584, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 353.75, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3195345714925039, "frac_reward_zero_std": 0.0, "grad_norm": 0.07360535115003586, "learning_rate": 4.8543309781723235e-06, "loss": -0.002, "num_tokens": 113788443.0, "reward": 6.086665153503418, "reward_std": 0.5752182006835938, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.6804152131080627, "rewards/ngram_similarity_reward/std": 0.31077635288238525, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 388.46875, "completions/mean_terminated_length": 388.46875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.31998209890355783, "frac_reward_zero_std": 0.0, "grad_norm": 0.06774787604808807, "learning_rate": 4.853741132398136e-06, "loss": -0.0142, "num_tokens": 113931817.0, "reward": 4.677279949188232, "reward_std": 0.9036829471588135, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.677280068397522, "rewards/ngram_similarity_reward/std": 0.32574334740638733, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 382.46875, "completions/mean_terminated_length": 382.46875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.32042962631461175, "frac_reward_zero_std": 0.25, "grad_norm": 0.061975304037332535, "learning_rate": 4.853150134943367e-06, "loss": -0.0007, "num_tokens": 114050775.0, "reward": 2.2052435874938965, "reward_std": 1.7568916082382202, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.5489935874938965, "rewards/ngram_similarity_reward/std": 0.265791654586792, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 437.515625, "completions/mean_terminated_length": 437.515625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.3208771537256657, "frac_reward_zero_std": 0.0, "grad_norm": 0.05750018358230591, "learning_rate": 4.852557986131555e-06, "loss": -0.0013, "num_tokens": 114183704.0, "reward": 5.553138732910156, "reward_std": 1.0766979455947876, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.4281388521194458, "rewards/ngram_similarity_reward/std": 0.2769782245159149, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 522.546875, "completions/mean_terminated_length": 522.546875, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.32132468113671964, "frac_reward_zero_std": 0.0, "grad_norm": 0.05540424585342407, "learning_rate": 4.8519646862868675e-06, "loss": 0.0045, "num_tokens": 114336971.0, "reward": 3.764643907546997, "reward_std": 1.3306835889816284, "rewards/accuracy_reward/mean": 3.203125, "rewards/accuracy_reward/std": 2.995656728744507, "rewards/ngram_similarity_reward/mean": 0.5615189075469971, "rewards/ngram_similarity_reward/std": 0.3798581063747406, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 403.09375, "completions/mean_terminated_length": 403.09375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.32177220854777355, "frac_reward_zero_std": 0.0, "grad_norm": 0.07903230935335159, "learning_rate": 4.851370235734103e-06, "loss": 0.0372, "num_tokens": 114479969.0, "reward": 5.639060974121094, "reward_std": 1.0079065561294556, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.795311450958252, "rewards/ngram_similarity_reward/std": 0.32375478744506836, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 449.453125, "completions/mean_terminated_length": 449.453125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.32221973595882747, "frac_reward_zero_std": 0.0, "grad_norm": 0.06887117773294449, "learning_rate": 4.85077463479869e-06, "loss": -0.0308, "num_tokens": 114705374.0, "reward": 1.1147079467773438, "reward_std": 0.863875150680542, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 2.381934404373169, "rewards/ngram_similarity_reward/mean": 0.5209579467773438, "rewards/ngram_similarity_reward/std": 0.2823002338409424, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 391.6875, "completions/mean_terminated_length": 391.6875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.3226672633698814, "frac_reward_zero_std": 0.0, "grad_norm": 0.07737100124359131, "learning_rate": 4.850177883806688e-06, "loss": 0.0387, "num_tokens": 114853418.0, "reward": 3.036330223083496, "reward_std": 0.6309786438941956, "rewards/accuracy_reward/mean": 2.53125, "rewards/accuracy_reward/std": 3.0961766242980957, "rewards/ngram_similarity_reward/mean": 0.5050802230834961, "rewards/ngram_similarity_reward/std": 0.40981554985046387, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 490.0625, "completions/mean_terminated_length": 490.0625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.32311479078093536, "frac_reward_zero_std": 0.0, "grad_norm": 0.06010664626955986, "learning_rate": 4.849579983084782e-06, "loss": 0.0351, "num_tokens": 115037470.0, "reward": 4.774776935577393, "reward_std": 1.4736918210983276, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7747770547866821, "rewards/ngram_similarity_reward/std": 0.26835912466049194, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 355.046875, "completions/mean_terminated_length": 355.046875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3235623181919893, "frac_reward_zero_std": 0.0, "grad_norm": 0.07075344771146774, "learning_rate": 4.848980932960292e-06, "loss": 0.0033, "num_tokens": 115192289.0, "reward": 2.6875319480895996, "reward_std": 1.4031498432159424, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6562820672988892, "rewards/ngram_similarity_reward/std": 0.36360806226730347, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 399.28125, "completions/mean_terminated_length": 399.28125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3240098456030432, "frac_reward_zero_std": 0.0, "grad_norm": 0.0756743997335434, "learning_rate": 4.848380733761164e-06, "loss": -0.0084, "num_tokens": 115352051.0, "reward": 4.374879837036133, "reward_std": 2.440624237060547, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.7498795986175537, "rewards/ngram_similarity_reward/std": 0.2921641767024994, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 411.890625, "completions/mean_terminated_length": 411.890625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3244573730140971, "frac_reward_zero_std": 0.0, "grad_norm": 0.07049600034952164, "learning_rate": 4.847779385815971e-06, "loss": -0.0151, "num_tokens": 115532108.0, "reward": 4.49118709564209, "reward_std": 0.833372950553894, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.866187334060669, "rewards/ngram_similarity_reward/std": 0.28561943769454956, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 432.625, "completions/mean_terminated_length": 432.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.324904900425151, "frac_reward_zero_std": 0.0, "grad_norm": 0.08293016254901886, "learning_rate": 4.847176889453921e-06, "loss": -0.0273, "num_tokens": 115678692.0, "reward": 5.021811485290527, "reward_std": 0.9971826672554016, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.5530613660812378, "rewards/ngram_similarity_reward/std": 0.3139452338218689, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 452.484375, "completions/mean_terminated_length": 452.484375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.325352427836205, "frac_reward_zero_std": 0.0, "grad_norm": 0.06269259005784988, "learning_rate": 4.846573245004844e-06, "loss": -0.0029, "num_tokens": 115835971.0, "reward": 4.7243499755859375, "reward_std": 0.22585970163345337, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7243503332138062, "rewards/ngram_similarity_reward/std": 0.3056999444961548, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 360.0625, "completions/mean_terminated_length": 360.0625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3257999552472589, "frac_reward_zero_std": 0.0, "grad_norm": 0.08586455881595612, "learning_rate": 4.845968452799203e-06, "loss": -0.0181, "num_tokens": 115954839.0, "reward": 1.1131083965301514, "reward_std": 1.2290589809417725, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 2.3712377548217773, "rewards/ngram_similarity_reward/mean": 0.5037335157394409, "rewards/ngram_similarity_reward/std": 0.2511395812034607, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 386.21875, "completions/mean_terminated_length": 386.21875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.3262474826583128, "frac_reward_zero_std": 0.0, "grad_norm": 0.06456200033426285, "learning_rate": 4.845362513168088e-06, "loss": 0.0137, "num_tokens": 116087573.0, "reward": 3.3358211517333984, "reward_std": 2.3673391342163086, "rewards/accuracy_reward/mean": 2.84375, "rewards/accuracy_reward/std": 3.0405657291412354, "rewards/ngram_similarity_reward/mean": 0.4920710325241089, "rewards/ngram_similarity_reward/std": 0.3257303833961487, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 417.484375, "completions/mean_terminated_length": 417.484375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.32669501006936674, "frac_reward_zero_std": 0.0, "grad_norm": 0.069620780646801, "learning_rate": 4.844755426443216e-06, "loss": -0.0308, "num_tokens": 116250084.0, "reward": 4.713657855987549, "reward_std": 0.9554732441902161, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.6199080348014832, "rewards/ngram_similarity_reward/std": 0.2207847386598587, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 398.28125, "completions/mean_terminated_length": 398.28125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.32714253748042066, "frac_reward_zero_std": 0.0, "grad_norm": 0.06426721811294556, "learning_rate": 4.844147192956935e-06, "loss": -0.0432, "num_tokens": 116370214.0, "reward": 3.733760118484497, "reward_std": 0.9057095646858215, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.6712601184844971, "rewards/ngram_similarity_reward/std": 0.30741092562675476, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 411.953125, "completions/mean_terminated_length": 411.953125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.32759006489147463, "frac_reward_zero_std": 0.0, "grad_norm": 0.07242885231971741, "learning_rate": 4.843537813042217e-06, "loss": 0.0345, "num_tokens": 116514611.0, "reward": 4.382899284362793, "reward_std": 1.9558072090148926, "rewards/accuracy_reward/mean": 3.6875, "rewards/accuracy_reward/std": 2.816476583480835, "rewards/ngram_similarity_reward/mean": 0.695399284362793, "rewards/ngram_similarity_reward/std": 0.3914608657360077, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 476.765625, "completions/mean_terminated_length": 476.765625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.32803759230252855, "frac_reward_zero_std": 0.0, "grad_norm": 0.07082340866327286, "learning_rate": 4.8429272870326635e-06, "loss": -0.0057, "num_tokens": 116651812.0, "reward": 5.053439140319824, "reward_std": 1.3016170263290405, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.5846890211105347, "rewards/ngram_similarity_reward/std": 0.24929215013980865, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 443.0625, "completions/mean_terminated_length": 443.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.32848511971358246, "frac_reward_zero_std": 0.0, "grad_norm": 0.07560112327337265, "learning_rate": 4.842315615262504e-06, "loss": -0.0019, "num_tokens": 116781128.0, "reward": -0.02607668563723564, "reward_std": 0.24621905386447906, "rewards/accuracy_reward/mean": -0.59375, "rewards/accuracy_reward/std": 0.29378482699394226, "rewards/ngram_similarity_reward/mean": 0.5676733255386353, "rewards/ngram_similarity_reward/std": 0.35898521542549133, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.3289326471246364, "frac_reward_zero_std": 0.25, "grad_norm": 0.07953772693872452, "learning_rate": 4.8417027980665945e-06, "loss": -0.0063, "num_tokens": 116935056.0, "reward": 5.001359462738037, "reward_std": 0.703096866607666, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.7201095819473267, "rewards/ngram_similarity_reward/std": 0.4384860098361969, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 386.515625, "completions/mean_terminated_length": 386.515625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.3293801745356903, "frac_reward_zero_std": 0.0, "grad_norm": 0.07305855304002762, "learning_rate": 4.8410888357804176e-06, "loss": -0.009, "num_tokens": 117111985.0, "reward": 4.526159763336182, "reward_std": 0.1608743965625763, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5261598229408264, "rewards/ngram_similarity_reward/std": 0.30847957730293274, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 438.9375, "completions/mean_terminated_length": 438.9375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.3298277019467442, "frac_reward_zero_std": 0.0, "grad_norm": 0.07190577685832977, "learning_rate": 4.840473728740084e-06, "loss": 0.0651, "num_tokens": 117270173.0, "reward": 3.601214647293091, "reward_std": 1.6631498336791992, "rewards/accuracy_reward/mean": 2.953125, "rewards/accuracy_reward/std": 3.0075550079345703, "rewards/ngram_similarity_reward/mean": 0.6480897665023804, "rewards/ngram_similarity_reward/std": 0.3352556824684143, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 562.625, "completions/mean_terminated_length": 562.625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.3302752293577982, "frac_reward_zero_std": 0.0, "grad_norm": 0.06534037739038467, "learning_rate": 4.839857477282331e-06, "loss": -0.0154, "num_tokens": 117433365.0, "reward": 4.571503639221191, "reward_std": 0.13676393032073975, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5715036392211914, "rewards/ngram_similarity_reward/std": 0.23960945010185242, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 418.8125, "completions/mean_terminated_length": 418.8125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3307227567688521, "frac_reward_zero_std": 0.0, "grad_norm": 0.06592286378145218, "learning_rate": 4.83924008174452e-06, "loss": -0.0127, "num_tokens": 117608409.0, "reward": 6.083271026611328, "reward_std": 0.12093131989240646, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.5832710266113281, "rewards/ngram_similarity_reward/std": 0.24732209742069244, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 494.421875, "completions/mean_terminated_length": 494.421875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.331170284179906, "frac_reward_zero_std": 0.0, "grad_norm": 0.05999520793557167, "learning_rate": 4.838621542464642e-06, "loss": -0.0446, "num_tokens": 117759108.0, "reward": 4.562652587890625, "reward_std": 0.8388803601264954, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.562652587890625, "rewards/ngram_similarity_reward/std": 0.48074784874916077, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 366.03125, "completions/mean_terminated_length": 366.03125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.33161781159095993, "frac_reward_zero_std": 0.0, "grad_norm": 0.09077738970518112, "learning_rate": 4.838001859781311e-06, "loss": 0.0005, "num_tokens": 117964454.0, "reward": 2.956620216369629, "reward_std": 0.8590655326843262, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.06007981300354, "rewards/ngram_similarity_reward/mean": 0.48787015676498413, "rewards/ngram_similarity_reward/std": 0.42288804054260254, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 484.375, "completions/mean_terminated_length": 484.375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.33206533900201385, "frac_reward_zero_std": 0.0, "grad_norm": 0.0587838850915432, "learning_rate": 4.8373810340337704e-06, "loss": -0.0207, "num_tokens": 118144622.0, "reward": 5.071628570556641, "reward_std": 0.6989190578460693, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.7903785705566406, "rewards/ngram_similarity_reward/std": 0.3258940875530243, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 374.671875, "completions/mean_terminated_length": 374.671875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3325128664130678, "frac_reward_zero_std": 0.0, "grad_norm": 0.06940338015556335, "learning_rate": 4.836759065561887e-06, "loss": -0.0062, "num_tokens": 118280185.0, "reward": 4.492125034332275, "reward_std": 1.3534945249557495, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6796249151229858, "rewards/ngram_similarity_reward/std": 0.30835971236228943, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 614.46875, "completions/mean_terminated_length": 614.46875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.33296039382412174, "frac_reward_zero_std": 0.0, "grad_norm": 0.05301578715443611, "learning_rate": 4.8361359547061535e-06, "loss": -0.0017, "num_tokens": 118436999.0, "reward": 1.7326395511627197, "reward_std": 2.1927366256713867, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 2.8296544551849365, "rewards/ngram_similarity_reward/mean": 0.3888895511627197, "rewards/ngram_similarity_reward/std": 0.20641785860061646, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 575.4375, "completions/mean_terminated_length": 575.4375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.33340792123517565, "frac_reward_zero_std": 0.0, "grad_norm": 0.04947161301970482, "learning_rate": 4.835511701807689e-06, "loss": -0.0175, "num_tokens": 118618227.0, "reward": 3.2324278354644775, "reward_std": 0.16435107588768005, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7324278354644775, "rewards/ngram_similarity_reward/std": 0.2556518018245697, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 444.5625, "completions/mean_terminated_length": 444.5625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.33385544864622957, "frac_reward_zero_std": 0.0, "grad_norm": 0.06482308357954025, "learning_rate": 4.834886307208235e-06, "loss": -0.0144, "num_tokens": 118742487.0, "reward": 2.9939393997192383, "reward_std": 0.10695018619298935, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.4939391314983368, "rewards/ngram_similarity_reward/std": 0.22385334968566895, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 466.1875, "completions/mean_terminated_length": 466.1875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.3343029760572835, "frac_reward_zero_std": 0.0, "grad_norm": 0.06717753410339355, "learning_rate": 4.834259771250162e-06, "loss": 0.0452, "num_tokens": 118882035.0, "reward": 5.7880048751831055, "reward_std": 1.249608039855957, "rewards/accuracy_reward/mean": 5.015625, "rewards/accuracy_reward/std": 1.68081796169281, "rewards/ngram_similarity_reward/mean": 0.7723801732063293, "rewards/ngram_similarity_reward/std": 0.2816872000694275, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 434.125, "completions/mean_terminated_length": 434.125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.33475050346833746, "frac_reward_zero_std": 0.0, "grad_norm": 0.07000329345464706, "learning_rate": 4.8336320942764634e-06, "loss": -0.0102, "num_tokens": 119045499.0, "reward": 4.440940856933594, "reward_std": 0.5408289432525635, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.5503160357475281, "rewards/ngram_similarity_reward/std": 0.38620853424072266, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 495.296875, "completions/mean_terminated_length": 495.296875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.3351980308793914, "frac_reward_zero_std": 0.25, "grad_norm": 0.052323050796985626, "learning_rate": 4.833003276630756e-06, "loss": 0.0275, "num_tokens": 119183662.0, "reward": 1.5198101997375488, "reward_std": 0.09568122029304504, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5198101997375488, "rewards/ngram_similarity_reward/std": 0.4339534342288971, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 376.3125, "completions/mean_terminated_length": 376.3125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3356455582904453, "frac_reward_zero_std": 0.0, "grad_norm": 0.07794438302516937, "learning_rate": 4.832373318657283e-06, "loss": -0.0132, "num_tokens": 119396882.0, "reward": 4.292140960693359, "reward_std": 0.7269996404647827, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5733910799026489, "rewards/ngram_similarity_reward/std": 0.24113622307777405, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 445.359375, "completions/mean_terminated_length": 445.359375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.3360930857014992, "frac_reward_zero_std": 0.0, "grad_norm": 0.0707058385014534, "learning_rate": 4.831742220700911e-06, "loss": 0.0098, "num_tokens": 119628553.0, "reward": 4.441817760467529, "reward_std": 1.7053776979446411, "rewards/accuracy_reward/mean": 3.609375, "rewards/accuracy_reward/std": 2.829084634780884, "rewards/ngram_similarity_reward/mean": 0.8324428796768188, "rewards/ngram_similarity_reward/std": 0.24423421919345856, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 437.1875, "completions/mean_terminated_length": 437.1875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3365406131125531, "frac_reward_zero_std": 0.0, "grad_norm": 0.0756855458021164, "learning_rate": 4.8311099831071316e-06, "loss": 0.0006, "num_tokens": 119767557.0, "reward": 2.9687860012054443, "reward_std": 1.2867786884307861, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5625358819961548, "rewards/ngram_similarity_reward/std": 0.37464669346809387, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 426.015625, "completions/mean_terminated_length": 426.015625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.3369881405236071, "frac_reward_zero_std": 0.0, "grad_norm": 0.07023408263921738, "learning_rate": 4.830476606222058e-06, "loss": -0.015, "num_tokens": 119951910.0, "reward": 1.4517123699188232, "reward_std": 0.5338603258132935, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 2.5734739303588867, "rewards/ngram_similarity_reward/mean": 0.5610873699188232, "rewards/ngram_similarity_reward/std": 0.2960151731967926, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 398.234375, "completions/mean_terminated_length": 398.234375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.337435667934661, "frac_reward_zero_std": 0.0, "grad_norm": 0.07396300137042999, "learning_rate": 4.8298420903924294e-06, "loss": 0.0015, "num_tokens": 120117189.0, "reward": 0.6071276664733887, "reward_std": 1.2192909717559814, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.45087766647338867, "rewards/ngram_similarity_reward/std": 0.3548135757446289, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 406.609375, "completions/mean_terminated_length": 406.609375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.3378831953457149, "frac_reward_zero_std": 0.0, "grad_norm": 0.0677541047334671, "learning_rate": 4.829206435965608e-06, "loss": 0.038, "num_tokens": 120254060.0, "reward": 5.05821418762207, "reward_std": 1.5589964389801025, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.4957137703895569, "rewards/ngram_similarity_reward/std": 0.28042688965797424, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 461.9375, "completions/mean_terminated_length": 461.9375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.33833072275676884, "frac_reward_zero_std": 0.0, "grad_norm": 0.07829014956951141, "learning_rate": 4.828569643289579e-06, "loss": 0.0159, "num_tokens": 120394472.0, "reward": 5.997868537902832, "reward_std": 0.09310601651668549, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.49786829948425293, "rewards/ngram_similarity_reward/std": 0.2516043186187744, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 406.875, "completions/mean_terminated_length": 406.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.33877825016782276, "frac_reward_zero_std": 0.0, "grad_norm": 0.0906175747513771, "learning_rate": 4.827931712712951e-06, "loss": 0.0037, "num_tokens": 120568144.0, "reward": 5.099058151245117, "reward_std": 0.9228776097297668, "rewards/accuracy_reward/mean": 4.453125, "rewards/accuracy_reward/std": 2.319206953048706, "rewards/ngram_similarity_reward/mean": 0.6459333300590515, "rewards/ngram_similarity_reward/std": 0.30590537190437317, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 428.671875, "completions/mean_terminated_length": 428.671875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.33922577757887673, "frac_reward_zero_std": 0.0, "grad_norm": 0.06237898766994476, "learning_rate": 4.827292644584954e-06, "loss": -0.0214, "num_tokens": 120700987.0, "reward": 4.630217552185059, "reward_std": 0.129164919257164, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6302177906036377, "rewards/ngram_similarity_reward/std": 0.35490062832832336, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 370.0625, "completions/mean_terminated_length": 370.0625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.33967330498993065, "frac_reward_zero_std": 0.0, "grad_norm": 0.06669400632381439, "learning_rate": 4.826652439255443e-06, "loss": -0.0037, "num_tokens": 120831663.0, "reward": 3.382824420928955, "reward_std": 1.113814115524292, "rewards/accuracy_reward/mean": 2.5625, "rewards/accuracy_reward/std": 3.2702362537384033, "rewards/ngram_similarity_reward/mean": 0.8203244805335999, "rewards/ngram_similarity_reward/std": 0.26841795444488525, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 439.1875, "completions/mean_terminated_length": 439.1875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.34012083240098456, "frac_reward_zero_std": 0.0, "grad_norm": 0.06376458704471588, "learning_rate": 4.826011097074895e-06, "loss": 0.0206, "num_tokens": 120967899.0, "reward": 4.12385368347168, "reward_std": 0.9235842823982239, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.5926036238670349, "rewards/ngram_similarity_reward/std": 0.2803894877433777, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 401.859375, "completions/mean_terminated_length": 401.859375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3405683598120385, "frac_reward_zero_std": 0.0, "grad_norm": 0.08693066984415054, "learning_rate": 4.825368618394407e-06, "loss": 0.0204, "num_tokens": 121179874.0, "reward": 1.5503162145614624, "reward_std": 0.8148245215415955, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5503160953521729, "rewards/ngram_similarity_reward/std": 0.14231202006340027, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 380.671875, "completions/mean_terminated_length": 354.20635986328125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.3410158872230924, "frac_reward_zero_std": 0.0, "grad_norm": 0.0824568048119545, "learning_rate": 4.8247250035657036e-06, "loss": 0.0296, "num_tokens": 121407437.0, "reward": 4.567397117614746, "reward_std": 0.2737312912940979, "rewards/accuracy_reward/mean": 3.921875, "rewards/accuracy_reward/std": 2.764885902404785, "rewards/ngram_similarity_reward/mean": 0.6455221176147461, "rewards/ngram_similarity_reward/std": 0.26874208450317383, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 464.359375, "completions/mean_terminated_length": 464.359375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.34146341463414637, "frac_reward_zero_std": 0.0, "grad_norm": 0.07261194288730621, "learning_rate": 4.824080252941125e-06, "loss": 0.0008, "num_tokens": 121575588.0, "reward": 5.645310878753662, "reward_std": 0.9448514580726624, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.614061176776886, "rewards/ngram_similarity_reward/std": 0.26361551880836487, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 477.1875, "completions/mean_terminated_length": 477.1875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3419109420452003, "frac_reward_zero_std": 0.0, "grad_norm": 0.06376535445451736, "learning_rate": 4.823434366873636e-06, "loss": -0.0149, "num_tokens": 121735952.0, "reward": 5.15456485748291, "reward_std": 1.620121955871582, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.5920648574829102, "rewards/ngram_similarity_reward/std": 0.350754052400589, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 404.796875, "completions/mean_terminated_length": 378.71429443359375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3423584694562542, "frac_reward_zero_std": 0.0, "grad_norm": 0.07600030303001404, "learning_rate": 4.822787345716826e-06, "loss": 0.0008, "num_tokens": 121888595.0, "reward": 2.8059439659118652, "reward_std": 0.07767674326896667, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.3059441149234772, "rewards/ngram_similarity_reward/std": 0.10181257873773575, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 415.078125, "completions/mean_terminated_length": 415.078125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3428059968673081, "frac_reward_zero_std": 0.0, "grad_norm": 0.09150941669940948, "learning_rate": 4.8221391898249005e-06, "loss": -0.0073, "num_tokens": 122045240.0, "reward": 2.7265686988830566, "reward_std": 0.7440415024757385, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6015689969062805, "rewards/ngram_similarity_reward/std": 0.32731401920318604, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 390.046875, "completions/mean_terminated_length": 390.046875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.34325352427836203, "frac_reward_zero_std": 0.0, "grad_norm": 0.08688319474458694, "learning_rate": 4.821489899552688e-06, "loss": -0.0285, "num_tokens": 122209819.0, "reward": 4.436158657073975, "reward_std": 0.49523043632507324, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.3424083888530731, "rewards/ngram_similarity_reward/std": 0.2369903326034546, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 460.25, "completions/mean_terminated_length": 460.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.343701051689416, "frac_reward_zero_std": 0.0, "grad_norm": 0.08063489198684692, "learning_rate": 4.820839475255641e-06, "loss": -0.0008, "num_tokens": 122410091.0, "reward": -0.36356696486473083, "reward_std": 0.26210200786590576, "rewards/accuracy_reward/mean": -0.625, "rewards/accuracy_reward/std": 0.3333333432674408, "rewards/ngram_similarity_reward/mean": 0.26143306493759155, "rewards/ngram_similarity_reward/std": 0.10493180900812149, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 444.28125, "completions/mean_terminated_length": 444.28125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3441485791004699, "frac_reward_zero_std": 0.0, "grad_norm": 0.09552402794361115, "learning_rate": 4.820187917289829e-06, "loss": 0.0433, "num_tokens": 122593261.0, "reward": 2.9167373180389404, "reward_std": 0.44910675287246704, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.51048743724823, "rewards/ngram_similarity_reward/std": 0.2428485006093979, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 417.625, "completions/mean_terminated_length": 417.625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.34459610651152384, "frac_reward_zero_std": 0.0, "grad_norm": 0.07206974923610687, "learning_rate": 4.819535226011943e-06, "loss": 0.0232, "num_tokens": 122745157.0, "reward": 3.581268072128296, "reward_std": 1.694632649421692, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6125181317329407, "rewards/ngram_similarity_reward/std": 0.44275662302970886, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 320.015625, "completions/mean_terminated_length": 320.015625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.34504363392257775, "frac_reward_zero_std": 0.0, "grad_norm": 0.09001357853412628, "learning_rate": 4.818881401779296e-06, "loss": 0.0099, "num_tokens": 122867302.0, "reward": 4.758194923400879, "reward_std": 0.22100293636322021, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7581952810287476, "rewards/ngram_similarity_reward/std": 0.3720369338989258, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 379.1875, "completions/mean_terminated_length": 379.1875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.34549116133363167, "frac_reward_zero_std": 0.0, "grad_norm": 0.0792035311460495, "learning_rate": 4.818226444949819e-06, "loss": -0.0154, "num_tokens": 123002306.0, "reward": 4.732837677001953, "reward_std": 0.5926983952522278, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.826587438583374, "rewards/ngram_similarity_reward/std": 0.31210973858833313, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 441.296875, "completions/mean_terminated_length": 441.296875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.3459386887446856, "frac_reward_zero_std": 0.0, "grad_norm": 0.07282985746860504, "learning_rate": 4.817570355882067e-06, "loss": 0.0359, "num_tokens": 123138501.0, "reward": 0.6833294630050659, "reward_std": 0.9691657423973083, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.5270794630050659, "rewards/ngram_similarity_reward/std": 0.3545078933238983, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 416.359375, "completions/mean_terminated_length": 416.359375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.34638621615573956, "frac_reward_zero_std": 0.0, "grad_norm": 0.06430497765541077, "learning_rate": 4.816913134935208e-06, "loss": 0.0343, "num_tokens": 123296300.0, "reward": 5.439652442932129, "reward_std": 1.4340357780456543, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.8771523237228394, "rewards/ngram_similarity_reward/std": 0.29859817028045654, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 393.5625, "completions/mean_terminated_length": 393.5625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.3468337435667935, "frac_reward_zero_std": 0.0, "grad_norm": 0.0746258869767189, "learning_rate": 4.8162547824690365e-06, "loss": 0.0325, "num_tokens": 123456832.0, "reward": 4.419753551483154, "reward_std": 0.6457646489143372, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6072536706924438, "rewards/ngram_similarity_reward/std": 0.2622377872467041, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 444.96875, "completions/mean_terminated_length": 444.96875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.3472812709778474, "frac_reward_zero_std": 0.0, "grad_norm": 0.07858631014823914, "learning_rate": 4.815595298843963e-06, "loss": 0.0147, "num_tokens": 123606398.0, "reward": 1.5636564493179321, "reward_std": 1.172206163406372, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 2.6425621509552, "rewards/ngram_similarity_reward/mean": 0.5949063897132874, "rewards/ngram_similarity_reward/std": 0.3762511909008026, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 532.890625, "completions/mean_terminated_length": 532.890625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.3477287983889013, "frac_reward_zero_std": 0.0, "grad_norm": 0.06011474132537842, "learning_rate": 4.814934684421018e-06, "loss": -0.0029, "num_tokens": 123752823.0, "reward": 3.070727586746216, "reward_std": 0.12144973129034042, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5707273483276367, "rewards/ngram_similarity_reward/std": 0.39387303590774536, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 454.28125, "completions/mean_terminated_length": 454.28125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.3481763257999552, "frac_reward_zero_std": 0.0, "grad_norm": 0.06932368129491806, "learning_rate": 4.8142729395618505e-06, "loss": -0.0243, "num_tokens": 123903465.0, "reward": 1.4879626035690308, "reward_std": 0.9695565104484558, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.581712543964386, "rewards/ngram_similarity_reward/std": 0.21801921725273132, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 453.890625, "completions/mean_terminated_length": 453.890625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3486238532110092, "frac_reward_zero_std": 0.0, "grad_norm": 0.0718824565410614, "learning_rate": 4.813610064628729e-06, "loss": -0.0058, "num_tokens": 124056482.0, "reward": 5.090573310852051, "reward_std": 1.7853457927703857, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.8093235492706299, "rewards/ngram_similarity_reward/std": 0.39342236518859863, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 361.140625, "completions/mean_terminated_length": 361.140625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3490713806220631, "frac_reward_zero_std": 0.0, "grad_norm": 0.13112983107566833, "learning_rate": 4.8129460599845416e-06, "loss": -0.0114, "num_tokens": 124233579.0, "reward": 2.248539686203003, "reward_std": 1.9886976480484009, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 3.32961106300354, "rewards/ngram_similarity_reward/mean": 0.5922897458076477, "rewards/ngram_similarity_reward/std": 0.321114718914032, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 391.046875, "completions/mean_terminated_length": 391.046875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.349518908033117, "frac_reward_zero_std": 0.0, "grad_norm": 0.07911917567253113, "learning_rate": 4.812280925992791e-06, "loss": 0.0378, "num_tokens": 124383454.0, "reward": 2.1933369636535645, "reward_std": 1.624046802520752, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.630837082862854, "rewards/ngram_similarity_reward/std": 0.3428136706352234, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 405.171875, "completions/mean_terminated_length": 405.171875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.34996643544417094, "frac_reward_zero_std": 0.0, "grad_norm": 0.08272143453359604, "learning_rate": 4.811614663017603e-06, "loss": -0.0183, "num_tokens": 124575481.0, "reward": 4.482691764831543, "reward_std": 0.5504340529441833, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5764422416687012, "rewards/ngram_similarity_reward/std": 0.2652113139629364, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 458.015625, "completions/mean_terminated_length": 458.015625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.35041396285522486, "frac_reward_zero_std": 0.0, "grad_norm": 0.06321236491203308, "learning_rate": 4.810947271423719e-06, "loss": -0.0047, "num_tokens": 124745994.0, "reward": 4.337434768676758, "reward_std": 1.5873150825500488, "rewards/accuracy_reward/mean": 3.5625, "rewards/accuracy_reward/std": 2.905249834060669, "rewards/ngram_similarity_reward/mean": 0.7749345302581787, "rewards/ngram_similarity_reward/std": 0.27499550580978394, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 494.0, "completions/mean_terminated_length": 494.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.35086149026627883, "frac_reward_zero_std": 0.0, "grad_norm": 0.06817396730184555, "learning_rate": 4.810278751576498e-06, "loss": 0.0293, "num_tokens": 124899482.0, "reward": 1.5161876678466797, "reward_std": 0.1321793794631958, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5161875486373901, "rewards/ngram_similarity_reward/std": 0.29837268590927124, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 526.40625, "completions/mean_terminated_length": 526.40625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.35130901767733275, "frac_reward_zero_std": 0.0, "grad_norm": 0.07075538486242294, "learning_rate": 4.809609103841917e-06, "loss": -0.0369, "num_tokens": 125075924.0, "reward": 4.434210300445557, "reward_std": 0.5480722188949585, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.543585479259491, "rewards/ngram_similarity_reward/std": 0.30443742871284485, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 455.796875, "completions/mean_terminated_length": 455.796875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.35175654508838666, "frac_reward_zero_std": 0.0, "grad_norm": 0.08288660645484924, "learning_rate": 4.808938328586573e-06, "loss": 0.0019, "num_tokens": 125272263.0, "reward": 0.5084143877029419, "reward_std": 1.5748283863067627, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 1.9065468311309814, "rewards/ngram_similarity_reward/mean": 0.3834143877029419, "rewards/ngram_similarity_reward/std": 0.21710476279258728, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 425.21875, "completions/mean_terminated_length": 425.21875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.3522040724994406, "frac_reward_zero_std": 0.0, "grad_norm": 0.09674753993749619, "learning_rate": 4.808266426177674e-06, "loss": -0.0288, "num_tokens": 125513173.0, "reward": 1.8925563097000122, "reward_std": 0.9368428587913513, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 2.941181182861328, "rewards/ngram_similarity_reward/mean": 0.40818145871162415, "rewards/ngram_similarity_reward/std": 0.2608228325843811, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 356.671875, "completions/mean_terminated_length": 356.671875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.3526515999104945, "frac_reward_zero_std": 0.25, "grad_norm": 0.06609658896923065, "learning_rate": 4.807593396983053e-06, "loss": -0.0023, "num_tokens": 125681984.0, "reward": 4.787121295928955, "reward_std": 1.3088996410369873, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7871214151382446, "rewards/ngram_similarity_reward/std": 0.33557581901550293, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 446.96875, "completions/mean_terminated_length": 446.96875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.35309912732154847, "frac_reward_zero_std": 0.0, "grad_norm": 0.0749947801232338, "learning_rate": 4.806919241371153e-06, "loss": -0.0101, "num_tokens": 125865406.0, "reward": 2.938890218734741, "reward_std": 1.8307483196258545, "rewards/accuracy_reward/mean": 2.28125, "rewards/accuracy_reward/std": 3.1596100330352783, "rewards/ngram_similarity_reward/mean": 0.6576401591300964, "rewards/ngram_similarity_reward/std": 0.3600626289844513, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 387.3125, "completions/mean_terminated_length": 387.3125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.3535466547326024, "frac_reward_zero_std": 0.0, "grad_norm": 0.07980632036924362, "learning_rate": 4.806243959711037e-06, "loss": -0.0154, "num_tokens": 126032418.0, "reward": 2.545635223388672, "reward_std": 1.1480920314788818, "rewards/accuracy_reward/mean": 2.015625, "rewards/accuracy_reward/std": 3.00260329246521, "rewards/ngram_similarity_reward/mean": 0.5300101041793823, "rewards/ngram_similarity_reward/std": 0.27905625104904175, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3539941821436563, "frac_reward_zero_std": 0.0, "grad_norm": 0.07998666167259216, "learning_rate": 4.805567552372385e-06, "loss": 0.0563, "num_tokens": 126182538.0, "reward": 6.1160430908203125, "reward_std": 0.5871672034263611, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.7254180312156677, "rewards/ngram_similarity_reward/std": 0.30045783519744873, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 465.40625, "completions/mean_terminated_length": 465.40625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.3544417095547102, "frac_reward_zero_std": 0.0, "grad_norm": 0.06252842396497726, "learning_rate": 4.804890019725492e-06, "loss": 0.0193, "num_tokens": 126348212.0, "reward": 3.336127519607544, "reward_std": 0.6081538200378418, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.648627519607544, "rewards/ngram_similarity_reward/std": 0.41256552934646606, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 411.5625, "completions/mean_terminated_length": 411.5625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.35488923696576413, "frac_reward_zero_std": 0.0, "grad_norm": 0.07198772579431534, "learning_rate": 4.804211362141267e-06, "loss": 0.0334, "num_tokens": 126462888.0, "reward": 4.9459052085876465, "reward_std": 0.7472313642501831, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.7584052085876465, "rewards/ngram_similarity_reward/std": 0.2952753007411957, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 369.765625, "completions/mean_terminated_length": 369.765625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3553367643768181, "frac_reward_zero_std": 0.0, "grad_norm": 0.08004353940486908, "learning_rate": 4.8035315799912404e-06, "loss": -0.0228, "num_tokens": 126599897.0, "reward": 3.7750284671783447, "reward_std": 1.6776546239852905, "rewards/accuracy_reward/mean": 3.09375, "rewards/accuracy_reward/std": 3.037954330444336, "rewards/ngram_similarity_reward/mean": 0.6812787055969238, "rewards/ngram_similarity_reward/std": 0.440616250038147, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 472.640625, "completions/mean_terminated_length": 472.640625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.355784291787872, "frac_reward_zero_std": 0.0, "grad_norm": 0.07230333983898163, "learning_rate": 4.802850673647553e-06, "loss": 0.0271, "num_tokens": 126781618.0, "reward": 3.80275297164917, "reward_std": 1.2974958419799805, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.7402530908584595, "rewards/ngram_similarity_reward/std": 0.3653397262096405, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 508.1875, "completions/mean_terminated_length": 508.1875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.35623181919892594, "frac_reward_zero_std": 0.0, "grad_norm": 0.06609354168176651, "learning_rate": 4.802168643482963e-06, "loss": -0.0023, "num_tokens": 126959294.0, "reward": 2.959211826324463, "reward_std": 1.0579187870025635, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6467118263244629, "rewards/ngram_similarity_reward/std": 0.281398743391037, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 454.859375, "completions/mean_terminated_length": 454.859375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.35667934660997985, "frac_reward_zero_std": 0.0, "grad_norm": 0.06357131153345108, "learning_rate": 4.801485489870845e-06, "loss": -0.0384, "num_tokens": 127091813.0, "reward": 2.302091598510742, "reward_std": 2.009054183959961, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.7395914793014526, "rewards/ngram_similarity_reward/std": 0.25282952189445496, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 348.390625, "completions/mean_terminated_length": 348.390625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.35712687402103377, "frac_reward_zero_std": 0.0, "grad_norm": 0.07078349590301514, "learning_rate": 4.800801213185184e-06, "loss": 0.0286, "num_tokens": 127249374.0, "reward": 4.6454057693481445, "reward_std": 1.6441264152526855, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.5516562461853027, "rewards/ngram_similarity_reward/std": 0.34286314249038696, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 495.890625, "completions/mean_terminated_length": 495.890625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.35757440143208774, "frac_reward_zero_std": 0.0, "grad_norm": 0.0576631985604763, "learning_rate": 4.800115813800587e-06, "loss": 0.034, "num_tokens": 127385719.0, "reward": 5.4827423095703125, "reward_std": 0.9034347534179688, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.5452421307563782, "rewards/ngram_similarity_reward/std": 0.347225159406662, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 453.78125, "completions/mean_terminated_length": 453.78125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.35802192884314166, "frac_reward_zero_std": 0.0, "grad_norm": 0.06489190459251404, "learning_rate": 4.799429292092272e-06, "loss": 0.0004, "num_tokens": 127558409.0, "reward": 3.6968436241149902, "reward_std": 1.5084903240203857, "rewards/accuracy_reward/mean": 3.078125, "rewards/accuracy_reward/std": 3.0592284202575684, "rewards/ngram_similarity_reward/mean": 0.6187184453010559, "rewards/ngram_similarity_reward/std": 0.39123642444610596, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 436.109375, "completions/mean_terminated_length": 436.109375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3584694562541956, "frac_reward_zero_std": 0.0, "grad_norm": 0.06589549034833908, "learning_rate": 4.798741648436068e-06, "loss": 0.0169, "num_tokens": 127696000.0, "reward": 3.2289295196533203, "reward_std": 0.13976755738258362, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7289294600486755, "rewards/ngram_similarity_reward/std": 0.22076164186000824, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 429.1875, "completions/mean_terminated_length": 429.1875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.3589169836652495, "frac_reward_zero_std": 0.0, "grad_norm": 0.07002793252468109, "learning_rate": 4.798052883208424e-06, "loss": 0.0133, "num_tokens": 127861036.0, "reward": 4.147583484649658, "reward_std": 1.162605881690979, "rewards/accuracy_reward/mean": 3.6875, "rewards/accuracy_reward/std": 2.816476583480835, "rewards/ngram_similarity_reward/mean": 0.46008336544036865, "rewards/ngram_similarity_reward/std": 0.39208683371543884, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 372.09375, "completions/mean_terminated_length": 372.09375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.3593645110763034, "frac_reward_zero_std": 0.25, "grad_norm": 0.07022594660520554, "learning_rate": 4.797362996786398e-06, "loss": -0.0101, "num_tokens": 128016226.0, "reward": 4.534200668334961, "reward_std": 0.6052448153495789, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.72170090675354, "rewards/ngram_similarity_reward/std": 0.3617556691169739, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 381.140625, "completions/mean_terminated_length": 381.140625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.3598120384873574, "frac_reward_zero_std": 0.0, "grad_norm": 0.09523069113492966, "learning_rate": 4.796671989547667e-06, "loss": 0.036, "num_tokens": 128213915.0, "reward": 1.9837148189544678, "reward_std": 0.841371476650238, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.42121487855911255, "rewards/ngram_similarity_reward/std": 0.22523535788059235, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 464.53125, "completions/mean_terminated_length": 464.53125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.3602595658984113, "frac_reward_zero_std": 0.0, "grad_norm": 0.06792967766523361, "learning_rate": 4.795979861870517e-06, "loss": -0.0196, "num_tokens": 128337133.0, "reward": 3.660709857940674, "reward_std": 1.4073338508605957, "rewards/accuracy_reward/mean": 3.03125, "rewards/accuracy_reward/std": 3.0130341053009033, "rewards/ngram_similarity_reward/mean": 0.6294599771499634, "rewards/ngram_similarity_reward/std": 0.300203412771225, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 580.90625, "completions/mean_terminated_length": 580.90625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3607070933094652, "frac_reward_zero_std": 0.0, "grad_norm": 0.05525697395205498, "learning_rate": 4.79528661413385e-06, "loss": -0.0096, "num_tokens": 128487687.0, "reward": 4.130392074584961, "reward_std": 0.8165697455406189, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.5053920745849609, "rewards/ngram_similarity_reward/std": 0.29331842064857483, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 446.96875, "completions/mean_terminated_length": 446.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.3611546207205191, "frac_reward_zero_std": 0.0, "grad_norm": 0.06297452002763748, "learning_rate": 4.79459224671718e-06, "loss": -0.0148, "num_tokens": 128631093.0, "reward": 3.1118602752685547, "reward_std": 0.8096895217895508, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6118603348731995, "rewards/ngram_similarity_reward/std": 0.240114226937294, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 371.265625, "completions/mean_terminated_length": 371.265625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.36160214813157304, "frac_reward_zero_std": 0.0, "grad_norm": 0.07497435808181763, "learning_rate": 4.7938967600006345e-06, "loss": -0.0025, "num_tokens": 128764758.0, "reward": 3.28179931640625, "reward_std": 1.3945362567901611, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.3130495250225067, "rewards/ngram_similarity_reward/std": 0.24311913549900055, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 385.6875, "completions/mean_terminated_length": 385.6875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.36204967554262696, "frac_reward_zero_std": 0.0, "grad_norm": 0.07288374751806259, "learning_rate": 4.793200154364952e-06, "loss": 0.0016, "num_tokens": 128902994.0, "reward": 5.032406806945801, "reward_std": 0.857367217540741, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.5636566877365112, "rewards/ngram_similarity_reward/std": 0.2474435567855835, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 495.0, "completions/mean_terminated_length": 495.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.36249720295368093, "frac_reward_zero_std": 0.0, "grad_norm": 0.060755811631679535, "learning_rate": 4.792502430191489e-06, "loss": -0.035, "num_tokens": 129072722.0, "reward": 5.3756914138793945, "reward_std": 1.7483309507369995, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.8131917119026184, "rewards/ngram_similarity_reward/std": 0.18953640758991241, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 508.984375, "completions/mean_terminated_length": 508.984375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.36294473036473485, "frac_reward_zero_std": 0.0, "grad_norm": 0.10541582852602005, "learning_rate": 4.791803587862207e-06, "loss": 0.093, "num_tokens": 129302561.0, "reward": 4.274901866912842, "reward_std": 1.602287769317627, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.4624021053314209, "rewards/ngram_similarity_reward/std": 0.3578868508338928, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 402.828125, "completions/mean_terminated_length": 402.828125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.36339225777578876, "frac_reward_zero_std": 0.0, "grad_norm": 0.06787524372339249, "learning_rate": 4.791103627759684e-06, "loss": -0.004, "num_tokens": 129464822.0, "reward": 6.206984996795654, "reward_std": 0.16916683316230774, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7069849967956543, "rewards/ngram_similarity_reward/std": 0.307819128036499, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 426.265625, "completions/mean_terminated_length": 426.265625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.3638397851868427, "frac_reward_zero_std": 0.0, "grad_norm": 0.08130761981010437, "learning_rate": 4.7904025502671085e-06, "loss": 0.0069, "num_tokens": 129604519.0, "reward": 2.542485237121582, "reward_std": 0.8359096646308899, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.5112350583076477, "rewards/ngram_similarity_reward/std": 0.2730526328086853, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 417.15625, "completions/mean_terminated_length": 417.15625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.3642873125978966, "frac_reward_zero_std": 0.0, "grad_norm": 0.07702480256557465, "learning_rate": 4.789700355768283e-06, "loss": -0.0035, "num_tokens": 129739105.0, "reward": 3.377739906311035, "reward_std": 1.9412826299667358, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5964900851249695, "rewards/ngram_similarity_reward/std": 0.3124755024909973, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 324.546875, "completions/mean_terminated_length": 324.546875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.36473484000895057, "frac_reward_zero_std": 0.0, "grad_norm": 0.09259375929832458, "learning_rate": 4.788997044647618e-06, "loss": -0.0038, "num_tokens": 129846964.0, "reward": 3.172454833984375, "reward_std": 1.0749475955963135, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.4849551022052765, "rewards/ngram_similarity_reward/std": 0.23124173283576965, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 392.484375, "completions/mean_terminated_length": 392.484375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.3651823674200045, "frac_reward_zero_std": 0.0, "grad_norm": 0.07372072339057922, "learning_rate": 4.788292617290137e-06, "loss": 0.0082, "num_tokens": 129981875.0, "reward": 2.8951284885406494, "reward_std": 0.7518053650856018, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.6763784885406494, "rewards/ngram_similarity_reward/std": 0.3303958475589752, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 478.296875, "completions/mean_terminated_length": 478.296875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.3656298948310584, "frac_reward_zero_std": 0.25, "grad_norm": 0.050354402512311935, "learning_rate": 4.787587074081476e-06, "loss": 0.0352, "num_tokens": 130164454.0, "reward": 4.420853614807129, "reward_std": 0.7614438533782959, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.7958537340164185, "rewards/ngram_similarity_reward/std": 0.4121065139770508, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 389.296875, "completions/mean_terminated_length": 389.296875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.3660774222421123, "frac_reward_zero_std": 0.0, "grad_norm": 0.0717056393623352, "learning_rate": 4.786880415407879e-06, "loss": -0.0265, "num_tokens": 130348473.0, "reward": 3.9296135902404785, "reward_std": 1.0928839445114136, "rewards/accuracy_reward/mean": 3.125, "rewards/accuracy_reward/std": 3.1040170192718506, "rewards/ngram_similarity_reward/mean": 0.8046135306358337, "rewards/ngram_similarity_reward/std": 0.3383404314517975, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 471.6875, "completions/mean_terminated_length": 471.6875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.36652494965316623, "frac_reward_zero_std": 0.0, "grad_norm": 0.06714696437120438, "learning_rate": 4.786172641656203e-06, "loss": 0.0059, "num_tokens": 130577109.0, "reward": 2.7257204055786133, "reward_std": 0.6096923351287842, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.41322028636932373, "rewards/ngram_similarity_reward/std": 0.16564498841762543, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 446.328125, "completions/mean_terminated_length": 446.328125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.3669724770642202, "frac_reward_zero_std": 0.0, "grad_norm": 0.07103020697832108, "learning_rate": 4.785463753213914e-06, "loss": 0.0415, "num_tokens": 130805866.0, "reward": 4.421981334686279, "reward_std": 1.171364188194275, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.796981692314148, "rewards/ngram_similarity_reward/std": 0.306751549243927, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 444.984375, "completions/mean_terminated_length": 444.984375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.3674200044752741, "frac_reward_zero_std": 0.0, "grad_norm": 0.08048597723245621, "learning_rate": 4.784753750469089e-06, "loss": -0.0204, "num_tokens": 130960793.0, "reward": 2.020061492919922, "reward_std": 0.7841981649398804, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6450613737106323, "rewards/ngram_similarity_reward/std": 0.3096904754638672, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 398.796875, "completions/mean_terminated_length": 398.796875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.36786753188632804, "frac_reward_zero_std": 0.0, "grad_norm": 0.0813199058175087, "learning_rate": 4.784042633810414e-06, "loss": 0.0313, "num_tokens": 131113244.0, "reward": 4.319128036499023, "reward_std": 0.8010765910148621, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6941279172897339, "rewards/ngram_similarity_reward/std": 0.2977048456668854, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 430.421875, "completions/mean_terminated_length": 430.421875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.36831505929738195, "frac_reward_zero_std": 0.0, "grad_norm": 0.08103030920028687, "learning_rate": 4.783330403627188e-06, "loss": -0.0209, "num_tokens": 131249975.0, "reward": 4.559375762939453, "reward_std": 0.19386625289916992, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5593758821487427, "rewards/ngram_similarity_reward/std": 0.3220345675945282, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 410.34375, "completions/mean_terminated_length": 410.34375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.36876258670843587, "frac_reward_zero_std": 0.0, "grad_norm": 0.07567102462053299, "learning_rate": 4.782617060309314e-06, "loss": 0.0129, "num_tokens": 131397981.0, "reward": 4.7222490310668945, "reward_std": 0.1576964557170868, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7222490310668945, "rewards/ngram_similarity_reward/std": 0.2786788046360016, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 420.359375, "completions/mean_terminated_length": 420.359375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.36921011411948984, "frac_reward_zero_std": 0.25, "grad_norm": 0.06875015050172806, "learning_rate": 4.7819026042473095e-06, "loss": 0.0181, "num_tokens": 131562948.0, "reward": 3.585540771484375, "reward_std": 0.8135175108909607, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7105408310890198, "rewards/ngram_similarity_reward/std": 0.3925935626029968, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 475.0, "completions/mean_terminated_length": 475.0, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.36965764153054376, "frac_reward_zero_std": 0.0, "grad_norm": 0.05377297103404999, "learning_rate": 4.7811870358322985e-06, "loss": 0.0007, "num_tokens": 131682884.0, "reward": 4.313448905944824, "reward_std": 1.2124178409576416, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.7821989059448242, "rewards/ngram_similarity_reward/std": 0.35354679822921753, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 436.78125, "completions/mean_terminated_length": 436.78125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.3701051689415977, "frac_reward_zero_std": 0.0, "grad_norm": 0.0779440850019455, "learning_rate": 4.780470355456015e-06, "loss": 0.0133, "num_tokens": 131845030.0, "reward": 4.533501625061035, "reward_std": 0.4481460452079773, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6272518634796143, "rewards/ngram_similarity_reward/std": 0.2969803512096405, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 454.671875, "completions/mean_terminated_length": 454.671875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.3705526963526516, "frac_reward_zero_std": 0.0, "grad_norm": 0.07512392848730087, "learning_rate": 4.779752563510802e-06, "loss": 0.0025, "num_tokens": 132009345.0, "reward": 1.7263811826705933, "reward_std": 0.6355293989181519, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 2.7316761016845703, "rewards/ngram_similarity_reward/mean": 0.554506242275238, "rewards/ngram_similarity_reward/std": 0.37061354517936707, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 420.96875, "completions/mean_terminated_length": 420.96875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.3710002237637055, "frac_reward_zero_std": 0.0, "grad_norm": 0.07953844219446182, "learning_rate": 4.779033660389609e-06, "loss": 0.0005, "num_tokens": 132149967.0, "reward": 1.5752811431884766, "reward_std": 0.841187059879303, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 2.7316761016845703, "rewards/ngram_similarity_reward/mean": 0.40340620279312134, "rewards/ngram_similarity_reward/std": 0.2614307105541229, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 383.609375, "completions/mean_terminated_length": 383.609375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.3714477511747595, "frac_reward_zero_std": 0.25, "grad_norm": 0.05196192115545273, "learning_rate": 4.7783136464859955e-06, "loss": 0.0109, "num_tokens": 132311542.0, "reward": 6.148770809173584, "reward_std": 0.6349098086357117, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.9300208687782288, "rewards/ngram_similarity_reward/std": 0.2650148272514343, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 570.96875, "completions/mean_terminated_length": 547.5238647460938, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.3718952785858134, "frac_reward_zero_std": 0.0, "grad_norm": 0.05970247462391853, "learning_rate": 4.77759252219413e-06, "loss": 0.0341, "num_tokens": 132453316.0, "reward": 4.687633037567139, "reward_std": 0.1622379720211029, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6876329183578491, "rewards/ngram_similarity_reward/std": 0.3011060059070587, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 462.34375, "completions/mean_terminated_length": 462.34375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3723428059968673, "frac_reward_zero_std": 0.0, "grad_norm": 0.07145772129297256, "learning_rate": 4.776870287908788e-06, "loss": 0.0045, "num_tokens": 132643898.0, "reward": 3.6631765365600586, "reward_std": 1.224141001701355, "rewards/accuracy_reward/mean": 3.046875, "rewards/accuracy_reward/std": 2.991680145263672, "rewards/ngram_similarity_reward/mean": 0.6163015365600586, "rewards/ngram_similarity_reward/std": 0.2998967170715332, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 489.328125, "completions/mean_terminated_length": 489.328125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.3727903334079212, "frac_reward_zero_std": 0.0, "grad_norm": 0.07153812050819397, "learning_rate": 4.776146944025351e-06, "loss": -0.0005, "num_tokens": 132822335.0, "reward": 4.376694679260254, "reward_std": 1.0175447463989258, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.657944917678833, "rewards/ngram_similarity_reward/std": 0.3186005651950836, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 458.421875, "completions/mean_terminated_length": 458.421875, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.37323786081897514, "frac_reward_zero_std": 0.0, "grad_norm": 0.0736360028386116, "learning_rate": 4.775422490939809e-06, "loss": -0.0249, "num_tokens": 132967290.0, "reward": 3.1810238361358643, "reward_std": 0.1251983940601349, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6810238361358643, "rewards/ngram_similarity_reward/std": 0.30840054154396057, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 444.46875, "completions/mean_terminated_length": 444.46875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.3736853882300291, "frac_reward_zero_std": 0.0, "grad_norm": 0.08364483714103699, "learning_rate": 4.774696929048761e-06, "loss": -0.0182, "num_tokens": 133178968.0, "reward": 5.657986640930176, "reward_std": 1.2540444135665894, "rewards/accuracy_reward/mean": 5.015625, "rewards/accuracy_reward/std": 1.68081796169281, "rewards/ngram_similarity_reward/mean": 0.6423616409301758, "rewards/ngram_similarity_reward/std": 0.37156593799591064, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 424.53125, "completions/mean_terminated_length": 424.53125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.37413291564108303, "frac_reward_zero_std": 0.0, "grad_norm": 0.08677200227975845, "learning_rate": 4.7739702587494105e-06, "loss": 0.0051, "num_tokens": 133359402.0, "reward": 1.4438471794128418, "reward_std": 0.5690076947212219, "rewards/accuracy_reward/mean": 1.0625, "rewards/accuracy_reward/std": 2.695528507232666, "rewards/ngram_similarity_reward/mean": 0.3813472390174866, "rewards/ngram_similarity_reward/std": 0.25122034549713135, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 552.40625, "completions/mean_terminated_length": 552.40625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.37458044305213695, "frac_reward_zero_std": 0.0, "grad_norm": 0.06587916612625122, "learning_rate": 4.77324248043957e-06, "loss": 0.0143, "num_tokens": 133519300.0, "reward": 6.079070091247559, "reward_std": 0.2363794445991516, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.579070508480072, "rewards/ngram_similarity_reward/std": 0.33267608284950256, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 520.453125, "completions/mean_terminated_length": 520.453125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.37502797046319086, "frac_reward_zero_std": 0.25, "grad_norm": 0.05705634877085686, "learning_rate": 4.7725135945176545e-06, "loss": -0.0057, "num_tokens": 133698721.0, "reward": 4.572680950164795, "reward_std": 0.8313631415367126, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5726807713508606, "rewards/ngram_similarity_reward/std": 0.42959290742874146, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 367.5625, "completions/mean_terminated_length": 367.5625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3754754978742448, "frac_reward_zero_std": 0.0, "grad_norm": 0.09058309346437454, "learning_rate": 4.771783601382693e-06, "loss": 0.0048, "num_tokens": 133819445.0, "reward": 3.2510876655578613, "reward_std": 0.20160742104053497, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7510875463485718, "rewards/ngram_similarity_reward/std": 0.34454336762428284, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 370.3125, "completions/mean_terminated_length": 370.3125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.37592302528529875, "frac_reward_zero_std": 0.0, "grad_norm": 0.07292758673429489, "learning_rate": 4.771052501434311e-06, "loss": 0.0127, "num_tokens": 133989113.0, "reward": 3.195587635040283, "reward_std": 0.4828079342842102, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.7893376350402832, "rewards/ngram_similarity_reward/std": 0.20010864734649658, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 388.28125, "completions/mean_terminated_length": 388.28125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.37637055269635267, "frac_reward_zero_std": 0.0, "grad_norm": 0.07030453532934189, "learning_rate": 4.770320295072748e-06, "loss": -0.0096, "num_tokens": 134169947.0, "reward": 5.593858242034912, "reward_std": 1.3770513534545898, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.7501083016395569, "rewards/ngram_similarity_reward/std": 0.3666737973690033, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 516.5, "completions/mean_terminated_length": 516.5, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.3768180801074066, "frac_reward_zero_std": 0.0, "grad_norm": 0.06108309328556061, "learning_rate": 4.769586982698845e-06, "loss": 0.0281, "num_tokens": 134319675.0, "reward": 4.372067451477051, "reward_std": 1.1060233116149902, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.7470671534538269, "rewards/ngram_similarity_reward/std": 0.2952858507633209, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 488.125, "completions/mean_terminated_length": 488.125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.3772656075184605, "frac_reward_zero_std": 0.0, "grad_norm": 0.06603636592626572, "learning_rate": 4.768852564714049e-06, "loss": 0.006, "num_tokens": 134473635.0, "reward": 4.100539207458496, "reward_std": 0.8879941701889038, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.6630393266677856, "rewards/ngram_similarity_reward/std": 0.257058322429657, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 379.78125, "completions/mean_terminated_length": 379.78125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.3777131349295144, "frac_reward_zero_std": 0.0, "grad_norm": 0.08025288581848145, "learning_rate": 4.768117041520414e-06, "loss": 0.0109, "num_tokens": 134636229.0, "reward": 5.075590133666992, "reward_std": 1.9024856090545654, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.7005902528762817, "rewards/ngram_similarity_reward/std": 0.2098340094089508, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 499.015625, "completions/mean_terminated_length": 499.015625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.37816066234056833, "frac_reward_zero_std": 0.0, "grad_norm": 0.06738443672657013, "learning_rate": 4.767380413520598e-06, "loss": 0.0734, "num_tokens": 134835254.0, "reward": 2.6038081645965576, "reward_std": 1.4416784048080444, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.4788079857826233, "rewards/ngram_similarity_reward/std": 0.2530724108219147, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 495.0625, "completions/mean_terminated_length": 495.0625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.3786081897516223, "frac_reward_zero_std": 0.0, "grad_norm": 0.05641628056764603, "learning_rate": 4.766642681117862e-06, "loss": -0.0054, "num_tokens": 134958810.0, "reward": 4.782063961029053, "reward_std": 0.1804514229297638, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7820637226104736, "rewards/ngram_similarity_reward/std": 0.2899869680404663, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 379.15625, "completions/mean_terminated_length": 379.15625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.3790557171626762, "frac_reward_zero_std": 0.25, "grad_norm": 0.06728257983922958, "learning_rate": 4.7659038447160735e-06, "loss": 0.0059, "num_tokens": 135081236.0, "reward": 6.2020463943481445, "reward_std": 0.6189609169960022, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.8895463943481445, "rewards/ngram_similarity_reward/std": 0.23006707429885864, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 472.890625, "completions/mean_terminated_length": 472.890625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.37950324457373014, "frac_reward_zero_std": 0.0, "grad_norm": 0.06494006514549255, "learning_rate": 4.7651639047197045e-06, "loss": -0.0379, "num_tokens": 135247101.0, "reward": 5.4059553146362305, "reward_std": 0.9737482070922852, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.65595543384552, "rewards/ngram_similarity_reward/std": 0.3362525999546051, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 512.5625, "completions/mean_terminated_length": 512.5625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.37995077198478405, "frac_reward_zero_std": 0.0, "grad_norm": 0.058171700686216354, "learning_rate": 4.764422861533832e-06, "loss": -0.0188, "num_tokens": 135407329.0, "reward": 5.091101169586182, "reward_std": 0.7125576138496399, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.8098511695861816, "rewards/ngram_similarity_reward/std": 0.21350127458572388, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 425.890625, "completions/mean_terminated_length": 425.890625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.38039829939583797, "frac_reward_zero_std": 0.0, "grad_norm": 0.07442991435527802, "learning_rate": 4.763680715564134e-06, "loss": -0.0164, "num_tokens": 135559594.0, "reward": 3.915182113647461, "reward_std": 1.5784138441085815, "rewards/accuracy_reward/mean": 3.109375, "rewards/accuracy_reward/std": 3.125000238418579, "rewards/ngram_similarity_reward/mean": 0.8058068752288818, "rewards/ngram_similarity_reward/std": 0.26200681924819946, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 561.984375, "completions/mean_terminated_length": 561.984375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.38084582680689194, "frac_reward_zero_std": 0.0, "grad_norm": 0.058048561215400696, "learning_rate": 4.762937467216894e-06, "loss": -0.0308, "num_tokens": 135688793.0, "reward": 4.483916282653809, "reward_std": 0.10813301801681519, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4839160442352295, "rewards/ngram_similarity_reward/std": 0.26713013648986816, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 333.640625, "completions/mean_terminated_length": 333.640625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.38129335421794586, "frac_reward_zero_std": 0.25, "grad_norm": 0.07030487060546875, "learning_rate": 4.762193116898999e-06, "loss": -0.0266, "num_tokens": 135926914.0, "reward": 2.676629066467285, "reward_std": 0.8994293808937073, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.7391289472579956, "rewards/ngram_similarity_reward/std": 0.2241910994052887, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 481.8125, "completions/mean_terminated_length": 481.8125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.3817408816289998, "frac_reward_zero_std": 0.0, "grad_norm": 0.07144621759653091, "learning_rate": 4.761447665017941e-06, "loss": -0.0084, "num_tokens": 136068998.0, "reward": 2.857234001159668, "reward_std": 1.1894598007202148, "rewards/accuracy_reward/mean": 2.265625, "rewards/accuracy_reward/std": 3.0692648887634277, "rewards/ngram_similarity_reward/mean": 0.591609001159668, "rewards/ngram_similarity_reward/std": 0.3215799927711487, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 495.328125, "completions/mean_terminated_length": 495.328125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3821884090400537, "frac_reward_zero_std": 0.25, "grad_norm": 0.06216570734977722, "learning_rate": 4.760701111981811e-06, "loss": -0.0139, "num_tokens": 136221339.0, "reward": 4.3738694190979, "reward_std": 0.703353226184845, "rewards/accuracy_reward/mean": 3.765625, "rewards/accuracy_reward/std": 2.8015992641448975, "rewards/ngram_similarity_reward/mean": 0.6082445383071899, "rewards/ngram_similarity_reward/std": 0.5324356555938721, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 380.296875, "completions/mean_terminated_length": 380.296875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3826359364511076, "frac_reward_zero_std": 0.0, "grad_norm": 0.08928291499614716, "learning_rate": 4.759953458199306e-06, "loss": 0.0081, "num_tokens": 136367230.0, "reward": 3.6769230365753174, "reward_std": 1.3307888507843018, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.6144229173660278, "rewards/ngram_similarity_reward/std": 0.22901320457458496, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 419.390625, "completions/mean_terminated_length": 419.390625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.3830834638621616, "frac_reward_zero_std": 0.0, "grad_norm": 0.06559686362743378, "learning_rate": 4.759204704079724e-06, "loss": -0.0055, "num_tokens": 136505703.0, "reward": 2.4887826442718506, "reward_std": 0.9327364563941956, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 2.9857051372528076, "rewards/ngram_similarity_reward/mean": 0.5669077634811401, "rewards/ngram_similarity_reward/std": 0.41834351420402527, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 407.828125, "completions/mean_terminated_length": 407.828125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.3835309912732155, "frac_reward_zero_std": 0.0, "grad_norm": 0.07738467305898666, "learning_rate": 4.7584548500329654e-06, "loss": 0.0314, "num_tokens": 136730732.0, "reward": 4.649050712585449, "reward_std": 0.19639216363430023, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6490504741668701, "rewards/ngram_similarity_reward/std": 0.28024277091026306, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 377.53125, "completions/mean_terminated_length": 377.53125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3839785186842694, "frac_reward_zero_std": 0.0, "grad_norm": 0.08628908544778824, "learning_rate": 4.757703896469535e-06, "loss": 0.0016, "num_tokens": 136854414.0, "reward": 4.2408447265625, "reward_std": 0.7140259742736816, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5220947265625, "rewards/ngram_similarity_reward/std": 0.2464819848537445, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 447.328125, "completions/mean_terminated_length": 447.328125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3844260460953233, "frac_reward_zero_std": 0.0, "grad_norm": 0.06706064939498901, "learning_rate": 4.756951843800537e-06, "loss": 0.011, "num_tokens": 136972851.0, "reward": 5.8076677322387695, "reward_std": 0.831523060798645, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.68266761302948, "rewards/ngram_similarity_reward/std": 0.4277319610118866, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 464.859375, "completions/mean_terminated_length": 464.859375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.38487357350637724, "frac_reward_zero_std": 0.0, "grad_norm": 0.07416027039289474, "learning_rate": 4.756198692437679e-06, "loss": -0.0561, "num_tokens": 137198074.0, "reward": 4.66461706161499, "reward_std": 0.14941942691802979, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6646170616149902, "rewards/ngram_similarity_reward/std": 0.4006292223930359, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 423.59375, "completions/mean_terminated_length": 423.59375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.3853211009174312, "frac_reward_zero_std": 0.0, "grad_norm": 0.0800650343298912, "learning_rate": 4.755444442793269e-06, "loss": -0.0034, "num_tokens": 137372336.0, "reward": 3.025587320327759, "reward_std": 0.53461754322052, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.619337260723114, "rewards/ngram_similarity_reward/std": 0.2948019504547119, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 456.953125, "completions/mean_terminated_length": 456.953125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.38576862832848513, "frac_reward_zero_std": 0.0, "grad_norm": 0.07621988654136658, "learning_rate": 4.754689095280214e-06, "loss": 0.0054, "num_tokens": 137511597.0, "reward": 4.9953718185424805, "reward_std": 1.2807860374450684, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.5266216993331909, "rewards/ngram_similarity_reward/std": 0.2767489552497864, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 406.21875, "completions/mean_terminated_length": 406.21875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.38621615573953905, "frac_reward_zero_std": 0.25, "grad_norm": 0.05878997966647148, "learning_rate": 4.753932650312028e-06, "loss": -0.0025, "num_tokens": 137692715.0, "reward": 2.9865684509277344, "reward_std": 0.4927492141723633, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5803183913230896, "rewards/ngram_similarity_reward/std": 0.32269957661628723, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 388.03125, "completions/mean_terminated_length": 388.03125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.38666368315059296, "frac_reward_zero_std": 0.0, "grad_norm": 0.08189192414283752, "learning_rate": 4.753175108302821e-06, "loss": -0.0043, "num_tokens": 137837405.0, "reward": 0.13617974519729614, "reward_std": 0.46592044830322266, "rewards/accuracy_reward/mean": -0.421875, "rewards/accuracy_reward/std": 0.7622999548912048, "rewards/ngram_similarity_reward/mean": 0.5580548048019409, "rewards/ngram_similarity_reward/std": 0.36090996861457825, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 396.96875, "completions/mean_terminated_length": 396.96875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.3871112105616469, "frac_reward_zero_std": 0.25, "grad_norm": 0.06704279780387878, "learning_rate": 4.7524164696673035e-06, "loss": 0.0149, "num_tokens": 138019051.0, "reward": 4.136058807373047, "reward_std": 1.4905953407287598, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.698559045791626, "rewards/ngram_similarity_reward/std": 0.30539950728416443, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 419.5, "completions/mean_terminated_length": 419.5, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.38755873797270085, "frac_reward_zero_std": 0.0, "grad_norm": 0.0700206458568573, "learning_rate": 4.75165673482079e-06, "loss": 0.0044, "num_tokens": 138205643.0, "reward": 1.98746919631958, "reward_std": 0.6994311809539795, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.7062191367149353, "rewards/ngram_similarity_reward/std": 0.14535358548164368, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 431.546875, "completions/mean_terminated_length": 431.546875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.38800626538375477, "frac_reward_zero_std": 0.0, "grad_norm": 0.07644589245319366, "learning_rate": 4.750895904179191e-06, "loss": 0.0037, "num_tokens": 138355630.0, "reward": 4.456112384796143, "reward_std": 1.1625944375991821, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4561123549938202, "rewards/ngram_similarity_reward/std": 0.2667543292045593, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 428.0625, "completions/mean_terminated_length": 428.0625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.3884537927948087, "frac_reward_zero_std": 0.0, "grad_norm": 0.07240907847881317, "learning_rate": 4.75013397815902e-06, "loss": -0.0122, "num_tokens": 138484930.0, "reward": 4.688074111938477, "reward_std": 0.14621832966804504, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.688073992729187, "rewards/ngram_similarity_reward/std": 0.2483687698841095, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 377.15625, "completions/mean_terminated_length": 377.15625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.3889013202058626, "frac_reward_zero_std": 0.0, "grad_norm": 0.09912554919719696, "learning_rate": 4.7493709571773875e-06, "loss": 0.0051, "num_tokens": 138620940.0, "reward": 3.0040464401245117, "reward_std": 0.5850521326065063, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.1496646404266357, "rewards/ngram_similarity_reward/mean": 0.5196714401245117, "rewards/ngram_similarity_reward/std": 0.3500573933124542, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 339.40625, "completions/mean_terminated_length": 339.40625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3893488476169165, "frac_reward_zero_std": 0.0, "grad_norm": 0.09624762088060379, "learning_rate": 4.7486068416520065e-06, "loss": -0.0155, "num_tokens": 138746134.0, "reward": 3.2631068229675293, "reward_std": 0.6305399537086487, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5756068825721741, "rewards/ngram_similarity_reward/std": 0.3386700749397278, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 497.53125, "completions/mean_terminated_length": 497.53125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.3897963750279705, "frac_reward_zero_std": 0.0, "grad_norm": 0.06503531336784363, "learning_rate": 4.747841632001186e-06, "loss": -0.0295, "num_tokens": 138942824.0, "reward": 1.393243432044983, "reward_std": 0.48505699634552, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.29949337244033813, "rewards/ngram_similarity_reward/std": 0.18441572785377502, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 435.046875, "completions/mean_terminated_length": 435.046875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.3902439024390244, "frac_reward_zero_std": 0.0, "grad_norm": 0.07496998459100723, "learning_rate": 4.747075328643837e-06, "loss": 0.0195, "num_tokens": 139082443.0, "reward": 3.0874123573303223, "reward_std": 1.6260262727737427, "rewards/accuracy_reward/mean": 2.296875, "rewards/accuracy_reward/std": 3.0351366996765137, "rewards/ngram_similarity_reward/mean": 0.7905375361442566, "rewards/ngram_similarity_reward/std": 0.26953014731407166, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 396.09375, "completions/mean_terminated_length": 396.09375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.3906914298500783, "frac_reward_zero_std": 0.5, "grad_norm": 0.05977340787649155, "learning_rate": 4.7463079319994665e-06, "loss": -0.0123, "num_tokens": 139232753.0, "reward": 3.3932254314422607, "reward_std": 0.6104599237442017, "rewards/accuracy_reward/mean": 2.671875, "rewards/accuracy_reward/std": 3.037097215652466, "rewards/ngram_similarity_reward/mean": 0.7213504314422607, "rewards/ngram_similarity_reward/std": 0.3697623908519745, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 475.265625, "completions/mean_terminated_length": 475.265625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.39113895726113224, "frac_reward_zero_std": 0.0, "grad_norm": 0.06421991437673569, "learning_rate": 4.745539442488181e-06, "loss": -0.0158, "num_tokens": 139369218.0, "reward": 2.3920938968658447, "reward_std": 1.329725742340088, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.5483440160751343, "rewards/ngram_similarity_reward/std": 0.29731571674346924, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 533.234375, "completions/mean_terminated_length": 533.234375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.39158648467218615, "frac_reward_zero_std": 0.0, "grad_norm": 0.07584026455879211, "learning_rate": 4.744769860530687e-06, "loss": 0.0287, "num_tokens": 139521953.0, "reward": 1.8337633609771729, "reward_std": 0.9680390357971191, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 3.019078016281128, "rewards/ngram_similarity_reward/mean": 0.4431384205818176, "rewards/ngram_similarity_reward/std": 0.19377842545509338, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 473.1875, "completions/mean_terminated_length": 473.1875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.3920340120832401, "frac_reward_zero_std": 0.0, "grad_norm": 0.06925900280475616, "learning_rate": 4.743999186548286e-06, "loss": -0.0045, "num_tokens": 139659837.0, "reward": 5.710826396942139, "reward_std": 1.1054452657699585, "rewards/accuracy_reward/mean": 5.109375, "rewards/accuracy_reward/std": 1.5287425518035889, "rewards/ngram_similarity_reward/mean": 0.6014513969421387, "rewards/ngram_similarity_reward/std": 0.3219076991081238, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 478.171875, "completions/mean_terminated_length": 478.171875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.39248153949429404, "frac_reward_zero_std": 0.0, "grad_norm": 0.07373322546482086, "learning_rate": 4.74322742096288e-06, "loss": -0.0055, "num_tokens": 139868232.0, "reward": 4.700196266174316, "reward_std": 1.3519628047943115, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.32519641518592834, "rewards/ngram_similarity_reward/std": 0.20448847115039825, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 530.640625, "completions/mean_terminated_length": 530.640625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.39292906690534796, "frac_reward_zero_std": 0.25, "grad_norm": 0.05321519076824188, "learning_rate": 4.742454564196966e-06, "loss": -0.0018, "num_tokens": 140030065.0, "reward": 4.054103851318359, "reward_std": 1.2656118869781494, "rewards/accuracy_reward/mean": 3.421875, "rewards/accuracy_reward/std": 2.896657705307007, "rewards/ngram_similarity_reward/mean": 0.6322291493415833, "rewards/ngram_similarity_reward/std": 0.31668195128440857, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 442.890625, "completions/mean_terminated_length": 442.890625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.3933765943164019, "frac_reward_zero_std": 0.25, "grad_norm": 0.06103889271616936, "learning_rate": 4.741680616673642e-06, "loss": 0.0291, "num_tokens": 140214026.0, "reward": 3.1103811264038086, "reward_std": 0.4770812392234802, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.7041311860084534, "rewards/ngram_similarity_reward/std": 0.3329959213733673, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 439.609375, "completions/mean_terminated_length": 439.609375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3938241217274558, "frac_reward_zero_std": 0.0, "grad_norm": 0.0781811773777008, "learning_rate": 4.740905578816599e-06, "loss": 0.0171, "num_tokens": 140370721.0, "reward": 2.5304312705993652, "reward_std": 1.2637858390808105, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 2.9857051372528076, "rewards/ngram_similarity_reward/mean": 0.6085561513900757, "rewards/ngram_similarity_reward/std": 0.31637707352638245, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 381.265625, "completions/mean_terminated_length": 381.265625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.39427164913850976, "frac_reward_zero_std": 0.0, "grad_norm": 0.09314892441034317, "learning_rate": 4.740129451050129e-06, "loss": -0.0046, "num_tokens": 140518082.0, "reward": 4.54716682434082, "reward_std": 1.0344082117080688, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.8284169435501099, "rewards/ngram_similarity_reward/std": 0.23733720183372498, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 382.46875, "completions/mean_terminated_length": 382.46875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.3947191765495637, "frac_reward_zero_std": 0.0, "grad_norm": 0.07349873334169388, "learning_rate": 4.739352233799116e-06, "loss": 0.011, "num_tokens": 140657392.0, "reward": 3.518798828125, "reward_std": 1.0502201318740845, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6437988877296448, "rewards/ngram_similarity_reward/std": 0.2269400656223297, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 389.671875, "completions/mean_terminated_length": 389.671875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3951667039606176, "frac_reward_zero_std": 0.0, "grad_norm": 0.08306004852056503, "learning_rate": 4.7385739274890444e-06, "loss": 0.0067, "num_tokens": 140779611.0, "reward": 1.7087817192077637, "reward_std": 1.6573550701141357, "rewards/accuracy_reward/mean": 1.109375, "rewards/accuracy_reward/std": 2.7809853553771973, "rewards/ngram_similarity_reward/mean": 0.5994066596031189, "rewards/ngram_similarity_reward/std": 0.20897532999515533, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 479.703125, "completions/mean_terminated_length": 479.703125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.3956142313716715, "frac_reward_zero_std": 0.0, "grad_norm": 0.06382640451192856, "learning_rate": 4.737794532545994e-06, "loss": 0.0178, "num_tokens": 140924744.0, "reward": 4.415198802947998, "reward_std": 0.7818960547447205, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6964489221572876, "rewards/ngram_similarity_reward/std": 0.3221212327480316, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 502.859375, "completions/mean_terminated_length": 502.859375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.3960617587827254, "frac_reward_zero_std": 0.0, "grad_norm": 0.06667076796293259, "learning_rate": 4.737014049396639e-06, "loss": -0.0226, "num_tokens": 141082239.0, "reward": 1.4672009944915771, "reward_std": 0.5684460401535034, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 2.7316761016845703, "rewards/ngram_similarity_reward/mean": 0.2953259348869324, "rewards/ngram_similarity_reward/std": 0.1921965628862381, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 403.53125, "completions/mean_terminated_length": 403.53125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.39650928619377934, "frac_reward_zero_std": 0.0, "grad_norm": 0.07745262235403061, "learning_rate": 4.736232478468249e-06, "loss": -0.0128, "num_tokens": 141191393.0, "reward": 1.348705768585205, "reward_std": 0.20715712010860443, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.3643309473991394, "rewards/ngram_similarity_reward/std": 0.2165614366531372, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 351.15625, "completions/mean_terminated_length": 351.15625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3969568136048333, "frac_reward_zero_std": 0.0, "grad_norm": 0.104799285531044, "learning_rate": 4.735449820188693e-06, "loss": 0.046, "num_tokens": 141304011.0, "reward": 3.7475767135620117, "reward_std": 2.1090166568756104, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.4038265347480774, "rewards/ngram_similarity_reward/std": 0.25956711173057556, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 411.765625, "completions/mean_terminated_length": 411.765625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.39740434101588723, "frac_reward_zero_std": 0.0, "grad_norm": 0.07789560407400131, "learning_rate": 4.73466607498643e-06, "loss": -0.0069, "num_tokens": 141415644.0, "reward": 5.63222599029541, "reward_std": 0.7778723239898682, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.41347628831863403, "rewards/ngram_similarity_reward/std": 0.28358733654022217, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 418.25, "completions/mean_terminated_length": 418.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.39785186842694115, "frac_reward_zero_std": 0.0, "grad_norm": 0.08741523325443268, "learning_rate": 4.73388124329052e-06, "loss": 0.0285, "num_tokens": 141587308.0, "reward": 3.2023792266845703, "reward_std": 0.5806549787521362, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6086291074752808, "rewards/ngram_similarity_reward/std": 0.27053776383399963, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 392.71875, "completions/mean_terminated_length": 392.71875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.39829939583799506, "frac_reward_zero_std": 0.0, "grad_norm": 0.07778114080429077, "learning_rate": 4.7330953255306114e-06, "loss": 0.0013, "num_tokens": 141727178.0, "reward": 2.7171683311462402, "reward_std": 1.6601142883300781, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.592168390750885, "rewards/ngram_similarity_reward/std": 0.32917869091033936, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 476.765625, "completions/mean_terminated_length": 476.765625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.398746923249049, "frac_reward_zero_std": 0.0, "grad_norm": 0.06503872573375702, "learning_rate": 4.732308322136951e-06, "loss": 0.0005, "num_tokens": 141879595.0, "reward": 3.591559886932373, "reward_std": 1.2463946342468262, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.4353097677230835, "rewards/ngram_similarity_reward/std": 0.21791474521160126, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 480.890625, "completions/mean_terminated_length": 480.890625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.39919445066010295, "frac_reward_zero_std": 0.0, "grad_norm": 0.07314197719097137, "learning_rate": 4.7315202335403794e-06, "loss": 0.031, "num_tokens": 142044292.0, "reward": 4.573571681976318, "reward_std": 0.5166336297988892, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6673218607902527, "rewards/ngram_similarity_reward/std": 0.35875454545021057, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 435.046875, "completions/mean_terminated_length": 435.046875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.39964197807115687, "frac_reward_zero_std": 0.0, "grad_norm": 0.07555290311574936, "learning_rate": 4.730731060172331e-06, "loss": -0.0136, "num_tokens": 142215895.0, "reward": 0.5935059189796448, "reward_std": 0.8443008661270142, "rewards/accuracy_reward/mean": -0.140625, "rewards/accuracy_reward/std": 1.473223328590393, "rewards/ngram_similarity_reward/mean": 0.7341309189796448, "rewards/ngram_similarity_reward/std": 0.284699022769928, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 516.21875, "completions/mean_terminated_length": 516.21875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.4000895054822108, "frac_reward_zero_std": 0.0, "grad_norm": 0.0662178173661232, "learning_rate": 4.7299408024648345e-06, "loss": 0.0232, "num_tokens": 142362757.0, "reward": 6.127585411071777, "reward_std": 0.2145819067955017, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6275853514671326, "rewards/ngram_similarity_reward/std": 0.23748208582401276, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 569.21875, "completions/mean_terminated_length": 569.21875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.4005370328932647, "frac_reward_zero_std": 0.0, "grad_norm": 0.062257200479507446, "learning_rate": 4.729149460850512e-06, "loss": 0.0213, "num_tokens": 142544371.0, "reward": 2.992208957672119, "reward_std": 1.5508846044540405, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.49220892786979675, "rewards/ngram_similarity_reward/std": 0.2310357689857483, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 436.6875, "completions/mean_terminated_length": 436.6875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.4009845603043186, "frac_reward_zero_std": 0.0, "grad_norm": 0.0728151872754097, "learning_rate": 4.728357035762577e-06, "loss": -0.0219, "num_tokens": 142679503.0, "reward": 3.1466708183288574, "reward_std": 0.1647435575723648, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6466709971427917, "rewards/ngram_similarity_reward/std": 0.294294536113739, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 420.0, "completions/mean_terminated_length": 420.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.4014320877153726, "frac_reward_zero_std": 0.25, "grad_norm": 0.0714784786105156, "learning_rate": 4.727563527634839e-06, "loss": -0.0301, "num_tokens": 142814111.0, "reward": 3.757340908050537, "reward_std": 1.262383222579956, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 2.9727182388305664, "rewards/ngram_similarity_reward/mean": 0.6167159080505371, "rewards/ngram_similarity_reward/std": 0.37762531638145447, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 387.90625, "completions/mean_terminated_length": 387.90625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4018796151264265, "frac_reward_zero_std": 0.0, "grad_norm": 0.09113533049821854, "learning_rate": 4.7267689369017e-06, "loss": -0.0, "num_tokens": 143032425.0, "reward": 5.159881114959717, "reward_std": 1.1793248653411865, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.7848809957504272, "rewards/ngram_similarity_reward/std": 0.3087009787559509, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 543.875, "completions/mean_terminated_length": 543.875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.4023271425374804, "frac_reward_zero_std": 0.0, "grad_norm": 0.06111174076795578, "learning_rate": 4.725973263998154e-06, "loss": 0.0037, "num_tokens": 143185457.0, "reward": 6.120242118835449, "reward_std": 0.2244957983493805, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6202424764633179, "rewards/ngram_similarity_reward/std": 0.2629983127117157, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 457.0625, "completions/mean_terminated_length": 457.0625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.40277466994853434, "frac_reward_zero_std": 0.0, "grad_norm": 0.07256443798542023, "learning_rate": 4.725176509359784e-06, "loss": -0.008, "num_tokens": 143354309.0, "reward": 3.8044281005859375, "reward_std": 1.7351446151733398, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.648178219795227, "rewards/ngram_similarity_reward/std": 0.25434863567352295, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 513.1875, "completions/mean_terminated_length": 513.1875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.40322219735958825, "frac_reward_zero_std": 0.0, "grad_norm": 0.05711884796619415, "learning_rate": 4.7243786734227745e-06, "loss": -0.0348, "num_tokens": 143517345.0, "reward": 2.1598706245422363, "reward_std": 1.3489621877670288, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.5036205053329468, "rewards/ngram_similarity_reward/std": 0.2906215190887451, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 435.140625, "completions/mean_terminated_length": 435.140625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4036697247706422, "frac_reward_zero_std": 0.0, "grad_norm": 0.07961972057819366, "learning_rate": 4.72357975662389e-06, "loss": -0.0097, "num_tokens": 143646346.0, "reward": 2.9096150398254395, "reward_std": 0.5758260488510132, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5971152186393738, "rewards/ngram_similarity_reward/std": 0.16422002017498016, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 413.5625, "completions/mean_terminated_length": 387.61907958984375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.40411725218169614, "frac_reward_zero_std": 0.0, "grad_norm": 0.09119610488414764, "learning_rate": 4.722779759400499e-06, "loss": -0.0105, "num_tokens": 143785374.0, "reward": 2.8944010734558105, "reward_std": 2.1692235469818115, "rewards/accuracy_reward/mean": 2.203125, "rewards/accuracy_reward/std": 3.0272817611694336, "rewards/ngram_similarity_reward/mean": 0.6912758350372314, "rewards/ngram_similarity_reward/std": 0.4539448022842407, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 464.125, "completions/mean_terminated_length": 464.125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.40456477959275006, "frac_reward_zero_std": 0.0, "grad_norm": 0.058656852692365646, "learning_rate": 4.721978682190549e-06, "loss": 0.0133, "num_tokens": 143932070.0, "reward": 5.176098823547363, "reward_std": 1.2019221782684326, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.8010987043380737, "rewards/ngram_similarity_reward/std": 0.353232204914093, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 426.0625, "completions/mean_terminated_length": 426.0625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.405012307003804, "frac_reward_zero_std": 0.0, "grad_norm": 0.07713035494089127, "learning_rate": 4.721176525432588e-06, "loss": -0.0042, "num_tokens": 144071882.0, "reward": 2.6432571411132812, "reward_std": 1.379880428314209, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.7995070219039917, "rewards/ngram_similarity_reward/std": 0.20705363154411316, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 421.203125, "completions/mean_terminated_length": 421.203125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4054598344148579, "frac_reward_zero_std": 0.25, "grad_norm": 0.06027873232960701, "learning_rate": 4.720373289565753e-06, "loss": -0.0172, "num_tokens": 144180327.0, "reward": 5.264594078063965, "reward_std": 0.8011810183525085, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.8895940184593201, "rewards/ngram_similarity_reward/std": 0.3584500551223755, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 530.984375, "completions/mean_terminated_length": 530.984375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.40590736182591186, "frac_reward_zero_std": 0.0, "grad_norm": 0.054708436131477356, "learning_rate": 4.719568975029769e-06, "loss": -0.0284, "num_tokens": 144370614.0, "reward": 4.3622941970825195, "reward_std": 1.1988966464996338, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.8310438394546509, "rewards/ngram_similarity_reward/std": 0.2436477094888687, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 363.265625, "completions/mean_terminated_length": 363.265625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4063548892369658, "frac_reward_zero_std": 0.25, "grad_norm": 0.09403359144926071, "learning_rate": 4.718763582264954e-06, "loss": 0.0102, "num_tokens": 144493943.0, "reward": 3.0666751861572266, "reward_std": 0.14330953359603882, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.5823002457618713, "rewards/ngram_similarity_reward/std": 0.22058938443660736, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 434.203125, "completions/mean_terminated_length": 434.203125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4068024166480197, "frac_reward_zero_std": 0.25, "grad_norm": 0.06283697485923767, "learning_rate": 4.7179571117122145e-06, "loss": -0.0004, "num_tokens": 144630804.0, "reward": 1.8347517251968384, "reward_std": 0.7851179838180542, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 2.775986671447754, "rewards/ngram_similarity_reward/mean": 0.5691266059875488, "rewards/ngram_similarity_reward/std": 0.2655256688594818, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 583.28125, "completions/mean_terminated_length": 583.28125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4072499440590736, "frac_reward_zero_std": 0.0, "grad_norm": 0.0670977532863617, "learning_rate": 4.717149563813049e-06, "loss": 0.0408, "num_tokens": 144773222.0, "reward": 4.14152717590332, "reward_std": 1.1679092645645142, "rewards/accuracy_reward/mean": 3.515625, "rewards/accuracy_reward/std": 2.8646292686462402, "rewards/ngram_similarity_reward/mean": 0.6259022355079651, "rewards/ngram_similarity_reward/std": 0.2814485728740692, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 482.03125, "completions/mean_terminated_length": 482.03125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.4076974714701275, "frac_reward_zero_std": 0.0, "grad_norm": 0.07531841844320297, "learning_rate": 4.716340939009544e-06, "loss": -0.0059, "num_tokens": 144910840.0, "reward": 3.2331812381744385, "reward_std": 0.5628366470336914, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6394312381744385, "rewards/ngram_similarity_reward/std": 0.27154573798179626, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 390.734375, "completions/mean_terminated_length": 390.734375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.4081449988811815, "frac_reward_zero_std": 0.0, "grad_norm": 0.08654443919658661, "learning_rate": 4.715531237744377e-06, "loss": 0.0477, "num_tokens": 145079527.0, "reward": 1.4686279296875, "reward_std": 0.15591643750667572, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.48425301909446716, "rewards/ngram_similarity_reward/std": 0.23591560125350952, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 446.140625, "completions/mean_terminated_length": 446.140625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.4085925262922354, "frac_reward_zero_std": 0.0, "grad_norm": 0.07434502989053726, "learning_rate": 4.714720460460814e-06, "loss": 0.0123, "num_tokens": 145252720.0, "reward": 3.0408496856689453, "reward_std": 0.1428745537996292, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5408496260643005, "rewards/ngram_similarity_reward/std": 0.20481747388839722, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 430.984375, "completions/mean_terminated_length": 430.984375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.40904005370328933, "frac_reward_zero_std": 0.0, "grad_norm": 0.07099854946136475, "learning_rate": 4.713908607602712e-06, "loss": -0.0142, "num_tokens": 145423663.0, "reward": 3.6085739135742188, "reward_std": 1.2402714490890503, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.4523240327835083, "rewards/ngram_similarity_reward/std": 0.22499507665634155, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 401.21875, "completions/mean_terminated_length": 401.21875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.40948758111434325, "frac_reward_zero_std": 0.0, "grad_norm": 0.07884711027145386, "learning_rate": 4.71309567961451e-06, "loss": -0.0077, "num_tokens": 145603309.0, "reward": 3.2621984481811523, "reward_std": 0.4593298137187958, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6684484481811523, "rewards/ngram_similarity_reward/std": 0.283232718706131, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 408.40625, "completions/mean_terminated_length": 408.40625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.40993510852539716, "frac_reward_zero_std": 0.0, "grad_norm": 0.07480478286743164, "learning_rate": 4.712281676941246e-06, "loss": 0.0063, "num_tokens": 145756311.0, "reward": 4.161160469055176, "reward_std": 0.7893478274345398, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.5361604690551758, "rewards/ngram_similarity_reward/std": 0.35466256737709045, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 460.734375, "completions/mean_terminated_length": 460.734375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.41038263593645113, "frac_reward_zero_std": 0.0, "grad_norm": 0.06919507682323456, "learning_rate": 4.711466600028538e-06, "loss": 0.0027, "num_tokens": 145902550.0, "reward": 3.319559097290039, "reward_std": 1.3462203741073608, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6320590972900391, "rewards/ngram_similarity_reward/std": 0.1879875659942627, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 400.984375, "completions/mean_terminated_length": 400.984375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.41083016334750505, "frac_reward_zero_std": 0.25, "grad_norm": 0.06792223453521729, "learning_rate": 4.710650449322595e-06, "loss": -0.0116, "num_tokens": 146033989.0, "reward": 4.589241981506348, "reward_std": 0.4721425175666809, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6829920411109924, "rewards/ngram_similarity_reward/std": 0.28800156712532043, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 466.828125, "completions/mean_terminated_length": 466.828125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.41127769075855897, "frac_reward_zero_std": 0.0, "grad_norm": 0.07291481643915176, "learning_rate": 4.709833225270215e-06, "loss": -0.0149, "num_tokens": 146223274.0, "reward": 4.248568058013916, "reward_std": 0.7303920388221741, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6235682368278503, "rewards/ngram_similarity_reward/std": 0.2175399512052536, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 474.5625, "completions/mean_terminated_length": 474.5625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.4117252181696129, "frac_reward_zero_std": 0.0, "grad_norm": 0.09087973088026047, "learning_rate": 4.709014928318783e-06, "loss": -0.0214, "num_tokens": 146420542.0, "reward": 3.0612213611602783, "reward_std": 1.0413897037506104, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.4674713611602783, "rewards/ngram_similarity_reward/std": 0.29778701066970825, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 357.28125, "completions/mean_terminated_length": 357.28125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.4121727455806668, "frac_reward_zero_std": 0.0, "grad_norm": 0.07861248403787613, "learning_rate": 4.708195558916269e-06, "loss": 0.0205, "num_tokens": 146552192.0, "reward": 3.3277878761291504, "reward_std": 0.6478875279426575, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6402877569198608, "rewards/ngram_similarity_reward/std": 0.35690537095069885, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 376.953125, "completions/mean_terminated_length": 376.953125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.4126202729917207, "frac_reward_zero_std": 0.25, "grad_norm": 0.07197709381580353, "learning_rate": 4.707375117511233e-06, "loss": -0.0047, "num_tokens": 146721805.0, "reward": 1.8992624282836914, "reward_std": 0.7386025190353394, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.5242624282836914, "rewards/ngram_similarity_reward/std": 0.38837382197380066, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 375.203125, "completions/mean_terminated_length": 375.203125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4130678004027747, "frac_reward_zero_std": 0.0, "grad_norm": 0.08947842568159103, "learning_rate": 4.70655360455282e-06, "loss": 0.001, "num_tokens": 146859530.0, "reward": 4.687874794006348, "reward_std": 1.5216972827911377, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.4066246747970581, "rewards/ngram_similarity_reward/std": 0.287428081035614, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 558.484375, "completions/mean_terminated_length": 558.484375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.4135153278138286, "frac_reward_zero_std": 0.0, "grad_norm": 0.06963956356048584, "learning_rate": 4.705731020490763e-06, "loss": -0.0101, "num_tokens": 147005209.0, "reward": 1.4135024547576904, "reward_std": 0.17173431813716888, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.42912742495536804, "rewards/ngram_similarity_reward/std": 0.34089791774749756, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 396.0625, "completions/mean_terminated_length": 396.0625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.4139628552248825, "frac_reward_zero_std": 0.0, "grad_norm": 0.07255294173955917, "learning_rate": 4.70490736577538e-06, "loss": 0.0061, "num_tokens": 147153325.0, "reward": 3.5313873291015625, "reward_std": 2.2151896953582764, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.7501375675201416, "rewards/ngram_similarity_reward/std": 0.32853633165359497, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 467.46875, "completions/mean_terminated_length": 467.46875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.41441038263593644, "frac_reward_zero_std": 0.0, "grad_norm": 0.06902709603309631, "learning_rate": 4.704082640857578e-06, "loss": -0.0027, "num_tokens": 147271787.0, "reward": 3.3249874114990234, "reward_std": 0.1639261543750763, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.8249874114990234, "rewards/ngram_similarity_reward/std": 0.23711428046226501, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 447.34375, "completions/mean_terminated_length": 447.34375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.41485791004699035, "frac_reward_zero_std": 0.0, "grad_norm": 0.066554494202137, "learning_rate": 4.703256846188846e-06, "loss": 0.0197, "num_tokens": 147425857.0, "reward": 2.882577657699585, "reward_std": 1.545878291130066, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.663827657699585, "rewards/ngram_similarity_reward/std": 0.33302900195121765, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 448.53125, "completions/mean_terminated_length": 448.53125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4153054374580443, "frac_reward_zero_std": 0.0, "grad_norm": 0.07156302034854889, "learning_rate": 4.70242998222126e-06, "loss": 0.001, "num_tokens": 147555107.0, "reward": 4.492798328399658, "reward_std": 0.12431110441684723, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.49279868602752686, "rewards/ngram_similarity_reward/std": 0.32165202498435974, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 476.046875, "completions/mean_terminated_length": 476.046875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.41575296486909824, "frac_reward_zero_std": 0.0, "grad_norm": 0.0779896154999733, "learning_rate": 4.701602049407482e-06, "loss": 0.0273, "num_tokens": 147775574.0, "reward": 5.030037879943848, "reward_std": 0.7520781755447388, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.6550377607345581, "rewards/ngram_similarity_reward/std": 0.22018156945705414, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 461.484375, "completions/mean_terminated_length": 461.484375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.41620049228015216, "frac_reward_zero_std": 0.0, "grad_norm": 0.07115507125854492, "learning_rate": 4.70077304820076e-06, "loss": -0.0071, "num_tokens": 147918117.0, "reward": 4.60894775390625, "reward_std": 1.4901347160339355, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.7964481711387634, "rewards/ngram_similarity_reward/std": 0.3186900317668915, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 399.90625, "completions/mean_terminated_length": 399.90625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.4166480196912061, "frac_reward_zero_std": 0.0, "grad_norm": 0.07521829009056091, "learning_rate": 4.699942979054926e-06, "loss": 0.0065, "num_tokens": 148077247.0, "reward": 5.061558723449707, "reward_std": 1.702772855758667, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.8740589022636414, "rewards/ngram_similarity_reward/std": 0.30254870653152466, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 433.140625, "completions/mean_terminated_length": 433.140625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.41709554710226, "frac_reward_zero_std": 0.0, "grad_norm": 0.06832011044025421, "learning_rate": 4.699111842424394e-06, "loss": -0.0374, "num_tokens": 148223560.0, "reward": 5.83868408203125, "reward_std": 0.5857259035110474, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.4480590522289276, "rewards/ngram_similarity_reward/std": 0.27246713638305664, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 436.328125, "completions/mean_terminated_length": 436.328125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.41754307451331396, "frac_reward_zero_std": 0.0, "grad_norm": 0.07530754804611206, "learning_rate": 4.698279638764167e-06, "loss": 0.0037, "num_tokens": 148405437.0, "reward": 4.639106750488281, "reward_std": 0.6088700294494629, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.8266069293022156, "rewards/ngram_similarity_reward/std": 0.20045363903045654, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 388.921875, "completions/mean_terminated_length": 388.921875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.4179906019243679, "frac_reward_zero_std": 0.25, "grad_norm": 0.0640043392777443, "learning_rate": 4.697446368529829e-06, "loss": 0.0264, "num_tokens": 148569240.0, "reward": 3.4495487213134766, "reward_std": 1.641690731048584, "rewards/accuracy_reward/mean": 2.75, "rewards/accuracy_reward/std": 3.0498504638671875, "rewards/ngram_similarity_reward/mean": 0.6995489001274109, "rewards/ngram_similarity_reward/std": 0.3327275514602661, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 506.71875, "completions/mean_terminated_length": 506.71875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.4184381293354218, "frac_reward_zero_std": 0.0, "grad_norm": 0.055351290851831436, "learning_rate": 4.69661203217755e-06, "loss": -0.0101, "num_tokens": 148767350.0, "reward": 4.562273025512695, "reward_std": 0.49909937381744385, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.6716479659080505, "rewards/ngram_similarity_reward/std": 0.2990396022796631, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 357.046875, "completions/mean_terminated_length": 357.046875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.4188856567464757, "frac_reward_zero_std": 0.0, "grad_norm": 0.09700652211904526, "learning_rate": 4.6957766301640814e-06, "loss": 0.005, "num_tokens": 148934665.0, "reward": 4.540606498718262, "reward_std": 1.3007328510284424, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6343559622764587, "rewards/ngram_similarity_reward/std": 0.36280736327171326, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 431.671875, "completions/mean_terminated_length": 431.671875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.4193331841575296, "frac_reward_zero_std": 0.0, "grad_norm": 0.08109112828969955, "learning_rate": 4.694940162946759e-06, "loss": 0.0015, "num_tokens": 149098900.0, "reward": 2.9900810718536377, "reward_std": 0.4328707456588745, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.396331250667572, "rewards/ngram_similarity_reward/std": 0.2602551281452179, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 512.09375, "completions/mean_terminated_length": 512.09375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.4197807115685836, "frac_reward_zero_std": 0.0, "grad_norm": 0.07237434387207031, "learning_rate": 4.694102630983502e-06, "loss": 0.0048, "num_tokens": 149249674.0, "reward": 0.04293042793869972, "reward_std": 0.33587461709976196, "rewards/accuracy_reward/mean": -0.609375, "rewards/accuracy_reward/std": 0.3145764470100403, "rewards/ngram_similarity_reward/mean": 0.6523054242134094, "rewards/ngram_similarity_reward/std": 0.19416570663452148, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 443.796875, "completions/mean_terminated_length": 443.796875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.4202282389796375, "frac_reward_zero_std": 0.0, "grad_norm": 0.0778619572520256, "learning_rate": 4.6932640347328125e-06, "loss": -0.0313, "num_tokens": 149402525.0, "reward": 4.294356346130371, "reward_std": 1.9737029075622559, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.763106107711792, "rewards/ngram_similarity_reward/std": 0.29600730538368225, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 487.125, "completions/mean_terminated_length": 487.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.42067576639069143, "frac_reward_zero_std": 0.0, "grad_norm": 0.07317737489938736, "learning_rate": 4.692424374653774e-06, "loss": 0.0081, "num_tokens": 149633157.0, "reward": 1.4493913650512695, "reward_std": 0.44835755228996277, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.5431413650512695, "rewards/ngram_similarity_reward/std": 0.3096190392971039, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 354.609375, "completions/mean_terminated_length": 354.609375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.42112329380174535, "frac_reward_zero_std": 0.0, "grad_norm": 0.09393345564603806, "learning_rate": 4.691583651206055e-06, "loss": -0.0068, "num_tokens": 149765692.0, "reward": 3.0908572673797607, "reward_std": 1.6652133464813232, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.7783571481704712, "rewards/ngram_similarity_reward/std": 0.3887231647968292, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 321.640625, "completions/mean_terminated_length": 321.640625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.42157082121279926, "frac_reward_zero_std": 0.25, "grad_norm": 0.1032625362277031, "learning_rate": 4.6907418648499045e-06, "loss": -0.0107, "num_tokens": 149893029.0, "reward": 4.834690093994141, "reward_std": 0.12957683205604553, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8346905708312988, "rewards/ngram_similarity_reward/std": 0.2965867519378662, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 473.359375, "completions/mean_terminated_length": 473.359375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.42201834862385323, "frac_reward_zero_std": 0.0, "grad_norm": 0.06951655447483063, "learning_rate": 4.689899016046152e-06, "loss": 0.0189, "num_tokens": 150042204.0, "reward": 4.776795864105225, "reward_std": 1.9362866878509521, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.5892958641052246, "rewards/ngram_similarity_reward/std": 0.34720170497894287, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 525.59375, "completions/mean_terminated_length": 525.59375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.42246587603490715, "frac_reward_zero_std": 0.0, "grad_norm": 0.07009086012840271, "learning_rate": 4.689055105256212e-06, "loss": -0.002, "num_tokens": 150219410.0, "reward": 1.7767894268035889, "reward_std": 0.6507170796394348, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.5892894864082336, "rewards/ngram_similarity_reward/std": 0.26303830742836, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 364.5625, "completions/mean_terminated_length": 364.5625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.42291340344596107, "frac_reward_zero_std": 0.25, "grad_norm": 0.07398659735918045, "learning_rate": 4.688210132942076e-06, "loss": -0.0034, "num_tokens": 150360694.0, "reward": 6.416528701782227, "reward_std": 0.16398948431015015, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.9165284037590027, "rewards/ngram_similarity_reward/std": 0.29204583168029785, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 512.015625, "completions/mean_terminated_length": 512.015625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.423360930857015, "frac_reward_zero_std": 0.0, "grad_norm": 0.07332911342382431, "learning_rate": 4.687364099566321e-06, "loss": -0.0114, "num_tokens": 150493399.0, "reward": 4.729446887969971, "reward_std": 0.18728002905845642, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7294468283653259, "rewards/ngram_similarity_reward/std": 0.3016957938671112, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 440.40625, "completions/mean_terminated_length": 440.40625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4238084582680689, "frac_reward_zero_std": 0.0, "grad_norm": 0.07343576103448868, "learning_rate": 4.686517005592102e-06, "loss": -0.0129, "num_tokens": 150703185.0, "reward": 3.696603298187256, "reward_std": 1.21620512008667, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7278531789779663, "rewards/ngram_similarity_reward/std": 0.3106238543987274, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1224.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 539.71875, "completions/mean_terminated_length": 539.71875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.42425598567912287, "frac_reward_zero_std": 0.0, "grad_norm": 0.07591954618692398, "learning_rate": 4.6856688514831566e-06, "loss": 0.0079, "num_tokens": 150857951.0, "reward": 1.913421869277954, "reward_std": 1.941478967666626, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 2.880171298980713, "rewards/ngram_similarity_reward/mean": 0.49154698848724365, "rewards/ngram_similarity_reward/std": 0.2773827016353607, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 399.890625, "completions/mean_terminated_length": 399.890625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.4247035130901768, "frac_reward_zero_std": 0.0, "grad_norm": 0.08750013262033463, "learning_rate": 4.684819637703801e-06, "loss": 0.0123, "num_tokens": 150955320.0, "reward": 4.495274543762207, "reward_std": 0.1956636607646942, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4952741265296936, "rewards/ngram_similarity_reward/std": 0.3157603144645691, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 453.078125, "completions/mean_terminated_length": 453.078125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4251510405012307, "frac_reward_zero_std": 0.0, "grad_norm": 0.06658633798360825, "learning_rate": 4.683969364718932e-06, "loss": 0.0164, "num_tokens": 151121853.0, "reward": 4.68372917175293, "reward_std": 0.49302682280540466, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.7774791717529297, "rewards/ngram_similarity_reward/std": 0.29103362560272217, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 422.515625, "completions/mean_terminated_length": 422.515625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.4255985679122846, "frac_reward_zero_std": 0.0, "grad_norm": 0.08575849235057831, "learning_rate": 4.6831180329940265e-06, "loss": 0.0187, "num_tokens": 151268398.0, "reward": 3.5084714889526367, "reward_std": 1.4491386413574219, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.9147217869758606, "rewards/ngram_similarity_reward/std": 0.289139986038208, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 475.828125, "completions/mean_terminated_length": 475.828125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.42604609532333854, "frac_reward_zero_std": 0.0, "grad_norm": 0.07986129075288773, "learning_rate": 4.6822656429951415e-06, "loss": -0.0008, "num_tokens": 151419187.0, "reward": 3.8873395919799805, "reward_std": 1.240060806274414, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.44983938336372375, "rewards/ngram_similarity_reward/std": 0.34512755274772644, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 470.96875, "completions/mean_terminated_length": 470.96875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.4264936227343925, "frac_reward_zero_std": 0.0, "grad_norm": 0.07536032795906067, "learning_rate": 4.681412195188913e-06, "loss": -0.0214, "num_tokens": 151561681.0, "reward": 4.1212921142578125, "reward_std": 1.463741660118103, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.8712917566299438, "rewards/ngram_similarity_reward/std": 0.2256009578704834, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 406.40625, "completions/mean_terminated_length": 406.40625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.4269411501454464, "frac_reward_zero_std": 0.0, "grad_norm": 0.08135781437158585, "learning_rate": 4.680557690042555e-06, "loss": -0.0006, "num_tokens": 151730619.0, "reward": 5.661190032958984, "reward_std": 1.377014398574829, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.9111900329589844, "rewards/ngram_similarity_reward/std": 0.2595444917678833, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 389.6875, "completions/mean_terminated_length": 389.6875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.42738867755650034, "frac_reward_zero_std": 0.0, "grad_norm": 0.09303267300128937, "learning_rate": 4.679702128023862e-06, "loss": 0.007, "num_tokens": 151887047.0, "reward": 3.0402135848999023, "reward_std": 1.1083359718322754, "rewards/accuracy_reward/mean": 2.375, "rewards/accuracy_reward/std": 3.057647228240967, "rewards/ngram_similarity_reward/mean": 0.6652137041091919, "rewards/ngram_similarity_reward/std": 0.3403913974761963, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 378.5625, "completions/mean_terminated_length": 378.5625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.42783620496755426, "frac_reward_zero_std": 0.0, "grad_norm": 0.08461212366819382, "learning_rate": 4.678845509601207e-06, "loss": 0.01, "num_tokens": 152050427.0, "reward": 5.111605644226074, "reward_std": 1.624394178390503, "rewards/accuracy_reward/mean": 4.453125, "rewards/accuracy_reward/std": 2.319206953048706, "rewards/ngram_similarity_reward/mean": 0.6584810018539429, "rewards/ngram_similarity_reward/std": 0.29018649458885193, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 462.71875, "completions/mean_terminated_length": 462.71875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.4282837323786082, "frac_reward_zero_std": 0.0, "grad_norm": 0.08391781896352768, "learning_rate": 4.677987835243539e-06, "loss": -0.0126, "num_tokens": 152210937.0, "reward": 2.565617084503174, "reward_std": 1.311779260635376, "rewards/accuracy_reward/mean": 2.015625, "rewards/accuracy_reward/std": 3.111638069152832, "rewards/ngram_similarity_reward/mean": 0.5499922037124634, "rewards/ngram_similarity_reward/std": 0.3418310284614563, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 444.328125, "completions/mean_terminated_length": 444.328125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4287312597896621, "frac_reward_zero_std": 0.0, "grad_norm": 0.07344631850719452, "learning_rate": 4.677129105420387e-06, "loss": -0.0588, "num_tokens": 152396702.0, "reward": 3.783191204071045, "reward_std": 1.4082891941070557, "rewards/accuracy_reward/mean": 3.21875, "rewards/accuracy_reward/std": 2.9732606410980225, "rewards/ngram_similarity_reward/mean": 0.5644412040710449, "rewards/ngram_similarity_reward/std": 0.20864370465278625, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 497.796875, "completions/mean_terminated_length": 497.796875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.42917878720071606, "frac_reward_zero_std": 0.0, "grad_norm": 0.06695375591516495, "learning_rate": 4.6762693206018585e-06, "loss": -0.0034, "num_tokens": 152605905.0, "reward": 0.6230151653289795, "reward_std": 1.6565158367156982, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.5605151653289795, "rewards/ngram_similarity_reward/std": 0.36507290601730347, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 569.859375, "completions/mean_terminated_length": 497.1639099121094, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.42962631461177, "frac_reward_zero_std": 0.0, "grad_norm": 0.0528508760035038, "learning_rate": 4.675408481258637e-06, "loss": -0.0834, "num_tokens": 152764936.0, "reward": 4.562169075012207, "reward_std": 0.52070552110672, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.671544075012207, "rewards/ngram_similarity_reward/std": 0.32839202880859375, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 508.265625, "completions/mean_terminated_length": 508.265625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.4300738420228239, "frac_reward_zero_std": 0.0, "grad_norm": 0.06275948137044907, "learning_rate": 4.674546587861985e-06, "loss": 0.0134, "num_tokens": 152953705.0, "reward": 4.307257175445557, "reward_std": 1.8037517070770264, "rewards/accuracy_reward/mean": 3.515625, "rewards/accuracy_reward/std": 2.8646292686462402, "rewards/ngram_similarity_reward/mean": 0.7916322350502014, "rewards/ngram_similarity_reward/std": 0.32915112376213074, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 363.96875, "completions/mean_terminated_length": 363.96875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4305213694338778, "frac_reward_zero_std": 0.0, "grad_norm": 0.08565182983875275, "learning_rate": 4.67368364088374e-06, "loss": -0.0126, "num_tokens": 153143223.0, "reward": 2.904425621032715, "reward_std": 0.7966715097427368, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.6856756806373596, "rewards/ngram_similarity_reward/std": 0.2889115810394287, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 469.171875, "completions/mean_terminated_length": 469.171875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.4309688968449317, "frac_reward_zero_std": 0.0, "grad_norm": 0.07181283831596375, "learning_rate": 4.6728196407963165e-06, "loss": 0.0015, "num_tokens": 153286130.0, "reward": 4.65209436416626, "reward_std": 1.5053563117980957, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6520941853523254, "rewards/ngram_similarity_reward/std": 0.3621197044849396, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 447.671875, "completions/mean_terminated_length": 447.671875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.4314164242559857, "frac_reward_zero_std": 0.0, "grad_norm": 0.09691634774208069, "learning_rate": 4.671954588072706e-06, "loss": -0.0244, "num_tokens": 153407661.0, "reward": 3.785538673400879, "reward_std": 1.360426902770996, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.5355386734008789, "rewards/ngram_similarity_reward/std": 0.19325245916843414, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 524.9375, "completions/mean_terminated_length": 524.9375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.4318639516670396, "frac_reward_zero_std": 0.0, "grad_norm": 0.06560735404491425, "learning_rate": 4.671088483186478e-06, "loss": -0.0118, "num_tokens": 153565225.0, "reward": 4.425906658172607, "reward_std": 0.6114445328712463, "rewards/accuracy_reward/mean": 3.734375, "rewards/accuracy_reward/std": 2.969379186630249, "rewards/ngram_similarity_reward/mean": 0.6915316581726074, "rewards/ngram_similarity_reward/std": 0.37951141595840454, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 424.03125, "completions/mean_terminated_length": 424.03125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.43231147907809353, "frac_reward_zero_std": 0.0, "grad_norm": 0.080103300511837, "learning_rate": 4.670221326611777e-06, "loss": 0.0107, "num_tokens": 153686011.0, "reward": 4.409912109375, "reward_std": 0.8295788168907166, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.7849124073982239, "rewards/ngram_similarity_reward/std": 0.2863464951515198, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 448.25, "completions/mean_terminated_length": 448.25, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.43275900648914745, "frac_reward_zero_std": 0.0, "grad_norm": 0.08570236712694168, "learning_rate": 4.6693531188233195e-06, "loss": 0.0356, "num_tokens": 153882811.0, "reward": 5.434075355529785, "reward_std": 1.3801506757736206, "rewards/accuracy_reward/mean": 4.734375, "rewards/accuracy_reward/std": 2.04506516456604, "rewards/ngram_similarity_reward/mean": 0.6996999382972717, "rewards/ngram_similarity_reward/std": 0.3544958233833313, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 449.515625, "completions/mean_terminated_length": 449.515625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.43320653390020136, "frac_reward_zero_std": 0.0, "grad_norm": 0.07121328264474869, "learning_rate": 4.668483860296405e-06, "loss": 0.0139, "num_tokens": 154036332.0, "reward": 4.247105598449707, "reward_std": 1.1307997703552246, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.7158555388450623, "rewards/ngram_similarity_reward/std": 0.3953235149383545, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 474.28125, "completions/mean_terminated_length": 474.28125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.43365406131125533, "frac_reward_zero_std": 0.0, "grad_norm": 0.07322057336568832, "learning_rate": 4.667613551506901e-06, "loss": -0.0341, "num_tokens": 154164830.0, "reward": 3.941260576248169, "reward_std": 0.8153356909751892, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.4100106358528137, "rewards/ngram_similarity_reward/std": 0.2338978499174118, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 558.0625, "completions/mean_terminated_length": 558.0625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.43410158872230925, "frac_reward_zero_std": 0.0, "grad_norm": 0.06403569877147675, "learning_rate": 4.666742192931252e-06, "loss": 0.0144, "num_tokens": 154337666.0, "reward": 6.299419403076172, "reward_std": 0.09389565885066986, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7994194626808167, "rewards/ngram_similarity_reward/std": 0.17963354289531708, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 394.890625, "completions/mean_terminated_length": 394.890625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.43454911613336317, "frac_reward_zero_std": 0.0, "grad_norm": 0.08646032214164734, "learning_rate": 4.665869785046481e-06, "loss": 0.0464, "num_tokens": 154472635.0, "reward": 2.6723878383636475, "reward_std": 1.3516616821289062, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.4536377191543579, "rewards/ngram_similarity_reward/std": 0.26994383335113525, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 435.046875, "completions/mean_terminated_length": 435.046875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4349966435444171, "frac_reward_zero_std": 0.0, "grad_norm": 0.08001197129487991, "learning_rate": 4.664996328330181e-06, "loss": 0.0315, "num_tokens": 154654654.0, "reward": 2.80649471282959, "reward_std": 0.6137222647666931, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.126309394836426, "rewards/ngram_similarity_reward/mean": 0.49399474263191223, "rewards/ngram_similarity_reward/std": 0.3272567093372345, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 455.8125, "completions/mean_terminated_length": 455.8125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.435444170955471, "frac_reward_zero_std": 0.0, "grad_norm": 0.0852704867720604, "learning_rate": 4.66412182326052e-06, "loss": -0.0294, "num_tokens": 154786994.0, "reward": 3.906424045562744, "reward_std": 1.2724534273147583, "rewards/accuracy_reward/mean": 3.40625, "rewards/accuracy_reward/std": 2.920745372772217, "rewards/ngram_similarity_reward/mean": 0.5001741051673889, "rewards/ngram_similarity_reward/std": 0.36396166682243347, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 423.453125, "completions/mean_terminated_length": 423.453125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.43589169836652497, "frac_reward_zero_std": 0.0, "grad_norm": 0.08183480054140091, "learning_rate": 4.663246270316243e-06, "loss": 0.026, "num_tokens": 154937359.0, "reward": 3.4259989261627197, "reward_std": 0.7067916989326477, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.6447486877441406, "rewards/ngram_similarity_reward/std": 0.29538872838020325, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 441.4375, "completions/mean_terminated_length": 441.4375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.4363392257775789, "frac_reward_zero_std": 0.0, "grad_norm": 0.07977086305618286, "learning_rate": 4.662369669976663e-06, "loss": -0.0062, "num_tokens": 155121643.0, "reward": 2.9232845306396484, "reward_std": 0.5824289321899414, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6107844710350037, "rewards/ngram_similarity_reward/std": 0.18239139020442963, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 419.3125, "completions/mean_terminated_length": 419.3125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.4367867531886328, "frac_reward_zero_std": 0.0, "grad_norm": 0.08666278421878815, "learning_rate": 4.661492022721672e-06, "loss": 0.0297, "num_tokens": 155272319.0, "reward": 1.7772903442382812, "reward_std": 1.3866727352142334, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.496040403842926, "rewards/ngram_similarity_reward/std": 0.1825767606496811, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 550.171875, "completions/mean_terminated_length": 550.171875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.4372342805996867, "frac_reward_zero_std": 0.0, "grad_norm": 0.06390845775604248, "learning_rate": 4.660613329031733e-06, "loss": -0.0196, "num_tokens": 155429210.0, "reward": 5.810588836669922, "reward_std": 0.7339605093002319, "rewards/accuracy_reward/mean": 5.28125, "rewards/accuracy_reward/std": 1.227576732635498, "rewards/ngram_similarity_reward/mean": 0.5293385982513428, "rewards/ngram_similarity_reward/std": 0.27086836099624634, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 440.84375, "completions/mean_terminated_length": 440.84375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.43768180801074064, "frac_reward_zero_std": 0.0, "grad_norm": 0.07718484103679657, "learning_rate": 4.6597335893878795e-06, "loss": 0.0143, "num_tokens": 155596400.0, "reward": 4.276413917541504, "reward_std": 0.46639174222946167, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.37016376852989197, "rewards/ngram_similarity_reward/std": 0.30691027641296387, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 503.40625, "completions/mean_terminated_length": 503.40625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.4381293354217946, "frac_reward_zero_std": 0.0, "grad_norm": 0.06703373044729233, "learning_rate": 4.6588528042717204e-06, "loss": -0.0112, "num_tokens": 155720618.0, "reward": 4.702371120452881, "reward_std": 1.5032615661621094, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.5148714184761047, "rewards/ngram_similarity_reward/std": 0.2904004454612732, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 576.984375, "completions/mean_terminated_length": 576.984375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.4385768628328485, "frac_reward_zero_std": 0.0, "grad_norm": 0.05834938958287239, "learning_rate": 4.657970974165438e-06, "loss": 0.0058, "num_tokens": 155861065.0, "reward": 3.6889820098876953, "reward_std": 0.888426661491394, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.6264820098876953, "rewards/ngram_similarity_reward/std": 0.18658064305782318, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 473.0625, "completions/mean_terminated_length": 473.0625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.43902439024390244, "frac_reward_zero_std": 0.0, "grad_norm": 0.06484103947877884, "learning_rate": 4.6570880995517835e-06, "loss": 0.0014, "num_tokens": 156040989.0, "reward": 4.436846733093262, "reward_std": 0.5330394506454468, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5305963754653931, "rewards/ngram_similarity_reward/std": 0.3162103593349457, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 502.78125, "completions/mean_terminated_length": 502.78125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.43947191765495636, "frac_reward_zero_std": 0.0, "grad_norm": 0.06592470407485962, "learning_rate": 4.656204180914082e-06, "loss": 0.0361, "num_tokens": 156183343.0, "reward": 4.8697404861450195, "reward_std": 1.4862189292907715, "rewards/accuracy_reward/mean": 4.34375, "rewards/accuracy_reward/std": 2.4314002990722656, "rewards/ngram_similarity_reward/mean": 0.5259901285171509, "rewards/ngram_similarity_reward/std": 0.2956460118293762, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 583.34375, "completions/mean_terminated_length": 583.34375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4399194450660103, "frac_reward_zero_std": 0.0, "grad_norm": 0.059540845453739166, "learning_rate": 4.655319218736229e-06, "loss": 0.0079, "num_tokens": 156341733.0, "reward": 3.7660622596740723, "reward_std": 1.3947157859802246, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7973122000694275, "rewards/ngram_similarity_reward/std": 0.28380465507507324, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 376.90625, "completions/mean_terminated_length": 376.90625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.44036697247706424, "frac_reward_zero_std": 0.0, "grad_norm": 0.09604540467262268, "learning_rate": 4.654433213502691e-06, "loss": -0.0188, "num_tokens": 156451903.0, "reward": 5.35115909576416, "reward_std": 0.9664373993873596, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.6949087977409363, "rewards/ngram_similarity_reward/std": 0.3469395935535431, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 436.890625, "completions/mean_terminated_length": 436.890625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.44081449988811816, "frac_reward_zero_std": 0.0, "grad_norm": 0.08597588539123535, "learning_rate": 4.653546165698508e-06, "loss": 0.0151, "num_tokens": 156678072.0, "reward": 5.017669200897217, "reward_std": 0.8949160575866699, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.4551692605018616, "rewards/ngram_similarity_reward/std": 0.2327612191438675, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 419.03125, "completions/mean_terminated_length": 419.03125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4412620272991721, "frac_reward_zero_std": 0.0, "grad_norm": 0.08380001038312912, "learning_rate": 4.652658075809289e-06, "loss": -0.001, "num_tokens": 156833658.0, "reward": 1.934064507484436, "reward_std": 1.6516462564468384, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.46531447768211365, "rewards/ngram_similarity_reward/std": 0.20871390402317047, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 319.515625, "completions/mean_terminated_length": 319.515625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.441709554710226, "frac_reward_zero_std": 0.25, "grad_norm": 0.07714425027370453, "learning_rate": 4.651768944321212e-06, "loss": -0.0181, "num_tokens": 156956747.0, "reward": 5.239560127258301, "reward_std": 1.2817769050598145, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.8645603656768799, "rewards/ngram_similarity_reward/std": 0.3636215925216675, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 497.078125, "completions/mean_terminated_length": 497.078125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.4421570821212799, "frac_reward_zero_std": 0.0, "grad_norm": 0.05398174375295639, "learning_rate": 4.650878771721028e-06, "loss": 0.0073, "num_tokens": 157094496.0, "reward": 4.853190898895264, "reward_std": 0.1744401752948761, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8531908988952637, "rewards/ngram_similarity_reward/std": 0.20298880338668823, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 389.234375, "completions/mean_terminated_length": 389.234375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.4426046095323339, "frac_reward_zero_std": 0.0, "grad_norm": 0.09710188210010529, "learning_rate": 4.649987558496056e-06, "loss": -0.0053, "num_tokens": 157235439.0, "reward": 5.915307998657227, "reward_std": 1.104044795036316, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.7903082370758057, "rewards/ngram_similarity_reward/std": 0.3099243938922882, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 407.921875, "completions/mean_terminated_length": 407.921875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.4430521369433878, "frac_reward_zero_std": 0.0, "grad_norm": 0.10357832908630371, "learning_rate": 4.649095305134186e-06, "loss": 0.0067, "num_tokens": 157474378.0, "reward": 1.24918532371521, "reward_std": 1.3662900924682617, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.71793532371521, "rewards/ngram_similarity_reward/std": 0.2971068024635315, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 626.25, "completions/mean_terminated_length": 626.25, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.4434996643544417, "frac_reward_zero_std": 0.0, "grad_norm": 0.048259276896715164, "learning_rate": 4.648202012123875e-06, "loss": 0.0095, "num_tokens": 157588394.0, "reward": 4.237071990966797, "reward_std": 0.8555949926376343, "rewards/accuracy_reward/mean": 3.796875, "rewards/accuracy_reward/std": 2.746886730194092, "rewards/ngram_similarity_reward/mean": 0.4401967227458954, "rewards/ngram_similarity_reward/std": 0.20750321447849274, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 501.796875, "completions/mean_terminated_length": 501.796875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.44394719176549563, "frac_reward_zero_std": 0.0, "grad_norm": 0.07116010785102844, "learning_rate": 4.647307679954155e-06, "loss": 0.0502, "num_tokens": 157721405.0, "reward": 4.5370049476623535, "reward_std": 0.7521318793296814, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.9120049476623535, "rewards/ngram_similarity_reward/std": 0.13152669370174408, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 391.984375, "completions/mean_terminated_length": 391.984375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.44439471917654955, "frac_reward_zero_std": 0.0, "grad_norm": 0.09845888614654541, "learning_rate": 4.646412309114618e-06, "loss": -0.0157, "num_tokens": 157885708.0, "reward": 4.514744758605957, "reward_std": 0.8385021686553955, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5147448182106018, "rewards/ngram_similarity_reward/std": 0.26765215396881104, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 446.546875, "completions/mean_terminated_length": 446.546875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.4448422465876035, "frac_reward_zero_std": 0.0, "grad_norm": 0.07439401745796204, "learning_rate": 4.645515900095432e-06, "loss": -0.0023, "num_tokens": 158054975.0, "reward": 3.3904080390930176, "reward_std": 1.1648640632629395, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.5154081583023071, "rewards/ngram_similarity_reward/std": 0.28233492374420166, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 457.5625, "completions/mean_terminated_length": 457.5625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.44528977399865743, "frac_reward_zero_std": 0.0, "grad_norm": 0.07856795191764832, "learning_rate": 4.6446184533873315e-06, "loss": -0.0044, "num_tokens": 158187299.0, "reward": 4.359292984008789, "reward_std": 1.1848640441894531, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.7342928647994995, "rewards/ngram_similarity_reward/std": 0.3862909972667694, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 538.0625, "completions/mean_terminated_length": 538.0625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.44573730140971135, "frac_reward_zero_std": 0.0, "grad_norm": 0.0661824569106102, "learning_rate": 4.643719969481616e-06, "loss": -0.0411, "num_tokens": 158339079.0, "reward": 2.278085470199585, "reward_std": 1.0091297626495361, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 3.039423704147339, "rewards/ngram_similarity_reward/mean": 0.5280854105949402, "rewards/ngram_similarity_reward/std": 0.2838883399963379, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 368.09375, "completions/mean_terminated_length": 368.09375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.44618482882076527, "frac_reward_zero_std": 0.0, "grad_norm": 0.09019217640161514, "learning_rate": 4.642820448870158e-06, "loss": -0.0171, "num_tokens": 158464269.0, "reward": 2.937518835067749, "reward_std": 0.8208221793174744, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.4531439542770386, "rewards/ngram_similarity_reward/std": 0.2831995189189911, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 474.5625, "completions/mean_terminated_length": 474.5625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.4466323562318192, "frac_reward_zero_std": 0.0, "grad_norm": 0.08046316355466843, "learning_rate": 4.641919892045393e-06, "loss": -0.0133, "num_tokens": 158589153.0, "reward": 2.7983765602111816, "reward_std": 0.7150464653968811, "rewards/accuracy_reward/mean": 2.25, "rewards/accuracy_reward/std": 3.0860671997070312, "rewards/ngram_similarity_reward/mean": 0.5483765006065369, "rewards/ngram_similarity_reward/std": 0.31970739364624023, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 541.28125, "completions/mean_terminated_length": 541.28125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.4470798836428731, "frac_reward_zero_std": 0.0, "grad_norm": 0.0517701581120491, "learning_rate": 4.641018299500324e-06, "loss": 0.0304, "num_tokens": 158717219.0, "reward": 4.679778099060059, "reward_std": 0.1837746500968933, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6797779202461243, "rewards/ngram_similarity_reward/std": 0.29137223958969116, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 435.109375, "completions/mean_terminated_length": 435.109375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.44752741105392707, "frac_reward_zero_std": 0.0, "grad_norm": 0.08018787950277328, "learning_rate": 4.640115671728527e-06, "loss": -0.0115, "num_tokens": 158882922.0, "reward": 4.042120456695557, "reward_std": 1.1525325775146484, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.3233703374862671, "rewards/ngram_similarity_reward/std": 0.1967499703168869, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 518.046875, "completions/mean_terminated_length": 518.046875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.447974938464981, "frac_reward_zero_std": 0.0, "grad_norm": 0.06527865678071976, "learning_rate": 4.639212009224135e-06, "loss": 0.0079, "num_tokens": 159055997.0, "reward": 3.4171600341796875, "reward_std": 0.47780343890190125, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.8234100341796875, "rewards/ngram_similarity_reward/std": 0.2530917525291443, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 401.171875, "completions/mean_terminated_length": 401.171875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4484224658760349, "frac_reward_zero_std": 0.0, "grad_norm": 0.08057080209255219, "learning_rate": 4.638307312481856e-06, "loss": -0.0164, "num_tokens": 159218520.0, "reward": 6.217398643493652, "reward_std": 0.13007661700248718, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7173987627029419, "rewards/ngram_similarity_reward/std": 0.21581734716892242, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 424.640625, "completions/mean_terminated_length": 424.640625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.4488699932870888, "frac_reward_zero_std": 0.0, "grad_norm": 0.08946868032217026, "learning_rate": 4.637401581996961e-06, "loss": 0.0141, "num_tokens": 159366369.0, "reward": 3.845090866088867, "reward_std": 1.2513779401779175, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6888412237167358, "rewards/ngram_similarity_reward/std": 0.29713091254234314, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 347.984375, "completions/mean_terminated_length": 347.984375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.44931752069814274, "frac_reward_zero_std": 0.0, "grad_norm": 0.09654480218887329, "learning_rate": 4.636494818265284e-06, "loss": -0.0059, "num_tokens": 159502880.0, "reward": 2.747676134109497, "reward_std": 1.8483915328979492, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7164261341094971, "rewards/ngram_similarity_reward/std": 0.2764227092266083, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 390.171875, "completions/mean_terminated_length": 390.171875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4497650481091967, "frac_reward_zero_std": 0.0, "grad_norm": 0.07983547449111938, "learning_rate": 4.63558702178323e-06, "loss": -0.0131, "num_tokens": 159688955.0, "reward": 3.516596555709839, "reward_std": 0.8062986731529236, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.547846794128418, "rewards/ngram_similarity_reward/std": 0.21161513030529022, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 359.359375, "completions/mean_terminated_length": 359.359375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.4502125755202506, "frac_reward_zero_std": 0.0, "grad_norm": 0.09040001779794693, "learning_rate": 4.634678193047765e-06, "loss": 0.0201, "num_tokens": 159850962.0, "reward": 4.5764336585998535, "reward_std": 1.0174980163574219, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.7639338970184326, "rewards/ngram_similarity_reward/std": 0.2882556617259979, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 407.59375, "completions/mean_terminated_length": 407.59375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.45066010293130454, "frac_reward_zero_std": 0.0, "grad_norm": 0.08074507862329483, "learning_rate": 4.633768332556424e-06, "loss": -0.0025, "num_tokens": 159981528.0, "reward": 3.310504674911499, "reward_std": 1.0501964092254639, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6230047941207886, "rewards/ngram_similarity_reward/std": 0.3178018629550934, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 416.125, "completions/mean_terminated_length": 390.2222595214844, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.45110763034235846, "frac_reward_zero_std": 0.0, "grad_norm": 0.10865113884210587, "learning_rate": 4.632857440807303e-06, "loss": -0.0083, "num_tokens": 160158640.0, "reward": 0.19361451268196106, "reward_std": 0.8938025832176208, "rewards/accuracy_reward/mean": -0.328125, "rewards/accuracy_reward/std": 1.0624125003814697, "rewards/ngram_similarity_reward/mean": 0.5217394828796387, "rewards/ngram_similarity_reward/std": 0.2538130581378937, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 372.3125, "completions/mean_terminated_length": 372.3125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.4515551577534124, "frac_reward_zero_std": 0.25, "grad_norm": 0.08638826757669449, "learning_rate": 4.631945518299064e-06, "loss": 0.0259, "num_tokens": 160397092.0, "reward": 4.829348087310791, "reward_std": 0.5159898400306702, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.9230982065200806, "rewards/ngram_similarity_reward/std": 0.20585164427757263, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 399.828125, "completions/mean_terminated_length": 399.828125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.45200268516446634, "frac_reward_zero_std": 0.0, "grad_norm": 0.08075409382581711, "learning_rate": 4.631032565530935e-06, "loss": 0.013, "num_tokens": 160570809.0, "reward": 1.565451979637146, "reward_std": 0.09725593030452728, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5654520988464355, "rewards/ngram_similarity_reward/std": 0.22560539841651917, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 398.09375, "completions/mean_terminated_length": 398.09375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.45245021257552026, "frac_reward_zero_std": 0.0, "grad_norm": 0.13738077878952026, "learning_rate": 4.630118583002706e-06, "loss": -0.0013, "num_tokens": 160764959.0, "reward": 3.276353359222412, "reward_std": 1.9264037609100342, "rewards/accuracy_reward/mean": 2.734375, "rewards/accuracy_reward/std": 3.0692648887634277, "rewards/ngram_similarity_reward/mean": 0.5419784188270569, "rewards/ngram_similarity_reward/std": 0.3680952787399292, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 657.671875, "completions/mean_terminated_length": 657.671875, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.4528977399865742, "frac_reward_zero_std": 0.0, "grad_norm": 0.05747548118233681, "learning_rate": 4.629203571214732e-06, "loss": 0.0019, "num_tokens": 160941930.0, "reward": 4.90988302230835, "reward_std": 0.7813419699668884, "rewards/accuracy_reward/mean": 4.265625, "rewards/accuracy_reward/std": 2.467195510864258, "rewards/ngram_similarity_reward/mean": 0.6442579030990601, "rewards/ngram_similarity_reward/std": 0.20877647399902344, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 470.1875, "completions/mean_terminated_length": 470.1875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.4533452673976281, "frac_reward_zero_std": 0.0, "grad_norm": 0.09367240965366364, "learning_rate": 4.628287530667929e-06, "loss": 0.0205, "num_tokens": 161089654.0, "reward": 0.8358583450317383, "reward_std": 0.8505191206932068, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.49210840463638306, "rewards/ngram_similarity_reward/std": 0.35597845911979675, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 513.03125, "completions/mean_terminated_length": 513.03125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.453792794808682, "frac_reward_zero_std": 0.0, "grad_norm": 0.07150578498840332, "learning_rate": 4.627370461863779e-06, "loss": 0.0288, "num_tokens": 161266152.0, "reward": 4.312531471252441, "reward_std": 0.986059844493866, "rewards/accuracy_reward/mean": 3.515625, "rewards/accuracy_reward/std": 2.8646292686462402, "rewards/ngram_similarity_reward/mean": 0.7969064712524414, "rewards/ngram_similarity_reward/std": 0.3095705509185791, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 401.265625, "completions/mean_terminated_length": 401.265625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.454240322219736, "frac_reward_zero_std": 0.0, "grad_norm": 0.08253169804811478, "learning_rate": 4.626452365304327e-06, "loss": 0.0023, "num_tokens": 161433225.0, "reward": 6.062775611877441, "reward_std": 0.7393642663955688, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.844025731086731, "rewards/ngram_similarity_reward/std": 0.2485669106245041, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 434.984375, "completions/mean_terminated_length": 434.984375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.4546878496307899, "frac_reward_zero_std": 0.0, "grad_norm": 0.10803146660327911, "learning_rate": 4.625533241492177e-06, "loss": 0.0029, "num_tokens": 161669704.0, "reward": 3.029890537261963, "reward_std": 2.1167008876800537, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.34239059686660767, "rewards/ngram_similarity_reward/std": 0.14699290692806244, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 407.703125, "completions/mean_terminated_length": 407.703125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.4551353770418438, "frac_reward_zero_std": 0.0, "grad_norm": 0.08884494006633759, "learning_rate": 4.6246130909305e-06, "loss": -0.0355, "num_tokens": 161873925.0, "reward": 4.0044450759887695, "reward_std": 1.6529027223587036, "rewards/accuracy_reward/mean": 3.21875, "rewards/accuracy_reward/std": 2.9732606410980225, "rewards/ngram_similarity_reward/mean": 0.785695493221283, "rewards/ngram_similarity_reward/std": 0.3590853810310364, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 477.09375, "completions/mean_terminated_length": 477.09375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.45558290445289773, "frac_reward_zero_std": 0.0, "grad_norm": 0.06873611360788345, "learning_rate": 4.623691914123025e-06, "loss": -0.0072, "num_tokens": 162011323.0, "reward": 4.906051158905029, "reward_std": 0.0931473970413208, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9060510396957397, "rewards/ngram_similarity_reward/std": 0.19354379177093506, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 330.484375, "completions/mean_terminated_length": 330.484375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.45603043186395165, "frac_reward_zero_std": 0.25, "grad_norm": 0.08067482709884644, "learning_rate": 4.622769711574047e-06, "loss": -0.0173, "num_tokens": 162141162.0, "reward": 3.4138293266296387, "reward_std": 0.5554684996604919, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.0400354862213135, "rewards/ngram_similarity_reward/mean": 1.0232044458389282, "rewards/ngram_similarity_reward/std": 0.22352083027362823, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 368.375, "completions/mean_terminated_length": 368.375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.4564779592750056, "frac_reward_zero_std": 0.0, "grad_norm": 0.08370555937290192, "learning_rate": 4.621846483788418e-06, "loss": 0.0242, "num_tokens": 162288434.0, "reward": 4.566494464874268, "reward_std": 0.437887579202652, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6602445840835571, "rewards/ngram_similarity_reward/std": 0.2169688194990158, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 442.390625, "completions/mean_terminated_length": 442.390625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.45692548668605953, "frac_reward_zero_std": 0.0, "grad_norm": 0.08730597794055939, "learning_rate": 4.620922231271555e-06, "loss": 0.0725, "num_tokens": 162469691.0, "reward": 2.3871564865112305, "reward_std": 0.8901073336601257, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.44965660572052, "rewards/ngram_similarity_reward/std": 0.21127961575984955, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 503.8125, "completions/mean_terminated_length": 503.8125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.45737301409711345, "frac_reward_zero_std": 0.0, "grad_norm": 0.07264433056116104, "learning_rate": 4.619996954529436e-06, "loss": -0.0433, "num_tokens": 162668127.0, "reward": 3.299483299255371, "reward_std": 0.41848915815353394, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.7057333588600159, "rewards/ngram_similarity_reward/std": 0.21536371111869812, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 467.796875, "completions/mean_terminated_length": 467.796875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.45782054150816737, "frac_reward_zero_std": 0.0, "grad_norm": 0.0833204984664917, "learning_rate": 4.619070654068595e-06, "loss": -0.0056, "num_tokens": 162825442.0, "reward": 4.313222885131836, "reward_std": 0.6884230375289917, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.5007230043411255, "rewards/ngram_similarity_reward/std": 0.2955898642539978, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 463.15625, "completions/mean_terminated_length": 463.15625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.4582680689192213, "frac_reward_zero_std": 0.25, "grad_norm": 0.07459675520658493, "learning_rate": 4.618143330396132e-06, "loss": 0.0033, "num_tokens": 162976252.0, "reward": 2.2040796279907227, "reward_std": 0.8834795355796814, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 2.91611385345459, "rewards/ngram_similarity_reward/mean": 0.5634545683860779, "rewards/ngram_similarity_reward/std": 0.24133865535259247, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 470.9375, "completions/mean_terminated_length": 470.9375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.45871559633027525, "frac_reward_zero_std": 0.0, "grad_norm": 0.07935074716806412, "learning_rate": 4.617214984019704e-06, "loss": 0.0048, "num_tokens": 163170760.0, "reward": 3.609093189239502, "reward_std": 0.8524174690246582, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.35909339785575867, "rewards/ngram_similarity_reward/std": 0.259185254573822, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 512.203125, "completions/mean_terminated_length": 487.825439453125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.45916312374132917, "frac_reward_zero_std": 0.0, "grad_norm": 0.07026367634534836, "learning_rate": 4.616285615447528e-06, "loss": 0.029, "num_tokens": 163342117.0, "reward": 4.603669166564941, "reward_std": 1.6760941743850708, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.6192945241928101, "rewards/ngram_similarity_reward/std": 0.3728182911872864, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 412.0625, "completions/mean_terminated_length": 412.0625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.4596106511523831, "frac_reward_zero_std": 0.0, "grad_norm": 0.08737986534833908, "learning_rate": 4.615355225188383e-06, "loss": -0.0044, "num_tokens": 163482857.0, "reward": 3.2743639945983887, "reward_std": 0.7320421934127808, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.3993642330169678, "rewards/ngram_similarity_reward/std": 0.23473213613033295, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 527.640625, "completions/mean_terminated_length": 527.640625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.460058178563437, "frac_reward_zero_std": 0.0, "grad_norm": 0.05836676061153412, "learning_rate": 4.6144238137516055e-06, "loss": 0.0058, "num_tokens": 163624178.0, "reward": 4.261211395263672, "reward_std": 1.2127914428710938, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6362113952636719, "rewards/ngram_similarity_reward/std": 0.21961015462875366, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 481.46875, "completions/mean_terminated_length": 481.46875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.4605057059744909, "frac_reward_zero_std": 0.0, "grad_norm": 0.08901166915893555, "learning_rate": 4.613491381647089e-06, "loss": -0.0366, "num_tokens": 163788480.0, "reward": 2.892777919769287, "reward_std": 2.786778450012207, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.0400354862213135, "rewards/ngram_similarity_reward/mean": 0.5021529197692871, "rewards/ngram_similarity_reward/std": 0.19329185783863068, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 411.125, "completions/mean_terminated_length": 411.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.4609532333855449, "frac_reward_zero_std": 0.0, "grad_norm": 0.08067741245031357, "learning_rate": 4.61255792938529e-06, "loss": 0.0262, "num_tokens": 163947400.0, "reward": 3.001347064971924, "reward_std": 2.00323486328125, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.7825968861579895, "rewards/ngram_similarity_reward/std": 0.2769053876399994, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 455.21875, "completions/mean_terminated_length": 455.21875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.4614007607965988, "frac_reward_zero_std": 0.0, "grad_norm": 0.06692025810480118, "learning_rate": 4.611623457477221e-06, "loss": 0.0132, "num_tokens": 164085014.0, "reward": 3.226724863052368, "reward_std": 0.6595999598503113, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.4454747140407562, "rewards/ngram_similarity_reward/std": 0.2540774643421173, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 358.171875, "completions/mean_terminated_length": 358.171875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.4618482882076527, "frac_reward_zero_std": 0.0, "grad_norm": 0.1052207350730896, "learning_rate": 4.610687966434451e-06, "loss": -0.0117, "num_tokens": 164198529.0, "reward": 4.967963218688965, "reward_std": 1.3966630697250366, "rewards/accuracy_reward/mean": 4.234375, "rewards/accuracy_reward/std": 2.662152051925659, "rewards/ngram_similarity_reward/mean": 0.7335888147354126, "rewards/ngram_similarity_reward/std": 0.434805691242218, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 384.359375, "completions/mean_terminated_length": 384.359375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.46229581561870664, "frac_reward_zero_std": 0.0, "grad_norm": 0.10399851948022842, "learning_rate": 4.609751456769112e-06, "loss": 0.0094, "num_tokens": 164369320.0, "reward": 3.617398262023926, "reward_std": 1.3301640748977661, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.4611482620239258, "rewards/ngram_similarity_reward/std": 0.3033144772052765, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 361.09375, "completions/mean_terminated_length": 361.09375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.46274334302976056, "frac_reward_zero_std": 0.0, "grad_norm": 0.09007919579744339, "learning_rate": 4.60881392899389e-06, "loss": 0.0161, "num_tokens": 164494574.0, "reward": 4.622521877288818, "reward_std": 0.22130389511585236, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.622521698474884, "rewards/ngram_similarity_reward/std": 0.32046768069267273, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 468.03125, "completions/mean_terminated_length": 468.03125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.4631908704408145, "frac_reward_zero_std": 0.0, "grad_norm": 0.09070774167776108, "learning_rate": 4.607875383622028e-06, "loss": 0.0035, "num_tokens": 164724912.0, "reward": 2.7548575401306152, "reward_std": 1.8196587562561035, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7236075401306152, "rewards/ngram_similarity_reward/std": 0.2541177272796631, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 438.015625, "completions/mean_terminated_length": 438.015625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.46363839785186844, "frac_reward_zero_std": 0.0, "grad_norm": 0.07595244795084, "learning_rate": 4.606935821167327e-06, "loss": 0.0049, "num_tokens": 164883409.0, "reward": 6.129752159118652, "reward_std": 0.4484296143054962, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.7235023975372314, "rewards/ngram_similarity_reward/std": 0.14641357958316803, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 472.3125, "completions/mean_terminated_length": 472.3125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.46408592526292236, "frac_reward_zero_std": 0.0, "grad_norm": 0.06311800330877304, "learning_rate": 4.605995242144146e-06, "loss": -0.0237, "num_tokens": 165045765.0, "reward": 4.421064853668213, "reward_std": 2.2462971210479736, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.7960647940635681, "rewards/ngram_similarity_reward/std": 0.3977976143360138, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 466.0, "completions/mean_terminated_length": 466.0, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.4645334526739763, "frac_reward_zero_std": 0.0, "grad_norm": 0.07536507397890091, "learning_rate": 4.605053647067399e-06, "loss": 0.0244, "num_tokens": 165210757.0, "reward": 2.9640872478485107, "reward_std": 0.7854170799255371, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.7453370690345764, "rewards/ngram_similarity_reward/std": 0.2581217586994171, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 409.75, "completions/mean_terminated_length": 409.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.4649809800850302, "frac_reward_zero_std": 0.0, "grad_norm": 0.06313635408878326, "learning_rate": 4.6041110364525565e-06, "loss": 0.0, "num_tokens": 165397909.0, "reward": 6.239960670471191, "reward_std": 0.9716970920562744, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.9274606704711914, "rewards/ngram_similarity_reward/std": 0.2883267104625702, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 492.453125, "completions/mean_terminated_length": 492.453125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.4654285074960841, "frac_reward_zero_std": 0.0, "grad_norm": 0.07633720338344574, "learning_rate": 4.603167410815645e-06, "loss": -0.0282, "num_tokens": 165553186.0, "reward": 6.175039291381836, "reward_std": 0.20186586678028107, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6750392913818359, "rewards/ngram_similarity_reward/std": 0.30154407024383545, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.4658760349071381, "frac_reward_zero_std": 0.25, "grad_norm": 0.07251864671707153, "learning_rate": 4.602222770673246e-06, "loss": 0.0544, "num_tokens": 165677898.0, "reward": 4.065640926361084, "reward_std": 0.8149410486221313, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.5343913435935974, "rewards/ngram_similarity_reward/std": 0.28121185302734375, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 435.578125, "completions/mean_terminated_length": 409.9841613769531, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.466323562318192, "frac_reward_zero_std": 0.25, "grad_norm": 0.06468340009450912, "learning_rate": 4.601277116542498e-06, "loss": 0.0504, "num_tokens": 165820575.0, "reward": 3.5892698764801025, "reward_std": 0.6493726372718811, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.8080199956893921, "rewards/ngram_similarity_reward/std": 0.3213406801223755, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 489.15625, "completions/mean_terminated_length": 464.4127197265625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4667710897292459, "frac_reward_zero_std": 0.0, "grad_norm": 0.07755690813064575, "learning_rate": 4.600330448941094e-06, "loss": -0.0507, "num_tokens": 166048185.0, "reward": 3.6401398181915283, "reward_std": 1.5206663608551025, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7651398777961731, "rewards/ngram_similarity_reward/std": 0.3352813422679901, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 448.046875, "completions/mean_terminated_length": 448.046875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.46721861714029983, "frac_reward_zero_std": 0.0, "grad_norm": 0.08614441007375717, "learning_rate": 4.599382768387282e-06, "loss": 0.0319, "num_tokens": 166185436.0, "reward": 6.143418788909912, "reward_std": 0.17175956070423126, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6434186697006226, "rewards/ngram_similarity_reward/std": 0.3139803409576416, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 534.34375, "completions/mean_terminated_length": 534.34375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.46766614455135375, "frac_reward_zero_std": 0.0, "grad_norm": 0.0726943388581276, "learning_rate": 4.598434075399862e-06, "loss": 0.0147, "num_tokens": 166341106.0, "reward": 1.8407868146896362, "reward_std": 0.6296216249465942, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6532865762710571, "rewards/ngram_similarity_reward/std": 0.19341208040714264, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 397.578125, "completions/mean_terminated_length": 397.578125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.4681136719624077, "frac_reward_zero_std": 0.0, "grad_norm": 0.07925833016633987, "learning_rate": 4.597484370498193e-06, "loss": 0.032, "num_tokens": 166481751.0, "reward": 6.3092122077941895, "reward_std": 0.528394341468811, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.9029619693756104, "rewards/ngram_similarity_reward/std": 0.269043505191803, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 593.984375, "completions/mean_terminated_length": 593.984375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.46856119937346163, "frac_reward_zero_std": 0.0, "grad_norm": 0.06135343015193939, "learning_rate": 4.596533654202183e-06, "loss": -0.016, "num_tokens": 166653334.0, "reward": 4.515833377838135, "reward_std": 0.46059608459472656, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6095834374427795, "rewards/ngram_similarity_reward/std": 0.35220563411712646, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 435.40625, "completions/mean_terminated_length": 435.40625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.46900872678451555, "frac_reward_zero_std": 0.0, "grad_norm": 0.08945198357105255, "learning_rate": 4.595581927032296e-06, "loss": -0.0074, "num_tokens": 166823584.0, "reward": 4.23256254196167, "reward_std": 0.5420059561729431, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.42006272077560425, "rewards/ngram_similarity_reward/std": 0.21719275414943695, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 568.578125, "completions/mean_terminated_length": 568.578125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.46945625419556947, "frac_reward_zero_std": 0.0, "grad_norm": 0.07008595019578934, "learning_rate": 4.594629189509552e-06, "loss": 0.022, "num_tokens": 167063989.0, "reward": 1.4306988716125488, "reward_std": 0.5476556420326233, "rewards/accuracy_reward/mean": 1.046875, "rewards/accuracy_reward/std": 2.7076005935668945, "rewards/ngram_similarity_reward/mean": 0.3838239908218384, "rewards/ngram_similarity_reward/std": 0.24899296462535858, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 427.15625, "completions/mean_terminated_length": 427.15625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.4699037816066234, "frac_reward_zero_std": 0.0, "grad_norm": 0.07404383271932602, "learning_rate": 4.59367544215552e-06, "loss": 0.0081, "num_tokens": 167206015.0, "reward": 2.6730198860168457, "reward_std": 0.8190189599990845, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.5480199456214905, "rewards/ngram_similarity_reward/std": 0.30111372470855713, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 516.421875, "completions/mean_terminated_length": 516.421875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.47035130901767735, "frac_reward_zero_std": 0.0, "grad_norm": 0.06461696326732635, "learning_rate": 4.5927206854923214e-06, "loss": -0.0275, "num_tokens": 167358298.0, "reward": 4.086188793182373, "reward_std": 1.0312371253967285, "rewards/accuracy_reward/mean": 3.390625, "rewards/accuracy_reward/std": 2.944552183151245, "rewards/ngram_similarity_reward/mean": 0.6955640316009521, "rewards/ngram_similarity_reward/std": 0.29851576685905457, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 423.765625, "completions/mean_terminated_length": 423.765625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.47079883642873127, "frac_reward_zero_std": 0.0, "grad_norm": 0.0901014432311058, "learning_rate": 4.591764920042635e-06, "loss": 0.0026, "num_tokens": 167517691.0, "reward": 4.31773567199707, "reward_std": 0.7875552177429199, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6927354335784912, "rewards/ngram_similarity_reward/std": 0.29080554842948914, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 440.171875, "completions/mean_terminated_length": 440.171875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.4712463638397852, "frac_reward_zero_std": 0.0, "grad_norm": 0.06958474963903427, "learning_rate": 4.590808146329687e-06, "loss": -0.0019, "num_tokens": 167686438.0, "reward": 4.321617603302002, "reward_std": 0.8348760008811951, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6966177225112915, "rewards/ngram_similarity_reward/std": 0.2306501567363739, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 398.9375, "completions/mean_terminated_length": 398.9375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4716938912508391, "frac_reward_zero_std": 0.0, "grad_norm": 0.08572255074977875, "learning_rate": 4.589850364877258e-06, "loss": 0.0102, "num_tokens": 167821554.0, "reward": 5.629925727844238, "reward_std": 1.23585844039917, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.7861757874488831, "rewards/ngram_similarity_reward/std": 0.3273070156574249, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 373.59375, "completions/mean_terminated_length": 373.59375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.472141418661893, "frac_reward_zero_std": 0.25, "grad_norm": 0.07060826569795609, "learning_rate": 4.588891576209682e-06, "loss": -0.0275, "num_tokens": 167970728.0, "reward": 3.7303578853607178, "reward_std": 1.4070768356323242, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.6678580641746521, "rewards/ngram_similarity_reward/std": 0.2841942608356476, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 380.171875, "completions/mean_terminated_length": 380.171875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.472588946072947, "frac_reward_zero_std": 0.25, "grad_norm": 0.08772917836904526, "learning_rate": 4.587931780851838e-06, "loss": -0.0164, "num_tokens": 168108131.0, "reward": 4.362936019897461, "reward_std": 0.8165090084075928, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.8316863775253296, "rewards/ngram_similarity_reward/std": 0.3584800958633423, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 395.703125, "completions/mean_terminated_length": 395.703125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.4730364734840009, "frac_reward_zero_std": 0.25, "grad_norm": 0.07608242332935333, "learning_rate": 4.586970979329163e-06, "loss": 0.003, "num_tokens": 168278864.0, "reward": 5.808627128601074, "reward_std": 0.8873310685157776, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.7773770689964294, "rewards/ngram_similarity_reward/std": 0.38519734144210815, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 423.8125, "completions/mean_terminated_length": 423.8125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.4734840008950548, "frac_reward_zero_std": 0.0, "grad_norm": 0.0858198031783104, "learning_rate": 4.586009172167642e-06, "loss": 0.025, "num_tokens": 168403908.0, "reward": 4.685708522796631, "reward_std": 0.6626394987106323, "rewards/accuracy_reward/mean": 4.171875, "rewards/accuracy_reward/std": 2.5326733589172363, "rewards/ngram_similarity_reward/mean": 0.5138335227966309, "rewards/ngram_similarity_reward/std": 0.2579622268676758, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 491.0625, "completions/mean_terminated_length": 491.0625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.47393152830610874, "frac_reward_zero_std": 0.25, "grad_norm": 0.05656769126653671, "learning_rate": 4.58504635989381e-06, "loss": -0.0338, "num_tokens": 168586472.0, "reward": 3.1912267208099365, "reward_std": 2.096855401992798, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6912267804145813, "rewards/ngram_similarity_reward/std": 0.4442880153656006, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 562.1875, "completions/mean_terminated_length": 562.1875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.47437905571716266, "frac_reward_zero_std": 0.0, "grad_norm": 0.06605823338031769, "learning_rate": 4.584082543034751e-06, "loss": 0.0002, "num_tokens": 168722388.0, "reward": 6.094749450683594, "reward_std": 0.7265222072601318, "rewards/accuracy_reward/mean": 5.28125, "rewards/accuracy_reward/std": 1.227576732635498, "rewards/ngram_similarity_reward/mean": 0.8134993314743042, "rewards/ngram_similarity_reward/std": 0.23808826506137848, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 376.84375, "completions/mean_terminated_length": 376.84375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.47482658312821663, "frac_reward_zero_std": 0.0, "grad_norm": 0.08780424296855927, "learning_rate": 4.583117722118104e-06, "loss": 0.0191, "num_tokens": 168874474.0, "reward": 5.824024677276611, "reward_std": 1.1161339282989502, "rewards/accuracy_reward/mean": 5.203125, "rewards/accuracy_reward/std": 1.3531819581985474, "rewards/ngram_similarity_reward/mean": 0.6208996772766113, "rewards/ngram_similarity_reward/std": 0.3165114223957062, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 340.25, "completions/mean_terminated_length": 340.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.47527411053927054, "frac_reward_zero_std": 0.25, "grad_norm": 0.09583883732557297, "learning_rate": 4.582151897672054e-06, "loss": 0.018, "num_tokens": 169064650.0, "reward": 4.0373735427856445, "reward_std": 1.3963215351104736, "rewards/accuracy_reward/mean": 3.296875, "rewards/accuracy_reward/std": 2.97171688079834, "rewards/ngram_similarity_reward/mean": 0.7404987812042236, "rewards/ngram_similarity_reward/std": 0.3559059500694275, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 442.640625, "completions/mean_terminated_length": 442.640625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.47572163795032446, "frac_reward_zero_std": 0.0, "grad_norm": 0.0804290696978569, "learning_rate": 4.581185070225335e-06, "loss": -0.0033, "num_tokens": 169238163.0, "reward": 1.472453236579895, "reward_std": 0.11882346868515015, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.47245335578918457, "rewards/ngram_similarity_reward/std": 0.15722110867500305, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 403.5625, "completions/mean_terminated_length": 403.5625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.4761691653613784, "frac_reward_zero_std": 0.0, "grad_norm": 0.1044817566871643, "learning_rate": 4.5802172403072295e-06, "loss": 0.0141, "num_tokens": 169367575.0, "reward": 2.712646722793579, "reward_std": 1.088694453239441, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.4938966631889343, "rewards/ngram_similarity_reward/std": 0.24072708189487457, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 403.59375, "completions/mean_terminated_length": 403.59375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.4766166927724323, "frac_reward_zero_std": 0.0, "grad_norm": 0.1042836606502533, "learning_rate": 4.579248408447573e-06, "loss": 0.0357, "num_tokens": 169584109.0, "reward": 4.310004234313965, "reward_std": 1.4007518291473389, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.4193789064884186, "rewards/ngram_similarity_reward/std": 0.2808838486671448, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 462.578125, "completions/mean_terminated_length": 462.578125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.47706422018348627, "frac_reward_zero_std": 0.0, "grad_norm": 0.06781932711601257, "learning_rate": 4.578278575176745e-06, "loss": -0.0018, "num_tokens": 169706882.0, "reward": 3.504133701324463, "reward_std": 0.7482330799102783, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.722883939743042, "rewards/ngram_similarity_reward/std": 0.3692457675933838, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 452.9375, "completions/mean_terminated_length": 452.9375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.4775117475945402, "frac_reward_zero_std": 0.0, "grad_norm": 0.1026446670293808, "learning_rate": 4.577307741025676e-06, "loss": -0.04, "num_tokens": 169957998.0, "reward": -0.06518572568893433, "reward_std": 0.12394097447395325, "rewards/accuracy_reward/mean": -0.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.4348142743110657, "rewards/ngram_similarity_reward/std": 0.1759442389011383, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 387.375, "completions/mean_terminated_length": 387.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.4779592750055941, "frac_reward_zero_std": 0.25, "grad_norm": 0.08201418817043304, "learning_rate": 4.5763359065258424e-06, "loss": -0.0178, "num_tokens": 170062390.0, "reward": 4.6426544189453125, "reward_std": 0.1418628841638565, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6426544189453125, "rewards/ngram_similarity_reward/std": 0.25741419196128845, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 518.0, "completions/mean_terminated_length": 518.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.478406802416648, "frac_reward_zero_std": 0.0, "grad_norm": 0.07468228787183762, "learning_rate": 4.57536307220927e-06, "loss": 0.0201, "num_tokens": 170227590.0, "reward": 1.8692747354507446, "reward_std": 1.0762724876403809, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.494274765253067, "rewards/ngram_similarity_reward/std": 0.24930661916732788, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 437.890625, "completions/mean_terminated_length": 437.890625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.47885432982770193, "frac_reward_zero_std": 0.0, "grad_norm": 0.10245736688375473, "learning_rate": 4.574389238608531e-06, "loss": 0.0663, "num_tokens": 170398463.0, "reward": 2.890944480895996, "reward_std": 0.4334186911582947, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.4846946597099304, "rewards/ngram_similarity_reward/std": 0.1906774491071701, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 403.921875, "completions/mean_terminated_length": 403.921875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.47930185723875585, "frac_reward_zero_std": 0.0, "grad_norm": 0.07545072585344315, "learning_rate": 4.573414406256743e-06, "loss": 0.0008, "num_tokens": 170536826.0, "reward": 5.844933986663818, "reward_std": 0.5657603740692139, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.5324341654777527, "rewards/ngram_similarity_reward/std": 0.27465152740478516, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 476.65625, "completions/mean_terminated_length": 451.7143249511719, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4797493846498098, "frac_reward_zero_std": 0.0, "grad_norm": 0.08169378340244293, "learning_rate": 4.572438575687576e-06, "loss": -0.0067, "num_tokens": 170660932.0, "reward": 3.226865768432617, "reward_std": 0.9055585861206055, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5393656492233276, "rewards/ngram_similarity_reward/std": 0.2904561460018158, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 468.84375, "completions/mean_terminated_length": 468.84375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.48019691206086373, "frac_reward_zero_std": 0.25, "grad_norm": 0.06678974628448486, "learning_rate": 4.571461747435239e-06, "loss": 0.0046, "num_tokens": 170784650.0, "reward": 4.711894989013672, "reward_std": 0.4199894070625305, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.8056451678276062, "rewards/ngram_similarity_reward/std": 0.27729812264442444, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 455.140625, "completions/mean_terminated_length": 455.140625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.48064443947191765, "frac_reward_zero_std": 0.0, "grad_norm": 0.0764194130897522, "learning_rate": 4.570483922034493e-06, "loss": -0.0194, "num_tokens": 170965043.0, "reward": 3.971428155899048, "reward_std": 0.8636301755905151, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.5339281558990479, "rewards/ngram_similarity_reward/std": 0.31490033864974976, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 592.734375, "completions/mean_terminated_length": 592.734375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.48109196688297157, "frac_reward_zero_std": 0.0, "grad_norm": 0.07808054238557816, "learning_rate": 4.569505100020642e-06, "loss": 0.0146, "num_tokens": 171111330.0, "reward": 1.527955174446106, "reward_std": 0.13413190841674805, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.543580174446106, "rewards/ngram_similarity_reward/std": 0.12489572167396545, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 454.515625, "completions/mean_terminated_length": 454.515625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.4815394942940255, "frac_reward_zero_std": 0.0, "grad_norm": 0.09512680768966675, "learning_rate": 4.568525281929536e-06, "loss": 0.0293, "num_tokens": 171286947.0, "reward": 5.5437164306640625, "reward_std": 1.2956647872924805, "rewards/accuracy_reward/mean": 4.921875, "rewards/accuracy_reward/std": 1.8153201341629028, "rewards/ngram_similarity_reward/mean": 0.6218414306640625, "rewards/ngram_similarity_reward/std": 0.3024308979511261, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 468.671875, "completions/mean_terminated_length": 468.671875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.48198702170507945, "frac_reward_zero_std": 0.0, "grad_norm": 0.06600500643253326, "learning_rate": 4.567544468297571e-06, "loss": -0.0094, "num_tokens": 171419294.0, "reward": 3.1191608905792236, "reward_std": 0.180575892329216, "rewards/accuracy_reward/mean": 2.453125, "rewards/accuracy_reward/std": 3.077979803085327, "rewards/ngram_similarity_reward/mean": 0.6660360097885132, "rewards/ngram_similarity_reward/std": 0.20528137683868408, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 485.046875, "completions/mean_terminated_length": 485.046875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.48243454911613337, "frac_reward_zero_std": 0.0, "grad_norm": 0.08958762884140015, "learning_rate": 4.5665626596616875e-06, "loss": 0.0076, "num_tokens": 171635025.0, "reward": 2.6711831092834473, "reward_std": 0.5186856985092163, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.0400354862213135, "rewards/ngram_similarity_reward/mean": 0.28055813908576965, "rewards/ngram_similarity_reward/std": 0.17530040442943573, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 505.46875, "completions/mean_terminated_length": 505.46875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.4828820765271873, "frac_reward_zero_std": 0.0, "grad_norm": 0.10286291688680649, "learning_rate": 4.565579856559371e-06, "loss": -0.0058, "num_tokens": 171867247.0, "reward": 2.890352725982666, "reward_std": 1.205926775932312, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.1477742195129395, "rewards/ngram_similarity_reward/mean": 0.4997277557849884, "rewards/ngram_similarity_reward/std": 0.30446815490722656, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 431.453125, "completions/mean_terminated_length": 431.453125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.4833296039382412, "frac_reward_zero_std": 0.0, "grad_norm": 0.08253061771392822, "learning_rate": 4.564596059528651e-06, "loss": -0.0061, "num_tokens": 172012204.0, "reward": 3.1154181957244873, "reward_std": 0.16891822218894958, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6154181361198425, "rewards/ngram_similarity_reward/std": 0.3033756911754608, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 438.65625, "completions/mean_terminated_length": 438.65625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.4837771313492951, "frac_reward_zero_std": 0.0, "grad_norm": 0.07555454969406128, "learning_rate": 4.563611269108101e-06, "loss": -0.0071, "num_tokens": 172151270.0, "reward": 6.350188255310059, "reward_std": 0.07179703563451767, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8501883745193481, "rewards/ngram_similarity_reward/std": 0.4043208360671997, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 434.296875, "completions/mean_terminated_length": 434.296875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.4842246587603491, "frac_reward_zero_std": 0.0, "grad_norm": 0.0908069908618927, "learning_rate": 4.562625485836839e-06, "loss": 0.0665, "num_tokens": 172310953.0, "reward": 4.73192024230957, "reward_std": 1.1997625827789307, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7319202423095703, "rewards/ngram_similarity_reward/std": 0.36777517199516296, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 536.03125, "completions/mean_terminated_length": 536.03125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.484672186171403, "frac_reward_zero_std": 0.0, "grad_norm": 0.06916855275630951, "learning_rate": 4.561638710254526e-06, "loss": 0.0375, "num_tokens": 172481963.0, "reward": 3.339456558227539, "reward_std": 0.7031046748161316, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5582062602043152, "rewards/ngram_similarity_reward/std": 0.2942054569721222, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 415.1875, "completions/mean_terminated_length": 415.1875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4851197135824569, "frac_reward_zero_std": 0.0, "grad_norm": 0.09040789306163788, "learning_rate": 4.560650942901367e-06, "loss": 0.0317, "num_tokens": 172630167.0, "reward": 3.6213231086730957, "reward_std": 1.5545053482055664, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7463233470916748, "rewards/ngram_similarity_reward/std": 0.2703729271888733, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 454.953125, "completions/mean_terminated_length": 454.953125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.48556724099351084, "frac_reward_zero_std": 0.25, "grad_norm": 0.07023082673549652, "learning_rate": 4.55966218431811e-06, "loss": 0.0116, "num_tokens": 172749300.0, "reward": 3.944469928741455, "reward_std": 0.8143153190612793, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.6007199287414551, "rewards/ngram_similarity_reward/std": 0.2684047520160675, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 465.9375, "completions/mean_terminated_length": 465.9375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.48601476840456476, "frac_reward_zero_std": 0.0, "grad_norm": 0.08287008106708527, "learning_rate": 4.558672435046042e-06, "loss": -0.001, "num_tokens": 172885648.0, "reward": 2.8535852432250977, "reward_std": 0.753909170627594, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.6348351240158081, "rewards/ngram_similarity_reward/std": 0.33657437562942505, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 443.75, "completions/mean_terminated_length": 443.75, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.48646229581561873, "frac_reward_zero_std": 0.0, "grad_norm": 0.09232994168996811, "learning_rate": 4.557681695626998e-06, "loss": 0.0107, "num_tokens": 173050448.0, "reward": 2.7262744903564453, "reward_std": 1.1046934127807617, "rewards/accuracy_reward/mean": 2.265625, "rewards/accuracy_reward/std": 3.0692648887634277, "rewards/ngram_similarity_reward/mean": 0.46064963936805725, "rewards/ngram_similarity_reward/std": 0.1888895034790039, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 419.734375, "completions/mean_terminated_length": 419.734375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.48690982322667264, "frac_reward_zero_std": 0.0, "grad_norm": 0.09107498824596405, "learning_rate": 4.556689966603353e-06, "loss": 0.0156, "num_tokens": 173227503.0, "reward": 4.152169704437256, "reward_std": 0.8628009557723999, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6209197044372559, "rewards/ngram_similarity_reward/std": 0.23462265729904175, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 429.109375, "completions/mean_terminated_length": 429.109375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.48735735063772656, "frac_reward_zero_std": 0.0, "grad_norm": 0.08090570569038391, "learning_rate": 4.55569724851802e-06, "loss": 0.0017, "num_tokens": 173365062.0, "reward": 3.8782854080200195, "reward_std": 1.4964256286621094, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.7220356464385986, "rewards/ngram_similarity_reward/std": 0.2286946028470993, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 414.78125, "completions/mean_terminated_length": 414.78125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4878048780487805, "frac_reward_zero_std": 0.0, "grad_norm": 0.09512459486722946, "learning_rate": 4.55470354191446e-06, "loss": -0.0028, "num_tokens": 173487208.0, "reward": 4.085986614227295, "reward_std": 1.3344218730926514, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.7422366142272949, "rewards/ngram_similarity_reward/std": 0.3242091238498688, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 450.234375, "completions/mean_terminated_length": 450.234375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.4882524054598344, "frac_reward_zero_std": 0.25, "grad_norm": 0.08115733414888382, "learning_rate": 4.55370884733667e-06, "loss": 0.0069, "num_tokens": 173732951.0, "reward": 3.8544161319732666, "reward_std": 1.337068796157837, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6981660723686218, "rewards/ngram_similarity_reward/std": 0.30572065711021423, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 389.890625, "completions/mean_terminated_length": 389.890625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.48869993287088836, "frac_reward_zero_std": 0.0, "grad_norm": 0.09762948006391525, "learning_rate": 4.55271316532919e-06, "loss": 0.0124, "num_tokens": 173914096.0, "reward": 2.8449859619140625, "reward_std": 2.1369941234588623, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7199859619140625, "rewards/ngram_similarity_reward/std": 0.38775435090065, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 416.4375, "completions/mean_terminated_length": 416.4375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.4891474602819423, "frac_reward_zero_std": 0.0, "grad_norm": 0.08745718747377396, "learning_rate": 4.5517164964371e-06, "loss": -0.0236, "num_tokens": 174162316.0, "reward": 3.639711380004883, "reward_std": 1.6092393398284912, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6709613800048828, "rewards/ngram_similarity_reward/std": 0.2689666748046875, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 409.0, "completions/mean_terminated_length": 409.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.4895949876929962, "frac_reward_zero_std": 0.0, "grad_norm": 0.09560834616422653, "learning_rate": 4.55071884120602e-06, "loss": -0.0177, "num_tokens": 174374860.0, "reward": 3.9378693103790283, "reward_std": 1.4928762912750244, "rewards/accuracy_reward/mean": 3.3125, "rewards/accuracy_reward/std": 2.948634386062622, "rewards/ngram_similarity_reward/mean": 0.6253694295883179, "rewards/ngram_similarity_reward/std": 0.2162037044763565, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 386.078125, "completions/mean_terminated_length": 386.078125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4900425151040501, "frac_reward_zero_std": 0.0, "grad_norm": 0.09615163505077362, "learning_rate": 4.549720200182112e-06, "loss": -0.0073, "num_tokens": 174522305.0, "reward": 3.2679896354675293, "reward_std": 0.21264588832855225, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7679896354675293, "rewards/ngram_similarity_reward/std": 0.29604002833366394, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 533.375, "completions/mean_terminated_length": 533.375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.49049004251510403, "frac_reward_zero_std": 0.0, "grad_norm": 0.06156305968761444, "learning_rate": 4.548720573912074e-06, "loss": 0.0026, "num_tokens": 174673961.0, "reward": 3.1978039741516113, "reward_std": 0.1528598964214325, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6978040337562561, "rewards/ngram_similarity_reward/std": 0.2940730154514313, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 505.359375, "completions/mean_terminated_length": 505.359375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.490937569926158, "frac_reward_zero_std": 0.0, "grad_norm": 0.08782241493463516, "learning_rate": 4.547719962943148e-06, "loss": -0.0012, "num_tokens": 174853888.0, "reward": 2.947543144226074, "reward_std": 0.18983094394207, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.1477742195129395, "rewards/ngram_similarity_reward/mean": 0.5569181442260742, "rewards/ngram_similarity_reward/std": 0.24300679564476013, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 415.203125, "completions/mean_terminated_length": 415.203125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.4913850973372119, "frac_reward_zero_std": 0.0, "grad_norm": 0.09604466706514359, "learning_rate": 4.5467183678231105e-06, "loss": 0.0163, "num_tokens": 174995565.0, "reward": 6.056362152099609, "reward_std": 0.5539984703063965, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.6501119136810303, "rewards/ngram_similarity_reward/std": 0.2691671550273895, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 434.75, "completions/mean_terminated_length": 434.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.49183262474826583, "frac_reward_zero_std": 0.0, "grad_norm": 0.0859757587313652, "learning_rate": 4.545715789100279e-06, "loss": -0.0052, "num_tokens": 175108797.0, "reward": 2.9753494262695312, "reward_std": 0.16992181539535522, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.49097421765327454, "rewards/ngram_similarity_reward/std": 0.13519920408725739, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 559.046875, "completions/mean_terminated_length": 535.4127197265625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.49228015215931975, "frac_reward_zero_std": 0.0, "grad_norm": 0.07323068380355835, "learning_rate": 4.544712227323511e-06, "loss": 0.004, "num_tokens": 175291264.0, "reward": 4.822459697723389, "reward_std": 0.23598459362983704, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8224596977233887, "rewards/ngram_similarity_reward/std": 0.28161683678627014, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 548.15625, "completions/mean_terminated_length": 499.774169921875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.49272767957037367, "frac_reward_zero_std": 0.0, "grad_norm": 0.061277225613594055, "learning_rate": 4.543707683042199e-06, "loss": 0.0882, "num_tokens": 175450298.0, "reward": 4.542122840881348, "reward_std": 0.42735737562179565, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6358727216720581, "rewards/ngram_similarity_reward/std": 0.20485639572143555, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 430.890625, "completions/mean_terminated_length": 430.890625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.49317520698142764, "frac_reward_zero_std": 0.0, "grad_norm": 0.09584963321685791, "learning_rate": 4.542702156806273e-06, "loss": -0.0052, "num_tokens": 175616515.0, "reward": 1.153327226638794, "reward_std": 1.185215950012207, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 2.381934404373169, "rewards/ngram_similarity_reward/mean": 0.5595772862434387, "rewards/ngram_similarity_reward/std": 0.367759108543396, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 420.671875, "completions/mean_terminated_length": 420.671875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.49362273439248155, "frac_reward_zero_std": 0.0, "grad_norm": 0.08076344430446625, "learning_rate": 4.5416956491662055e-06, "loss": 0.0055, "num_tokens": 175744814.0, "reward": 3.275195360183716, "reward_std": 0.12107278406620026, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7751953601837158, "rewards/ngram_similarity_reward/std": 0.236673966050148, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 438.390625, "completions/mean_terminated_length": 438.390625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.49407026180353547, "frac_reward_zero_std": 0.0, "grad_norm": 0.0785026103258133, "learning_rate": 4.540688160673002e-06, "loss": 0.005, "num_tokens": 175897671.0, "reward": 3.3170793056488037, "reward_std": 0.9156270623207092, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5358291864395142, "rewards/ngram_similarity_reward/std": 0.2915536165237427, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 573.75, "completions/mean_terminated_length": 550.3492431640625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.4945177892145894, "frac_reward_zero_std": 0.0, "grad_norm": 0.0661405697464943, "learning_rate": 4.5396796918782055e-06, "loss": 0.0107, "num_tokens": 176075527.0, "reward": 4.647940635681152, "reward_std": 0.15474610030651093, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6479406356811523, "rewards/ngram_similarity_reward/std": 0.2756750285625458, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 551.375, "completions/mean_terminated_length": 551.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.4949653166256433, "frac_reward_zero_std": 0.25, "grad_norm": 0.06709405779838562, "learning_rate": 4.538670243333897e-06, "loss": 0.0234, "num_tokens": 176289183.0, "reward": 4.5249834060668945, "reward_std": 0.40416163206100464, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.4312337338924408, "rewards/ngram_similarity_reward/std": 0.2670634090900421, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 423.359375, "completions/mean_terminated_length": 423.359375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.4954128440366973, "frac_reward_zero_std": 0.25, "grad_norm": 0.0584019236266613, "learning_rate": 4.537659815592693e-06, "loss": -0.0232, "num_tokens": 176453782.0, "reward": 4.692909240722656, "reward_std": 0.0988813266158104, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6929092407226562, "rewards/ngram_similarity_reward/std": 0.2770628333091736, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 410.421875, "completions/mean_terminated_length": 410.421875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.4958603714477512, "frac_reward_zero_std": 0.0, "grad_norm": 0.07954328507184982, "learning_rate": 4.536648409207746e-06, "loss": 0.025, "num_tokens": 176600049.0, "reward": 6.233366012573242, "reward_std": 0.09150787442922592, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7333660125732422, "rewards/ngram_similarity_reward/std": 0.11544132232666016, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 488.65625, "completions/mean_terminated_length": 488.65625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.4963078988588051, "frac_reward_zero_std": 0.0, "grad_norm": 0.08080743253231049, "learning_rate": 4.535636024732745e-06, "loss": 0.0072, "num_tokens": 176777147.0, "reward": 3.5629160404205322, "reward_std": 1.4889570474624634, "rewards/accuracy_reward/mean": 3.046875, "rewards/accuracy_reward/std": 2.991680145263672, "rewards/ngram_similarity_reward/mean": 0.5160409212112427, "rewards/ngram_similarity_reward/std": 0.2812080681324005, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 560.234375, "completions/mean_terminated_length": 560.234375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.496755426269859, "frac_reward_zero_std": 0.0, "grad_norm": 0.06918640434741974, "learning_rate": 4.534622662721912e-06, "loss": -0.0248, "num_tokens": 176935706.0, "reward": 1.5414713621139526, "reward_std": 0.12708443403244019, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5414713025093079, "rewards/ngram_similarity_reward/std": 0.30945712327957153, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 360.796875, "completions/mean_terminated_length": 360.796875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.49720295368091294, "frac_reward_zero_std": 0.0, "grad_norm": 0.0921863541007042, "learning_rate": 4.533608323730008e-06, "loss": 0.0021, "num_tokens": 177075405.0, "reward": 4.40880823135376, "reward_std": 0.9826715588569641, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6900581121444702, "rewards/ngram_similarity_reward/std": 0.2976328730583191, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 407.046875, "completions/mean_terminated_length": 407.046875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.49765048109196686, "frac_reward_zero_std": 0.25, "grad_norm": 0.07950065284967422, "learning_rate": 4.532593008312326e-06, "loss": 0.0296, "num_tokens": 177204672.0, "reward": 5.49090051651001, "reward_std": 1.4403270483016968, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.740900456905365, "rewards/ngram_similarity_reward/std": 0.2728201150894165, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 347.59375, "completions/mean_terminated_length": 347.59375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.49809800850302083, "frac_reward_zero_std": 0.0, "grad_norm": 0.08854083716869354, "learning_rate": 4.5315767170246945e-06, "loss": 0.0007, "num_tokens": 177334950.0, "reward": 4.268105506896973, "reward_std": 1.4788068532943726, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.8306055068969727, "rewards/ngram_similarity_reward/std": 0.2775154411792755, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 423.921875, "completions/mean_terminated_length": 423.921875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.49854553591407474, "frac_reward_zero_std": 0.0, "grad_norm": 0.10035360604524612, "learning_rate": 4.530559450423477e-06, "loss": 0.0388, "num_tokens": 177478081.0, "reward": 1.4108541011810303, "reward_std": 0.08538447320461273, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4108540415763855, "rewards/ngram_similarity_reward/std": 0.25501975417137146, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 539.90625, "completions/mean_terminated_length": 539.90625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.49899306332512866, "frac_reward_zero_std": 0.0, "grad_norm": 0.05983530730009079, "learning_rate": 4.529541209065568e-06, "loss": 0.0214, "num_tokens": 177640107.0, "reward": 4.336306571960449, "reward_std": 0.9542237520217896, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6175564527511597, "rewards/ngram_similarity_reward/std": 0.19533227384090424, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 420.296875, "completions/mean_terminated_length": 420.296875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.4994405907361826, "frac_reward_zero_std": 0.0, "grad_norm": 0.07837483286857605, "learning_rate": 4.5285219935084e-06, "loss": 0.0108, "num_tokens": 177827518.0, "reward": 5.884796142578125, "reward_std": 1.0983819961547852, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.7597965002059937, "rewards/ngram_similarity_reward/std": 0.3037159740924835, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 502.203125, "completions/mean_terminated_length": 502.203125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.4998881181472365, "frac_reward_zero_std": 0.0, "grad_norm": 0.09346076101064682, "learning_rate": 4.527501804309935e-06, "loss": 0.0285, "num_tokens": 178005147.0, "reward": 5.347973823547363, "reward_std": 1.6126937866210938, "rewards/accuracy_reward/mean": 4.53125, "rewards/accuracy_reward/std": 2.27455735206604, "rewards/ngram_similarity_reward/mean": 0.8167234659194946, "rewards/ngram_similarity_reward/std": 0.27641963958740234, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 444.9375, "completions/mean_terminated_length": 444.9375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.5003356455582905, "frac_reward_zero_std": 0.0, "grad_norm": 0.073221355676651, "learning_rate": 4.52648064202867e-06, "loss": 0.0217, "num_tokens": 178157671.0, "reward": 4.819570541381836, "reward_std": 0.08227217942476273, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8195701837539673, "rewards/ngram_similarity_reward/std": 0.18116185069084167, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 399.359375, "completions/mean_terminated_length": 399.359375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.5007831729693444, "frac_reward_zero_std": 0.0, "grad_norm": 0.09368530660867691, "learning_rate": 4.525458507223633e-06, "loss": 0.0076, "num_tokens": 178299662.0, "reward": 2.5764033794403076, "reward_std": 1.840775728225708, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.6389033198356628, "rewards/ngram_similarity_reward/std": 0.21850425004959106, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 400.46875, "completions/mean_terminated_length": 400.46875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5012307003803983, "frac_reward_zero_std": 0.0, "grad_norm": 0.0986308604478836, "learning_rate": 4.524435400454388e-06, "loss": -0.0369, "num_tokens": 178455596.0, "reward": 6.323049545288086, "reward_std": 0.1640501320362091, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8230493068695068, "rewards/ngram_similarity_reward/std": 0.21891455352306366, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 528.78125, "completions/mean_terminated_length": 528.78125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.5016782277914522, "frac_reward_zero_std": 0.0, "grad_norm": 0.05695182830095291, "learning_rate": 4.523411322281027e-06, "loss": -0.0004, "num_tokens": 178566734.0, "reward": 6.216606616973877, "reward_std": 0.48135465383529663, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.8103565573692322, "rewards/ngram_similarity_reward/std": 0.1939639449119568, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 468.90625, "completions/mean_terminated_length": 468.90625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.5021257552025061, "frac_reward_zero_std": 0.0, "grad_norm": 0.085639089345932, "learning_rate": 4.5223862732641775e-06, "loss": -0.0116, "num_tokens": 178714232.0, "reward": 3.6673331260681152, "reward_std": 1.353724718093872, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.6048333048820496, "rewards/ngram_similarity_reward/std": 0.2771512567996979, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 375.109375, "completions/mean_terminated_length": 375.109375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.50257328261356, "frac_reward_zero_std": 0.0, "grad_norm": 0.0942516103386879, "learning_rate": 4.5213602539649935e-06, "loss": -0.013, "num_tokens": 178839855.0, "reward": 4.5549468994140625, "reward_std": 0.5895836353302002, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.664322018623352, "rewards/ngram_similarity_reward/std": 0.35421836376190186, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 587.4375, "completions/mean_terminated_length": 587.4375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.503020810024614, "frac_reward_zero_std": 0.0, "grad_norm": 0.06759040802717209, "learning_rate": 4.520333264945164e-06, "loss": 0.0033, "num_tokens": 178983515.0, "reward": 4.4897613525390625, "reward_std": 0.7100332975387573, "rewards/accuracy_reward/mean": 3.78125, "rewards/accuracy_reward/std": 2.7744226455688477, "rewards/ngram_similarity_reward/mean": 0.7085116505622864, "rewards/ngram_similarity_reward/std": 0.2417541742324829, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 365.578125, "completions/mean_terminated_length": 365.578125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.503468337435668, "frac_reward_zero_std": 0.0, "grad_norm": 0.12146230787038803, "learning_rate": 4.519305306766911e-06, "loss": -0.0168, "num_tokens": 179108272.0, "reward": 4.922126770019531, "reward_std": 1.7761967182159424, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.6408767104148865, "rewards/ngram_similarity_reward/std": 0.30786970257759094, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 467.796875, "completions/mean_terminated_length": 467.796875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5039158648467219, "frac_reward_zero_std": 0.0, "grad_norm": 0.08122233301401138, "learning_rate": 4.51827637999298e-06, "loss": -0.0069, "num_tokens": 179257155.0, "reward": 6.2950897216796875, "reward_std": 0.21259811520576477, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7950894236564636, "rewards/ngram_similarity_reward/std": 0.2753046751022339, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 371.546875, "completions/mean_terminated_length": 371.546875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5043633922577758, "frac_reward_zero_std": 0.25, "grad_norm": 0.0807737484574318, "learning_rate": 4.517246485186653e-06, "loss": 0.0073, "num_tokens": 179423318.0, "reward": 4.422578811645508, "reward_std": 0.6542232036590576, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.7038288116455078, "rewards/ngram_similarity_reward/std": 0.3005114197731018, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 433.390625, "completions/mean_terminated_length": 433.390625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.5048109196688297, "frac_reward_zero_std": 0.0, "grad_norm": 0.07679495215415955, "learning_rate": 4.51621562291174e-06, "loss": -0.0012, "num_tokens": 179544383.0, "reward": 4.917806148529053, "reward_std": 0.4657190442085266, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.8240560293197632, "rewards/ngram_similarity_reward/std": 0.24326196312904358, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 388.765625, "completions/mean_terminated_length": 388.765625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5052584470798837, "frac_reward_zero_std": 0.25, "grad_norm": 0.10028428584337234, "learning_rate": 4.515183793732579e-06, "loss": -0.0079, "num_tokens": 179676352.0, "reward": 3.615325450897217, "reward_std": 1.9288501739501953, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7403252124786377, "rewards/ngram_similarity_reward/std": 0.18498782813549042, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 455.203125, "completions/mean_terminated_length": 455.203125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5057059744909376, "frac_reward_zero_std": 0.0, "grad_norm": 0.08782827854156494, "learning_rate": 4.5141509982140395e-06, "loss": -0.0153, "num_tokens": 179826701.0, "reward": 5.653952598571777, "reward_std": 1.243302822113037, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.7164527773857117, "rewards/ngram_similarity_reward/std": 0.3032922148704529, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 421.46875, "completions/mean_terminated_length": 421.46875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5061535019019915, "frac_reward_zero_std": 0.0, "grad_norm": 0.08586135506629944, "learning_rate": 4.5131172369215205e-06, "loss": -0.0009, "num_tokens": 179978059.0, "reward": 4.517760276794434, "reward_std": 0.7564865350723267, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.7990100979804993, "rewards/ngram_similarity_reward/std": 0.22309666872024536, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 472.421875, "completions/mean_terminated_length": 472.421875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5066010293130454, "frac_reward_zero_std": 0.0, "grad_norm": 0.07734288275241852, "learning_rate": 4.512082510420946e-06, "loss": 0.0058, "num_tokens": 180163606.0, "reward": 4.513777732849121, "reward_std": 0.5919719934463501, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.7012777328491211, "rewards/ngram_similarity_reward/std": 0.39876317977905273, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 615.953125, "completions/mean_terminated_length": 615.953125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5070485567240993, "frac_reward_zero_std": 0.0, "grad_norm": 0.07883348315954208, "learning_rate": 4.511046819278773e-06, "loss": 0.0073, "num_tokens": 180363987.0, "reward": 2.52891206741333, "reward_std": 0.8345606923103333, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.4976623058319092, "rewards/ngram_similarity_reward/std": 0.2290460467338562, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 522.390625, "completions/mean_terminated_length": 522.390625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.5074960841351532, "frac_reward_zero_std": 0.0, "grad_norm": 0.06173047423362732, "learning_rate": 4.510010164061984e-06, "loss": 0.0179, "num_tokens": 180516812.0, "reward": 6.1605224609375, "reward_std": 0.0839911550283432, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6605223417282104, "rewards/ngram_similarity_reward/std": 0.3330098092556, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 480.90625, "completions/mean_terminated_length": 480.90625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.5079436115462072, "frac_reward_zero_std": 0.0, "grad_norm": 0.09174945950508118, "learning_rate": 4.508972545338089e-06, "loss": 0.0102, "num_tokens": 180700966.0, "reward": 4.181850433349609, "reward_std": 1.16425359249115, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6506001949310303, "rewards/ngram_similarity_reward/std": 0.2055261731147766, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 566.65625, "completions/mean_terminated_length": 566.65625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5083911389572612, "frac_reward_zero_std": 0.0, "grad_norm": 0.05755320191383362, "learning_rate": 4.507933963675128e-06, "loss": 0.0011, "num_tokens": 180878528.0, "reward": 6.443551063537598, "reward_std": 0.08974266052246094, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.9435515403747559, "rewards/ngram_similarity_reward/std": 0.1593705117702484, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 393.953125, "completions/mean_terminated_length": 393.953125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5088386663683151, "frac_reward_zero_std": 0.25, "grad_norm": 0.06603922694921494, "learning_rate": 4.506894419641663e-06, "loss": 0.0243, "num_tokens": 181015997.0, "reward": 2.74050235748291, "reward_std": 1.3518166542053223, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6155022978782654, "rewards/ngram_similarity_reward/std": 0.3120533525943756, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 538.90625, "completions/mean_terminated_length": 538.90625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.509286193779369, "frac_reward_zero_std": 0.0, "grad_norm": 0.06870218366384506, "learning_rate": 4.505853913806789e-06, "loss": 0.0007, "num_tokens": 181179383.0, "reward": 4.546904563903809, "reward_std": 0.1394781917333603, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5469047427177429, "rewards/ngram_similarity_reward/std": 0.24873505532741547, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 421.125, "completions/mean_terminated_length": 421.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.5097337211904229, "frac_reward_zero_std": 0.0, "grad_norm": 0.09319427609443665, "learning_rate": 4.504812446740124e-06, "loss": 0.052, "num_tokens": 181322175.0, "reward": 3.070396900177002, "reward_std": 0.22405299544334412, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.570397138595581, "rewards/ngram_similarity_reward/std": 0.3630915582180023, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 500.453125, "completions/mean_terminated_length": 500.453125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.5101812486014768, "frac_reward_zero_std": 0.0, "grad_norm": 0.08894863724708557, "learning_rate": 4.5037700190118125e-06, "loss": 0.0099, "num_tokens": 181466556.0, "reward": 3.2207350730895996, "reward_std": 0.19931933283805847, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7207349538803101, "rewards/ngram_similarity_reward/std": 0.22353129088878632, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 452.328125, "completions/mean_terminated_length": 452.328125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5106287760125308, "frac_reward_zero_std": 0.0, "grad_norm": 0.0830058678984642, "learning_rate": 4.502726631192526e-06, "loss": -0.0097, "num_tokens": 181617409.0, "reward": 3.030059814453125, "reward_std": 0.12081623822450638, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5300598740577698, "rewards/ngram_similarity_reward/std": 0.19034965336322784, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 506.5625, "completions/mean_terminated_length": 506.5625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.5110763034235847, "frac_reward_zero_std": 0.0, "grad_norm": 0.07904507964849472, "learning_rate": 4.501682283853461e-06, "loss": -0.0101, "num_tokens": 181778421.0, "reward": 4.437538146972656, "reward_std": 0.523172914981842, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.5469129085540771, "rewards/ngram_similarity_reward/std": 0.2477530837059021, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 490.296875, "completions/mean_terminated_length": 490.296875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.5115238308346386, "frac_reward_zero_std": 0.0, "grad_norm": 0.0692552924156189, "learning_rate": 4.500636977566339e-06, "loss": 0.0159, "num_tokens": 181923240.0, "reward": 4.113036632537842, "reward_std": 1.2630510330200195, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.8630364537239075, "rewards/ngram_similarity_reward/std": 0.20219597220420837, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 415.453125, "completions/mean_terminated_length": 415.453125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.5119713582456925, "frac_reward_zero_std": 0.25, "grad_norm": 0.08201241493225098, "learning_rate": 4.499590712903406e-06, "loss": -0.0128, "num_tokens": 182052917.0, "reward": 4.288032531738281, "reward_std": 0.6296678781509399, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5692823529243469, "rewards/ngram_similarity_reward/std": 0.21475210785865784, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 482.15625, "completions/mean_terminated_length": 482.15625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5124188856567464, "frac_reward_zero_std": 0.0, "grad_norm": 0.08425358682870865, "learning_rate": 4.498543490437435e-06, "loss": -0.0208, "num_tokens": 182215071.0, "reward": 6.372078895568848, "reward_std": 0.20720459520816803, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8720792531967163, "rewards/ngram_similarity_reward/std": 0.24663090705871582, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 337.328125, "completions/mean_terminated_length": 337.328125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5128664130678005, "frac_reward_zero_std": 0.0, "grad_norm": 0.12950652837753296, "learning_rate": 4.49749531074172e-06, "loss": 0.0082, "num_tokens": 182326564.0, "reward": 5.764260292053223, "reward_std": 1.266313076019287, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.7330105304718018, "rewards/ngram_similarity_reward/std": 0.3188089430332184, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 418.328125, "completions/mean_terminated_length": 418.328125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5133139404788544, "frac_reward_zero_std": 0.0, "grad_norm": 0.09356052428483963, "learning_rate": 4.496446174390082e-06, "loss": -0.005, "num_tokens": 182469689.0, "reward": 3.608661413192749, "reward_std": 1.2372500896453857, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.5461612939834595, "rewards/ngram_similarity_reward/std": 0.14150479435920715, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 465.40625, "completions/mean_terminated_length": 465.40625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.5137614678899083, "frac_reward_zero_std": 0.0, "grad_norm": 0.089523546397686, "learning_rate": 4.495396081956864e-06, "loss": -0.0015, "num_tokens": 182633267.0, "reward": 2.098345994949341, "reward_std": 1.1912450790405273, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 2.9823806285858154, "rewards/ngram_similarity_reward/mean": 0.8014709949493408, "rewards/ngram_similarity_reward/std": 0.26014891266822815, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.5142089953009622, "frac_reward_zero_std": 0.0, "grad_norm": 0.11801254749298096, "learning_rate": 4.494345034016932e-06, "loss": -0.0032, "num_tokens": 182752707.0, "reward": 3.136491060256958, "reward_std": 0.16078060865402222, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.6521161794662476, "rewards/ngram_similarity_reward/std": 0.3107413053512573, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 438.640625, "completions/mean_terminated_length": 438.640625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5146565227120161, "frac_reward_zero_std": 0.25, "grad_norm": 0.07768989354372025, "learning_rate": 4.4932930311456774e-06, "loss": 0.0026, "num_tokens": 182898876.0, "reward": 3.6199331283569336, "reward_std": 0.8400613069534302, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6511832475662231, "rewards/ngram_similarity_reward/std": 0.43442097306251526, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 405.0, "completions/mean_terminated_length": 405.0, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.51510405012307, "frac_reward_zero_std": 0.0, "grad_norm": 0.08477354794740677, "learning_rate": 4.492240073919013e-06, "loss": 0.0245, "num_tokens": 183055804.0, "reward": 6.137720584869385, "reward_std": 0.46785545349121094, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.7314706444740295, "rewards/ngram_similarity_reward/std": 0.3978707492351532, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 351.796875, "completions/mean_terminated_length": 351.796875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.515551577534124, "frac_reward_zero_std": 0.0, "grad_norm": 0.11964279413223267, "learning_rate": 4.4911861629133724e-06, "loss": -0.0126, "num_tokens": 183288719.0, "reward": 3.1458687782287598, "reward_std": 0.5388427972793579, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.7396188974380493, "rewards/ngram_similarity_reward/std": 0.3106095492839813, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 354.421875, "completions/mean_terminated_length": 354.421875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5159991049451779, "frac_reward_zero_std": 0.0, "grad_norm": 0.09884260594844818, "learning_rate": 4.490131298705714e-06, "loss": 0.0228, "num_tokens": 183470058.0, "reward": 4.703275203704834, "reward_std": 0.062054891139268875, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7032753229141235, "rewards/ngram_similarity_reward/std": 0.3125001788139343, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 385.53125, "completions/mean_terminated_length": 385.53125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.5164466323562318, "frac_reward_zero_std": 0.25, "grad_norm": 0.09258336573839188, "learning_rate": 4.489075481873517e-06, "loss": -0.0091, "num_tokens": 183725836.0, "reward": 4.709859848022461, "reward_std": 1.5861510038375854, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.70986008644104, "rewards/ngram_similarity_reward/std": 0.3363460302352905, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 486.6875, "completions/mean_terminated_length": 486.6875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.5168941597672857, "frac_reward_zero_std": 0.25, "grad_norm": 0.0801980122923851, "learning_rate": 4.488018712994782e-06, "loss": -0.0039, "num_tokens": 183890904.0, "reward": 3.1528306007385254, "reward_std": 0.1401413083076477, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.6684555411338806, "rewards/ngram_similarity_reward/std": 0.372976154088974, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 512.796875, "completions/mean_terminated_length": 512.796875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.5173416871783397, "frac_reward_zero_std": 0.25, "grad_norm": 0.06421882659196854, "learning_rate": 4.48696099264803e-06, "loss": 0.0141, "num_tokens": 184026987.0, "reward": 4.849231719970703, "reward_std": 0.05383665859699249, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8492318391799927, "rewards/ngram_similarity_reward/std": 0.4074471592903137, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 507.953125, "completions/mean_terminated_length": 507.953125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5177892145893936, "frac_reward_zero_std": 0.0, "grad_norm": 0.06870316714048386, "learning_rate": 4.485902321412304e-06, "loss": -0.006, "num_tokens": 184177672.0, "reward": 5.312700271606445, "reward_std": 0.9201359152793884, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.5627005100250244, "rewards/ngram_similarity_reward/std": 0.2898489534854889, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 467.71875, "completions/mean_terminated_length": 467.71875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5182367420004476, "frac_reward_zero_std": 0.25, "grad_norm": 0.06772363185882568, "learning_rate": 4.484842699867168e-06, "loss": -0.0277, "num_tokens": 184329974.0, "reward": 4.685681343078613, "reward_std": 0.20069976150989532, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6856814622879028, "rewards/ngram_similarity_reward/std": 0.26604440808296204, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 421.234375, "completions/mean_terminated_length": 421.234375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5186842694115015, "frac_reward_zero_std": 0.0, "grad_norm": 0.08867233991622925, "learning_rate": 4.483782128592706e-06, "loss": -0.0217, "num_tokens": 184509573.0, "reward": 6.380373001098633, "reward_std": 0.4640117585659027, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.9741232395172119, "rewards/ngram_similarity_reward/std": 0.22101424634456635, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 437.1875, "completions/mean_terminated_length": 437.1875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5191317968225554, "frac_reward_zero_std": 0.0, "grad_norm": 0.08081317692995071, "learning_rate": 4.48272060816952e-06, "loss": 0.0263, "num_tokens": 184663793.0, "reward": 3.2799012660980225, "reward_std": 0.060639895498752594, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7799011468887329, "rewards/ngram_similarity_reward/std": 0.3067109286785126, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 444.9375, "completions/mean_terminated_length": 444.9375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.5195793242336093, "frac_reward_zero_std": 0.25, "grad_norm": 0.06213630363345146, "learning_rate": 4.481658139178734e-06, "loss": 0.0039, "num_tokens": 184800813.0, "reward": 6.081175804138184, "reward_std": 0.09067673981189728, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.5811758637428284, "rewards/ngram_similarity_reward/std": 0.3027797341346741, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 562.46875, "completions/mean_terminated_length": 562.46875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.5200268516446632, "frac_reward_zero_std": 0.0, "grad_norm": 0.058607082813978195, "learning_rate": 4.4805947222019895e-06, "loss": -0.0046, "num_tokens": 184989755.0, "reward": 5.9834794998168945, "reward_std": 0.7814930081367493, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.8584795594215393, "rewards/ngram_similarity_reward/std": 0.22565433382987976, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 440.78125, "completions/mean_terminated_length": 440.78125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5204743790557171, "frac_reward_zero_std": 0.0, "grad_norm": 0.07914085686206818, "learning_rate": 4.479530357821448e-06, "loss": 0.0066, "num_tokens": 185155181.0, "reward": 5.609979629516602, "reward_std": 1.271763801574707, "rewards/accuracy_reward/mean": 4.828125, "rewards/accuracy_reward/std": 1.9359153509140015, "rewards/ngram_similarity_reward/mean": 0.7818544507026672, "rewards/ngram_similarity_reward/std": 0.1954008787870407, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 522.46875, "completions/mean_terminated_length": 522.46875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5209219064667711, "frac_reward_zero_std": 0.25, "grad_norm": 0.06199885904788971, "learning_rate": 4.47846504661979e-06, "loss": 0.0148, "num_tokens": 185303163.0, "reward": 4.539876937866211, "reward_std": 0.41415005922317505, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6336269378662109, "rewards/ngram_similarity_reward/std": 0.2983126640319824, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 482.046875, "completions/mean_terminated_length": 482.046875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.521369433877825, "frac_reward_zero_std": 0.0, "grad_norm": 0.06533315032720566, "learning_rate": 4.477398789180214e-06, "loss": -0.0054, "num_tokens": 185441102.0, "reward": 3.6075239181518555, "reward_std": 0.7264617681503296, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7325237989425659, "rewards/ngram_similarity_reward/std": 0.2967837154865265, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 518.390625, "completions/mean_terminated_length": 518.390625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.5218169612888789, "frac_reward_zero_std": 0.0, "grad_norm": 0.08033981174230576, "learning_rate": 4.476331586086435e-06, "loss": -0.0108, "num_tokens": 185581527.0, "reward": 2.005021810531616, "reward_std": 0.960408091545105, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.723771870136261, "rewards/ngram_similarity_reward/std": 0.2440297156572342, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 471.796875, "completions/mean_terminated_length": 471.796875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.5222644886999329, "frac_reward_zero_std": 0.25, "grad_norm": 0.06338745355606079, "learning_rate": 4.475263437922689e-06, "loss": 0.0028, "num_tokens": 185712426.0, "reward": 4.866727828979492, "reward_std": 0.10598120093345642, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8667280673980713, "rewards/ngram_similarity_reward/std": 0.3441309630870819, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 645.953125, "completions/mean_terminated_length": 645.953125, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.5227120161109868, "frac_reward_zero_std": 0.0, "grad_norm": 0.05194732919335365, "learning_rate": 4.474194345273726e-06, "loss": 0.0358, "num_tokens": 185883015.0, "reward": 6.271397590637207, "reward_std": 0.13656187057495117, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7713972330093384, "rewards/ngram_similarity_reward/std": 0.20546936988830566, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 388.6875, "completions/mean_terminated_length": 388.6875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5231595435220407, "frac_reward_zero_std": 0.0, "grad_norm": 0.08884132653474808, "learning_rate": 4.473124308724814e-06, "loss": 0.0119, "num_tokens": 185998451.0, "reward": 3.39813232421875, "reward_std": 0.6454095244407654, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.7106325626373291, "rewards/ngram_similarity_reward/std": 0.32445278763771057, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 305.453125, "completions/mean_terminated_length": 305.453125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5236070709330947, "frac_reward_zero_std": 0.0, "grad_norm": 0.11478908360004425, "learning_rate": 4.472053328861738e-06, "loss": 0.0363, "num_tokens": 186162144.0, "reward": 5.454303741455078, "reward_std": 1.8677128553390503, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.9855537414550781, "rewards/ngram_similarity_reward/std": 0.2933935523033142, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 374.828125, "completions/mean_terminated_length": 374.828125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5240545983441486, "frac_reward_zero_std": 0.25, "grad_norm": 0.09934691339731216, "learning_rate": 4.470981406270802e-06, "loss": 0.0094, "num_tokens": 186365653.0, "reward": 3.188791513442993, "reward_std": 0.1223050057888031, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.7044165134429932, "rewards/ngram_similarity_reward/std": 0.1886785477399826, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 456.9375, "completions/mean_terminated_length": 456.9375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5245021257552025, "frac_reward_zero_std": 0.0, "grad_norm": 0.09197183698415756, "learning_rate": 4.469908541538821e-06, "loss": 0.0019, "num_tokens": 186516833.0, "reward": 3.5278964042663574, "reward_std": 0.8360381722450256, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.4653964638710022, "rewards/ngram_similarity_reward/std": 0.18946166336536407, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 477.09375, "completions/mean_terminated_length": 477.09375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5249496531662564, "frac_reward_zero_std": 0.0, "grad_norm": 0.08766616135835648, "learning_rate": 4.468834735253129e-06, "loss": -0.0272, "num_tokens": 186673159.0, "reward": 3.9475269317626953, "reward_std": 0.9023805856704712, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.6037769317626953, "rewards/ngram_similarity_reward/std": 0.3598421514034271, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 360.390625, "completions/mean_terminated_length": 360.390625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.5253971805773103, "frac_reward_zero_std": 0.0, "grad_norm": 0.11231087893247604, "learning_rate": 4.467759988001576e-06, "loss": 0.0126, "num_tokens": 186914208.0, "reward": 1.7884948253631592, "reward_std": 0.534112274646759, "rewards/accuracy_reward/mean": 1.03125, "rewards/accuracy_reward/std": 2.7195281982421875, "rewards/ngram_similarity_reward/mean": 0.7572449445724487, "rewards/ngram_similarity_reward/std": 0.28806519508361816, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 473.125, "completions/mean_terminated_length": 473.125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.5258447079883642, "frac_reward_zero_std": 0.25, "grad_norm": 0.08042744547128677, "learning_rate": 4.466684300372524e-06, "loss": 0.0121, "num_tokens": 187085416.0, "reward": 3.117652177810669, "reward_std": 0.814605712890625, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6176521182060242, "rewards/ngram_similarity_reward/std": 0.17957745492458344, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 357.75, "completions/mean_terminated_length": 357.75, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5262922353994182, "frac_reward_zero_std": 0.0, "grad_norm": 0.138007253408432, "learning_rate": 4.465607672954855e-06, "loss": -0.0453, "num_tokens": 187231064.0, "reward": 5.352939605712891, "reward_std": 0.8770514726638794, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.8841900825500488, "rewards/ngram_similarity_reward/std": 0.2505876123905182, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 338.390625, "completions/mean_terminated_length": 338.390625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5267397628104722, "frac_reward_zero_std": 0.0, "grad_norm": 0.11972807347774506, "learning_rate": 4.464530106337959e-06, "loss": 0.0243, "num_tokens": 187372497.0, "reward": 3.768209218978882, "reward_std": 0.9440657496452332, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6119591593742371, "rewards/ngram_similarity_reward/std": 0.2602013051509857, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 432.234375, "completions/mean_terminated_length": 432.234375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.5271872902215261, "frac_reward_zero_std": 0.0, "grad_norm": 0.11133215576410294, "learning_rate": 4.4634516011117455e-06, "loss": -0.0083, "num_tokens": 187599568.0, "reward": 2.9685022830963135, "reward_std": 1.5151430368423462, "rewards/accuracy_reward/mean": 2.203125, "rewards/accuracy_reward/std": 3.0272817611694336, "rewards/ngram_similarity_reward/mean": 0.7653775215148926, "rewards/ngram_similarity_reward/std": 0.40268921852111816, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 434.78125, "completions/mean_terminated_length": 434.78125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.52763481763258, "frac_reward_zero_std": 0.25, "grad_norm": 0.06760997325181961, "learning_rate": 4.4623721578666345e-06, "loss": -0.0061, "num_tokens": 187845202.0, "reward": 2.0149641036987305, "reward_std": 0.9120714664459229, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.7337141633033752, "rewards/ngram_similarity_reward/std": 0.2726683020591736, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 449.578125, "completions/mean_terminated_length": 449.578125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.5280823450436339, "frac_reward_zero_std": 0.0, "grad_norm": 0.0716506615281105, "learning_rate": 4.461291777193562e-06, "loss": 0.0071, "num_tokens": 188014775.0, "reward": 5.391098976135254, "reward_std": 1.4299428462982178, "rewards/accuracy_reward/mean": 4.640625, "rewards/accuracy_reward/std": 2.1445181369781494, "rewards/ngram_similarity_reward/mean": 0.7504733800888062, "rewards/ngram_similarity_reward/std": 0.2910078167915344, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 500.984375, "completions/mean_terminated_length": 500.984375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5285298724546879, "frac_reward_zero_std": 0.0, "grad_norm": 0.07876599580049515, "learning_rate": 4.460210459683975e-06, "loss": 0.0322, "num_tokens": 188172854.0, "reward": 4.6322832107543945, "reward_std": 0.08452863991260529, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6322831511497498, "rewards/ngram_similarity_reward/std": 0.3113572597503662, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 431.34375, "completions/mean_terminated_length": 431.34375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5289773998657418, "frac_reward_zero_std": 0.0, "grad_norm": 0.08243060857057571, "learning_rate": 4.459128205929835e-06, "loss": 0.0084, "num_tokens": 188320764.0, "reward": 4.336956977844238, "reward_std": 0.7198574542999268, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6182069778442383, "rewards/ngram_similarity_reward/std": 0.19275131821632385, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 435.265625, "completions/mean_terminated_length": 435.265625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5294249272767957, "frac_reward_zero_std": 0.0, "grad_norm": 0.08188221603631973, "learning_rate": 4.458045016523615e-06, "loss": -0.0147, "num_tokens": 188481293.0, "reward": 4.523244857788086, "reward_std": 1.700342059135437, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6169949173927307, "rewards/ngram_similarity_reward/std": 0.325880229473114, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 408.3125, "completions/mean_terminated_length": 408.3125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5298724546878496, "frac_reward_zero_std": 0.0, "grad_norm": 0.12688559293746948, "learning_rate": 4.4569608920582994e-06, "loss": -0.0039, "num_tokens": 188771185.0, "reward": 2.5959887504577637, "reward_std": 0.7658644914627075, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.4866139888763428, "rewards/ngram_similarity_reward/std": 0.2558864653110504, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 577.03125, "completions/mean_terminated_length": 577.03125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.5303199820989035, "frac_reward_zero_std": 0.0, "grad_norm": 0.06306634098291397, "learning_rate": 4.455875833127388e-06, "loss": 0.0149, "num_tokens": 188898899.0, "reward": 3.541544198989868, "reward_std": 0.8483132123947144, "rewards/accuracy_reward/mean": 2.859375, "rewards/accuracy_reward/std": 3.0203921794891357, "rewards/ngram_similarity_reward/mean": 0.6821693181991577, "rewards/ngram_similarity_reward/std": 0.26785826683044434, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 402.6875, "completions/mean_terminated_length": 402.6875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.5307675095099574, "frac_reward_zero_std": 0.0, "grad_norm": 0.10755161941051483, "learning_rate": 4.4547898403248885e-06, "loss": 0.0222, "num_tokens": 189054527.0, "reward": 4.52208948135376, "reward_std": 0.13962656259536743, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.522089421749115, "rewards/ngram_similarity_reward/std": 0.29837271571159363, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 504.546875, "completions/mean_terminated_length": 504.546875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5312150369210115, "frac_reward_zero_std": 0.0, "grad_norm": 0.06728096306324005, "learning_rate": 4.4537029142453215e-06, "loss": -0.0118, "num_tokens": 189207106.0, "reward": 4.069512367248535, "reward_std": 0.899407148361206, "rewards/accuracy_reward/mean": 3.453125, "rewards/accuracy_reward/std": 2.962354898452759, "rewards/ngram_similarity_reward/mean": 0.6163874864578247, "rewards/ngram_similarity_reward/std": 0.2319907248020172, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 355.28125, "completions/mean_terminated_length": 355.28125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.5316625643320654, "frac_reward_zero_std": 0.0, "grad_norm": 0.09989286214113235, "learning_rate": 4.452615055483719e-06, "loss": 0.0002, "num_tokens": 189361076.0, "reward": 6.271515369415283, "reward_std": 0.5231516361236572, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.8652653694152832, "rewards/ngram_similarity_reward/std": 0.3173908591270447, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 402.515625, "completions/mean_terminated_length": 402.515625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5321100917431193, "frac_reward_zero_std": 0.0, "grad_norm": 0.0916324183344841, "learning_rate": 4.451526264635622e-06, "loss": -0.0148, "num_tokens": 189512661.0, "reward": 4.111636638641357, "reward_std": 0.8803005814552307, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.7678864598274231, "rewards/ngram_similarity_reward/std": 0.2462681084871292, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 432.78125, "completions/mean_terminated_length": 432.78125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.5325576191541732, "frac_reward_zero_std": 0.0, "grad_norm": 0.10155756026506424, "learning_rate": 4.450436542297082e-06, "loss": 0.0177, "num_tokens": 189640967.0, "reward": 1.6175291538238525, "reward_std": 0.1283179223537445, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6175291538238525, "rewards/ngram_similarity_reward/std": 0.1763124018907547, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 691.109375, "completions/mean_terminated_length": 497.26788330078125, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.5330051465652271, "frac_reward_zero_std": 0.0, "grad_norm": 0.07862675935029984, "learning_rate": 4.4493458890646615e-06, "loss": -0.1386, "num_tokens": 189758030.0, "reward": 3.4345579147338867, "reward_std": 1.5686781406402588, "rewards/accuracy_reward/mean": 2.953125, "rewards/accuracy_reward/std": 3.0075550079345703, "rewards/ngram_similarity_reward/mean": 0.4814329743385315, "rewards/ngram_similarity_reward/std": 0.26114898920059204, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 439.953125, "completions/mean_terminated_length": 439.953125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.533452673976281, "frac_reward_zero_std": 0.0, "grad_norm": 0.10595672577619553, "learning_rate": 4.448254305535432e-06, "loss": -0.0085, "num_tokens": 189893019.0, "reward": 3.101775884628296, "reward_std": 0.18303251266479492, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.6174007654190063, "rewards/ngram_similarity_reward/std": 0.2428828775882721, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 377.53125, "completions/mean_terminated_length": 377.53125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.533900201387335, "frac_reward_zero_std": 0.0, "grad_norm": 0.09084273874759674, "learning_rate": 4.447161792306976e-06, "loss": -0.0022, "num_tokens": 190032973.0, "reward": 2.9964699745178223, "reward_std": 0.48650163412094116, "rewards/accuracy_reward/mean": 2.578125, "rewards/accuracy_reward/std": 3.0410144329071045, "rewards/ngram_similarity_reward/mean": 0.41834527254104614, "rewards/ngram_similarity_reward/std": 0.40859779715538025, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 445.21875, "completions/mean_terminated_length": 445.21875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5343477287983889, "frac_reward_zero_std": 0.0, "grad_norm": 0.07660823315382004, "learning_rate": 4.446068349977381e-06, "loss": -0.0193, "num_tokens": 190167883.0, "reward": 4.553478240966797, "reward_std": 0.8184431195259094, "rewards/accuracy_reward/mean": 3.703125, "rewards/accuracy_reward/std": 2.789889335632324, "rewards/ngram_similarity_reward/mean": 0.850353479385376, "rewards/ngram_similarity_reward/std": 0.3576010465621948, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 419.4375, "completions/mean_terminated_length": 419.4375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5347952562094428, "frac_reward_zero_std": 0.0, "grad_norm": 0.12094959616661072, "learning_rate": 4.444973979145247e-06, "loss": -0.0241, "num_tokens": 190289575.0, "reward": 3.085411548614502, "reward_std": 1.3554508686065674, "rewards/accuracy_reward/mean": 2.4375, "rewards/accuracy_reward/std": 3.095695972442627, "rewards/ngram_similarity_reward/mean": 0.6479116678237915, "rewards/ngram_similarity_reward/std": 0.3733079135417938, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 428.1875, "completions/mean_terminated_length": 428.1875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5352427836204967, "frac_reward_zero_std": 0.0, "grad_norm": 0.07159439474344254, "learning_rate": 4.443878680409681e-06, "loss": -0.0079, "num_tokens": 190427731.0, "reward": 6.272378921508789, "reward_std": 0.09958252310752869, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7723792195320129, "rewards/ngram_similarity_reward/std": 0.3313034176826477, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 387.4375, "completions/mean_terminated_length": 387.4375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5356903110315507, "frac_reward_zero_std": 0.0, "grad_norm": 0.08593352138996124, "learning_rate": 4.442782454370296e-06, "loss": 0.0077, "num_tokens": 190591567.0, "reward": 4.706838130950928, "reward_std": 0.5320311188697815, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.8005880117416382, "rewards/ngram_similarity_reward/std": 0.26705488562583923, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 459.296875, "completions/mean_terminated_length": 459.296875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.5361378384426047, "frac_reward_zero_std": 0.0, "grad_norm": 0.09371272474527359, "learning_rate": 4.441685301627216e-06, "loss": 0.0152, "num_tokens": 190817410.0, "reward": 3.785367488861084, "reward_std": 0.9774179458618164, "rewards/accuracy_reward/mean": 3.453125, "rewards/accuracy_reward/std": 2.962354898452759, "rewards/ngram_similarity_reward/mean": 0.33224231004714966, "rewards/ngram_similarity_reward/std": 0.2916002869606018, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 383.078125, "completions/mean_terminated_length": 383.078125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5365853658536586, "frac_reward_zero_std": 0.0, "grad_norm": 0.10788188129663467, "learning_rate": 4.440587222781071e-06, "loss": 0.0254, "num_tokens": 190947559.0, "reward": 1.6308579444885254, "reward_std": 1.6703917980194092, "rewards/accuracy_reward/mean": 1.140625, "rewards/accuracy_reward/std": 2.7566208839416504, "rewards/ngram_similarity_reward/mean": 0.4902329742908478, "rewards/ngram_similarity_reward/std": 0.21183845400810242, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5370328932647125, "frac_reward_zero_std": 0.0, "grad_norm": 0.09294694662094116, "learning_rate": 4.439488218432996e-06, "loss": 0.0038, "num_tokens": 191123359.0, "reward": 3.347154140472412, "reward_std": 1.6056389808654785, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5659040212631226, "rewards/ngram_similarity_reward/std": 0.32600826025009155, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 438.234375, "completions/mean_terminated_length": 438.234375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5374804206757664, "frac_reward_zero_std": 0.0, "grad_norm": 0.09265422821044922, "learning_rate": 4.438388289184637e-06, "loss": -0.0324, "num_tokens": 191257998.0, "reward": 3.1129822731018066, "reward_std": 1.1772977113723755, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.519232451915741, "rewards/ngram_similarity_reward/std": 0.20867078006267548, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 504.6875, "completions/mean_terminated_length": 504.6875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5379279480868203, "frac_reward_zero_std": 0.0, "grad_norm": 0.07943686842918396, "learning_rate": 4.437287435638141e-06, "loss": -0.02, "num_tokens": 191419034.0, "reward": 1.963949203491211, "reward_std": 1.6095166206359863, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 2.816432476043701, "rewards/ngram_similarity_reward/mean": 0.6045742034912109, "rewards/ngram_similarity_reward/std": 0.3855489194393158, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 341.03125, "completions/mean_terminated_length": 341.03125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.5383754754978742, "frac_reward_zero_std": 0.25, "grad_norm": 0.1066887304186821, "learning_rate": 4.436185658396165e-06, "loss": -0.0023, "num_tokens": 191526300.0, "reward": 4.599543571472168, "reward_std": 1.4790633916854858, "rewards/accuracy_reward/mean": 3.875, "rewards/accuracy_reward/std": 2.7284510135650635, "rewards/ngram_similarity_reward/mean": 0.7245436906814575, "rewards/ngram_similarity_reward/std": 0.1518775224685669, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 427.9375, "completions/mean_terminated_length": 427.9375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5388230029089282, "frac_reward_zero_std": 0.0, "grad_norm": 0.1014091968536377, "learning_rate": 4.435082958061871e-06, "loss": -0.0068, "num_tokens": 191684808.0, "reward": 0.0338769257068634, "reward_std": 0.5324877500534058, "rewards/accuracy_reward/mean": -0.453125, "rewards/accuracy_reward/std": 0.7853760123252869, "rewards/ngram_similarity_reward/mean": 0.4870019257068634, "rewards/ngram_similarity_reward/std": 0.18385639786720276, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 513.0, "completions/mean_terminated_length": 513.0, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.5392705303199821, "frac_reward_zero_std": 0.0, "grad_norm": 0.07513110339641571, "learning_rate": 4.433979335238925e-06, "loss": -0.0052, "num_tokens": 191822568.0, "reward": 4.481563568115234, "reward_std": 0.7649232149124146, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.8565635681152344, "rewards/ngram_similarity_reward/std": 0.19029684364795685, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 517.5, "completions/mean_terminated_length": 517.5, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.539718057731036, "frac_reward_zero_std": 0.0, "grad_norm": 0.08555588126182556, "learning_rate": 4.4328747905314985e-06, "loss": -0.0155, "num_tokens": 192039816.0, "reward": 1.8265221118927002, "reward_std": 2.752342700958252, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5452721118927002, "rewards/ngram_similarity_reward/std": 0.3277539610862732, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 418.625, "completions/mean_terminated_length": 418.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5401655851420899, "frac_reward_zero_std": 0.25, "grad_norm": 0.09938278049230576, "learning_rate": 4.431769324544268e-06, "loss": 0.0287, "num_tokens": 192202608.0, "reward": 4.789680004119873, "reward_std": 0.07458257675170898, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7896800637245178, "rewards/ngram_similarity_reward/std": 0.32243624329566956, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 358.78125, "completions/mean_terminated_length": 358.78125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.5406131125531439, "frac_reward_zero_std": 0.0, "grad_norm": 0.10705152899026871, "learning_rate": 4.430662937882415e-06, "loss": -0.0476, "num_tokens": 192384818.0, "reward": 4.78361177444458, "reward_std": 1.1920162439346313, "rewards/accuracy_reward/mean": 4.359375, "rewards/accuracy_reward/std": 2.3962087631225586, "rewards/ngram_similarity_reward/mean": 0.4242364764213562, "rewards/ngram_similarity_reward/std": 0.24500861763954163, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 416.859375, "completions/mean_terminated_length": 416.859375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5410606399641978, "frac_reward_zero_std": 0.0, "grad_norm": 0.10445639491081238, "learning_rate": 4.429555631151624e-06, "loss": 0.0699, "num_tokens": 192601529.0, "reward": 3.170267105102539, "reward_std": 0.23001646995544434, "rewards/accuracy_reward/mean": 2.359375, "rewards/accuracy_reward/std": 3.1816298961639404, "rewards/ngram_similarity_reward/mean": 0.8108919858932495, "rewards/ngram_similarity_reward/std": 0.39939984679222107, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 366.109375, "completions/mean_terminated_length": 366.109375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5415081673752518, "frac_reward_zero_std": 0.0, "grad_norm": 0.09566810727119446, "learning_rate": 4.428447404958084e-06, "loss": 0.0063, "num_tokens": 192781024.0, "reward": 2.16746187210083, "reward_std": 2.0318212509155273, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6987118124961853, "rewards/ngram_similarity_reward/std": 0.3588160574436188, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 349.296875, "completions/mean_terminated_length": 349.296875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5419556947863057, "frac_reward_zero_std": 0.25, "grad_norm": 0.09934712946414948, "learning_rate": 4.427338259908485e-06, "loss": 0.0101, "num_tokens": 192888531.0, "reward": 5.944144248962402, "reward_std": 0.8069709539413452, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.6316438913345337, "rewards/ngram_similarity_reward/std": 0.28903961181640625, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 379.828125, "completions/mean_terminated_length": 379.828125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5424032221973596, "frac_reward_zero_std": 0.0, "grad_norm": 0.10533181577920914, "learning_rate": 4.426228196610024e-06, "loss": -0.0253, "num_tokens": 192991032.0, "reward": 4.857659339904785, "reward_std": 0.11671176552772522, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8576596975326538, "rewards/ngram_similarity_reward/std": 0.32358139753341675, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 446.375, "completions/mean_terminated_length": 446.375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5428507496084135, "frac_reward_zero_std": 0.0, "grad_norm": 0.0953093022108078, "learning_rate": 4.4251172156703974e-06, "loss": -0.0166, "num_tokens": 193172656.0, "reward": 2.717423915863037, "reward_std": 2.219616174697876, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.5924241542816162, "rewards/ngram_similarity_reward/std": 0.3202495872974396, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 398.171875, "completions/mean_terminated_length": 398.171875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5432982770194674, "frac_reward_zero_std": 0.0, "grad_norm": 0.12990568578243256, "learning_rate": 4.424005317697805e-06, "loss": -0.0166, "num_tokens": 193324651.0, "reward": 4.243120193481445, "reward_std": 0.8481252789497375, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6181201934814453, "rewards/ngram_similarity_reward/std": 0.2649337351322174, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 402.25, "completions/mean_terminated_length": 402.25, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.5437458044305213, "frac_reward_zero_std": 0.0, "grad_norm": 0.10022781044244766, "learning_rate": 4.422892503300949e-06, "loss": -0.0005, "num_tokens": 193469899.0, "reward": 4.512398719787598, "reward_std": 0.8765596747398376, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.8873988389968872, "rewards/ngram_similarity_reward/std": 0.29553747177124023, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 493.53125, "completions/mean_terminated_length": 493.53125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.5441933318415753, "frac_reward_zero_std": 0.0, "grad_norm": 0.09071893244981766, "learning_rate": 4.421778773089035e-06, "loss": 0.0228, "num_tokens": 193605133.0, "reward": 4.1777238845825195, "reward_std": 2.1853396892547607, "rewards/accuracy_reward/mean": 3.59375, "rewards/accuracy_reward/std": 2.854785919189453, "rewards/ngram_similarity_reward/mean": 0.5839738249778748, "rewards/ngram_similarity_reward/std": 0.24924445152282715, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 353.8125, "completions/mean_terminated_length": 353.8125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5446408592526292, "frac_reward_zero_std": 0.0, "grad_norm": 0.10845791548490524, "learning_rate": 4.420664127671764e-06, "loss": 0.0341, "num_tokens": 193749793.0, "reward": 5.829488754272461, "reward_std": 0.9526141881942749, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.9857387542724609, "rewards/ngram_similarity_reward/std": 0.2537689805030823, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 466.78125, "completions/mean_terminated_length": 466.78125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5450883866636832, "frac_reward_zero_std": 0.25, "grad_norm": 0.07989437133073807, "learning_rate": 4.419548567659344e-06, "loss": 0.0066, "num_tokens": 193930371.0, "reward": 2.572287082672119, "reward_std": 0.8746790885925293, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.7285372018814087, "rewards/ngram_similarity_reward/std": 0.2515488862991333, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 487.046875, "completions/mean_terminated_length": 487.046875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.5455359140747371, "frac_reward_zero_std": 0.0, "grad_norm": 0.08332957327365875, "learning_rate": 4.418432093662483e-06, "loss": 0.0294, "num_tokens": 194075942.0, "reward": 4.446735382080078, "reward_std": 0.46390363574028015, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5404852032661438, "rewards/ngram_similarity_reward/std": 0.28095483779907227, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 412.859375, "completions/mean_terminated_length": 412.859375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.545983441485791, "frac_reward_zero_std": 0.0, "grad_norm": 0.0877496600151062, "learning_rate": 4.417314706292386e-06, "loss": 0.019, "num_tokens": 194225917.0, "reward": 5.413695812225342, "reward_std": 1.3082340955734253, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.8511958122253418, "rewards/ngram_similarity_reward/std": 0.26645204424858093, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 380.0625, "completions/mean_terminated_length": 380.0625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.546430968896845, "frac_reward_zero_std": 0.0, "grad_norm": 0.12315753102302551, "learning_rate": 4.416196406160762e-06, "loss": 0.061, "num_tokens": 194423985.0, "reward": 2.7438230514526367, "reward_std": 1.4954211711883545, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.618823230266571, "rewards/ngram_similarity_reward/std": 0.3517220914363861, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 432.234375, "completions/mean_terminated_length": 432.234375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5468784963078989, "frac_reward_zero_std": 0.0, "grad_norm": 0.16123153269290924, "learning_rate": 4.415077193879816e-06, "loss": 0.0049, "num_tokens": 194566000.0, "reward": 3.785012722015381, "reward_std": 1.0094263553619385, "rewards/accuracy_reward/mean": 3.109375, "rewards/accuracy_reward/std": 3.125000238418579, "rewards/ngram_similarity_reward/mean": 0.6756376624107361, "rewards/ngram_similarity_reward/std": 0.3560906946659088, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5473260237189528, "frac_reward_zero_std": 0.25, "grad_norm": 0.09261993318796158, "learning_rate": 4.413957070062256e-06, "loss": -0.0121, "num_tokens": 194725432.0, "reward": 6.400691986083984, "reward_std": 0.8437039256095886, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 1.0881922245025635, "rewards/ngram_similarity_reward/std": 0.16068026423454285, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 407.578125, "completions/mean_terminated_length": 407.578125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.5477735511300067, "frac_reward_zero_std": 0.0, "grad_norm": 0.11597134172916412, "learning_rate": 4.4128360353212846e-06, "loss": -0.0016, "num_tokens": 194847341.0, "reward": 4.762581825256348, "reward_std": 0.218172088265419, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.7782068252563477, "rewards/ngram_similarity_reward/std": 0.40134501457214355, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 491.21875, "completions/mean_terminated_length": 466.5079650878906, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5482210785410606, "frac_reward_zero_std": 0.0, "grad_norm": 0.09415656328201294, "learning_rate": 4.411714090270606e-06, "loss": 0.0172, "num_tokens": 195059947.0, "reward": 3.096059799194336, "reward_std": 0.42152872681617737, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6898097991943359, "rewards/ngram_similarity_reward/std": 0.22004714608192444, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 459.6875, "completions/mean_terminated_length": 459.6875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.5486686059521145, "frac_reward_zero_std": 0.0, "grad_norm": 0.08568942546844482, "learning_rate": 4.4105912355244255e-06, "loss": 0.01, "num_tokens": 195212551.0, "reward": 6.1095170974731445, "reward_std": 0.8938249349594116, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.7970174551010132, "rewards/ngram_similarity_reward/std": 0.36820217967033386, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 428.046875, "completions/mean_terminated_length": 428.046875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5491161333631684, "frac_reward_zero_std": 0.0, "grad_norm": 0.09603255242109299, "learning_rate": 4.40946747169744e-06, "loss": 0.0154, "num_tokens": 195347834.0, "reward": 2.0748066902160645, "reward_std": 1.199176549911499, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 2.91611385345459, "rewards/ngram_similarity_reward/mean": 0.4341817796230316, "rewards/ngram_similarity_reward/std": 0.22849342226982117, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 398.203125, "completions/mean_terminated_length": 398.203125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5495636607742225, "frac_reward_zero_std": 0.0, "grad_norm": 0.09439033269882202, "learning_rate": 4.4083427994048484e-06, "loss": 0.0072, "num_tokens": 195524167.0, "reward": 6.176176071166992, "reward_std": 0.10318027436733246, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6761763095855713, "rewards/ngram_similarity_reward/std": 0.2377578467130661, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 504.359375, "completions/mean_terminated_length": 504.359375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5500111881852764, "frac_reward_zero_std": 0.0, "grad_norm": 0.07317329943180084, "learning_rate": 4.407217219262347e-06, "loss": -0.0342, "num_tokens": 195671966.0, "reward": 3.126587390899658, "reward_std": 0.4775499105453491, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.7203375101089478, "rewards/ngram_similarity_reward/std": 0.19649796187877655, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 428.515625, "completions/mean_terminated_length": 428.515625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5504587155963303, "frac_reward_zero_std": 0.25, "grad_norm": 0.08198384940624237, "learning_rate": 4.406090731886125e-06, "loss": 0.0426, "num_tokens": 195794463.0, "reward": 4.950111389160156, "reward_std": 0.13821549713611603, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9501111507415771, "rewards/ngram_similarity_reward/std": 0.2974853515625, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 420.203125, "completions/mean_terminated_length": 420.203125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.5509062430073842, "frac_reward_zero_std": 0.25, "grad_norm": 0.09043321013450623, "learning_rate": 4.404963337892874e-06, "loss": 0.005, "num_tokens": 195923756.0, "reward": 2.980551242828369, "reward_std": 0.0410948246717453, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.4805510640144348, "rewards/ngram_similarity_reward/std": 0.2743230164051056, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 350.15625, "completions/mean_terminated_length": 350.15625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5513537704184381, "frac_reward_zero_std": 0.0, "grad_norm": 0.1214212104678154, "learning_rate": 4.403835037899778e-06, "loss": -0.0058, "num_tokens": 196088710.0, "reward": 3.1802761554718018, "reward_std": 0.18114234507083893, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6802761554718018, "rewards/ngram_similarity_reward/std": 0.26738229393959045, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 383.6875, "completions/mean_terminated_length": 383.6875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.551801297829492, "frac_reward_zero_std": 0.0, "grad_norm": 0.11884015798568726, "learning_rate": 4.4027058325245186e-06, "loss": 0.0295, "num_tokens": 196264658.0, "reward": 3.2437744140625, "reward_std": 1.6648763418197632, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5562744140625, "rewards/ngram_similarity_reward/std": 0.2828984260559082, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 450.328125, "completions/mean_terminated_length": 450.328125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.552248825240546, "frac_reward_zero_std": 0.0, "grad_norm": 0.08260753005743027, "learning_rate": 4.401575722385272e-06, "loss": -0.0038, "num_tokens": 196394135.0, "reward": 5.003075122833252, "reward_std": 1.365034580230713, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.4405752122402191, "rewards/ngram_similarity_reward/std": 0.19160227477550507, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 462.46875, "completions/mean_terminated_length": 462.46875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.5526963526515999, "frac_reward_zero_std": 0.0, "grad_norm": 0.08738621324300766, "learning_rate": 4.400444708100712e-06, "loss": -0.0076, "num_tokens": 196559973.0, "reward": 2.6876659393310547, "reward_std": 1.1295281648635864, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.5626658201217651, "rewards/ngram_similarity_reward/std": 0.11666838079690933, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5531438800626538, "frac_reward_zero_std": 0.0, "grad_norm": 0.11991672962903976, "learning_rate": 4.399312790290002e-06, "loss": 0.0102, "num_tokens": 196731933.0, "reward": 5.695017337799072, "reward_std": 1.147326946258545, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.6637676954269409, "rewards/ngram_similarity_reward/std": 0.3793351352214813, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 386.046875, "completions/mean_terminated_length": 386.046875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5535914074737077, "frac_reward_zero_std": 0.25, "grad_norm": 0.12412123382091522, "learning_rate": 4.398179969572807e-06, "loss": 0.0123, "num_tokens": 196894464.0, "reward": 1.0227073431015015, "reward_std": 0.8533003330230713, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 2.2060859203338623, "rewards/ngram_similarity_reward/mean": 0.6008323431015015, "rewards/ngram_similarity_reward/std": 0.287748783826828, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 469.078125, "completions/mean_terminated_length": 469.078125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5540389348847617, "frac_reward_zero_std": 0.0, "grad_norm": 0.07732565701007843, "learning_rate": 4.397046246569281e-06, "loss": 0.0105, "num_tokens": 197067893.0, "reward": 1.4631134271621704, "reward_std": 0.08608405292034149, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.46311336755752563, "rewards/ngram_similarity_reward/std": 0.13780225813388824, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 479.53125, "completions/mean_terminated_length": 479.53125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.5544864622958157, "frac_reward_zero_std": 0.0, "grad_norm": 0.07165555655956268, "learning_rate": 4.395911621900076e-06, "loss": 0.0031, "num_tokens": 197229799.0, "reward": 4.127664566040039, "reward_std": 2.0247697830200195, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.5964146852493286, "rewards/ngram_similarity_reward/std": 0.3093799948692322, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 439.234375, "completions/mean_terminated_length": 439.234375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5549339897068696, "frac_reward_zero_std": 0.0, "grad_norm": 0.10605686157941818, "learning_rate": 4.394776096186334e-06, "loss": 0.0, "num_tokens": 197377766.0, "reward": 4.677614688873291, "reward_std": 0.205747589468956, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.693239688873291, "rewards/ngram_similarity_reward/std": 0.40632468461990356, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 427.984375, "completions/mean_terminated_length": 427.984375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5553815171179235, "frac_reward_zero_std": 0.0, "grad_norm": 0.09386388212442398, "learning_rate": 4.393639670049692e-06, "loss": 0.0178, "num_tokens": 197547941.0, "reward": 3.404207229614258, "reward_std": 1.5193042755126953, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.435457319021225, "rewards/ngram_similarity_reward/std": 0.2644595503807068, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 465.765625, "completions/mean_terminated_length": 465.765625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.5558290445289774, "frac_reward_zero_std": 0.0, "grad_norm": 0.07347415387630463, "learning_rate": 4.392502344112279e-06, "loss": -0.0266, "num_tokens": 197685126.0, "reward": 3.5573441982269287, "reward_std": 1.2206635475158691, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.49484407901763916, "rewards/ngram_similarity_reward/std": 0.29545167088508606, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 442.4375, "completions/mean_terminated_length": 442.4375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5562765719400313, "frac_reward_zero_std": 0.0, "grad_norm": 0.09946972131729126, "learning_rate": 4.391364118996719e-06, "loss": 0.0076, "num_tokens": 197872834.0, "reward": 1.5781934261322021, "reward_std": 0.5496147871017456, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.3906933069229126, "rewards/ngram_similarity_reward/std": 0.4072954058647156, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 325.6875, "completions/mean_terminated_length": 325.6875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.5567240993510852, "frac_reward_zero_std": 0.5, "grad_norm": 0.07552611082792282, "learning_rate": 4.390224995326126e-06, "loss": -0.0015, "num_tokens": 198019518.0, "reward": 3.113879919052124, "reward_std": 0.4030926525592804, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5201297998428345, "rewards/ngram_similarity_reward/std": 0.3127535283565521, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 618.53125, "completions/mean_terminated_length": 618.53125, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.5571716267621392, "frac_reward_zero_std": 0.0, "grad_norm": 0.05382458493113518, "learning_rate": 4.389084973724106e-06, "loss": -0.0017, "num_tokens": 198208240.0, "reward": 4.324518203735352, "reward_std": 0.7830394506454468, "rewards/accuracy_reward/mean": 3.609375, "rewards/accuracy_reward/std": 2.829084634780884, "rewards/ngram_similarity_reward/mean": 0.7151432633399963, "rewards/ngram_similarity_reward/std": 0.22197884321212769, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 516.671875, "completions/mean_terminated_length": 492.3651123046875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.5576191541731931, "frac_reward_zero_std": 0.0, "grad_norm": 0.07948078960180283, "learning_rate": 4.3879440548147575e-06, "loss": -0.0031, "num_tokens": 198337131.0, "reward": 4.668152809143066, "reward_std": 0.5126365423202515, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.7619028687477112, "rewards/ngram_similarity_reward/std": 0.23275285959243774, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 463.5625, "completions/mean_terminated_length": 463.5625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.558066681584247, "frac_reward_zero_std": 0.0, "grad_norm": 0.12463536858558655, "learning_rate": 4.386802239222669e-06, "loss": 0.0054, "num_tokens": 198543423.0, "reward": 4.298855304718018, "reward_std": 0.7150664925575256, "rewards/accuracy_reward/mean": 3.703125, "rewards/accuracy_reward/std": 2.789889335632324, "rewards/ngram_similarity_reward/mean": 0.5957306623458862, "rewards/ngram_similarity_reward/std": 0.4176064431667328, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 478.5625, "completions/mean_terminated_length": 478.5625, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.5585142089953009, "frac_reward_zero_std": 0.0, "grad_norm": 0.07589740306138992, "learning_rate": 4.385659527572922e-06, "loss": -0.0055, "num_tokens": 198674355.0, "reward": 5.731651782989502, "reward_std": 0.9091787934303284, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.8879019021987915, "rewards/ngram_similarity_reward/std": 0.17611658573150635, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 530.96875, "completions/mean_terminated_length": 530.96875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5589617364063549, "frac_reward_zero_std": 0.0, "grad_norm": 0.07664234191179276, "learning_rate": 4.384515920491086e-06, "loss": 0.0182, "num_tokens": 198791345.0, "reward": 2.9477784633636475, "reward_std": 0.7595309019088745, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.8227784633636475, "rewards/ngram_similarity_reward/std": 0.264209121465683, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 502.703125, "completions/mean_terminated_length": 502.703125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5594092638174089, "frac_reward_zero_std": 0.0, "grad_norm": 0.08277595043182373, "learning_rate": 4.383371418603222e-06, "loss": -0.0294, "num_tokens": 198948110.0, "reward": 3.1016769409179688, "reward_std": 0.1552654504776001, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6016767024993896, "rewards/ngram_similarity_reward/std": 0.300509512424469, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 395.53125, "completions/mean_terminated_length": 395.53125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5598567912284628, "frac_reward_zero_std": 0.25, "grad_norm": 0.07156224548816681, "learning_rate": 4.382226022535882e-06, "loss": 0.0104, "num_tokens": 199110688.0, "reward": 4.769443035125732, "reward_std": 0.07713472843170166, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7694433927536011, "rewards/ngram_similarity_reward/std": 0.4159128963947296, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 409.390625, "completions/mean_terminated_length": 409.390625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5603043186395167, "frac_reward_zero_std": 0.0, "grad_norm": 0.08550208061933517, "learning_rate": 4.381079732916104e-06, "loss": -0.0078, "num_tokens": 199290969.0, "reward": 3.241121768951416, "reward_std": 0.6393508911132812, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5536216497421265, "rewards/ngram_similarity_reward/std": 0.28667500615119934, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 450.578125, "completions/mean_terminated_length": 450.578125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.5607518460505706, "frac_reward_zero_std": 0.0, "grad_norm": 0.09212508797645569, "learning_rate": 4.3799325503714205e-06, "loss": -0.0084, "num_tokens": 199500894.0, "reward": 2.061561346054077, "reward_std": 0.8142240643501282, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.5928114652633667, "rewards/ngram_similarity_reward/std": 0.2968214154243469, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 355.953125, "completions/mean_terminated_length": 355.953125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5611993734616245, "frac_reward_zero_std": 0.25, "grad_norm": 0.08225823193788528, "learning_rate": 4.378784475529847e-06, "loss": -0.0251, "num_tokens": 199671707.0, "reward": 6.1100172996521, "reward_std": 0.10467161983251572, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6100171804428101, "rewards/ngram_similarity_reward/std": 0.2915937900543213, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 479.15625, "completions/mean_terminated_length": 479.15625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5616469008726784, "frac_reward_zero_std": 0.0, "grad_norm": 0.0778251439332962, "learning_rate": 4.377635509019891e-06, "loss": -0.0155, "num_tokens": 199827045.0, "reward": 3.1580843925476074, "reward_std": 0.5416929125785828, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.564334511756897, "rewards/ngram_similarity_reward/std": 0.20748886466026306, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 419.9375, "completions/mean_terminated_length": 419.9375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.5620944282837324, "frac_reward_zero_std": 0.25, "grad_norm": 0.08607825636863708, "learning_rate": 4.376485651470549e-06, "loss": -0.0038, "num_tokens": 199985249.0, "reward": 1.464486837387085, "reward_std": 0.8320564031600952, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.48011189699172974, "rewards/ngram_similarity_reward/std": 0.2833652198314667, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 428.390625, "completions/mean_terminated_length": 428.390625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5625419556947863, "frac_reward_zero_std": 0.0, "grad_norm": 0.09921771287918091, "learning_rate": 4.375334903511302e-06, "loss": -0.0273, "num_tokens": 200126858.0, "reward": 3.4128971099853516, "reward_std": 0.6406792402267456, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.7253968715667725, "rewards/ngram_similarity_reward/std": 0.21139651536941528, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 529.0625, "completions/mean_terminated_length": 529.0625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.5629894831058402, "frac_reward_zero_std": 0.0, "grad_norm": 0.0691845715045929, "learning_rate": 4.37418326577212e-06, "loss": 0.0244, "num_tokens": 200282302.0, "reward": 3.6800453662872314, "reward_std": 0.8345037698745728, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7112956047058105, "rewards/ngram_similarity_reward/std": 0.25884538888931274, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 460.84375, "completions/mean_terminated_length": 460.84375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5634370105168942, "frac_reward_zero_std": 0.0, "grad_norm": 0.09184509515762329, "learning_rate": 4.37303073888346e-06, "loss": 0.0102, "num_tokens": 200438996.0, "reward": 4.765527725219727, "reward_std": 0.5454859733581543, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.6717779636383057, "rewards/ngram_similarity_reward/std": 0.3333250880241394, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 413.65625, "completions/mean_terminated_length": 413.65625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.5638845379279481, "frac_reward_zero_std": 0.0, "grad_norm": 0.1030343621969223, "learning_rate": 4.3718773234762684e-06, "loss": -0.0029, "num_tokens": 200573614.0, "reward": 3.8357348442077637, "reward_std": 1.9523077011108398, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6794849634170532, "rewards/ngram_similarity_reward/std": 0.3307085335254669, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 396.015625, "completions/mean_terminated_length": 396.015625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.564332065339002, "frac_reward_zero_std": 0.0, "grad_norm": 0.11136715114116669, "learning_rate": 4.370723020181973e-06, "loss": -0.0062, "num_tokens": 200701919.0, "reward": 4.722345352172852, "reward_std": 1.2175238132476807, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.6285953521728516, "rewards/ngram_similarity_reward/std": 0.27477124333381653, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 601.625, "completions/mean_terminated_length": 601.625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.564779592750056, "frac_reward_zero_std": 0.0, "grad_norm": 0.06975237280130386, "learning_rate": 4.369567829632491e-06, "loss": 0.0124, "num_tokens": 200867223.0, "reward": 3.798196792602539, "reward_std": 1.3255693912506104, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.54819655418396, "rewards/ngram_similarity_reward/std": 0.23060840368270874, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 409.546875, "completions/mean_terminated_length": 409.546875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5652271201611099, "frac_reward_zero_std": 0.0, "grad_norm": 0.11291968077421188, "learning_rate": 4.368411752460226e-06, "loss": 0.0079, "num_tokens": 200974810.0, "reward": 1.9527816772460938, "reward_std": 2.1675050258636475, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6715317964553833, "rewards/ngram_similarity_reward/std": 0.31274405121803284, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 370.84375, "completions/mean_terminated_length": 370.84375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5656746475721638, "frac_reward_zero_std": 0.0, "grad_norm": 0.117449551820755, "learning_rate": 4.367254789298064e-06, "loss": -0.0587, "num_tokens": 201134016.0, "reward": 4.690761089324951, "reward_std": 0.18102560937404633, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6907616257667542, "rewards/ngram_similarity_reward/std": 0.31563398241996765, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 358.828125, "completions/mean_terminated_length": 358.828125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5661221749832177, "frac_reward_zero_std": 0.5, "grad_norm": 0.09254030138254166, "learning_rate": 4.366096940779378e-06, "loss": 0.0127, "num_tokens": 201380165.0, "reward": 4.939835548400879, "reward_std": 0.05255156755447388, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9398356080055237, "rewards/ngram_similarity_reward/std": 0.26915013790130615, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 371.234375, "completions/mean_terminated_length": 371.234375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5665697023942716, "frac_reward_zero_std": 0.0, "grad_norm": 0.12018410861492157, "learning_rate": 4.364938207538025e-06, "loss": 0.0074, "num_tokens": 201576292.0, "reward": 3.246954917907715, "reward_std": 1.396032452583313, "rewards/accuracy_reward/mean": 2.4375, "rewards/accuracy_reward/std": 3.095695972442627, "rewards/ngram_similarity_reward/mean": 0.8094548583030701, "rewards/ngram_similarity_reward/std": 0.3154228627681732, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 455.75, "completions/mean_terminated_length": 455.75, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.5670172298053255, "frac_reward_zero_std": 0.0, "grad_norm": 0.08729907870292664, "learning_rate": 4.3637785902083465e-06, "loss": 0.0213, "num_tokens": 201734020.0, "reward": 1.7822763919830322, "reward_std": 1.6111629009246826, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 2.775986671447754, "rewards/ngram_similarity_reward/mean": 0.5166513919830322, "rewards/ngram_similarity_reward/std": 0.30705127120018005, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 390.90625, "completions/mean_terminated_length": 390.90625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.5674647572163795, "frac_reward_zero_std": 0.0, "grad_norm": 0.10130637139081955, "learning_rate": 4.362618089425169e-06, "loss": 0.0195, "num_tokens": 201891838.0, "reward": 3.8338756561279297, "reward_std": 2.096841335296631, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.5838757753372192, "rewards/ngram_similarity_reward/std": 0.34362757205963135, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 464.875, "completions/mean_terminated_length": 439.7460632324219, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5679122846274335, "frac_reward_zero_std": 0.0, "grad_norm": 0.08763626962900162, "learning_rate": 4.361456705823802e-06, "loss": -0.0274, "num_tokens": 202039766.0, "reward": 5.238713264465332, "reward_std": 1.1649223566055298, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.8637134432792664, "rewards/ngram_similarity_reward/std": 0.2816106379032135, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 418.171875, "completions/mean_terminated_length": 418.171875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5683598120384874, "frac_reward_zero_std": 0.0, "grad_norm": 0.11126846075057983, "learning_rate": 4.3602944400400364e-06, "loss": -0.0053, "num_tokens": 202186593.0, "reward": 2.743793487548828, "reward_std": 2.2492334842681885, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7125436067581177, "rewards/ngram_similarity_reward/std": 0.23575741052627563, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 489.609375, "completions/mean_terminated_length": 439.33868408203125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5688073394495413, "frac_reward_zero_std": 0.0, "grad_norm": 0.06754752993583679, "learning_rate": 4.359131292710149e-06, "loss": -0.0835, "num_tokens": 202316040.0, "reward": 5.6284284591674805, "reward_std": 1.3737437725067139, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.7846779227256775, "rewards/ngram_similarity_reward/std": 0.31788432598114014, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 481.921875, "completions/mean_terminated_length": 481.921875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.5692548668605952, "frac_reward_zero_std": 0.0, "grad_norm": 0.08389648795127869, "learning_rate": 4.357967264470898e-06, "loss": -0.0086, "num_tokens": 202488723.0, "reward": 3.3738982677459717, "reward_std": 0.7010356187820435, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.4988984763622284, "rewards/ngram_similarity_reward/std": 0.2545229494571686, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 620.859375, "completions/mean_terminated_length": 574.8225708007812, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5697023942716491, "frac_reward_zero_std": 0.0, "grad_norm": 0.09216287732124329, "learning_rate": 4.356802355959524e-06, "loss": 0.0526, "num_tokens": 202655034.0, "reward": 3.811068534851074, "reward_std": 1.2754812240600586, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.8423184752464294, "rewards/ngram_similarity_reward/std": 0.2445010542869568, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 513.625, "completions/mean_terminated_length": 513.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5701499216827031, "frac_reward_zero_std": 0.0, "grad_norm": 0.08699966967105865, "learning_rate": 4.355636567813747e-06, "loss": 0.006, "num_tokens": 202846946.0, "reward": 4.7357497215271, "reward_std": 0.1327037811279297, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7357498407363892, "rewards/ngram_similarity_reward/std": 0.3276141285896301, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 401.890625, "completions/mean_terminated_length": 401.890625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.570597449093757, "frac_reward_zero_std": 0.0, "grad_norm": 0.09865225851535797, "learning_rate": 4.354469900671773e-06, "loss": -0.0169, "num_tokens": 202977371.0, "reward": 3.6046531200408936, "reward_std": 0.8405933976173401, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6359032392501831, "rewards/ngram_similarity_reward/std": 0.2587908208370209, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 418.8125, "completions/mean_terminated_length": 418.8125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5710449765048109, "frac_reward_zero_std": 0.25, "grad_norm": 0.10138774663209915, "learning_rate": 4.353302355172286e-06, "loss": -0.0067, "num_tokens": 203110127.0, "reward": 3.164700984954834, "reward_std": 0.06457286328077316, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6647011041641235, "rewards/ngram_similarity_reward/std": 0.3551769256591797, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 444.203125, "completions/mean_terminated_length": 444.203125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.5714925039158648, "frac_reward_zero_std": 0.0, "grad_norm": 0.0864931121468544, "learning_rate": 4.3521339319544526e-06, "loss": 0.0059, "num_tokens": 203257500.0, "reward": 5.6036834716796875, "reward_std": 1.960383415222168, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.759933352470398, "rewards/ngram_similarity_reward/std": 0.28998276591300964, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 408.625, "completions/mean_terminated_length": 408.625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.5719400313269187, "frac_reward_zero_std": 0.25, "grad_norm": 0.08241990953683853, "learning_rate": 4.350964631657918e-06, "loss": 0.0027, "num_tokens": 203417636.0, "reward": 3.7263097763061523, "reward_std": 0.8650047183036804, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.5700598955154419, "rewards/ngram_similarity_reward/std": 0.46347489953041077, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 390.75, "completions/mean_terminated_length": 390.75, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.5723875587379726, "frac_reward_zero_std": 0.0, "grad_norm": 0.09796450287103653, "learning_rate": 4.349794454922811e-06, "loss": 0.0355, "num_tokens": 203556820.0, "reward": 4.256396293640137, "reward_std": 1.2533926963806152, "rewards/accuracy_reward/mean": 3.59375, "rewards/accuracy_reward/std": 2.854785919189453, "rewards/ngram_similarity_reward/mean": 0.6626464128494263, "rewards/ngram_similarity_reward/std": 0.2611728310585022, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1564.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 435.828125, "completions/mean_terminated_length": 417.920654296875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5728350861490267, "frac_reward_zero_std": 0.0, "grad_norm": 0.20437754690647125, "learning_rate": 4.348623402389735e-06, "loss": 0.0418, "num_tokens": 203834909.0, "reward": 2.4070346355438232, "reward_std": 0.9201688766479492, "rewards/accuracy_reward/mean": 1.984375, "rewards/accuracy_reward/std": 3.03415584564209, "rewards/ngram_similarity_reward/mean": 0.42265960574150085, "rewards/ngram_similarity_reward/std": 0.29591092467308044, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 480.421875, "completions/mean_terminated_length": 480.421875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.5732826135600806, "frac_reward_zero_std": 0.25, "grad_norm": 0.0753888338804245, "learning_rate": 4.347451474699777e-06, "loss": -0.0334, "num_tokens": 203976904.0, "reward": 2.8891162872314453, "reward_std": 0.1184445470571518, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.40474116802215576, "rewards/ngram_similarity_reward/std": 0.23125404119491577, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 498.015625, "completions/mean_terminated_length": 498.015625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.5737301409711345, "frac_reward_zero_std": 0.0, "grad_norm": 0.07478698343038559, "learning_rate": 4.346278672494504e-06, "loss": 0.0139, "num_tokens": 204172025.0, "reward": 2.332071542739868, "reward_std": 1.2610900402069092, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 2.886364698410034, "rewards/ngram_similarity_reward/mean": 0.7851964235305786, "rewards/ngram_similarity_reward/std": 0.16327303647994995, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 459.890625, "completions/mean_terminated_length": 459.890625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.5741776683821884, "frac_reward_zero_std": 0.0, "grad_norm": 0.08617988973855972, "learning_rate": 4.345104996415955e-06, "loss": 0.0302, "num_tokens": 204340898.0, "reward": 4.557013511657715, "reward_std": 0.5276709794998169, "rewards/accuracy_reward/mean": 3.875, "rewards/accuracy_reward/std": 2.7284510135650635, "rewards/ngram_similarity_reward/mean": 0.682013750076294, "rewards/ngram_similarity_reward/std": 0.2925887703895569, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 396.453125, "completions/mean_terminated_length": 396.453125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5746251957932423, "frac_reward_zero_std": 0.0, "grad_norm": 0.08058377355337143, "learning_rate": 4.343930447106656e-06, "loss": 0.0124, "num_tokens": 204519631.0, "reward": 1.8502486944198608, "reward_std": 0.8469141721725464, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6627487540245056, "rewards/ngram_similarity_reward/std": 0.1869189441204071, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 407.5, "completions/mean_terminated_length": 407.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5750727232042963, "frac_reward_zero_std": 0.0, "grad_norm": 0.10437867790460587, "learning_rate": 4.342755025209604e-06, "loss": 0.0234, "num_tokens": 204655599.0, "reward": 5.386307239532471, "reward_std": 2.221829652786255, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.8238072991371155, "rewards/ngram_similarity_reward/std": 0.26526692509651184, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 564.3125, "completions/mean_terminated_length": 564.3125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.5755202506153502, "frac_reward_zero_std": 0.25, "grad_norm": 0.066917784512043, "learning_rate": 4.34157873136828e-06, "loss": 0.0032, "num_tokens": 204823587.0, "reward": 6.086606979370117, "reward_std": 0.7912037372589111, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.9616069793701172, "rewards/ngram_similarity_reward/std": 0.18773657083511353, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 481.75, "completions/mean_terminated_length": 481.75, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.5759677780264041, "frac_reward_zero_std": 0.0, "grad_norm": 0.08909545838832855, "learning_rate": 4.340401566226636e-06, "loss": -0.0426, "num_tokens": 204978611.0, "reward": 4.891693115234375, "reward_std": 0.5363629460334778, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.797943115234375, "rewards/ngram_similarity_reward/std": 0.23069414496421814, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 521.484375, "completions/mean_terminated_length": 521.484375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.576415305437458, "frac_reward_zero_std": 0.0, "grad_norm": 0.08944880217313766, "learning_rate": 4.339223530429107e-06, "loss": 0.0096, "num_tokens": 205223346.0, "reward": 4.646847248077393, "reward_std": 0.12239819020032883, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.646847128868103, "rewards/ngram_similarity_reward/std": 0.3760032057762146, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 693.90625, "completions/mean_terminated_length": 500.46429443359375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.5768628328485119, "frac_reward_zero_std": 0.25, "grad_norm": 0.12657584249973297, "learning_rate": 4.338044624620599e-06, "loss": -0.0352, "num_tokens": 205432556.0, "reward": 1.4632389545440674, "reward_std": 1.4085140228271484, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 2.455153465270996, "rewards/ngram_similarity_reward/mean": 0.7757389545440674, "rewards/ngram_similarity_reward/std": 0.2819943130016327, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 449.234375, "completions/mean_terminated_length": 397.6612854003906, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.577310360259566, "frac_reward_zero_std": 0.0, "grad_norm": 0.11718279868364334, "learning_rate": 4.336864849446499e-06, "loss": 0.0201, "num_tokens": 205583355.0, "reward": 3.979405641555786, "reward_std": 1.3052562475204468, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.6356555223464966, "rewards/ngram_similarity_reward/std": 0.35236215591430664, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 575.4375, "completions/mean_terminated_length": 527.9354858398438, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5777578876706199, "frac_reward_zero_std": 0.0, "grad_norm": 0.0954417958855629, "learning_rate": 4.335684205552666e-06, "loss": 0.0487, "num_tokens": 205836455.0, "reward": 0.9510928988456726, "reward_std": 1.331400752067566, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 2.3426685333251953, "rewards/ngram_similarity_reward/mean": 0.5135928988456726, "rewards/ngram_similarity_reward/std": 0.3271593749523163, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 344.09375, "completions/mean_terminated_length": 344.09375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5782054150816738, "frac_reward_zero_std": 0.0, "grad_norm": 0.11226638406515121, "learning_rate": 4.334502693585438e-06, "loss": -0.011, "num_tokens": 206006973.0, "reward": 3.1843786239624023, "reward_std": 0.1548232138156891, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6843785643577576, "rewards/ngram_similarity_reward/std": 0.288402259349823, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 375.921875, "completions/mean_terminated_length": 375.921875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5786529424927277, "frac_reward_zero_std": 0.25, "grad_norm": 0.07860208302736282, "learning_rate": 4.333320314191625e-06, "loss": 0.0095, "num_tokens": 206148088.0, "reward": 4.348507881164551, "reward_std": 0.7970369458198547, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.8172580599784851, "rewards/ngram_similarity_reward/std": 0.1973295956850052, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 436.125, "completions/mean_terminated_length": 436.125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5791004699037816, "frac_reward_zero_std": 0.0, "grad_norm": 0.09916546195745468, "learning_rate": 4.332137068018517e-06, "loss": -0.0098, "num_tokens": 206294736.0, "reward": 5.092933177947998, "reward_std": 1.3503336906433105, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.8116834163665771, "rewards/ngram_similarity_reward/std": 0.3395226299762726, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 548.0, "completions/mean_terminated_length": 548.0, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5795479973148355, "frac_reward_zero_std": 0.0, "grad_norm": 0.07296004146337509, "learning_rate": 4.330952955713871e-06, "loss": -0.0303, "num_tokens": 206444656.0, "reward": 6.197272300720215, "reward_std": 0.13062140345573425, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6972724795341492, "rewards/ngram_similarity_reward/std": 0.31922462582588196, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 397.390625, "completions/mean_terminated_length": 397.390625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5799955247258894, "frac_reward_zero_std": 0.0, "grad_norm": 0.09220290929079056, "learning_rate": 4.329767977925926e-06, "loss": 0.0017, "num_tokens": 206595369.0, "reward": 5.39586067199707, "reward_std": 1.393269658088684, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.645860493183136, "rewards/ngram_similarity_reward/std": 0.1155499666929245, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 432.25, "completions/mean_terminated_length": 432.25, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.5804430521369434, "frac_reward_zero_std": 0.0, "grad_norm": 0.08327258378267288, "learning_rate": 4.328582135303387e-06, "loss": -0.0105, "num_tokens": 206748617.0, "reward": 3.1711244583129883, "reward_std": 0.4597923755645752, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5773743391036987, "rewards/ngram_similarity_reward/std": 0.30644235014915466, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 500.75, "completions/mean_terminated_length": 500.75, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.5808905795479973, "frac_reward_zero_std": 0.0, "grad_norm": 0.07713520526885986, "learning_rate": 4.327395428495441e-06, "loss": -0.0364, "num_tokens": 206921913.0, "reward": 1.6427359580993652, "reward_std": 0.09815908223390579, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6427358984947205, "rewards/ngram_similarity_reward/std": 0.23064683377742767, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 510.515625, "completions/mean_terminated_length": 510.515625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5813381069590512, "frac_reward_zero_std": 0.0, "grad_norm": 0.06505846977233887, "learning_rate": 4.326207858151739e-06, "loss": 0.0012, "num_tokens": 207084122.0, "reward": 4.891460418701172, "reward_std": 0.6402475237846375, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.703960657119751, "rewards/ngram_similarity_reward/std": 0.29561981558799744, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 479.375, "completions/mean_terminated_length": 454.4762268066406, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5817856343701052, "frac_reward_zero_std": 0.25, "grad_norm": 0.09155668318271637, "learning_rate": 4.325019424922412e-06, "loss": -0.0135, "num_tokens": 207233314.0, "reward": 3.167853355407715, "reward_std": 0.10784564912319183, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6678533554077148, "rewards/ngram_similarity_reward/std": 0.25225144624710083, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 443.890625, "completions/mean_terminated_length": 443.890625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5822331617811591, "frac_reward_zero_std": 0.0, "grad_norm": 0.0908183827996254, "learning_rate": 4.323830129458061e-06, "loss": 0.024, "num_tokens": 207518395.0, "reward": 4.62649393081665, "reward_std": 2.101778030395508, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.9077439308166504, "rewards/ngram_similarity_reward/std": 0.28142043948173523, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 446.359375, "completions/mean_terminated_length": 446.359375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.582680689192213, "frac_reward_zero_std": 0.0, "grad_norm": 0.09624571353197098, "learning_rate": 4.322639972409759e-06, "loss": 0.0042, "num_tokens": 207667986.0, "reward": 4.585222244262695, "reward_std": 0.47039100527763367, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6789719462394714, "rewards/ngram_similarity_reward/std": 0.23788464069366455, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 324.828125, "completions/mean_terminated_length": 324.828125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.583128216603267, "frac_reward_zero_std": 0.25, "grad_norm": 0.10973282158374786, "learning_rate": 4.321448954429048e-06, "loss": -0.0066, "num_tokens": 207798759.0, "reward": 4.582745552062988, "reward_std": 0.039319053292274475, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5827455520629883, "rewards/ngram_similarity_reward/std": 0.1293172389268875, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 522.328125, "completions/mean_terminated_length": 522.328125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5835757440143209, "frac_reward_zero_std": 0.0, "grad_norm": 0.09233658760786057, "learning_rate": 4.320257076167945e-06, "loss": -0.0145, "num_tokens": 207954252.0, "reward": 2.652021646499634, "reward_std": 2.0199222564697266, "rewards/accuracy_reward/mean": 2.1875, "rewards/accuracy_reward/std": 3.043989896774292, "rewards/ngram_similarity_reward/mean": 0.4645217955112457, "rewards/ngram_similarity_reward/std": 0.389131098985672, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 506.390625, "completions/mean_terminated_length": 506.390625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.5840232714253748, "frac_reward_zero_std": 0.0, "grad_norm": 0.0873550996184349, "learning_rate": 4.319064338278937e-06, "loss": 0.0418, "num_tokens": 208083957.0, "reward": 1.8698256015777588, "reward_std": 2.0062408447265625, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6823254823684692, "rewards/ngram_similarity_reward/std": 0.21194741129875183, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 403.28125, "completions/mean_terminated_length": 403.28125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.5844707988364287, "frac_reward_zero_std": 0.0, "grad_norm": 0.09757047891616821, "learning_rate": 4.317870741414981e-06, "loss": 0.0312, "num_tokens": 208239943.0, "reward": 2.9031214714050293, "reward_std": 0.4686318039894104, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.4968714416027069, "rewards/ngram_similarity_reward/std": 0.21332430839538574, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 412.046875, "completions/mean_terminated_length": 412.046875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5849183262474826, "frac_reward_zero_std": 0.0, "grad_norm": 0.12136556953191757, "learning_rate": 4.3166762862295055e-06, "loss": 0.0032, "num_tokens": 208373370.0, "reward": 3.7895092964172363, "reward_std": 0.8431544303894043, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.5395092368125916, "rewards/ngram_similarity_reward/std": 0.2623805105686188, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 369.65625, "completions/mean_terminated_length": 369.65625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.5853658536585366, "frac_reward_zero_std": 0.0, "grad_norm": 0.11417142301797867, "learning_rate": 4.315480973376406e-06, "loss": 0.0012, "num_tokens": 208533684.0, "reward": 3.0894782543182373, "reward_std": 1.7220485210418701, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5894783735275269, "rewards/ngram_similarity_reward/std": 0.32322585582733154, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 423.84375, "completions/mean_terminated_length": 423.84375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5858133810695905, "frac_reward_zero_std": 0.0, "grad_norm": 0.0967361107468605, "learning_rate": 4.314284803510051e-06, "loss": -0.0074, "num_tokens": 208704058.0, "reward": 6.2280683517456055, "reward_std": 0.5649706125259399, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.837443470954895, "rewards/ngram_similarity_reward/std": 0.19172249734401703, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 474.171875, "completions/mean_terminated_length": 474.171875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.5862609084806445, "frac_reward_zero_std": 0.0, "grad_norm": 0.14397962391376495, "learning_rate": 4.313087777285275e-06, "loss": 0.0034, "num_tokens": 208902981.0, "reward": 2.9218571186065674, "reward_std": 0.44614848494529724, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5156069993972778, "rewards/ngram_similarity_reward/std": 0.34570419788360596, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 382.453125, "completions/mean_terminated_length": 382.453125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5867084358916984, "frac_reward_zero_std": 0.0, "grad_norm": 0.1117420345544815, "learning_rate": 4.311889895357385e-06, "loss": -0.0281, "num_tokens": 209042322.0, "reward": 3.054821491241455, "reward_std": 0.15742333233356476, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5548213720321655, "rewards/ngram_similarity_reward/std": 0.35419711470603943, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 614.03125, "completions/mean_terminated_length": 614.03125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.5871559633027523, "frac_reward_zero_std": 0.0, "grad_norm": 0.13993960618972778, "learning_rate": 4.310691158382153e-06, "loss": -0.0063, "num_tokens": 209275700.0, "reward": 3.5995821952819824, "reward_std": 0.7862210273742676, "rewards/accuracy_reward/mean": 2.84375, "rewards/accuracy_reward/std": 3.0405657291412354, "rewards/ngram_similarity_reward/mean": 0.7558322548866272, "rewards/ngram_similarity_reward/std": 0.3166537582874298, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 452.140625, "completions/mean_terminated_length": 452.140625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.5876034907138062, "frac_reward_zero_std": 0.25, "grad_norm": 0.08446729928255081, "learning_rate": 4.3094915670158205e-06, "loss": 0.003, "num_tokens": 209448637.0, "reward": 3.330838203430176, "reward_std": 0.4240117073059082, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.7370883226394653, "rewards/ngram_similarity_reward/std": 0.304819792509079, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 432.84375, "completions/mean_terminated_length": 432.84375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5880510181248602, "frac_reward_zero_std": 0.5, "grad_norm": 0.04957375302910805, "learning_rate": 4.308291121915097e-06, "loss": -0.0012, "num_tokens": 209582851.0, "reward": 4.887948989868164, "reward_std": 0.016795890405774117, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8879486918449402, "rewards/ngram_similarity_reward/std": 0.27080172300338745, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 617.921875, "completions/mean_terminated_length": 617.921875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.5884985455359141, "frac_reward_zero_std": 0.0, "grad_norm": 0.07341840118169785, "learning_rate": 4.307089823737158e-06, "loss": -0.0015, "num_tokens": 209758686.0, "reward": 1.6291905641555786, "reward_std": 0.14190228283405304, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.6448154449462891, "rewards/ngram_similarity_reward/std": 0.18305432796478271, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 477.125, "completions/mean_terminated_length": 477.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.588946072946968, "frac_reward_zero_std": 0.0, "grad_norm": 0.10296616703271866, "learning_rate": 4.30588767313965e-06, "loss": -0.0146, "num_tokens": 209898118.0, "reward": 4.714685440063477, "reward_std": 0.08106479048728943, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.714685320854187, "rewards/ngram_similarity_reward/std": 0.306520938873291, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 487.75, "completions/mean_terminated_length": 437.4193420410156, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5893936003580219, "frac_reward_zero_std": 0.0, "grad_norm": 0.08141181617975235, "learning_rate": 4.304684670780679e-06, "loss": -0.0747, "num_tokens": 210028694.0, "reward": 4.732090950012207, "reward_std": 1.1350460052490234, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.4508410394191742, "rewards/ngram_similarity_reward/std": 0.41943320631980896, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 468.109375, "completions/mean_terminated_length": 468.109375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.5898411277690758, "frac_reward_zero_std": 0.0, "grad_norm": 0.08218703418970108, "learning_rate": 4.303480817318824e-06, "loss": -0.0047, "num_tokens": 210170541.0, "reward": 3.772031784057617, "reward_std": 1.4102468490600586, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6157817244529724, "rewards/ngram_similarity_reward/std": 0.36207425594329834, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 448.953125, "completions/mean_terminated_length": 448.953125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5902886551801297, "frac_reward_zero_std": 0.25, "grad_norm": 0.0812632218003273, "learning_rate": 4.302276113413127e-06, "loss": 0.0048, "num_tokens": 210345034.0, "reward": 3.6715292930603027, "reward_std": 0.7349720597267151, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7965295910835266, "rewards/ngram_similarity_reward/std": 0.3704771399497986, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 413.8125, "completions/mean_terminated_length": 413.8125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5907361825911837, "frac_reward_zero_std": 0.0, "grad_norm": 0.08711085468530655, "learning_rate": 4.301070559723097e-06, "loss": 0.0093, "num_tokens": 210501582.0, "reward": 4.417667388916016, "reward_std": 0.8771721720695496, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.8864171504974365, "rewards/ngram_similarity_reward/std": 0.3051711618900299, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 428.40625, "completions/mean_terminated_length": 428.40625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.5911837100022377, "frac_reward_zero_std": 0.25, "grad_norm": 0.09898845851421356, "learning_rate": 4.2998641569087055e-06, "loss": -0.0295, "num_tokens": 210654440.0, "reward": 3.3565735816955566, "reward_std": 0.10630813241004944, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.8565737009048462, "rewards/ngram_similarity_reward/std": 0.16159357130527496, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 419.75, "completions/mean_terminated_length": 419.75, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5916312374132916, "frac_reward_zero_std": 0.0, "grad_norm": 0.09474169462919235, "learning_rate": 4.2986569056303914e-06, "loss": 0.0152, "num_tokens": 210808600.0, "reward": 1.5890698432922363, "reward_std": 0.4605858623981476, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.6828198432922363, "rewards/ngram_similarity_reward/std": 0.1803705096244812, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 448.8125, "completions/mean_terminated_length": 448.8125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.5920787648243455, "frac_reward_zero_std": 0.0, "grad_norm": 0.10569164901971817, "learning_rate": 4.297448806549057e-06, "loss": 0.0042, "num_tokens": 210951836.0, "reward": 4.156012058258057, "reward_std": 0.843189537525177, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6247619986534119, "rewards/ngram_similarity_reward/std": 0.3260897397994995, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 510.09375, "completions/mean_terminated_length": 510.09375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5925262922353994, "frac_reward_zero_std": 0.0, "grad_norm": 0.06643950194120407, "learning_rate": 4.2962398603260685e-06, "loss": 0.0263, "num_tokens": 211098034.0, "reward": 6.3764495849609375, "reward_std": 0.10241048783063889, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8764500021934509, "rewards/ngram_similarity_reward/std": 0.1761569082736969, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 478.046875, "completions/mean_terminated_length": 478.046875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.5929738196464533, "frac_reward_zero_std": 0.0, "grad_norm": 0.08487922698259354, "learning_rate": 4.295030067623258e-06, "loss": -0.0116, "num_tokens": 211261285.0, "reward": 4.322232723236084, "reward_std": 1.0263280868530273, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6972328424453735, "rewards/ngram_similarity_reward/std": 0.24187816679477692, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 502.90625, "completions/mean_terminated_length": 502.90625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.5934213470575073, "frac_reward_zero_std": 0.25, "grad_norm": 0.07794208824634552, "learning_rate": 4.293819429102917e-06, "loss": 0.0161, "num_tokens": 211430095.0, "reward": 4.888006210327148, "reward_std": 0.5880323648452759, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.7005060911178589, "rewards/ngram_similarity_reward/std": 0.26562538743019104, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 556.421875, "completions/mean_terminated_length": 508.3064270019531, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.5938688744685612, "frac_reward_zero_std": 0.0, "grad_norm": 0.09490156918764114, "learning_rate": 4.2926079454278055e-06, "loss": -0.0211, "num_tokens": 211566714.0, "reward": 2.9497318267822266, "reward_std": 2.1695144176483154, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.126309394836426, "rewards/ngram_similarity_reward/mean": 0.6372320652008057, "rewards/ngram_similarity_reward/std": 0.3465478718280792, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 439.9375, "completions/mean_terminated_length": 439.9375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5943164018796151, "frac_reward_zero_std": 0.25, "grad_norm": 0.09296488761901855, "learning_rate": 4.29139561726114e-06, "loss": -0.0155, "num_tokens": 211746230.0, "reward": 0.4452609121799469, "reward_std": 0.621183454990387, "rewards/accuracy_reward/mean": -0.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.6640108823776245, "rewards/ngram_similarity_reward/std": 0.26765748858451843, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 563.703125, "completions/mean_terminated_length": 563.703125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.594763929290669, "frac_reward_zero_std": 0.0, "grad_norm": 0.10059570521116257, "learning_rate": 4.2901824452666025e-06, "loss": -0.0006, "num_tokens": 211888707.0, "reward": 3.1512491703033447, "reward_std": 0.14171501994132996, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6512490510940552, "rewards/ngram_similarity_reward/std": 0.23141607642173767, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 463.609375, "completions/mean_terminated_length": 463.609375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.5952114567017229, "frac_reward_zero_std": 0.0, "grad_norm": 0.09031175076961517, "learning_rate": 4.288968430108339e-06, "loss": 0.0001, "num_tokens": 212059626.0, "reward": 4.033574104309082, "reward_std": 0.9780128002166748, "rewards/accuracy_reward/mean": 3.46875, "rewards/accuracy_reward/std": 3.157097101211548, "rewards/ngram_similarity_reward/mean": 0.5648245811462402, "rewards/ngram_similarity_reward/std": 0.3023799657821655, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 517.96875, "completions/mean_terminated_length": 517.96875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.595658984112777, "frac_reward_zero_std": 0.0, "grad_norm": 0.07907724380493164, "learning_rate": 4.287753572450953e-06, "loss": 0.0591, "num_tokens": 212239240.0, "reward": 4.20205020904541, "reward_std": 0.9947676658630371, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.48330050706863403, "rewards/ngram_similarity_reward/std": 0.2970724403858185, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 565.203125, "completions/mean_terminated_length": 517.3709716796875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.5961065115238309, "frac_reward_zero_std": 0.0, "grad_norm": 0.11148945987224579, "learning_rate": 4.286537872959513e-06, "loss": -0.0255, "num_tokens": 212376005.0, "reward": 5.483572959899902, "reward_std": 0.9000318050384521, "rewards/accuracy_reward/mean": 4.828125, "rewards/accuracy_reward/std": 1.9359153509140015, "rewards/ngram_similarity_reward/mean": 0.6554478406906128, "rewards/ngram_similarity_reward/std": 0.3480670750141144, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 446.21875, "completions/mean_terminated_length": 446.21875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.5965540389348848, "frac_reward_zero_std": 0.0, "grad_norm": 0.09494276344776154, "learning_rate": 4.285321332299544e-06, "loss": -0.0225, "num_tokens": 212503747.0, "reward": 3.0399563312530518, "reward_std": 1.0980381965637207, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5399561524391174, "rewards/ngram_similarity_reward/std": 0.1877039521932602, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 402.125, "completions/mean_terminated_length": 402.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5970015663459387, "frac_reward_zero_std": 0.0, "grad_norm": 0.10471401363611221, "learning_rate": 4.284103951137036e-06, "loss": -0.0133, "num_tokens": 212656683.0, "reward": 6.392208099365234, "reward_std": 0.1683715581893921, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.892208456993103, "rewards/ngram_similarity_reward/std": 0.23434390127658844, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 474.921875, "completions/mean_terminated_length": 449.952392578125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.5974490937569926, "frac_reward_zero_std": 0.0, "grad_norm": 0.09312959760427475, "learning_rate": 4.2828857301384355e-06, "loss": 0.0336, "num_tokens": 212838998.0, "reward": 3.833740711212158, "reward_std": 2.504103183746338, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.5837405920028687, "rewards/ngram_similarity_reward/std": 0.33722445368766785, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 403.84375, "completions/mean_terminated_length": 403.84375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.5978966211680465, "frac_reward_zero_std": 0.0, "grad_norm": 0.10815006494522095, "learning_rate": 4.281666669970652e-06, "loss": -0.0152, "num_tokens": 213023052.0, "reward": 4.467321395874023, "reward_std": 0.10678227245807648, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.46732163429260254, "rewards/ngram_similarity_reward/std": 0.24562959372997284, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 484.578125, "completions/mean_terminated_length": 434.1451416015625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.5983441485791005, "frac_reward_zero_std": 0.0, "grad_norm": 0.10885749757289886, "learning_rate": 4.280446771301051e-06, "loss": -0.0005, "num_tokens": 213146865.0, "reward": 4.570272445678711, "reward_std": 0.7078070640563965, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.85152268409729, "rewards/ngram_similarity_reward/std": 0.2377517968416214, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 495.953125, "completions/mean_terminated_length": 495.953125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5987916759901544, "frac_reward_zero_std": 0.25, "grad_norm": 0.06886874884366989, "learning_rate": 4.279226034797459e-06, "loss": -0.0594, "num_tokens": 213301934.0, "reward": 3.3245933055877686, "reward_std": 0.5592687726020813, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6370932459831238, "rewards/ngram_similarity_reward/std": 0.2646704316139221, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 430.671875, "completions/mean_terminated_length": 430.671875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5992392034012083, "frac_reward_zero_std": 0.25, "grad_norm": 0.07862003147602081, "learning_rate": 4.278004461128163e-06, "loss": -0.0119, "num_tokens": 213425081.0, "reward": 4.431199550628662, "reward_std": 0.7673516869544983, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.8061997890472412, "rewards/ngram_similarity_reward/std": 0.307853102684021, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 459.03125, "completions/mean_terminated_length": 459.03125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.5996867308122622, "frac_reward_zero_std": 0.0, "grad_norm": 0.07831700146198273, "learning_rate": 4.276782050961905e-06, "loss": -0.0355, "num_tokens": 213607291.0, "reward": 6.349282741546631, "reward_std": 0.14734327793121338, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8492826223373413, "rewards/ngram_similarity_reward/std": 0.25453436374664307, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 403.359375, "completions/mean_terminated_length": 403.359375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6001342582233162, "frac_reward_zero_std": 0.0, "grad_norm": 0.08558699488639832, "learning_rate": 4.2755588049678845e-06, "loss": -0.0252, "num_tokens": 213735954.0, "reward": 4.715620994567871, "reward_std": 0.10598556697368622, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7156206369400024, "rewards/ngram_similarity_reward/std": 0.1637507677078247, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 562.46875, "completions/mean_terminated_length": 562.46875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.6005817856343701, "frac_reward_zero_std": 0.0, "grad_norm": 0.06608100980520248, "learning_rate": 4.274334723815763e-06, "loss": 0.005, "num_tokens": 213871072.0, "reward": 5.264054775238037, "reward_std": 1.5914874076843262, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.7953047156333923, "rewards/ngram_similarity_reward/std": 0.29335033893585205, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 523.109375, "completions/mean_terminated_length": 523.109375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.6010293130454241, "frac_reward_zero_std": 0.0, "grad_norm": 0.07100894302129745, "learning_rate": 4.273109808175655e-06, "loss": 0.008, "num_tokens": 214082583.0, "reward": 1.8311067819595337, "reward_std": 0.6609352231025696, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6436067819595337, "rewards/ngram_similarity_reward/std": 0.268633633852005, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 576.75, "completions/mean_terminated_length": 576.75, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.601476840456478, "frac_reward_zero_std": 0.0, "grad_norm": 0.09211838990449905, "learning_rate": 4.271884058718133e-06, "loss": 0.0255, "num_tokens": 214230647.0, "reward": 2.031980276107788, "reward_std": 1.8648428916931152, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 3.0130341053009033, "rewards/ngram_similarity_reward/mean": 0.5007302761077881, "rewards/ngram_similarity_reward/std": 0.1377912163734436, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 573.265625, "completions/mean_terminated_length": 549.857177734375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.6019243678675319, "frac_reward_zero_std": 0.0, "grad_norm": 0.1130412220954895, "learning_rate": 4.270657476114227e-06, "loss": 0.0084, "num_tokens": 214439080.0, "reward": 3.2013514041900635, "reward_std": 0.5560939311981201, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6076014041900635, "rewards/ngram_similarity_reward/std": 0.3042972981929779, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 434.265625, "completions/mean_terminated_length": 434.265625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.6023718952785858, "frac_reward_zero_std": 0.0, "grad_norm": 0.07032214850187302, "learning_rate": 4.269430061035423e-06, "loss": -0.0116, "num_tokens": 214577033.0, "reward": 6.2450032234191895, "reward_std": 0.14689236879348755, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7450035810470581, "rewards/ngram_similarity_reward/std": 0.4236237108707428, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 494.703125, "completions/mean_terminated_length": 494.703125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6028194226896397, "frac_reward_zero_std": 0.0, "grad_norm": 0.07221122086048126, "learning_rate": 4.26820181415366e-06, "loss": 0.0114, "num_tokens": 214700678.0, "reward": 3.9844794273376465, "reward_std": 1.3448883295059204, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.640729546546936, "rewards/ngram_similarity_reward/std": 0.4196074306964874, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 438.375, "completions/mean_terminated_length": 438.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6032669501006936, "frac_reward_zero_std": 0.25, "grad_norm": 0.07972569018602371, "learning_rate": 4.266972736141337e-06, "loss": -0.0108, "num_tokens": 214825182.0, "reward": 3.4212043285369873, "reward_std": 0.5146507620811462, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.8274544477462769, "rewards/ngram_similarity_reward/std": 0.31825268268585205, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 434.1875, "completions/mean_terminated_length": 434.1875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6037144775117476, "frac_reward_zero_std": 0.0, "grad_norm": 0.10139903426170349, "learning_rate": 4.2657428276713025e-06, "loss": -0.0588, "num_tokens": 214964810.0, "reward": 4.542989730834961, "reward_std": 1.9339182376861572, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.9179896116256714, "rewards/ngram_similarity_reward/std": 0.23786461353302002, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 466.921875, "completions/mean_terminated_length": 466.921875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.6041620049228015, "frac_reward_zero_std": 0.25, "grad_norm": 0.08900474011898041, "learning_rate": 4.264512089416864e-06, "loss": 0.0183, "num_tokens": 215158261.0, "reward": 3.9474077224731445, "reward_std": 0.9407780170440674, "rewards/accuracy_reward/mean": 3.140625, "rewards/accuracy_reward/std": 2.9727182388305664, "rewards/ngram_similarity_reward/mean": 0.8067828416824341, "rewards/ngram_similarity_reward/std": 0.3917195498943329, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 496.171875, "completions/mean_terminated_length": 496.171875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.6046095323338555, "frac_reward_zero_std": 0.0, "grad_norm": 0.08319786190986633, "learning_rate": 4.263280522051784e-06, "loss": 0.0029, "num_tokens": 215324656.0, "reward": 4.686700344085693, "reward_std": 0.5952214598655701, "rewards/accuracy_reward/mean": 4.171875, "rewards/accuracy_reward/std": 2.5326733589172363, "rewards/ngram_similarity_reward/mean": 0.5148252248764038, "rewards/ngram_similarity_reward/std": 0.3073686361312866, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 459.1875, "completions/mean_terminated_length": 459.1875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6050570597449094, "frac_reward_zero_std": 0.25, "grad_norm": 0.0919172465801239, "learning_rate": 4.262048126250274e-06, "loss": -0.0199, "num_tokens": 215483900.0, "reward": 5.952759742736816, "reward_std": 0.4562895894050598, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.5465095639228821, "rewards/ngram_similarity_reward/std": 0.1557082086801529, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 386.296875, "completions/mean_terminated_length": 386.296875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6055045871559633, "frac_reward_zero_std": 0.0, "grad_norm": 0.10624445974826813, "learning_rate": 4.260814902687001e-06, "loss": 0.0154, "num_tokens": 215644111.0, "reward": 4.7336578369140625, "reward_std": 0.19047802686691284, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7336578369140625, "rewards/ngram_similarity_reward/std": 0.38012969493865967, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 453.0625, "completions/mean_terminated_length": 453.0625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6059521145670173, "frac_reward_zero_std": 0.25, "grad_norm": 0.09647325426340103, "learning_rate": 4.259580852037089e-06, "loss": 0.0303, "num_tokens": 215809331.0, "reward": 3.023860216140747, "reward_std": 0.9079327583312988, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6176100969314575, "rewards/ngram_similarity_reward/std": 0.2777595520019531, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 421.6875, "completions/mean_terminated_length": 421.6875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.6063996419780712, "frac_reward_zero_std": 0.0, "grad_norm": 0.0940016433596611, "learning_rate": 4.258345974976111e-06, "loss": 0.0039, "num_tokens": 215965631.0, "reward": 3.2174174785614014, "reward_std": 0.8066022992134094, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7174174189567566, "rewards/ngram_similarity_reward/std": 0.2216615527868271, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 398.921875, "completions/mean_terminated_length": 398.921875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.6068471693891251, "frac_reward_zero_std": 0.0, "grad_norm": 0.09868673235177994, "learning_rate": 4.257110272180091e-06, "loss": -0.0161, "num_tokens": 216138090.0, "reward": 4.321117401123047, "reward_std": 0.7568840980529785, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6961173415184021, "rewards/ngram_similarity_reward/std": 0.2414599359035492, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 480.140625, "completions/mean_terminated_length": 480.140625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.607294696800179, "frac_reward_zero_std": 0.0, "grad_norm": 0.08821909129619598, "learning_rate": 4.255873744325509e-06, "loss": -0.0002, "num_tokens": 216324867.0, "reward": 6.164969444274902, "reward_std": 0.12031973153352737, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6649699211120605, "rewards/ngram_similarity_reward/std": 0.20555303990840912, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 628.125, "completions/mean_terminated_length": 481.2413635253906, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6077422242112329, "frac_reward_zero_std": 0.0, "grad_norm": 0.14948788285255432, "learning_rate": 4.254636392089293e-06, "loss": -0.1383, "num_tokens": 216485579.0, "reward": 4.539266586303711, "reward_std": 1.2547316551208496, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.6486419439315796, "rewards/ngram_similarity_reward/std": 0.44756120443344116, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 498.9375, "completions/mean_terminated_length": 498.9375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.6081897516222868, "frac_reward_zero_std": 0.0, "grad_norm": 0.07231821864843369, "learning_rate": 4.253398216148826e-06, "loss": -0.014, "num_tokens": 216622135.0, "reward": 4.751773834228516, "reward_std": 0.11481022834777832, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7517737150192261, "rewards/ngram_similarity_reward/std": 0.30630967020988464, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 471.796875, "completions/mean_terminated_length": 471.796875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6086372790333408, "frac_reward_zero_std": 0.0, "grad_norm": 0.09079179167747498, "learning_rate": 4.25215921718194e-06, "loss": -0.0029, "num_tokens": 216787658.0, "reward": 3.768434524536133, "reward_std": 1.1472851037979126, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6121848225593567, "rewards/ngram_similarity_reward/std": 0.3595779836177826, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 477.421875, "completions/mean_terminated_length": 477.421875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.6090848064443947, "frac_reward_zero_std": 0.0, "grad_norm": 0.09562075883150101, "learning_rate": 4.250919395866917e-06, "loss": -0.008, "num_tokens": 216939541.0, "reward": 2.0378975868225098, "reward_std": 1.4186642169952393, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 2.8667497634887695, "rewards/ngram_similarity_reward/mean": 0.6003977060317993, "rewards/ngram_similarity_reward/std": 0.17121119797229767, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 463.984375, "completions/mean_terminated_length": 463.984375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.6095323338554487, "frac_reward_zero_std": 0.0, "grad_norm": 0.09416069835424423, "learning_rate": 4.249678752882488e-06, "loss": 0.0243, "num_tokens": 217049812.0, "reward": 4.938436031341553, "reward_std": 0.18189571797847748, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9384360313415527, "rewards/ngram_similarity_reward/std": 0.2724663317203522, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 516.859375, "completions/mean_terminated_length": 516.859375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.6099798612665026, "frac_reward_zero_std": 0.0, "grad_norm": 0.0826227068901062, "learning_rate": 4.24843728890784e-06, "loss": -0.0194, "num_tokens": 217225803.0, "reward": 3.1972994804382324, "reward_std": 0.837207555770874, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6972993612289429, "rewards/ngram_similarity_reward/std": 0.298595130443573, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 391.28125, "completions/mean_terminated_length": 391.28125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6104273886775565, "frac_reward_zero_std": 0.0, "grad_norm": 0.09286059439182281, "learning_rate": 4.247195004622601e-06, "loss": -0.0084, "num_tokens": 217351901.0, "reward": 5.607362747192383, "reward_std": 0.8946105241775513, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.6698628067970276, "rewards/ngram_similarity_reward/std": 0.2770031988620758, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 418.671875, "completions/mean_terminated_length": 418.671875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.6108749160886104, "frac_reward_zero_std": 0.0, "grad_norm": 0.09607899934053421, "learning_rate": 4.245951900706854e-06, "loss": -0.0446, "num_tokens": 217496344.0, "reward": 2.730788230895996, "reward_std": 1.7564964294433594, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6995381712913513, "rewards/ngram_similarity_reward/std": 0.39579570293426514, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 457.09375, "completions/mean_terminated_length": 457.09375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6113224434996644, "frac_reward_zero_std": 0.0, "grad_norm": 0.1026829332113266, "learning_rate": 4.244707977841129e-06, "loss": -0.0009, "num_tokens": 217687294.0, "reward": 1.8190507888793945, "reward_std": 0.49981939792633057, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.7253009080886841, "rewards/ngram_similarity_reward/std": 0.1935221403837204, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 442.453125, "completions/mean_terminated_length": 442.453125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.6117699709107183, "frac_reward_zero_std": 0.25, "grad_norm": 0.08293341845273972, "learning_rate": 4.243463236706404e-06, "loss": 0.0288, "num_tokens": 217851771.0, "reward": 3.426107883453369, "reward_std": 0.4723675549030304, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.8323577642440796, "rewards/ngram_similarity_reward/std": 0.3235006332397461, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 446.609375, "completions/mean_terminated_length": 446.609375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.6122174983217722, "frac_reward_zero_std": 0.0, "grad_norm": 0.13072438538074493, "learning_rate": 4.242217677984104e-06, "loss": 0.0042, "num_tokens": 218055330.0, "reward": 1.628633737564087, "reward_std": 1.9026405811309814, "rewards/accuracy_reward/mean": 1.078125, "rewards/accuracy_reward/std": 2.683309316635132, "rewards/ngram_similarity_reward/mean": 0.5505087375640869, "rewards/ngram_similarity_reward/std": 0.19984152913093567, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 423.765625, "completions/mean_terminated_length": 423.765625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6126650257328261, "frac_reward_zero_std": 0.5, "grad_norm": 0.05521805211901665, "learning_rate": 4.2409713023561045e-06, "loss": -0.0124, "num_tokens": 218204051.0, "reward": 4.729894161224365, "reward_std": 0.12251585721969604, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7298941612243652, "rewards/ngram_similarity_reward/std": 0.3434084355831146, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 517.515625, "completions/mean_terminated_length": 517.515625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.61311255314388, "frac_reward_zero_std": 0.0, "grad_norm": 0.06719180196523666, "learning_rate": 4.239724110504725e-06, "loss": -0.0246, "num_tokens": 218367764.0, "reward": 4.951564788818359, "reward_std": 0.5328652858734131, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 1.0453150272369385, "rewards/ngram_similarity_reward/std": 0.22503575682640076, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 430.796875, "completions/mean_terminated_length": 430.796875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6135600805549339, "frac_reward_zero_std": 0.25, "grad_norm": 0.07877121865749359, "learning_rate": 4.238476103112734e-06, "loss": 0.0225, "num_tokens": 218514135.0, "reward": 3.2725768089294434, "reward_std": 0.972164511680603, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.8663267493247986, "rewards/ngram_similarity_reward/std": 0.17302057147026062, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 524.265625, "completions/mean_terminated_length": 524.265625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.614007607965988, "frac_reward_zero_std": 0.0, "grad_norm": 0.09043522924184799, "learning_rate": 4.237227280863345e-06, "loss": -0.0094, "num_tokens": 218713976.0, "reward": 5.182313919067383, "reward_std": 0.796884298324585, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.807313859462738, "rewards/ngram_similarity_reward/std": 0.25411897897720337, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 584.484375, "completions/mean_terminated_length": 537.274169921875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.6144551353770419, "frac_reward_zero_std": 0.0, "grad_norm": 0.08146406710147858, "learning_rate": 4.235977644440219e-06, "loss": -0.0147, "num_tokens": 218857367.0, "reward": 5.981154441833496, "reward_std": 0.5761600732803345, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.6686544418334961, "rewards/ngram_similarity_reward/std": 0.3249327838420868, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 542.3125, "completions/mean_terminated_length": 542.3125, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.6149026627880958, "frac_reward_zero_std": 0.0, "grad_norm": 0.0895610898733139, "learning_rate": 4.234727194527462e-06, "loss": 0.0409, "num_tokens": 218992123.0, "reward": 4.557622909545898, "reward_std": 0.7898719906806946, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.557623028755188, "rewards/ngram_similarity_reward/std": 0.3491251766681671, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 494.109375, "completions/mean_terminated_length": 494.109375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6153501901991497, "frac_reward_zero_std": 0.0, "grad_norm": 0.08254073560237885, "learning_rate": 4.233475931809626e-06, "loss": 0.0243, "num_tokens": 219199634.0, "reward": 5.252430438995361, "reward_std": 0.8898434638977051, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.6899309158325195, "rewards/ngram_similarity_reward/std": 0.3919968903064728, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 486.015625, "completions/mean_terminated_length": 486.015625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6157977176102036, "frac_reward_zero_std": 0.25, "grad_norm": 0.0924331396818161, "learning_rate": 4.232223856971705e-06, "loss": 0.0518, "num_tokens": 219412211.0, "reward": 1.6161731481552124, "reward_std": 0.0947863757610321, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6161731481552124, "rewards/ngram_similarity_reward/std": 0.30199921131134033, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 478.03125, "completions/mean_terminated_length": 478.03125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6162452450212575, "frac_reward_zero_std": 0.0, "grad_norm": 0.09547276049852371, "learning_rate": 4.230970970699143e-06, "loss": -0.0305, "num_tokens": 219547125.0, "reward": 3.140580177307129, "reward_std": 0.0856446921825409, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6405801177024841, "rewards/ngram_similarity_reward/std": 0.2989759147167206, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 584.90625, "completions/mean_terminated_length": 584.90625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6166927724323115, "frac_reward_zero_std": 0.0, "grad_norm": 0.07841598987579346, "learning_rate": 4.229717273677823e-06, "loss": -0.0298, "num_tokens": 219699087.0, "reward": 3.326608657836914, "reward_std": 0.7123034596443176, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.4516085982322693, "rewards/ngram_similarity_reward/std": 0.28527435660362244, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 530.78125, "completions/mean_terminated_length": 530.78125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.6171402998433654, "frac_reward_zero_std": 0.0, "grad_norm": 0.08135497570037842, "learning_rate": 4.228462766594075e-06, "loss": -0.0011, "num_tokens": 219858593.0, "reward": 3.2081475257873535, "reward_std": 0.4310663640499115, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6143976449966431, "rewards/ngram_similarity_reward/std": 0.29245811700820923, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 493.625, "completions/mean_terminated_length": 493.625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6175878272544193, "frac_reward_zero_std": 0.0, "grad_norm": 0.1111777052283287, "learning_rate": 4.22720745013467e-06, "loss": 0.02, "num_tokens": 220012825.0, "reward": 5.311241626739502, "reward_std": 1.3066898584365845, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.561241626739502, "rewards/ngram_similarity_reward/std": 0.144589364528656, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 351.859375, "completions/mean_terminated_length": 351.859375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.6180353546654732, "frac_reward_zero_std": 0.5, "grad_norm": 0.07743772119283676, "learning_rate": 4.225951324986826e-06, "loss": -0.0064, "num_tokens": 220124000.0, "reward": 2.014042615890503, "reward_std": 0.5598815083503723, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.8265426158905029, "rewards/ngram_similarity_reward/std": 0.26197633147239685, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 591.046875, "completions/mean_terminated_length": 591.046875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.6184828820765272, "frac_reward_zero_std": 0.0, "grad_norm": 0.09611193090677261, "learning_rate": 4.2246943918382e-06, "loss": -0.0242, "num_tokens": 220326835.0, "reward": 3.06565260887146, "reward_std": 0.4496273994445801, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6594024896621704, "rewards/ngram_similarity_reward/std": 0.18902292847633362, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 515.78125, "completions/mean_terminated_length": 515.78125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.6189304094875812, "frac_reward_zero_std": 0.0, "grad_norm": 0.07921776920557022, "learning_rate": 4.223436651376892e-06, "loss": 0.0166, "num_tokens": 220566773.0, "reward": 4.603166103363037, "reward_std": 0.44305115938186646, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.5094163417816162, "rewards/ngram_similarity_reward/std": 0.1663961410522461, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 462.15625, "completions/mean_terminated_length": 462.15625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6193779368986351, "frac_reward_zero_std": 0.25, "grad_norm": 0.0742671862244606, "learning_rate": 4.222178104291445e-06, "loss": -0.0078, "num_tokens": 220772879.0, "reward": 4.781156539916992, "reward_std": 0.1699088215827942, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7811561822891235, "rewards/ngram_similarity_reward/std": 0.2659415304660797, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 487.75, "completions/mean_terminated_length": 487.75, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.619825464309689, "frac_reward_zero_std": 0.0, "grad_norm": 0.08986928313970566, "learning_rate": 4.220918751270843e-06, "loss": -0.0167, "num_tokens": 220899711.0, "reward": 2.0840022563934326, "reward_std": 1.6939761638641357, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.8965021371841431, "rewards/ngram_similarity_reward/std": 0.20797498524188995, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 530.765625, "completions/mean_terminated_length": 530.765625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6202729917207429, "frac_reward_zero_std": 0.25, "grad_norm": 0.07207165658473969, "learning_rate": 4.219658593004512e-06, "loss": 0.0213, "num_tokens": 221057792.0, "reward": 1.5318541526794434, "reward_std": 0.8543497920036316, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.5474792718887329, "rewards/ngram_similarity_reward/std": 0.28610652685165405, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 390.5, "completions/mean_terminated_length": 390.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6207205191317968, "frac_reward_zero_std": 0.25, "grad_norm": 0.09263911843299866, "learning_rate": 4.2183976301823164e-06, "loss": -0.0364, "num_tokens": 221180448.0, "reward": 2.463513135910034, "reward_std": 0.8897101879119873, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.713513195514679, "rewards/ngram_similarity_reward/std": 0.3683803081512451, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 441.359375, "completions/mean_terminated_length": 441.359375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.6211680465428507, "frac_reward_zero_std": 0.0, "grad_norm": 0.08527567982673645, "learning_rate": 4.217135863494564e-06, "loss": 0.0011, "num_tokens": 221346135.0, "reward": 6.123813152313232, "reward_std": 0.12822559475898743, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6238132119178772, "rewards/ngram_similarity_reward/std": 0.3540512025356293, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 607.703125, "completions/mean_terminated_length": 607.703125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.6216155739539047, "frac_reward_zero_std": 0.0, "grad_norm": 0.07611112296581268, "learning_rate": 4.215873293632003e-06, "loss": 0.0155, "num_tokens": 221528196.0, "reward": 3.1610960960388184, "reward_std": 2.0345959663391113, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.8485962152481079, "rewards/ngram_similarity_reward/std": 0.2572100758552551, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 494.921875, "completions/mean_terminated_length": 494.921875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.6220631013649586, "frac_reward_zero_std": 0.0, "grad_norm": 0.08504889160394669, "learning_rate": 4.214609921285818e-06, "loss": 0.01, "num_tokens": 221653263.0, "reward": 3.1183433532714844, "reward_std": 0.13722378015518188, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6183434128761292, "rewards/ngram_similarity_reward/std": 0.28606945276260376, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 545.40625, "completions/mean_terminated_length": 545.40625, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.6225106287760125, "frac_reward_zero_std": 0.0, "grad_norm": 0.08638472110033035, "learning_rate": 4.2133457471476345e-06, "loss": 0.0115, "num_tokens": 221812377.0, "reward": 2.7754156589508057, "reward_std": 1.5384128093719482, "rewards/accuracy_reward/mean": 2.015625, "rewards/accuracy_reward/std": 3.00260329246521, "rewards/ngram_similarity_reward/mean": 0.7597907185554504, "rewards/ngram_similarity_reward/std": 0.31502005457878113, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 433.03125, "completions/mean_terminated_length": 433.03125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6229581561870664, "frac_reward_zero_std": 0.25, "grad_norm": 0.07527010142803192, "learning_rate": 4.2120807719095166e-06, "loss": 0.0026, "num_tokens": 221934555.0, "reward": 6.035556793212891, "reward_std": 0.10023511946201324, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.5355567336082458, "rewards/ngram_similarity_reward/std": 0.2512805461883545, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 436.96875, "completions/mean_terminated_length": 436.96875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.6234056835981204, "frac_reward_zero_std": 0.0, "grad_norm": 0.09910791367292404, "learning_rate": 4.2108149962639695e-06, "loss": -0.0045, "num_tokens": 222097513.0, "reward": 6.168093204498291, "reward_std": 0.47341105341911316, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.761843204498291, "rewards/ngram_similarity_reward/std": 0.2521355152130127, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 494.171875, "completions/mean_terminated_length": 494.171875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.6238532110091743, "frac_reward_zero_std": 0.0, "grad_norm": 0.09825079888105392, "learning_rate": 4.209548420903934e-06, "loss": -0.0026, "num_tokens": 222248404.0, "reward": 3.4757986068725586, "reward_std": 1.0716943740844727, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.5070487260818481, "rewards/ngram_similarity_reward/std": 0.28137826919555664, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 495.625, "completions/mean_terminated_length": 470.9841613769531, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.6243007384202283, "frac_reward_zero_std": 0.0, "grad_norm": 0.08490008115768433, "learning_rate": 4.208281046522788e-06, "loss": -0.0214, "num_tokens": 222377292.0, "reward": 4.1201958656311035, "reward_std": 0.8970295190811157, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.7764459848403931, "rewards/ngram_similarity_reward/std": 0.2914271056652069, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 420.53125, "completions/mean_terminated_length": 420.53125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6247482658312822, "frac_reward_zero_std": 0.25, "grad_norm": 0.09060279279947281, "learning_rate": 4.207012873814349e-06, "loss": 0.0085, "num_tokens": 222572638.0, "reward": 4.718295574188232, "reward_std": 0.0775412917137146, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7182955145835876, "rewards/ngram_similarity_reward/std": 0.30478185415267944, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 392.390625, "completions/mean_terminated_length": 392.390625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6251957932423361, "frac_reward_zero_std": 0.0, "grad_norm": 0.11478184163570404, "learning_rate": 4.20574390347287e-06, "loss": -0.0186, "num_tokens": 222742759.0, "reward": 4.449161529541016, "reward_std": 0.7801280617713928, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.8241614103317261, "rewards/ngram_similarity_reward/std": 0.4052240550518036, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 441.484375, "completions/mean_terminated_length": 441.484375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.62564332065339, "frac_reward_zero_std": 0.0, "grad_norm": 0.0892636626958847, "learning_rate": 4.2044741361930425e-06, "loss": 0.0056, "num_tokens": 222887830.0, "reward": 3.9365570545196533, "reward_std": 0.8537122011184692, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.9678070545196533, "rewards/ngram_similarity_reward/std": 0.15813502669334412, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 476.828125, "completions/mean_terminated_length": 476.828125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6260908480644439, "frac_reward_zero_std": 0.0, "grad_norm": 0.07625213265419006, "learning_rate": 4.203203572669992e-06, "loss": 0.012, "num_tokens": 223017387.0, "reward": 4.069248676300049, "reward_std": 0.9240694046020508, "rewards/accuracy_reward/mean": 3.453125, "rewards/accuracy_reward/std": 2.962354898452759, "rewards/ngram_similarity_reward/mean": 0.6161236763000488, "rewards/ngram_similarity_reward/std": 0.24392534792423248, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 453.90625, "completions/mean_terminated_length": 453.90625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6265383754754978, "frac_reward_zero_std": 0.0, "grad_norm": 0.0871860533952713, "learning_rate": 4.201932213599281e-06, "loss": 0.0197, "num_tokens": 223158677.0, "reward": 4.026385307312012, "reward_std": 0.9551817774772644, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.6826354265213013, "rewards/ngram_similarity_reward/std": 0.33283618092536926, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 457.34375, "completions/mean_terminated_length": 457.34375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6269859028865518, "frac_reward_zero_std": 0.0, "grad_norm": 0.08433037996292114, "learning_rate": 4.200660059676908e-06, "loss": -0.0166, "num_tokens": 223337515.0, "reward": 5.095659255981445, "reward_std": 1.2406134605407715, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.7206593751907349, "rewards/ngram_similarity_reward/std": 0.3060435652732849, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 409.21875, "completions/mean_terminated_length": 409.21875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6274334302976057, "frac_reward_zero_std": 0.0, "grad_norm": 0.10011816769838333, "learning_rate": 4.199387111599305e-06, "loss": -0.0469, "num_tokens": 223519049.0, "reward": 4.542868614196777, "reward_std": 0.11508417129516602, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5428687334060669, "rewards/ngram_similarity_reward/std": 0.3072071075439453, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 478.453125, "completions/mean_terminated_length": 478.453125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6278809577086597, "frac_reward_zero_std": 0.0, "grad_norm": 0.06337883323431015, "learning_rate": 4.198113370063342e-06, "loss": -0.0013, "num_tokens": 223674614.0, "reward": 5.18435001373291, "reward_std": 1.8982305526733398, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.8093503713607788, "rewards/ngram_similarity_reward/std": 0.3255639672279358, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 490.171875, "completions/mean_terminated_length": 439.9193420410156, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6283284851197136, "frac_reward_zero_std": 0.25, "grad_norm": 0.10022468119859695, "learning_rate": 4.196838835766318e-06, "loss": 0.0202, "num_tokens": 223784241.0, "reward": 2.7793374061584473, "reward_std": 1.5091160535812378, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6543375849723816, "rewards/ngram_similarity_reward/std": 0.2150864601135254, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 484.859375, "completions/mean_terminated_length": 460.0476379394531, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6287760125307675, "frac_reward_zero_std": 0.0, "grad_norm": 0.08851854503154755, "learning_rate": 4.1955635094059725e-06, "loss": -0.0504, "num_tokens": 223979768.0, "reward": 3.8104681968688965, "reward_std": 1.2830171585083008, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.4667181074619293, "rewards/ngram_similarity_reward/std": 0.28389641642570496, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 484.109375, "completions/mean_terminated_length": 484.109375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6292235399418215, "frac_reward_zero_std": 0.0, "grad_norm": 0.07858376204967499, "learning_rate": 4.1942873916804736e-06, "loss": -0.0391, "num_tokens": 224116303.0, "reward": 4.786311149597168, "reward_std": 0.15001791715621948, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7863106727600098, "rewards/ngram_similarity_reward/std": 0.24081631004810333, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 616.734375, "completions/mean_terminated_length": 594.0159301757812, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6296710673528754, "frac_reward_zero_std": 0.0, "grad_norm": 0.09792505204677582, "learning_rate": 4.193010483288424e-06, "loss": -0.0839, "num_tokens": 224303886.0, "reward": 3.317302703857422, "reward_std": 0.6904056668281555, "rewards/accuracy_reward/mean": 2.640625, "rewards/accuracy_reward/std": 3.075077533721924, "rewards/ngram_similarity_reward/mean": 0.676677942276001, "rewards/ngram_similarity_reward/std": 0.278852641582489, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 500.9375, "completions/mean_terminated_length": 500.9375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6301185947639293, "frac_reward_zero_std": 0.0, "grad_norm": 0.09619695693254471, "learning_rate": 4.191732784928862e-06, "loss": -0.02, "num_tokens": 224436426.0, "reward": 3.7580008506774902, "reward_std": 0.8680193424224854, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.6955007314682007, "rewards/ngram_similarity_reward/std": 0.18768362700939178, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 411.078125, "completions/mean_terminated_length": 411.078125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.6305661221749832, "frac_reward_zero_std": 0.0, "grad_norm": 0.10975297540426254, "learning_rate": 4.190454297301254e-06, "loss": -0.0123, "num_tokens": 224555967.0, "reward": 3.212692975997925, "reward_std": 0.2119302749633789, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.06007981300354, "rewards/ngram_similarity_reward/mean": 0.7439426779747009, "rewards/ngram_similarity_reward/std": 0.3938526213169098, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 389.703125, "completions/mean_terminated_length": 389.703125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6310136495860371, "frac_reward_zero_std": 0.0, "grad_norm": 0.14062047004699707, "learning_rate": 4.189175021105499e-06, "loss": -0.0113, "num_tokens": 224741740.0, "reward": 5.415818214416504, "reward_std": 0.9347876906394958, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.5720680952072144, "rewards/ngram_similarity_reward/std": 0.4173468351364136, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 772.265625, "completions/mean_terminated_length": 477.86541748046875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.631461176997091, "frac_reward_zero_std": 0.0, "grad_norm": 0.13017436861991882, "learning_rate": 4.187894957041933e-06, "loss": -0.0176, "num_tokens": 224883661.0, "reward": 2.0202715396881104, "reward_std": 0.8831048011779785, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6452715396881104, "rewards/ngram_similarity_reward/std": 0.41355928778648376, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 643.875, "completions/mean_terminated_length": 598.5806274414062, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.631908704408145, "frac_reward_zero_std": 0.0, "grad_norm": 0.06196107342839241, "learning_rate": 4.186614105811317e-06, "loss": 0.0645, "num_tokens": 225031029.0, "reward": 3.5483577251434326, "reward_std": 1.4880199432373047, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.7671077251434326, "rewards/ngram_similarity_reward/std": 0.30070042610168457, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 428.84375, "completions/mean_terminated_length": 428.84375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.632356231819199, "frac_reward_zero_std": 0.25, "grad_norm": 0.07660739868879318, "learning_rate": 4.185332468114844e-06, "loss": 0.0095, "num_tokens": 225170971.0, "reward": 3.1542890071868896, "reward_std": 0.09491442143917084, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6542891263961792, "rewards/ngram_similarity_reward/std": 0.20330768823623657, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 493.921875, "completions/mean_terminated_length": 493.921875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.6328037592302529, "frac_reward_zero_std": 0.0, "grad_norm": 0.09530206024646759, "learning_rate": 4.184050044654142e-06, "loss": 0.002, "num_tokens": 225292902.0, "reward": 4.7313737869262695, "reward_std": 0.144403874874115, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7313735485076904, "rewards/ngram_similarity_reward/std": 0.2395082414150238, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 420.109375, "completions/mean_terminated_length": 420.109375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6332512866413068, "frac_reward_zero_std": 0.0, "grad_norm": 0.09896707534790039, "learning_rate": 4.182766836131265e-06, "loss": 0.0097, "num_tokens": 225479677.0, "reward": 6.169071674346924, "reward_std": 0.5309245586395264, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.7784466743469238, "rewards/ngram_similarity_reward/std": 0.2396049052476883, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 483.59375, "completions/mean_terminated_length": 483.59375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.6336988140523607, "frac_reward_zero_std": 0.0, "grad_norm": 0.07389920204877853, "learning_rate": 4.181482843248697e-06, "loss": -0.0102, "num_tokens": 225630275.0, "reward": 5.170687198638916, "reward_std": 0.7861341238021851, "rewards/accuracy_reward/mean": 4.375, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.7956870794296265, "rewards/ngram_similarity_reward/std": 0.2827925384044647, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 529.90625, "completions/mean_terminated_length": 529.90625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6341463414634146, "frac_reward_zero_std": 0.0, "grad_norm": 0.07031376659870148, "learning_rate": 4.180198066709354e-06, "loss": 0.0024, "num_tokens": 225791293.0, "reward": 6.244063854217529, "reward_std": 0.11442062258720398, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7440634369850159, "rewards/ngram_similarity_reward/std": 0.34232163429260254, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 438.3125, "completions/mean_terminated_length": 438.3125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.6345938688744686, "frac_reward_zero_std": 0.0, "grad_norm": 0.12037988752126694, "learning_rate": 4.178912507216577e-06, "loss": 0.0319, "num_tokens": 225940417.0, "reward": 3.188974618911743, "reward_std": 0.4819395840167999, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.7827244997024536, "rewards/ngram_similarity_reward/std": 0.28460362553596497, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 511.234375, "completions/mean_terminated_length": 511.234375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.6350413962855225, "frac_reward_zero_std": 0.0, "grad_norm": 0.10589804500341415, "learning_rate": 4.17762616547414e-06, "loss": 0.0219, "num_tokens": 226085168.0, "reward": 2.5460028648376465, "reward_std": 1.662891149520874, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 3.125000238418579, "rewards/ngram_similarity_reward/mean": 0.43662798404693604, "rewards/ngram_similarity_reward/std": 0.3201160132884979, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 457.375, "completions/mean_terminated_length": 457.375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6354889236965764, "frac_reward_zero_std": 0.0, "grad_norm": 0.08328361809253693, "learning_rate": 4.176339042186242e-06, "loss": -0.0262, "num_tokens": 226249320.0, "reward": 4.569077491760254, "reward_std": 0.9987720251083374, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.66282719373703, "rewards/ngram_similarity_reward/std": 0.17006200551986694, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 455.21875, "completions/mean_terminated_length": 455.21875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6359364511076303, "frac_reward_zero_std": 0.25, "grad_norm": 0.08780533075332642, "learning_rate": 4.175051138057512e-06, "loss": 0.0133, "num_tokens": 226405046.0, "reward": 2.935864210128784, "reward_std": 0.4295510947704315, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.34211423993110657, "rewards/ngram_similarity_reward/std": 0.15545785427093506, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 373.9375, "completions/mean_terminated_length": 373.9375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6363839785186842, "frac_reward_zero_std": 0.0, "grad_norm": 0.11097009479999542, "learning_rate": 4.173762453793004e-06, "loss": 0.0066, "num_tokens": 226561602.0, "reward": 4.57109260559082, "reward_std": 1.2318971157073975, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6648430824279785, "rewards/ngram_similarity_reward/std": 0.3468696177005768, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 467.015625, "completions/mean_terminated_length": 467.015625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.6368315059297383, "frac_reward_zero_std": 0.0, "grad_norm": 0.09840057790279388, "learning_rate": 4.172472990098201e-06, "loss": -0.0136, "num_tokens": 226692723.0, "reward": 5.230722427368164, "reward_std": 0.8436187505722046, "rewards/accuracy_reward/mean": 4.53125, "rewards/accuracy_reward/std": 2.27455735206604, "rewards/ngram_similarity_reward/mean": 0.6994720697402954, "rewards/ngram_similarity_reward/std": 0.3748303949832916, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 476.546875, "completions/mean_terminated_length": 476.546875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6372790333407922, "frac_reward_zero_std": 0.0, "grad_norm": 0.09602098912000656, "learning_rate": 4.171182747679013e-06, "loss": 0.0086, "num_tokens": 226869206.0, "reward": 4.479099273681641, "reward_std": 0.679693341255188, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.7603493332862854, "rewards/ngram_similarity_reward/std": 0.23113702237606049, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 480.1875, "completions/mean_terminated_length": 455.3016052246094, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.6377265607518461, "frac_reward_zero_std": 0.0, "grad_norm": 0.08428952097892761, "learning_rate": 4.169891727241775e-06, "loss": -0.047, "num_tokens": 227013986.0, "reward": 1.4020637273788452, "reward_std": 0.5320973992347717, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 2.5734739303588867, "rewards/ngram_similarity_reward/mean": 0.5114387273788452, "rewards/ngram_similarity_reward/std": 0.2666884958744049, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 510.0625, "completions/mean_terminated_length": 510.0625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6381740881629, "frac_reward_zero_std": 0.25, "grad_norm": 0.08204381167888641, "learning_rate": 4.168599929493249e-06, "loss": 0.0165, "num_tokens": 227161734.0, "reward": 1.611677885055542, "reward_std": 0.07850028574466705, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6116780042648315, "rewards/ngram_similarity_reward/std": 0.27441543340682983, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 447.28125, "completions/mean_terminated_length": 447.28125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.6386216155739539, "frac_reward_zero_std": 0.0, "grad_norm": 0.11197412759065628, "learning_rate": 4.16730735514062e-06, "loss": 0.0244, "num_tokens": 227332968.0, "reward": 4.392323017120361, "reward_std": 2.07807993888855, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6735729575157166, "rewards/ngram_similarity_reward/std": 0.31565147638320923, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 457.984375, "completions/mean_terminated_length": 457.984375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.6390691429850078, "frac_reward_zero_std": 0.0, "grad_norm": 0.10292767733335495, "learning_rate": 4.166014004891504e-06, "loss": 0.0246, "num_tokens": 227463543.0, "reward": 2.124846935272217, "reward_std": 1.1881186962127686, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 2.8667497634887695, "rewards/ngram_similarity_reward/mean": 0.687346875667572, "rewards/ngram_similarity_reward/std": 0.1769050657749176, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 420.515625, "completions/mean_terminated_length": 420.515625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6395166703960617, "frac_reward_zero_std": 0.0, "grad_norm": 0.09785809367895126, "learning_rate": 4.164719879453934e-06, "loss": 0.0257, "num_tokens": 227646712.0, "reward": 4.452017307281494, "reward_std": 0.5627508163452148, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6395174264907837, "rewards/ngram_similarity_reward/std": 0.26190394163131714, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 429.484375, "completions/mean_terminated_length": 429.484375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6399641978071157, "frac_reward_zero_std": 0.0, "grad_norm": 0.12155099213123322, "learning_rate": 4.163424979536373e-06, "loss": -0.0046, "num_tokens": 227828087.0, "reward": 4.365470886230469, "reward_std": 2.231382369995117, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6467206478118896, "rewards/ngram_similarity_reward/std": 0.2218778282403946, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 441.546875, "completions/mean_terminated_length": 441.546875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6404117252181696, "frac_reward_zero_std": 0.0, "grad_norm": 0.09199751913547516, "learning_rate": 4.162129305847707e-06, "loss": 0.0159, "num_tokens": 227932506.0, "reward": 4.9037675857543945, "reward_std": 0.1362752467393875, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9037677049636841, "rewards/ngram_similarity_reward/std": 0.22845500707626343, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 487.265625, "completions/mean_terminated_length": 487.265625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.6408592526292235, "frac_reward_zero_std": 0.0, "grad_norm": 0.10971736162900925, "learning_rate": 4.160832859097243e-06, "loss": -0.0071, "num_tokens": 228162699.0, "reward": 2.998798131942749, "reward_std": 0.5218923687934875, "rewards/accuracy_reward/mean": 2.375, "rewards/accuracy_reward/std": 3.057647228240967, "rewards/ngram_similarity_reward/mean": 0.6237983703613281, "rewards/ngram_similarity_reward/std": 0.26022130250930786, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 494.171875, "completions/mean_terminated_length": 494.171875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.6413067800402774, "frac_reward_zero_std": 0.0, "grad_norm": 0.08202996104955673, "learning_rate": 4.159535639994714e-06, "loss": 0.0028, "num_tokens": 228326134.0, "reward": 6.2203145027160645, "reward_std": 0.07296191155910492, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.720314621925354, "rewards/ngram_similarity_reward/std": 0.3275502622127533, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 510.65625, "completions/mean_terminated_length": 486.2539978027344, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6417543074513314, "frac_reward_zero_std": 0.0, "grad_norm": 0.1214066594839096, "learning_rate": 4.158237649250276e-06, "loss": -0.0613, "num_tokens": 228500368.0, "reward": 1.6632508039474487, "reward_std": 0.6656859517097473, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.861900806427002, "rewards/ngram_similarity_reward/mean": 0.6632509231567383, "rewards/ngram_similarity_reward/std": 0.3105030953884125, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 551.90625, "completions/mean_terminated_length": 551.90625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.6422018348623854, "frac_reward_zero_std": 0.0, "grad_norm": 0.08250430971384048, "learning_rate": 4.1569388875745044e-06, "loss": 0.0044, "num_tokens": 228654202.0, "reward": 4.726158618927002, "reward_std": 0.8597042560577393, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7261587381362915, "rewards/ngram_similarity_reward/std": 0.15916042029857635, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 462.015625, "completions/mean_terminated_length": 462.015625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6426493622734393, "frac_reward_zero_std": 0.0, "grad_norm": 0.07860218733549118, "learning_rate": 4.1556393556784e-06, "loss": 0.0014, "num_tokens": 228800971.0, "reward": 6.067918300628662, "reward_std": 0.13865211606025696, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.5679180026054382, "rewards/ngram_similarity_reward/std": 0.2406175434589386, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 579.734375, "completions/mean_terminated_length": 579.734375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6430968896844932, "frac_reward_zero_std": 0.0, "grad_norm": 0.08776629716157913, "learning_rate": 4.154339054273383e-06, "loss": 0.0446, "num_tokens": 228971802.0, "reward": 3.1615724563598633, "reward_std": 0.05697369575500488, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6615725159645081, "rewards/ngram_similarity_reward/std": 0.36188453435897827, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 481.359375, "completions/mean_terminated_length": 481.359375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6435444170955471, "frac_reward_zero_std": 0.0, "grad_norm": 0.12041721493005753, "learning_rate": 4.153037984071296e-06, "loss": 0.0159, "num_tokens": 229177937.0, "reward": 5.157722473144531, "reward_std": 1.9307384490966797, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.5952223539352417, "rewards/ngram_similarity_reward/std": 0.37407711148262024, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 527.296875, "completions/mean_terminated_length": 527.296875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.643991944506601, "frac_reward_zero_std": 0.0, "grad_norm": 0.08580014109611511, "learning_rate": 4.151736145784402e-06, "loss": -0.0043, "num_tokens": 229319604.0, "reward": 3.193204641342163, "reward_std": 0.19513703882694244, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6932047605514526, "rewards/ngram_similarity_reward/std": 0.3354808986186981, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 477.859375, "completions/mean_terminated_length": 477.859375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.6444394719176549, "frac_reward_zero_std": 0.0, "grad_norm": 0.0718073844909668, "learning_rate": 4.150433540125385e-06, "loss": -0.0022, "num_tokens": 229481003.0, "reward": 6.222909450531006, "reward_std": 0.6646191477775574, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.9104093313217163, "rewards/ngram_similarity_reward/std": 0.22802779078483582, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 505.671875, "completions/mean_terminated_length": 505.671875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.6448869993287089, "frac_reward_zero_std": 0.0, "grad_norm": 0.08394629508256912, "learning_rate": 4.149130167807347e-06, "loss": 0.0163, "num_tokens": 229615846.0, "reward": 5.157923698425293, "reward_std": 0.9705162048339844, "rewards/accuracy_reward/mean": 4.625, "rewards/accuracy_reward/std": 2.3333334922790527, "rewards/ngram_similarity_reward/mean": 0.532923698425293, "rewards/ngram_similarity_reward/std": 0.39065343141555786, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 390.0, "completions/mean_terminated_length": 390.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6453345267397628, "frac_reward_zero_std": 0.0, "grad_norm": 0.1086539626121521, "learning_rate": 4.147826029543815e-06, "loss": 0.0071, "num_tokens": 229777062.0, "reward": 4.193568706512451, "reward_std": 1.4892494678497314, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6623190641403198, "rewards/ngram_similarity_reward/std": 0.3748938739299774, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 595.953125, "completions/mean_terminated_length": 595.953125, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.6457820541508167, "frac_reward_zero_std": 0.0, "grad_norm": 0.06477946043014526, "learning_rate": 4.146521126048729e-06, "loss": -0.0083, "num_tokens": 229990707.0, "reward": 4.260105609893799, "reward_std": 1.5706150531768799, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5413554906845093, "rewards/ngram_similarity_reward/std": 0.22571967542171478, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 361.046875, "completions/mean_terminated_length": 361.046875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.6462295815618707, "frac_reward_zero_std": 0.25, "grad_norm": 0.0986800566315651, "learning_rate": 4.145215458036451e-06, "loss": -0.0243, "num_tokens": 230160774.0, "reward": 4.196253776550293, "reward_std": 1.152472972869873, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.8525041341781616, "rewards/ngram_similarity_reward/std": 0.22282861173152924, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 485.859375, "completions/mean_terminated_length": 485.859375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6466771089729246, "frac_reward_zero_std": 0.25, "grad_norm": 0.06249188259243965, "learning_rate": 4.1439090262217614e-06, "loss": -0.0188, "num_tokens": 230323053.0, "reward": 6.313204765319824, "reward_std": 0.754632830619812, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 1.0007052421569824, "rewards/ngram_similarity_reward/std": 0.1831846833229065, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 455.734375, "completions/mean_terminated_length": 455.734375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.6471246363839785, "frac_reward_zero_std": 0.0, "grad_norm": 0.07594480365514755, "learning_rate": 4.142601831319859e-06, "loss": 0.0022, "num_tokens": 230487692.0, "reward": 6.028704643249512, "reward_std": 0.8274354934692383, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.7162050008773804, "rewards/ngram_similarity_reward/std": 0.33539679646492004, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 448.5, "completions/mean_terminated_length": 448.5, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.6475721637950325, "frac_reward_zero_std": 0.25, "grad_norm": 0.09063916653394699, "learning_rate": 4.141293874046359e-06, "loss": -0.0091, "num_tokens": 230630204.0, "reward": 3.5175516605377197, "reward_std": 1.1414765119552612, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6425517201423645, "rewards/ngram_similarity_reward/std": 0.29616227746009827, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 484.234375, "completions/mean_terminated_length": 484.234375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.6480196912060864, "frac_reward_zero_std": 0.0, "grad_norm": 0.0769663080573082, "learning_rate": 4.139985155117296e-06, "loss": -0.0222, "num_tokens": 230793227.0, "reward": 5.935922622680664, "reward_std": 0.811854898929596, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.6234228014945984, "rewards/ngram_similarity_reward/std": 0.2343846708536148, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 391.78125, "completions/mean_terminated_length": 391.78125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6484672186171403, "frac_reward_zero_std": 0.25, "grad_norm": 0.08732519298791885, "learning_rate": 4.138675675249119e-06, "loss": 0.0031, "num_tokens": 230926109.0, "reward": 4.527279853820801, "reward_std": 0.4422011971473694, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6210297346115112, "rewards/ngram_similarity_reward/std": 0.3546171486377716, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 487.765625, "completions/mean_terminated_length": 487.765625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6489147460281942, "frac_reward_zero_std": 0.0, "grad_norm": 0.10862072557210922, "learning_rate": 4.1373654351586955e-06, "loss": -0.0008, "num_tokens": 231127534.0, "reward": 3.2383363246917725, "reward_std": 2.2389791011810303, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.7539613842964172, "rewards/ngram_similarity_reward/std": 0.3312397003173828, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 450.9375, "completions/mean_terminated_length": 450.9375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.6493622734392481, "frac_reward_zero_std": 0.0, "grad_norm": 0.07241150736808777, "learning_rate": 4.13605443556331e-06, "loss": 0.0168, "num_tokens": 231288010.0, "reward": 5.961592674255371, "reward_std": 0.5978045463562012, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.6490925550460815, "rewards/ngram_similarity_reward/std": 0.27072691917419434, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 473.9375, "completions/mean_terminated_length": 473.9375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.649809800850302, "frac_reward_zero_std": 0.0, "grad_norm": 0.11711332201957703, "learning_rate": 4.1347426771806595e-06, "loss": 0.0014, "num_tokens": 231479702.0, "reward": 1.2033401727676392, "reward_std": 0.5603854060173035, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.39084017276763916, "rewards/ngram_similarity_reward/std": 0.14672933518886566, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 628.734375, "completions/mean_terminated_length": 628.734375, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.650257328261356, "frac_reward_zero_std": 0.0, "grad_norm": 0.0671830028295517, "learning_rate": 4.133430160728859e-06, "loss": 0.0051, "num_tokens": 231666981.0, "reward": 6.058471202850342, "reward_std": 0.42626237869262695, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.6522212028503418, "rewards/ngram_similarity_reward/std": 0.17144225537776947, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 471.546875, "completions/mean_terminated_length": 471.546875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.65070485567241, "frac_reward_zero_std": 0.0, "grad_norm": 0.10306880623102188, "learning_rate": 4.132116886926438e-06, "loss": 0.0074, "num_tokens": 231802824.0, "reward": 6.152955055236816, "reward_std": 0.18371598422527313, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6529550552368164, "rewards/ngram_similarity_reward/std": 0.25471946597099304, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 426.421875, "completions/mean_terminated_length": 426.421875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6511523830834639, "frac_reward_zero_std": 0.25, "grad_norm": 0.0816989615559578, "learning_rate": 4.130802856492341e-06, "loss": -0.0171, "num_tokens": 231965587.0, "reward": 4.789144515991211, "reward_std": 0.15055951476097107, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7891442775726318, "rewards/ngram_similarity_reward/std": 0.3067992925643921, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 508.359375, "completions/mean_terminated_length": 483.920654296875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.6515999104945178, "frac_reward_zero_std": 0.0, "grad_norm": 0.1071004718542099, "learning_rate": 4.129488070145925e-06, "loss": -0.0149, "num_tokens": 232104602.0, "reward": 4.685388565063477, "reward_std": 0.09721273183822632, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6853882670402527, "rewards/ngram_similarity_reward/std": 0.2215292751789093, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 426.84375, "completions/mean_terminated_length": 426.84375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.6520474379055717, "frac_reward_zero_std": 0.0, "grad_norm": 0.11188356578350067, "learning_rate": 4.128172528606963e-06, "loss": 0.0136, "num_tokens": 232242368.0, "reward": 5.314410209655762, "reward_std": 1.6823713779449463, "rewards/accuracy_reward/mean": 4.34375, "rewards/accuracy_reward/std": 2.4314002990722656, "rewards/ngram_similarity_reward/mean": 0.9706601500511169, "rewards/ngram_similarity_reward/std": 0.21696914732456207, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 434.15625, "completions/mean_terminated_length": 434.15625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.6524949653166257, "frac_reward_zero_std": 0.25, "grad_norm": 0.11057297885417938, "learning_rate": 4.126856232595639e-06, "loss": -0.0247, "num_tokens": 232398378.0, "reward": 5.810824394226074, "reward_std": 1.0515462160110474, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.6858242750167847, "rewards/ngram_similarity_reward/std": 0.138453409075737, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 473.484375, "completions/mean_terminated_length": 473.484375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.6529424927276796, "frac_reward_zero_std": 0.5, "grad_norm": 0.058471474796533585, "learning_rate": 4.125539182832553e-06, "loss": -0.008, "num_tokens": 232579209.0, "reward": 4.679767608642578, "reward_std": 0.07971321046352386, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6797680854797363, "rewards/ngram_similarity_reward/std": 0.3421189785003662, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 454.890625, "completions/mean_terminated_length": 454.890625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6533900201387335, "frac_reward_zero_std": 0.0, "grad_norm": 0.10413812100887299, "learning_rate": 4.124221380038716e-06, "loss": -0.0558, "num_tokens": 232762962.0, "reward": 3.1186277866363525, "reward_std": 0.09125609695911407, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6186277866363525, "rewards/ngram_similarity_reward/std": 0.29530709981918335, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 509.765625, "completions/mean_terminated_length": 509.765625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.6538375475497874, "frac_reward_zero_std": 0.0, "grad_norm": 0.18827714025974274, "learning_rate": 4.1229028249355505e-06, "loss": -0.0112, "num_tokens": 233006467.0, "reward": 1.6649301052093506, "reward_std": 0.5911146402359009, "rewards/accuracy_reward/mean": 1.0625, "rewards/accuracy_reward/std": 2.695528507232666, "rewards/ngram_similarity_reward/mean": 0.6024301648139954, "rewards/ngram_similarity_reward/std": 0.2865312695503235, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 547.984375, "completions/mean_terminated_length": 547.984375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.6542850749608413, "frac_reward_zero_std": 0.0, "grad_norm": 0.07518404722213745, "learning_rate": 4.121583518244891e-06, "loss": -0.0003, "num_tokens": 233143810.0, "reward": 4.681684494018555, "reward_std": 0.10513784736394882, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6816844940185547, "rewards/ngram_similarity_reward/std": 0.24526838958263397, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 336.671875, "completions/mean_terminated_length": 336.671875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6547326023718952, "frac_reward_zero_std": 0.5, "grad_norm": 0.09193456172943115, "learning_rate": 4.120263460688986e-06, "loss": 0.0114, "num_tokens": 233263373.0, "reward": 0.32907330989837646, "reward_std": 0.03988669440150261, "rewards/accuracy_reward/mean": -0.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8290732502937317, "rewards/ngram_similarity_reward/std": 0.3824120759963989, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 498.90625, "completions/mean_terminated_length": 498.90625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.6551801297829493, "frac_reward_zero_std": 0.0, "grad_norm": 0.08146632462739944, "learning_rate": 4.1189426529904905e-06, "loss": 0.0205, "num_tokens": 233429063.0, "reward": 5.3825764656066895, "reward_std": 0.8494433760643005, "rewards/accuracy_reward/mean": 4.828125, "rewards/accuracy_reward/std": 1.9359153509140015, "rewards/ngram_similarity_reward/mean": 0.5544514656066895, "rewards/ngram_similarity_reward/std": 0.2992057204246521, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 468.3125, "completions/mean_terminated_length": 468.3125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.6556276571940032, "frac_reward_zero_std": 0.0, "grad_norm": 0.08296996355056763, "learning_rate": 4.117621095872476e-06, "loss": -0.0241, "num_tokens": 233599163.0, "reward": 3.1460554599761963, "reward_std": 0.46030616760253906, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5523055791854858, "rewards/ngram_similarity_reward/std": 0.18339583277702332, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 451.03125, "completions/mean_terminated_length": 451.03125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6560751846050571, "frac_reward_zero_std": 0.0, "grad_norm": 0.12082185596227646, "learning_rate": 4.11629879005842e-06, "loss": 0.0199, "num_tokens": 233770557.0, "reward": 4.085703372955322, "reward_std": 0.9208123683929443, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.8357031345367432, "rewards/ngram_similarity_reward/std": 0.2821987569332123, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 441.65625, "completions/mean_terminated_length": 441.65625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.656522712016111, "frac_reward_zero_std": 0.0, "grad_norm": 0.15086643397808075, "learning_rate": 4.114975736272209e-06, "loss": 0.015, "num_tokens": 233971447.0, "reward": 5.505659580230713, "reward_std": 0.8936091065406799, "rewards/accuracy_reward/mean": 4.96875, "rewards/accuracy_reward/std": 1.8427786827087402, "rewards/ngram_similarity_reward/mean": 0.5369095802307129, "rewards/ngram_similarity_reward/std": 0.3775620758533478, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 552.65625, "completions/mean_terminated_length": 504.4193420410156, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.6569702394271649, "frac_reward_zero_std": 0.0, "grad_norm": 0.10218188911676407, "learning_rate": 4.113651935238144e-06, "loss": 0.0136, "num_tokens": 234132161.0, "reward": 2.611851215362549, "reward_std": 0.8556914925575256, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.5806010961532593, "rewards/ngram_similarity_reward/std": 0.3110412359237671, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 421.53125, "completions/mean_terminated_length": 369.06451416015625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6574177668382188, "frac_reward_zero_std": 0.0, "grad_norm": 0.16987138986587524, "learning_rate": 4.11232738768093e-06, "loss": -0.0114, "num_tokens": 234330659.0, "reward": 4.307737827301025, "reward_std": 1.6615090370178223, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.6827379465103149, "rewards/ngram_similarity_reward/std": 0.3940820097923279, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 468.40625, "completions/mean_terminated_length": 468.40625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.6578652942492728, "frac_reward_zero_std": 0.0, "grad_norm": 0.10134927183389664, "learning_rate": 4.111002094325682e-06, "loss": -0.0136, "num_tokens": 234512029.0, "reward": 2.7333059310913086, "reward_std": 1.3934979438781738, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5145559310913086, "rewards/ngram_similarity_reward/std": 0.1624448001384735, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 654.875, "completions/mean_terminated_length": 562.0, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.6583128216603267, "frac_reward_zero_std": 0.0, "grad_norm": 0.08241728693246841, "learning_rate": 4.109676055897926e-06, "loss": -0.0989, "num_tokens": 234658613.0, "reward": 6.001367568969727, "reward_std": 0.606982409954071, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.6888673305511475, "rewards/ngram_similarity_reward/std": 0.1470806747674942, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 466.828125, "completions/mean_terminated_length": 466.828125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.6587603490713806, "frac_reward_zero_std": 0.25, "grad_norm": 0.10049612820148468, "learning_rate": 4.108349273123593e-06, "loss": -0.0132, "num_tokens": 234791530.0, "reward": 5.122507095336914, "reward_std": 0.6011908650398254, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.9350072145462036, "rewards/ngram_similarity_reward/std": 0.2942538559436798, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 452.265625, "completions/mean_terminated_length": 452.265625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6592078764824345, "frac_reward_zero_std": 0.0, "grad_norm": 0.10474356263875961, "learning_rate": 4.1070217467290215e-06, "loss": 0.0213, "num_tokens": 234929131.0, "reward": 3.1596713066101074, "reward_std": 0.05670752003788948, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6596712470054626, "rewards/ngram_similarity_reward/std": 0.20737488567829132, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 652.109375, "completions/mean_terminated_length": 629.952392578125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.6596554038934884, "frac_reward_zero_std": 0.0, "grad_norm": 0.08270086348056793, "learning_rate": 4.10569347744096e-06, "loss": 0.047, "num_tokens": 235098018.0, "reward": 4.487462997436523, "reward_std": 0.09502571821212769, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.48746317625045776, "rewards/ngram_similarity_reward/std": 0.28049421310424805, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 520.828125, "completions/mean_terminated_length": 520.828125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.6601029313045425, "frac_reward_zero_std": 0.0, "grad_norm": 0.08269716799259186, "learning_rate": 4.10436446598656e-06, "loss": -0.0041, "num_tokens": 235249527.0, "reward": 4.493458271026611, "reward_std": 0.44828683137893677, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5872083902359009, "rewards/ngram_similarity_reward/std": 0.4154118299484253, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 428.765625, "completions/mean_terminated_length": 428.765625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.6605504587155964, "frac_reward_zero_std": 0.0, "grad_norm": 0.12616463005542755, "learning_rate": 4.1030347130933815e-06, "loss": 0.0004, "num_tokens": 235413224.0, "reward": 3.0580625534057617, "reward_std": 0.8669981956481934, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.7455626130104065, "rewards/ngram_similarity_reward/std": 0.3908692002296448, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 454.03125, "completions/mean_terminated_length": 454.03125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6609979861266503, "frac_reward_zero_std": 0.0, "grad_norm": 0.0840691477060318, "learning_rate": 4.101704219489389e-06, "loss": -0.0135, "num_tokens": 235542730.0, "reward": 6.504312515258789, "reward_std": 0.21266743540763855, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 1.0043129920959473, "rewards/ngram_similarity_reward/std": 0.26560497283935547, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 510.890625, "completions/mean_terminated_length": 510.890625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6614455135377042, "frac_reward_zero_std": 0.0, "grad_norm": 0.09798114001750946, "learning_rate": 4.100372985902955e-06, "loss": -0.014, "num_tokens": 235705171.0, "reward": 4.69714879989624, "reward_std": 0.16241833567619324, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.697148859500885, "rewards/ngram_similarity_reward/std": 0.38769692182540894, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 504.609375, "completions/mean_terminated_length": 504.609375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6618930409487581, "frac_reward_zero_std": 0.0, "grad_norm": 0.09201952069997787, "learning_rate": 4.099041013062854e-06, "loss": 0.0368, "num_tokens": 235850890.0, "reward": 5.716346740722656, "reward_std": 1.29547119140625, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.8725964426994324, "rewards/ngram_similarity_reward/std": 0.24577252566814423, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 512.921875, "completions/mean_terminated_length": 512.921875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.662340568359812, "frac_reward_zero_std": 0.0, "grad_norm": 0.09567677229642868, "learning_rate": 4.097708301698266e-06, "loss": 0.0454, "num_tokens": 236003349.0, "reward": 3.1331405639648438, "reward_std": 0.16849875450134277, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6331405639648438, "rewards/ngram_similarity_reward/std": 0.2982363998889923, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 431.640625, "completions/mean_terminated_length": 431.640625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.662788095770866, "frac_reward_zero_std": 0.5, "grad_norm": 0.060426075011491776, "learning_rate": 4.0963748525387774e-06, "loss": -0.0121, "num_tokens": 236150846.0, "reward": 6.335261344909668, "reward_std": 0.8731868267059326, "rewards/accuracy_reward/mean": 5.296875, "rewards/accuracy_reward/std": 1.1433686017990112, "rewards/ngram_similarity_reward/mean": 1.038386583328247, "rewards/ngram_similarity_reward/std": 0.17555567622184753, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 476.546875, "completions/mean_terminated_length": 476.546875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6632356231819199, "frac_reward_zero_std": 0.25, "grad_norm": 0.0681709423661232, "learning_rate": 4.095040666314377e-06, "loss": 0.0307, "num_tokens": 236315233.0, "reward": 4.7674665451049805, "reward_std": 0.029874470084905624, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7674665451049805, "rewards/ngram_similarity_reward/std": 0.20821696519851685, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 472.328125, "completions/mean_terminated_length": 472.328125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.6636831505929738, "frac_reward_zero_std": 0.0, "grad_norm": 0.0849587619304657, "learning_rate": 4.0937057437554565e-06, "loss": -0.0165, "num_tokens": 236471446.0, "reward": 3.6301369667053223, "reward_std": 0.9095160961151123, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.661387026309967, "rewards/ngram_similarity_reward/std": 0.2558072805404663, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 447.421875, "completions/mean_terminated_length": 447.421875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.6641306780040277, "frac_reward_zero_std": 0.0, "grad_norm": 0.11404269188642502, "learning_rate": 4.092370085592812e-06, "loss": -0.0398, "num_tokens": 236661297.0, "reward": 0.8565977215766907, "reward_std": 0.9256554841995239, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 2.0098345279693604, "rewards/ngram_similarity_reward/mean": 0.6222226619720459, "rewards/ngram_similarity_reward/std": 0.29372283816337585, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 460.640625, "completions/mean_terminated_length": 460.640625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.6645782054150817, "frac_reward_zero_std": 0.5, "grad_norm": 0.06449608504772186, "learning_rate": 4.091033692557641e-06, "loss": -0.0014, "num_tokens": 236790250.0, "reward": 6.271068572998047, "reward_std": 0.07193907350301743, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7710685133934021, "rewards/ngram_similarity_reward/std": 0.3435302674770355, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 527.671875, "completions/mean_terminated_length": 527.671875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.6650257328261356, "frac_reward_zero_std": 0.0, "grad_norm": 0.080342136323452, "learning_rate": 4.089696565381543e-06, "loss": -0.0054, "num_tokens": 236930501.0, "reward": 1.0850721597671509, "reward_std": 1.204647183418274, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.6475721597671509, "rewards/ngram_similarity_reward/std": 0.19706475734710693, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 424.796875, "completions/mean_terminated_length": 424.796875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.6654732602371896, "frac_reward_zero_std": 0.25, "grad_norm": 0.08424574881792068, "learning_rate": 4.088358704796522e-06, "loss": 0.0063, "num_tokens": 237080952.0, "reward": 5.506689071655273, "reward_std": 0.8653192520141602, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.7566890716552734, "rewards/ngram_similarity_reward/std": 0.2732362449169159, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 601.34375, "completions/mean_terminated_length": 504.9000244140625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6659207876482435, "frac_reward_zero_std": 0.25, "grad_norm": 0.08288619667291641, "learning_rate": 4.087020111534981e-06, "loss": 0.0117, "num_tokens": 237208430.0, "reward": 6.119905948638916, "reward_std": 0.5858101844787598, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.8074060082435608, "rewards/ngram_similarity_reward/std": 0.22066588699817657, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 443.5625, "completions/mean_terminated_length": 443.5625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6663683150592974, "frac_reward_zero_std": 0.0, "grad_norm": 0.11840573698282242, "learning_rate": 4.085680786329725e-06, "loss": 0.012, "num_tokens": 237426818.0, "reward": 3.0602216720581055, "reward_std": 0.8027669787406921, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.841471791267395, "rewards/ngram_similarity_reward/std": 0.22687605023384094, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 590.65625, "completions/mean_terminated_length": 590.65625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.6668158424703513, "frac_reward_zero_std": 0.25, "grad_norm": 0.05773892626166344, "learning_rate": 4.084340729913959e-06, "loss": 0.0107, "num_tokens": 237584524.0, "reward": 5.3110551834106445, "reward_std": 0.8410439491271973, "rewards/accuracy_reward/mean": 4.609375, "rewards/accuracy_reward/std": 2.2262303829193115, "rewards/ngram_similarity_reward/mean": 0.7016801238059998, "rewards/ngram_similarity_reward/std": 0.3378947675228119, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 502.65625, "completions/mean_terminated_length": 478.12701416015625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6672633698814052, "frac_reward_zero_std": 0.25, "grad_norm": 0.0763397067785263, "learning_rate": 4.082999943021291e-06, "loss": -0.0176, "num_tokens": 237762454.0, "reward": 3.3543882369995117, "reward_std": 0.6753823757171631, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5731382966041565, "rewards/ngram_similarity_reward/std": 0.4696844816207886, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 539.0625, "completions/mean_terminated_length": 515.1111450195312, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.6677108972924591, "frac_reward_zero_std": 0.0, "grad_norm": 0.10276427865028381, "learning_rate": 4.081658426385725e-06, "loss": -0.0187, "num_tokens": 237909962.0, "reward": 4.054726600646973, "reward_std": 1.9290709495544434, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.8984768390655518, "rewards/ngram_similarity_reward/std": 0.2063111960887909, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 428.03125, "completions/mean_terminated_length": 428.03125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.668158424703513, "frac_reward_zero_std": 0.25, "grad_norm": 0.08124004304409027, "learning_rate": 4.080316180741667e-06, "loss": -0.001, "num_tokens": 238071116.0, "reward": 6.309140205383301, "reward_std": 0.09563571214675903, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8091399669647217, "rewards/ngram_similarity_reward/std": 0.2800677418708801, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 588.9375, "completions/mean_terminated_length": 565.77783203125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.668605952114567, "frac_reward_zero_std": 0.0, "grad_norm": 0.09261708706617355, "learning_rate": 4.0789732068239215e-06, "loss": -0.0255, "num_tokens": 238219688.0, "reward": 6.038747787475586, "reward_std": 1.0299474000930786, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.9137482047080994, "rewards/ngram_similarity_reward/std": 0.23467347025871277, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 531.8125, "completions/mean_terminated_length": 531.8125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.669053479525621, "frac_reward_zero_std": 0.0, "grad_norm": 0.08115254342556, "learning_rate": 4.07762950536769e-06, "loss": -0.0014, "num_tokens": 238362236.0, "reward": 3.2857232093811035, "reward_std": 0.10580325871706009, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7857229113578796, "rewards/ngram_similarity_reward/std": 0.2741377651691437, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 566.6875, "completions/mean_terminated_length": 566.6875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.6695010069366749, "frac_reward_zero_std": 0.0, "grad_norm": 0.08066169172525406, "learning_rate": 4.076285077108576e-06, "loss": 0.0104, "num_tokens": 238497832.0, "reward": 4.507000923156738, "reward_std": 0.5692435503005981, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6945005655288696, "rewards/ngram_similarity_reward/std": 0.32864856719970703, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 530.578125, "completions/mean_terminated_length": 530.578125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.6699485343477288, "frac_reward_zero_std": 0.0, "grad_norm": 0.07869521528482437, "learning_rate": 4.0749399227825775e-06, "loss": -0.0118, "num_tokens": 238662765.0, "reward": 4.468442440032959, "reward_std": 1.3427462577819824, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5621925592422485, "rewards/ngram_similarity_reward/std": 0.23733443021774292, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 522.421875, "completions/mean_terminated_length": 498.2063903808594, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.6703960617587827, "frac_reward_zero_std": 0.0, "grad_norm": 0.08195405453443527, "learning_rate": 4.073594043126093e-06, "loss": -0.0238, "num_tokens": 238830936.0, "reward": 3.77846622467041, "reward_std": 1.9166414737701416, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6222162246704102, "rewards/ngram_similarity_reward/std": 0.24115103483200073, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 496.796875, "completions/mean_terminated_length": 496.796875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.6708435891698367, "frac_reward_zero_std": 0.0, "grad_norm": 0.10084085911512375, "learning_rate": 4.0722474388759135e-06, "loss": -0.0317, "num_tokens": 238995371.0, "reward": 4.701803207397461, "reward_std": 0.5719506144523621, "rewards/accuracy_reward/mean": 3.875, "rewards/accuracy_reward/std": 2.7284510135650635, "rewards/ngram_similarity_reward/mean": 0.8268033266067505, "rewards/ngram_similarity_reward/std": 0.2679290175437927, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 595.65625, "completions/mean_terminated_length": 572.6032104492188, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.6712911165808906, "frac_reward_zero_std": 0.0, "grad_norm": 0.10507504642009735, "learning_rate": 4.0709001107692305e-06, "loss": 0.0258, "num_tokens": 239127797.0, "reward": 4.470786094665527, "reward_std": 0.19319896399974823, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4707862436771393, "rewards/ngram_similarity_reward/std": 0.3042636215686798, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 457.03125, "completions/mean_terminated_length": 457.03125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6717386439919445, "frac_reward_zero_std": 0.25, "grad_norm": 0.07408589124679565, "learning_rate": 4.069552059543632e-06, "loss": 0.0489, "num_tokens": 239313671.0, "reward": 0.45077264308929443, "reward_std": 0.7991616725921631, "rewards/accuracy_reward/mean": -0.03125, "rewards/accuracy_reward/std": 1.3209995031356812, "rewards/ngram_similarity_reward/mean": 0.48202264308929443, "rewards/ngram_similarity_reward/std": 0.15846078097820282, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 423.234375, "completions/mean_terminated_length": 423.234375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6721861714029984, "frac_reward_zero_std": 0.0, "grad_norm": 0.10048261284828186, "learning_rate": 4.068203285937101e-06, "loss": 0.011, "num_tokens": 239434678.0, "reward": 4.453929901123047, "reward_std": 0.5523361563682556, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6414296627044678, "rewards/ngram_similarity_reward/std": 0.3741045892238617, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 498.53125, "completions/mean_terminated_length": 498.53125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.6726336988140523, "frac_reward_zero_std": 0.0, "grad_norm": 0.11623698472976685, "learning_rate": 4.066853790688013e-06, "loss": -0.0322, "num_tokens": 239666312.0, "reward": 2.941746234893799, "reward_std": 0.8182100057601929, "rewards/accuracy_reward/mean": 2.25, "rewards/accuracy_reward/std": 3.295017957687378, "rewards/ngram_similarity_reward/mean": 0.6917462944984436, "rewards/ngram_similarity_reward/std": 0.2632776200771332, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 556.203125, "completions/mean_terminated_length": 556.203125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.6730812262251062, "frac_reward_zero_std": 0.0, "grad_norm": 0.15846550464630127, "learning_rate": 4.065503574535143e-06, "loss": 0.0191, "num_tokens": 239884917.0, "reward": 2.8816068172454834, "reward_std": 0.4729578495025635, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.4753568470478058, "rewards/ngram_similarity_reward/std": 0.20575736463069916, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 636.21875, "completions/mean_terminated_length": 613.8095703125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.6735287536361602, "frac_reward_zero_std": 0.0, "grad_norm": 0.07791364938020706, "learning_rate": 4.064152638217657e-06, "loss": 0.0034, "num_tokens": 240057539.0, "reward": 4.2021660804748535, "reward_std": 0.5542839765548706, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.3896661102771759, "rewards/ngram_similarity_reward/std": 0.29823753237724304, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 578.484375, "completions/mean_terminated_length": 555.1587524414062, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.6739762810472142, "frac_reward_zero_std": 0.0, "grad_norm": 0.08407315611839294, "learning_rate": 4.062800982475121e-06, "loss": 0.0153, "num_tokens": 240206690.0, "reward": 0.8947778940200806, "reward_std": 0.812238335609436, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.5510278940200806, "rewards/ngram_similarity_reward/std": 0.34097588062286377, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 544.359375, "completions/mean_terminated_length": 544.359375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6744238084582681, "frac_reward_zero_std": 0.0, "grad_norm": 0.10727450996637344, "learning_rate": 4.061448608047487e-06, "loss": 0.0123, "num_tokens": 240400585.0, "reward": 4.1962361335754395, "reward_std": 1.6066899299621582, "rewards/accuracy_reward/mean": 3.421875, "rewards/accuracy_reward/std": 2.896657705307007, "rewards/ngram_similarity_reward/mean": 0.7743610739707947, "rewards/ngram_similarity_reward/std": 0.2609521448612213, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 658.203125, "completions/mean_terminated_length": 658.203125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.674871335869322, "frac_reward_zero_std": 0.0, "grad_norm": 0.09349507093429565, "learning_rate": 4.060095515675107e-06, "loss": 0.0134, "num_tokens": 240552502.0, "reward": 2.9654061794281006, "reward_std": 1.4760980606079102, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.7466559410095215, "rewards/ngram_similarity_reward/std": 0.2863253355026245, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 432.96875, "completions/mean_terminated_length": 432.96875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.6753188632803759, "frac_reward_zero_std": 0.0, "grad_norm": 0.09913067519664764, "learning_rate": 4.058741706098721e-06, "loss": 0.016, "num_tokens": 240693380.0, "reward": 6.0691680908203125, "reward_std": 0.6833319067955017, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.8504183292388916, "rewards/ngram_similarity_reward/std": 0.4402073323726654, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 428.921875, "completions/mean_terminated_length": 428.921875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6757663906914299, "frac_reward_zero_std": 0.0, "grad_norm": 0.12266264855861664, "learning_rate": 4.057387180059465e-06, "loss": -0.0191, "num_tokens": 240841247.0, "reward": 3.98134446144104, "reward_std": 0.8376370072364807, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.8250945806503296, "rewards/ngram_similarity_reward/std": 0.29401248693466187, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 595.375, "completions/mean_terminated_length": 595.375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.6762139181024838, "frac_reward_zero_std": 0.0, "grad_norm": 0.07436852157115936, "learning_rate": 4.0560319382988665e-06, "loss": 0.0041, "num_tokens": 241011655.0, "reward": 2.9112842082977295, "reward_std": 0.09545248746871948, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.411283940076828, "rewards/ngram_similarity_reward/std": 0.19912941753864288, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 518.71875, "completions/mean_terminated_length": 518.71875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.6766614455135377, "frac_reward_zero_std": 0.0, "grad_norm": 0.1377800852060318, "learning_rate": 4.054675981558845e-06, "loss": -0.0225, "num_tokens": 241212437.0, "reward": 4.467209815979004, "reward_std": 0.8089848756790161, "rewards/accuracy_reward/mean": 3.59375, "rewards/accuracy_reward/std": 2.854785919189453, "rewards/ngram_similarity_reward/mean": 0.8734598159790039, "rewards/ngram_similarity_reward/std": 0.2185596376657486, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 515.453125, "completions/mean_terminated_length": 515.453125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.6771089729245916, "frac_reward_zero_std": 0.0, "grad_norm": 0.08017116785049438, "learning_rate": 4.053319310581709e-06, "loss": -0.0084, "num_tokens": 241396898.0, "reward": 1.6002922058105469, "reward_std": 0.4597763121128082, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5065422654151917, "rewards/ngram_similarity_reward/std": 0.19871973991394043, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 395.546875, "completions/mean_terminated_length": 395.546875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6775565003356455, "frac_reward_zero_std": 0.25, "grad_norm": 0.11000040918588638, "learning_rate": 4.051961926110161e-06, "loss": -0.043, "num_tokens": 241563813.0, "reward": 6.277076721191406, "reward_std": 0.17032390832901, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7770768404006958, "rewards/ngram_similarity_reward/std": 0.39133378863334656, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 488.8125, "completions/mean_terminated_length": 488.8125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6780040277466994, "frac_reward_zero_std": 0.25, "grad_norm": 0.15393635630607605, "learning_rate": 4.0506038288872955e-06, "loss": 0.0103, "num_tokens": 241792089.0, "reward": 4.695099830627441, "reward_std": 0.04499488323926926, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6950998306274414, "rewards/ngram_similarity_reward/std": 0.42729097604751587, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 474.625, "completions/mean_terminated_length": 474.625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.6784515551577535, "frac_reward_zero_std": 0.0, "grad_norm": 0.17051804065704346, "learning_rate": 4.049245019656592e-06, "loss": -0.0037, "num_tokens": 242007793.0, "reward": 4.716339111328125, "reward_std": 0.4745529294013977, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.8100893497467041, "rewards/ngram_similarity_reward/std": 0.2591150999069214, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 492.71875, "completions/mean_terminated_length": 492.71875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.6788990825688074, "frac_reward_zero_std": 0.0, "grad_norm": 0.09130734205245972, "learning_rate": 4.047885499161923e-06, "loss": 0.022, "num_tokens": 242175439.0, "reward": 4.156372547149658, "reward_std": 0.7526333332061768, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6251224279403687, "rewards/ngram_similarity_reward/std": 0.2308683544397354, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 519.9375, "completions/mean_terminated_length": 495.68255615234375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6793466099798613, "frac_reward_zero_std": 0.0, "grad_norm": 0.11831668764352798, "learning_rate": 4.0465252681475505e-06, "loss": -0.036, "num_tokens": 242350523.0, "reward": 1.7951796054840088, "reward_std": 0.5443094968795776, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.7014296650886536, "rewards/ngram_similarity_reward/std": 0.31123247742652893, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 438.765625, "completions/mean_terminated_length": 438.765625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.6797941373909152, "frac_reward_zero_std": 0.0, "grad_norm": 0.10574766993522644, "learning_rate": 4.0451643273581235e-06, "loss": 0.0051, "num_tokens": 242480716.0, "reward": 4.843680381774902, "reward_std": 0.9988704919815063, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.749930202960968, "rewards/ngram_similarity_reward/std": 0.2892991900444031, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1395.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 567.921875, "completions/mean_terminated_length": 567.921875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.6802416648019691, "frac_reward_zero_std": 0.25, "grad_norm": 0.07118367403745651, "learning_rate": 4.043802677538682e-06, "loss": 0.03, "num_tokens": 242637063.0, "reward": 1.5142168998718262, "reward_std": 0.10006881505250931, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.5298418998718262, "rewards/ngram_similarity_reward/std": 0.14635279774665833, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 479.34375, "completions/mean_terminated_length": 479.34375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.680689192213023, "frac_reward_zero_std": 0.0, "grad_norm": 0.10519301891326904, "learning_rate": 4.042440319434653e-06, "loss": 0.0257, "num_tokens": 242758237.0, "reward": 4.176997661590576, "reward_std": 0.9178718328475952, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6457475423812866, "rewards/ngram_similarity_reward/std": 0.2892041802406311, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 532.0, "completions/mean_terminated_length": 532.0, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.681136719624077, "frac_reward_zero_std": 0.0, "grad_norm": 0.08786152303218842, "learning_rate": 4.041077253791853e-06, "loss": 0.0127, "num_tokens": 242908973.0, "reward": 3.625276565551758, "reward_std": 0.7645278573036194, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7502766847610474, "rewards/ngram_similarity_reward/std": 0.33395424485206604, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 564.578125, "completions/mean_terminated_length": 541.0317993164062, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.6815842470351309, "frac_reward_zero_std": 0.0, "grad_norm": 0.10636013746261597, "learning_rate": 4.03971348135648e-06, "loss": 0.0508, "num_tokens": 243068882.0, "reward": 2.9963223934173584, "reward_std": 1.0687932968139648, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6838223934173584, "rewards/ngram_similarity_reward/std": 0.32443106174468994, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 499.46875, "completions/mean_terminated_length": 499.46875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.6820317744461848, "frac_reward_zero_std": 0.25, "grad_norm": 0.08839923143386841, "learning_rate": 4.038349002875127e-06, "loss": -0.0321, "num_tokens": 243221936.0, "reward": 3.3763747215270996, "reward_std": 0.6003474593162537, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6888746023178101, "rewards/ngram_similarity_reward/std": 0.25925931334495544, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 465.359375, "completions/mean_terminated_length": 465.359375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.6824793018572387, "frac_reward_zero_std": 0.0, "grad_norm": 0.0927591472864151, "learning_rate": 4.036983819094769e-06, "loss": 0.0248, "num_tokens": 243377463.0, "reward": 4.6241655349731445, "reward_std": 0.5007472038269043, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.7335406541824341, "rewards/ngram_similarity_reward/std": 0.30895963311195374, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 522.5625, "completions/mean_terminated_length": 522.5625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.6829268292682927, "frac_reward_zero_std": 0.25, "grad_norm": 0.09703091531991959, "learning_rate": 4.0356179307627654e-06, "loss": 0.0057, "num_tokens": 243560683.0, "reward": 4.5334930419921875, "reward_std": 0.6946358680725098, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.8147426247596741, "rewards/ngram_similarity_reward/std": 0.26535749435424805, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 489.296875, "completions/mean_terminated_length": 464.5555725097656, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6833743566793467, "frac_reward_zero_std": 0.0, "grad_norm": 0.1149318590760231, "learning_rate": 4.034251338626867e-06, "loss": -0.0194, "num_tokens": 243772062.0, "reward": 1.503861665725708, "reward_std": 0.12548813223838806, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5038615465164185, "rewards/ngram_similarity_reward/std": 0.2763817310333252, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 519.0, "completions/mean_terminated_length": 519.0, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.6838218840904006, "frac_reward_zero_std": 0.0, "grad_norm": 0.08250933140516281, "learning_rate": 4.032884043435204e-06, "loss": 0.0276, "num_tokens": 243936014.0, "reward": 2.9738001823425293, "reward_std": 0.1276642084121704, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.4738001227378845, "rewards/ngram_similarity_reward/std": 0.30533042550086975, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 569.0625, "completions/mean_terminated_length": 569.0625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.6842694115014545, "frac_reward_zero_std": 0.0, "grad_norm": 0.08203459531068802, "learning_rate": 4.031516045936295e-06, "loss": 0.0132, "num_tokens": 244071554.0, "reward": 3.4614744186401367, "reward_std": 1.5858569145202637, "rewards/accuracy_reward/mean": 2.84375, "rewards/accuracy_reward/std": 3.0405657291412354, "rewards/ngram_similarity_reward/mean": 0.6177245378494263, "rewards/ngram_similarity_reward/std": 0.25231122970581055, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 457.59375, "completions/mean_terminated_length": 457.59375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.6847169389125084, "frac_reward_zero_std": 0.0, "grad_norm": 0.11150728911161423, "learning_rate": 4.030147346879042e-06, "loss": -0.0276, "num_tokens": 244246120.0, "reward": 3.9611706733703613, "reward_std": 0.8779897689819336, "rewards/accuracy_reward/mean": 3.34375, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.6174206733703613, "rewards/ngram_similarity_reward/std": 0.2506239712238312, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 501.078125, "completions/mean_terminated_length": 501.078125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6851644663235623, "frac_reward_zero_std": 0.0, "grad_norm": 0.08861125260591507, "learning_rate": 4.028777947012732e-06, "loss": 0.0065, "num_tokens": 244404237.0, "reward": 3.6665449142456055, "reward_std": 1.2334115505218506, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.41654497385025024, "rewards/ngram_similarity_reward/std": 0.2642189562320709, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 480.46875, "completions/mean_terminated_length": 480.46875, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.6856119937346162, "frac_reward_zero_std": 0.0, "grad_norm": 0.08565875142812729, "learning_rate": 4.027407847087032e-06, "loss": 0.0087, "num_tokens": 244568651.0, "reward": 3.4153053760528564, "reward_std": 0.43495145440101624, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.8215552568435669, "rewards/ngram_similarity_reward/std": 0.18351885676383972, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 512.484375, "completions/mean_terminated_length": 512.484375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.6860595211456701, "frac_reward_zero_std": 0.0, "grad_norm": 0.0879283994436264, "learning_rate": 4.0260370478519986e-06, "loss": -0.0244, "num_tokens": 244779178.0, "reward": 4.797738075256348, "reward_std": 0.10125033557415009, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7977379560470581, "rewards/ngram_similarity_reward/std": 0.20789459347724915, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 459.265625, "completions/mean_terminated_length": 459.265625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.6865070485567241, "frac_reward_zero_std": 0.25, "grad_norm": 0.10340742766857147, "learning_rate": 4.024665550058065e-06, "loss": -0.0148, "num_tokens": 244986763.0, "reward": 4.677967071533203, "reward_std": 0.09323635697364807, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6779671311378479, "rewards/ngram_similarity_reward/std": 0.2466803640127182, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 550.515625, "completions/mean_terminated_length": 450.683349609375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.686954575967778, "frac_reward_zero_std": 0.0, "grad_norm": 0.10321526229381561, "learning_rate": 4.023293354456051e-06, "loss": 0.0764, "num_tokens": 245177868.0, "reward": 4.347613334655762, "reward_std": 1.1923820972442627, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6288636922836304, "rewards/ngram_similarity_reward/std": 0.3350275158882141, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 465.953125, "completions/mean_terminated_length": 440.84130859375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.687402103378832, "frac_reward_zero_std": 0.0, "grad_norm": 0.10282690078020096, "learning_rate": 4.021920461797157e-06, "loss": 0.0222, "num_tokens": 245346457.0, "reward": 4.505459785461426, "reward_std": 0.4533050060272217, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5992101430892944, "rewards/ngram_similarity_reward/std": 0.13468407094478607, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 520.5, "completions/mean_terminated_length": 496.2539978027344, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.6878496307898859, "frac_reward_zero_std": 0.0, "grad_norm": 0.08532318472862244, "learning_rate": 4.020546872832965e-06, "loss": -0.024, "num_tokens": 245493289.0, "reward": 4.433938026428223, "reward_std": 0.6108426451683044, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6214381456375122, "rewards/ngram_similarity_reward/std": 0.1984623521566391, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 430.21875, "completions/mean_terminated_length": 430.21875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6882971582009398, "frac_reward_zero_std": 0.25, "grad_norm": 0.10850238800048828, "learning_rate": 4.019172588315436e-06, "loss": 0.0154, "num_tokens": 245625319.0, "reward": 4.919859886169434, "reward_std": 0.453775554895401, "rewards/accuracy_reward/mean": 4.078125, "rewards/accuracy_reward/std": 2.593059778213501, "rewards/ngram_similarity_reward/mean": 0.8417348861694336, "rewards/ngram_similarity_reward/std": 0.335245281457901, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 518.5, "completions/mean_terminated_length": 518.5, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.6887446856119938, "frac_reward_zero_std": 0.0, "grad_norm": 0.08004138618707657, "learning_rate": 4.017797608996918e-06, "loss": 0.0577, "num_tokens": 245760711.0, "reward": 6.246609687805176, "reward_std": 0.09443674236536026, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7466100454330444, "rewards/ngram_similarity_reward/std": 0.18616412580013275, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 583.765625, "completions/mean_terminated_length": 583.765625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.6891922130230477, "frac_reward_zero_std": 0.0, "grad_norm": 0.08965511620044708, "learning_rate": 4.016421935630131e-06, "loss": 0.0032, "num_tokens": 245944392.0, "reward": 4.213710308074951, "reward_std": 1.1697884798049927, "rewards/accuracy_reward/mean": 3.59375, "rewards/accuracy_reward/std": 2.854785919189453, "rewards/ngram_similarity_reward/mean": 0.6199604272842407, "rewards/ngram_similarity_reward/std": 0.3590463697910309, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 438.359375, "completions/mean_terminated_length": 438.359375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.6896397404341016, "frac_reward_zero_std": 0.0, "grad_norm": 0.11437889188528061, "learning_rate": 4.0150455689681805e-06, "loss": -0.0005, "num_tokens": 246081599.0, "reward": 3.605909824371338, "reward_std": 1.5501439571380615, "rewards/accuracy_reward/mean": 2.8125, "rewards/accuracy_reward/std": 3.080275297164917, "rewards/ngram_similarity_reward/mean": 0.7934097647666931, "rewards/ngram_similarity_reward/std": 0.28618934750556946, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 478.21875, "completions/mean_terminated_length": 478.21875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6900872678451555, "frac_reward_zero_std": 0.0, "grad_norm": 0.07439448684453964, "learning_rate": 4.013668509764552e-06, "loss": -0.0131, "num_tokens": 246252285.0, "reward": 6.477996349334717, "reward_std": 0.10430392622947693, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.9779964089393616, "rewards/ngram_similarity_reward/std": 0.19902634620666504, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 442.765625, "completions/mean_terminated_length": 442.765625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6905347952562094, "frac_reward_zero_std": 0.25, "grad_norm": 0.09308404475450516, "learning_rate": 4.012290758773106e-06, "loss": 0.0194, "num_tokens": 246411934.0, "reward": 2.091059446334839, "reward_std": 1.5338801145553589, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.9035593271255493, "rewards/ngram_similarity_reward/std": 0.14946863055229187, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 430.171875, "completions/mean_terminated_length": 430.171875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6909823226672633, "frac_reward_zero_std": 0.25, "grad_norm": 0.09399006515741348, "learning_rate": 4.010912316748085e-06, "loss": -0.0073, "num_tokens": 246566153.0, "reward": 4.816415786743164, "reward_std": 0.13335630297660828, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8164159655570984, "rewards/ngram_similarity_reward/std": 0.3032083511352539, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 537.8125, "completions/mean_terminated_length": 537.8125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.6914298500783173, "frac_reward_zero_std": 0.25, "grad_norm": 0.0755753219127655, "learning_rate": 4.009533184444111e-06, "loss": -0.016, "num_tokens": 246769197.0, "reward": 3.2236671447753906, "reward_std": 0.09550964832305908, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.723667323589325, "rewards/ngram_similarity_reward/std": 0.3105979263782501, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 501.546875, "completions/mean_terminated_length": 501.546875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.6918773774893712, "frac_reward_zero_std": 0.25, "grad_norm": 0.07646788656711578, "learning_rate": 4.008153362616179e-06, "loss": 0.0118, "num_tokens": 246949312.0, "reward": 3.505671977996826, "reward_std": 1.1457102298736572, "rewards/accuracy_reward/mean": 2.765625, "rewards/accuracy_reward/std": 3.0302298069000244, "rewards/ngram_similarity_reward/mean": 0.7400468587875366, "rewards/ngram_similarity_reward/std": 0.15715715289115906, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 508.1875, "completions/mean_terminated_length": 508.1875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6923249049004252, "frac_reward_zero_std": 0.25, "grad_norm": 0.11373983323574066, "learning_rate": 4.006772852019664e-06, "loss": -0.002, "num_tokens": 247099228.0, "reward": 1.571900486946106, "reward_std": 0.07764595746994019, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5719004273414612, "rewards/ngram_similarity_reward/std": 0.2418157160282135, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 511.453125, "completions/mean_terminated_length": 511.453125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6927724323114791, "frac_reward_zero_std": 0.0, "grad_norm": 0.09819337725639343, "learning_rate": 4.0053916534103205e-06, "loss": 0.0368, "num_tokens": 247300873.0, "reward": 3.8611645698547363, "reward_std": 0.7926970720291138, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.6111645102500916, "rewards/ngram_similarity_reward/std": 0.3728707730770111, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 515.5, "completions/mean_terminated_length": 515.5, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.693219959722533, "frac_reward_zero_std": 0.0, "grad_norm": 0.08585703372955322, "learning_rate": 4.004009767544276e-06, "loss": -0.0004, "num_tokens": 247444361.0, "reward": 3.245168924331665, "reward_std": 0.12304390221834183, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.745168924331665, "rewards/ngram_similarity_reward/std": 0.1642819494009018, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 536.3125, "completions/mean_terminated_length": 512.3175048828125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.693667487133587, "frac_reward_zero_std": 0.0, "grad_norm": 0.08912550657987595, "learning_rate": 4.002627195178037e-06, "loss": 0.002, "num_tokens": 247592973.0, "reward": 3.1012842655181885, "reward_std": 0.11610934138298035, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.601284384727478, "rewards/ngram_similarity_reward/std": 0.3811487853527069, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 352.1875, "completions/mean_terminated_length": 352.1875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6941150145446409, "frac_reward_zero_std": 0.25, "grad_norm": 0.11063137650489807, "learning_rate": 4.001243937068482e-06, "loss": -0.0079, "num_tokens": 247728553.0, "reward": 6.3173675537109375, "reward_std": 0.6602230072021484, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 1.0048675537109375, "rewards/ngram_similarity_reward/std": 0.2490445375442505, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 482.28125, "completions/mean_terminated_length": 482.28125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.6945625419556948, "frac_reward_zero_std": 0.0, "grad_norm": 0.15815483033657074, "learning_rate": 3.999859993972871e-06, "loss": -0.0043, "num_tokens": 247999067.0, "reward": 3.154822587966919, "reward_std": 0.07147978246212006, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6548227071762085, "rewards/ngram_similarity_reward/std": 0.3912094235420227, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 481.359375, "completions/mean_terminated_length": 481.359375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6950100693667487, "frac_reward_zero_std": 0.25, "grad_norm": 0.0771925300359726, "learning_rate": 3.998475366648832e-06, "loss": 0.0053, "num_tokens": 248126306.0, "reward": 5.524537086486816, "reward_std": 0.746326744556427, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.49328747391700745, "rewards/ngram_similarity_reward/std": 0.16534650325775146, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 528.5, "completions/mean_terminated_length": 528.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.6954575967778026, "frac_reward_zero_std": 0.0, "grad_norm": 0.08068442344665527, "learning_rate": 3.9970900558543744e-06, "loss": 0.0076, "num_tokens": 248270066.0, "reward": 3.2489542961120605, "reward_std": 2.6484427452087402, "rewards/accuracy_reward/mean": 2.453125, "rewards/accuracy_reward/std": 3.077979803085327, "rewards/ngram_similarity_reward/mean": 0.7958290576934814, "rewards/ngram_similarity_reward/std": 0.2966296970844269, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 559.140625, "completions/mean_terminated_length": 559.140625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.6959051241888565, "frac_reward_zero_std": 0.0, "grad_norm": 0.07420410960912704, "learning_rate": 3.995704062347874e-06, "loss": 0.0251, "num_tokens": 248448299.0, "reward": 4.802103519439697, "reward_std": 0.5186323523521423, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.7083532810211182, "rewards/ngram_similarity_reward/std": 0.2901707589626312, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 637.265625, "completions/mean_terminated_length": 637.265625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.6963526515999104, "frac_reward_zero_std": 0.0, "grad_norm": 0.06902759522199631, "learning_rate": 3.994317386888089e-06, "loss": 0.0283, "num_tokens": 248630572.0, "reward": 2.794793128967285, "reward_std": 0.7708073258399963, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6697932481765747, "rewards/ngram_similarity_reward/std": 0.3138458728790283, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 497.765625, "completions/mean_terminated_length": 497.765625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.6968001790109645, "frac_reward_zero_std": 0.0, "grad_norm": 0.09973353892564774, "learning_rate": 3.992930030234144e-06, "loss": -0.0031, "num_tokens": 248775325.0, "reward": 0.4560176730155945, "reward_std": 1.1246377229690552, "rewards/accuracy_reward/mean": -0.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.4872676730155945, "rewards/ngram_similarity_reward/std": 0.1638980507850647, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 475.453125, "completions/mean_terminated_length": 475.453125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.6972477064220184, "frac_reward_zero_std": 0.0, "grad_norm": 0.08650655299425125, "learning_rate": 3.9915419931455414e-06, "loss": 0.0133, "num_tokens": 248951002.0, "reward": 3.812142848968506, "reward_std": 0.8941503167152405, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.8433928489685059, "rewards/ngram_similarity_reward/std": 0.2216542363166809, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 477.71875, "completions/mean_terminated_length": 477.71875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.6976952338330723, "frac_reward_zero_std": 0.0, "grad_norm": 0.0839940756559372, "learning_rate": 3.990153276382151e-06, "loss": -0.0006, "num_tokens": 249099256.0, "reward": 4.668988227844238, "reward_std": 0.07187686860561371, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6689878106117249, "rewards/ngram_similarity_reward/std": 0.24075810611248016, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 400.40625, "completions/mean_terminated_length": 400.40625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.6981427612441262, "frac_reward_zero_std": 0.0, "grad_norm": 0.10552025586366653, "learning_rate": 3.988763880704218e-06, "loss": -0.016, "num_tokens": 249260130.0, "reward": 2.995523452758789, "reward_std": 0.06344757974147797, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.49552327394485474, "rewards/ngram_similarity_reward/std": 0.26851361989974976, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 472.953125, "completions/mean_terminated_length": 447.952392578125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6985902886551801, "frac_reward_zero_std": 0.0, "grad_norm": 0.10093589872121811, "learning_rate": 3.987373806872362e-06, "loss": -0.009, "num_tokens": 249401023.0, "reward": 4.924391269683838, "reward_std": 1.5129709243774414, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.830641508102417, "rewards/ngram_similarity_reward/std": 0.3078822195529938, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 403.46875, "completions/mean_terminated_length": 403.46875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.699037816066234, "frac_reward_zero_std": 0.25, "grad_norm": 0.08264653384685516, "learning_rate": 3.985983055647567e-06, "loss": -0.005, "num_tokens": 249551613.0, "reward": 4.847428798675537, "reward_std": 0.09491054713726044, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8474288582801819, "rewards/ngram_similarity_reward/std": 0.3070674538612366, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 485.9375, "completions/mean_terminated_length": 485.9375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.699485343477288, "frac_reward_zero_std": 0.0, "grad_norm": 0.08242128044366837, "learning_rate": 3.984591627791194e-06, "loss": 0.0139, "num_tokens": 249702969.0, "reward": 3.1901519298553467, "reward_std": 0.16040048003196716, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6901519894599915, "rewards/ngram_similarity_reward/std": 0.32409507036209106, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 454.109375, "completions/mean_terminated_length": 454.109375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.6999328708883419, "frac_reward_zero_std": 0.25, "grad_norm": 0.09580738097429276, "learning_rate": 3.98319952406497e-06, "loss": -0.0167, "num_tokens": 249885008.0, "reward": 4.166101932525635, "reward_std": 1.3613916635513306, "rewards/accuracy_reward/mean": 3.296875, "rewards/accuracy_reward/std": 2.97171688079834, "rewards/ngram_similarity_reward/mean": 0.8692273497581482, "rewards/ngram_similarity_reward/std": 0.3243560492992401, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 524.296875, "completions/mean_terminated_length": 475.1451416015625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.7003803982993958, "frac_reward_zero_std": 0.0, "grad_norm": 0.09756094217300415, "learning_rate": 3.981806745230995e-06, "loss": -0.1086, "num_tokens": 250025651.0, "reward": 4.522310256958008, "reward_std": 1.0277568101882935, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.8035602569580078, "rewards/ngram_similarity_reward/std": 0.29595068097114563, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 446.734375, "completions/mean_terminated_length": 446.734375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.7008279257104497, "frac_reward_zero_std": 0.0, "grad_norm": 0.08303641527891159, "learning_rate": 3.980413292051737e-06, "loss": 0.0055, "num_tokens": 250157410.0, "reward": 6.100274085998535, "reward_std": 0.6671102046966553, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.8815240859985352, "rewards/ngram_similarity_reward/std": 0.26843953132629395, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 455.3125, "completions/mean_terminated_length": 455.3125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.7012754531215037, "frac_reward_zero_std": 0.0, "grad_norm": 0.10217029601335526, "learning_rate": 3.979019165290034e-06, "loss": 0.0239, "num_tokens": 250302742.0, "reward": 5.739469528198242, "reward_std": 0.4549495577812195, "rewards/accuracy_reward/mean": 5.390625, "rewards/accuracy_reward/std": 0.8750000596046448, "rewards/ngram_similarity_reward/mean": 0.34884434938430786, "rewards/ngram_similarity_reward/std": 0.268588125705719, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 549.09375, "completions/mean_terminated_length": 549.09375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.7017229805325577, "frac_reward_zero_std": 0.0, "grad_norm": 0.1244237869977951, "learning_rate": 3.977624365709093e-06, "loss": -0.0067, "num_tokens": 250482428.0, "reward": 4.415213584899902, "reward_std": 0.19421711564064026, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.4308384656906128, "rewards/ngram_similarity_reward/std": 0.22356411814689636, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 438.59375, "completions/mean_terminated_length": 438.59375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7021705079436116, "frac_reward_zero_std": 0.0, "grad_norm": 0.09684059768915176, "learning_rate": 3.9762288940724875e-06, "loss": -0.0028, "num_tokens": 250659778.0, "reward": 5.599760055541992, "reward_std": 0.8724799752235413, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.7560100555419922, "rewards/ngram_similarity_reward/std": 0.19458892941474915, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 447.90625, "completions/mean_terminated_length": 447.90625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7026180353546655, "frac_reward_zero_std": 0.25, "grad_norm": 0.09088005870580673, "learning_rate": 3.974832751144161e-06, "loss": 0.0236, "num_tokens": 250825052.0, "reward": 4.93842887878418, "reward_std": 0.06765494495630264, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9384286999702454, "rewards/ngram_similarity_reward/std": 0.3072734475135803, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 421.640625, "completions/mean_terminated_length": 421.640625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7030655627657194, "frac_reward_zero_std": 0.0, "grad_norm": 0.12094370275735855, "learning_rate": 3.973435937688424e-06, "loss": 0.0044, "num_tokens": 251016517.0, "reward": 4.936328887939453, "reward_std": 0.7063286900520325, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.748828649520874, "rewards/ngram_similarity_reward/std": 0.2553362250328064, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 463.921875, "completions/mean_terminated_length": 463.921875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.7035130901767733, "frac_reward_zero_std": 0.0, "grad_norm": 0.12409054487943649, "learning_rate": 3.972038454469951e-06, "loss": 0.0217, "num_tokens": 251184816.0, "reward": 2.1978726387023926, "reward_std": 0.8047740459442139, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.7291226387023926, "rewards/ngram_similarity_reward/std": 0.1631511002779007, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 512.40625, "completions/mean_terminated_length": 512.40625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7039606175878272, "frac_reward_zero_std": 0.0, "grad_norm": 0.10297142714262009, "learning_rate": 3.97064030225379e-06, "loss": -0.0207, "num_tokens": 251333130.0, "reward": 2.899536609649658, "reward_std": 0.5663940906524658, "rewards/accuracy_reward/mean": 2.546875, "rewards/accuracy_reward/std": 3.077979803085327, "rewards/ngram_similarity_reward/mean": 0.352661669254303, "rewards/ngram_similarity_reward/std": 0.2078230232000351, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 568.125, "completions/mean_terminated_length": 568.125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.7044081449988812, "frac_reward_zero_std": 0.0, "grad_norm": 0.12223118543624878, "learning_rate": 3.969241481805349e-06, "loss": -0.0197, "num_tokens": 251512290.0, "reward": 2.919192314147949, "reward_std": 0.6469785571098328, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6066923141479492, "rewards/ngram_similarity_reward/std": 0.35552290081977844, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 438.765625, "completions/mean_terminated_length": 438.765625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7048556724099351, "frac_reward_zero_std": 0.0, "grad_norm": 0.1319248229265213, "learning_rate": 3.9678419938904024e-06, "loss": 0.0399, "num_tokens": 251614851.0, "reward": 4.733461380004883, "reward_std": 2.085000514984131, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7334614992141724, "rewards/ngram_similarity_reward/std": 0.2388819009065628, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 460.921875, "completions/mean_terminated_length": 460.921875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.705303199820989, "frac_reward_zero_std": 0.0, "grad_norm": 0.12093572318553925, "learning_rate": 3.966441839275096e-06, "loss": 0.0198, "num_tokens": 251819342.0, "reward": 4.596960067749023, "reward_std": 0.17896360158920288, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5969597101211548, "rewards/ngram_similarity_reward/std": 0.26587197184562683, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 423.78125, "completions/mean_terminated_length": 423.78125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.705750727232043, "frac_reward_zero_std": 0.25, "grad_norm": 0.12576833367347717, "learning_rate": 3.965041018725931e-06, "loss": -0.0031, "num_tokens": 251989600.0, "reward": 3.3358778953552246, "reward_std": 1.182431697845459, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6483778953552246, "rewards/ngram_similarity_reward/std": 0.3610571026802063, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 506.46875, "completions/mean_terminated_length": 506.46875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7061982546430969, "frac_reward_zero_std": 0.0, "grad_norm": 0.12079954892396927, "learning_rate": 3.9636395330097805e-06, "loss": -0.0386, "num_tokens": 252148190.0, "reward": 4.8347649574279785, "reward_std": 0.6759188175201416, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.553514838218689, "rewards/ngram_similarity_reward/std": 0.31114792823791504, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 614.390625, "completions/mean_terminated_length": 518.8167114257812, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7066457820541509, "frac_reward_zero_std": 0.0, "grad_norm": 0.13940861821174622, "learning_rate": 3.96223738289388e-06, "loss": -0.0085, "num_tokens": 252293943.0, "reward": 4.402132987976074, "reward_std": 0.8495877981185913, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.5896329879760742, "rewards/ngram_similarity_reward/std": 0.29393261671066284, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 480.640625, "completions/mean_terminated_length": 455.7619323730469, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7070933094652048, "frac_reward_zero_std": 0.0, "grad_norm": 0.09094911813735962, "learning_rate": 3.960834569145829e-06, "loss": -0.0164, "num_tokens": 252435440.0, "reward": 3.8131513595581055, "reward_std": 1.889657974243164, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.7506515383720398, "rewards/ngram_similarity_reward/std": 0.3241376578807831, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 564.640625, "completions/mean_terminated_length": 564.640625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.7075408368762587, "frac_reward_zero_std": 0.25, "grad_norm": 0.06971049308776855, "learning_rate": 3.959431092533588e-06, "loss": -0.0067, "num_tokens": 252647945.0, "reward": 5.790827751159668, "reward_std": 0.6532076001167297, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.5720778107643127, "rewards/ngram_similarity_reward/std": 0.325927734375, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 440.34375, "completions/mean_terminated_length": 440.34375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.7079883642873126, "frac_reward_zero_std": 0.0, "grad_norm": 0.10880489647388458, "learning_rate": 3.958026953825482e-06, "loss": 0.008, "num_tokens": 252777839.0, "reward": 4.542692184448242, "reward_std": 0.07795602083206177, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5426921844482422, "rewards/ngram_similarity_reward/std": 0.28923484683036804, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 479.71875, "completions/mean_terminated_length": 454.8254089355469, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7084358916983665, "frac_reward_zero_std": 0.0, "grad_norm": 0.13043983280658722, "learning_rate": 3.9566221537901985e-06, "loss": 0.0307, "num_tokens": 252915069.0, "reward": 3.0502660274505615, "reward_std": 0.10106834024190903, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5502660274505615, "rewards/ngram_similarity_reward/std": 0.21487337350845337, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 508.28125, "completions/mean_terminated_length": 508.28125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7088834191094204, "frac_reward_zero_std": 0.0, "grad_norm": 0.09665711969137192, "learning_rate": 3.955216693196787e-06, "loss": -0.0044, "num_tokens": 253079119.0, "reward": 4.397170066833496, "reward_std": 0.5835081338882446, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.5846704244613647, "rewards/ngram_similarity_reward/std": 0.08703292161226273, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 496.1875, "completions/mean_terminated_length": 496.1875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7093309465204743, "frac_reward_zero_std": 0.0, "grad_norm": 0.07534337788820267, "learning_rate": 3.95381057281466e-06, "loss": -0.0034, "num_tokens": 253218875.0, "reward": 4.854658126831055, "reward_std": 0.0570051483809948, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.854658305644989, "rewards/ngram_similarity_reward/std": 0.33798474073410034, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 345.46875, "completions/mean_terminated_length": 345.46875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7097784739315283, "frac_reward_zero_std": 0.5, "grad_norm": 0.08280720561742783, "learning_rate": 3.952403793413587e-06, "loss": 0.0222, "num_tokens": 253326457.0, "reward": 4.747356414794922, "reward_std": 0.03861922398209572, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7473564147949219, "rewards/ngram_similarity_reward/std": 0.4246657192707062, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 414.625, "completions/mean_terminated_length": 414.625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7102260013425822, "frac_reward_zero_std": 0.0, "grad_norm": 0.11492746323347092, "learning_rate": 3.950996355763704e-06, "loss": 0.0138, "num_tokens": 253440257.0, "reward": 5.836974143981934, "reward_std": 0.7199723124504089, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.618224024772644, "rewards/ngram_similarity_reward/std": 0.3036477565765381, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 465.90625, "completions/mean_terminated_length": 465.90625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.7106735287536362, "frac_reward_zero_std": 0.0, "grad_norm": 0.09943797439336777, "learning_rate": 3.949588260635502e-06, "loss": 0.02, "num_tokens": 253591483.0, "reward": 4.5792107582092285, "reward_std": 0.09714803099632263, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5792109370231628, "rewards/ngram_similarity_reward/std": 0.3758523166179657, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 634.03125, "completions/mean_terminated_length": 634.03125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.7111210561646901, "frac_reward_zero_std": 0.0, "grad_norm": 0.07041703164577484, "learning_rate": 3.948179508799835e-06, "loss": 0.0055, "num_tokens": 253754333.0, "reward": 4.734063148498535, "reward_std": 0.43159162998199463, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.640312910079956, "rewards/ngram_similarity_reward/std": 0.3765218257904053, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 477.6875, "completions/mean_terminated_length": 477.6875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.711568583575744, "frac_reward_zero_std": 0.0, "grad_norm": 0.11647907644510269, "learning_rate": 3.946770101027917e-06, "loss": 0.0276, "num_tokens": 253934393.0, "reward": 3.272484540939331, "reward_std": 0.09996727108955383, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7724847793579102, "rewards/ngram_similarity_reward/std": 0.1062595322728157, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 437.765625, "completions/mean_terminated_length": 437.765625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.712016110986798, "frac_reward_zero_std": 0.0, "grad_norm": 0.1312415599822998, "learning_rate": 3.945360038091317e-06, "loss": 0.0219, "num_tokens": 254156698.0, "reward": 3.7038168907165527, "reward_std": 2.1661648750305176, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.7350670099258423, "rewards/ngram_similarity_reward/std": 0.2963406443595886, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 491.609375, "completions/mean_terminated_length": 491.609375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7124636383978519, "frac_reward_zero_std": 0.0, "grad_norm": 0.12840750813484192, "learning_rate": 3.9439493207619695e-06, "loss": -0.0066, "num_tokens": 254378305.0, "reward": 5.733854293823242, "reward_std": 1.2718347311019897, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.7026040554046631, "rewards/ngram_similarity_reward/std": 0.33394986391067505, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 675.25, "completions/mean_terminated_length": 558.915283203125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.7129111658089058, "frac_reward_zero_std": 0.0, "grad_norm": 0.10053110122680664, "learning_rate": 3.942537949812161e-06, "loss": -0.0773, "num_tokens": 254525825.0, "reward": 3.607769250869751, "reward_std": 0.7840161323547363, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.732769250869751, "rewards/ngram_similarity_reward/std": 0.4627540111541748, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 467.234375, "completions/mean_terminated_length": 467.234375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.7133586932199597, "frac_reward_zero_std": 0.0, "grad_norm": 0.07122514396905899, "learning_rate": 3.941125926014537e-06, "loss": 0.0148, "num_tokens": 254681904.0, "reward": 4.114041805267334, "reward_std": 1.4079729318618774, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.8640419244766235, "rewards/ngram_similarity_reward/std": 0.2687089443206787, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 653.484375, "completions/mean_terminated_length": 608.5, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.7138062206310136, "frac_reward_zero_std": 0.0, "grad_norm": 0.08103739470243454, "learning_rate": 3.939713250142104e-06, "loss": -0.0419, "num_tokens": 254838271.0, "reward": 5.720540523529053, "reward_std": 0.8042657375335693, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.6892905235290527, "rewards/ngram_similarity_reward/std": 0.24583858251571655, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 544.15625, "completions/mean_terminated_length": 495.6451416015625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7142537480420675, "frac_reward_zero_std": 0.5, "grad_norm": 0.0563306026160717, "learning_rate": 3.938299922968223e-06, "loss": 0.0129, "num_tokens": 254982857.0, "reward": 4.760640621185303, "reward_std": 0.04776597395539284, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7606406211853027, "rewards/ngram_similarity_reward/std": 0.267974317073822, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 513.84375, "completions/mean_terminated_length": 513.84375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.7147012754531215, "frac_reward_zero_std": 0.0, "grad_norm": 0.0818086639046669, "learning_rate": 3.936885945266609e-06, "loss": -0.0053, "num_tokens": 255121967.0, "reward": 1.653555989265442, "reward_std": 0.6274036765098572, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.46605589985847473, "rewards/ngram_similarity_reward/std": 0.19381797313690186, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 528.53125, "completions/mean_terminated_length": 528.53125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.7151488028641755, "frac_reward_zero_std": 0.0, "grad_norm": 0.08734636008739471, "learning_rate": 3.935471317811338e-06, "loss": 0.0064, "num_tokens": 255254913.0, "reward": 5.333474159240723, "reward_std": 0.8434069156646729, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.6772241592407227, "rewards/ngram_similarity_reward/std": 0.3647167682647705, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 501.078125, "completions/mean_terminated_length": 501.078125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.7155963302752294, "frac_reward_zero_std": 0.5, "grad_norm": 0.05728432536125183, "learning_rate": 3.9340560413768384e-06, "loss": 0.0021, "num_tokens": 255387462.0, "reward": 1.715783715248108, "reward_std": 0.13839605450630188, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 2.6425621509552, "rewards/ngram_similarity_reward/mean": 0.7470337152481079, "rewards/ngram_similarity_reward/std": 0.3163996934890747, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 520.71875, "completions/mean_terminated_length": 520.71875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7160438576862833, "frac_reward_zero_std": 0.0, "grad_norm": 0.1150129958987236, "learning_rate": 3.932640116737896e-06, "loss": -0.0153, "num_tokens": 255568452.0, "reward": 1.610967755317688, "reward_std": 0.1124044731259346, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6109675168991089, "rewards/ngram_similarity_reward/std": 0.20065845549106598, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 478.046875, "completions/mean_terminated_length": 478.046875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.7164913850973372, "frac_reward_zero_std": 0.0, "grad_norm": 0.10712695121765137, "learning_rate": 3.931223544669649e-06, "loss": -0.0093, "num_tokens": 255726407.0, "reward": 1.8906025886535645, "reward_std": 1.1437827348709106, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.515602707862854, "rewards/ngram_similarity_reward/std": 0.3079753518104553, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 425.71875, "completions/mean_terminated_length": 425.71875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.7169389125083911, "frac_reward_zero_std": 0.25, "grad_norm": 0.09294099360704422, "learning_rate": 3.929806325947591e-06, "loss": -0.0115, "num_tokens": 255862725.0, "reward": 2.9013173580169678, "reward_std": 0.1599607616662979, "rewards/accuracy_reward/mean": 2.46875, "rewards/accuracy_reward/std": 3.06007981300354, "rewards/ngram_similarity_reward/mean": 0.4325675070285797, "rewards/ngram_similarity_reward/std": 0.17533192038536072, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 450.4375, "completions/mean_terminated_length": 450.4375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.7173864399194451, "frac_reward_zero_std": 0.0, "grad_norm": 0.10121306031942368, "learning_rate": 3.9283884613475706e-06, "loss": -0.0115, "num_tokens": 256010737.0, "reward": 3.354915142059326, "reward_std": 1.0463835000991821, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6674151420593262, "rewards/ngram_similarity_reward/std": 0.2502375841140747, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 632.484375, "completions/mean_terminated_length": 610.0159301757812, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.717833967330499, "frac_reward_zero_std": 0.0, "grad_norm": 0.1088629886507988, "learning_rate": 3.92696995164579e-06, "loss": 0.002, "num_tokens": 256189120.0, "reward": 4.064433574676514, "reward_std": 1.296493649482727, "rewards/accuracy_reward/mean": 3.421875, "rewards/accuracy_reward/std": 2.896657705307007, "rewards/ngram_similarity_reward/mean": 0.6425585746765137, "rewards/ngram_similarity_reward/std": 0.35586273670196533, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 432.828125, "completions/mean_terminated_length": 432.828125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.7182814947415529, "frac_reward_zero_std": 0.0, "grad_norm": 0.10701368004083633, "learning_rate": 3.925550797618804e-06, "loss": 0.0042, "num_tokens": 256354469.0, "reward": 4.758089065551758, "reward_std": 0.5311642289161682, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.8518393635749817, "rewards/ngram_similarity_reward/std": 0.24525253474712372, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 453.359375, "completions/mean_terminated_length": 453.359375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.7187290221526068, "frac_reward_zero_std": 0.25, "grad_norm": 0.1467125415802002, "learning_rate": 3.92413100004352e-06, "loss": -0.0263, "num_tokens": 256571372.0, "reward": 6.293436050415039, "reward_std": 0.13439010083675385, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7934361100196838, "rewards/ngram_similarity_reward/std": 0.3361327350139618, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 433.765625, "completions/mean_terminated_length": 433.765625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.7191765495636607, "frac_reward_zero_std": 0.0, "grad_norm": 0.10858814418315887, "learning_rate": 3.922710559697196e-06, "loss": 0.0318, "num_tokens": 256800589.0, "reward": 5.375655651092529, "reward_std": 1.4599380493164062, "rewards/accuracy_reward/mean": 4.46875, "rewards/accuracy_reward/std": 2.2815253734588623, "rewards/ngram_similarity_reward/mean": 0.9069056510925293, "rewards/ngram_similarity_reward/std": 0.26185810565948486, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 579.421875, "completions/mean_terminated_length": 579.421875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.7196240769747148, "frac_reward_zero_std": 0.0, "grad_norm": 0.09472686797380447, "learning_rate": 3.921289477357445e-06, "loss": 0.0087, "num_tokens": 256949304.0, "reward": 4.724700450897217, "reward_std": 0.12030484527349472, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7247006297111511, "rewards/ngram_similarity_reward/std": 0.21106944978237152, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 431.421875, "completions/mean_terminated_length": 431.421875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.7200716043857687, "frac_reward_zero_std": 0.25, "grad_norm": 0.11076363176107407, "learning_rate": 3.919867753802231e-06, "loss": 0.0411, "num_tokens": 257139651.0, "reward": 3.8031749725341797, "reward_std": 1.1167179346084595, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.8344252109527588, "rewards/ngram_similarity_reward/std": 0.3208877742290497, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 475.5625, "completions/mean_terminated_length": 475.5625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.7205191317968226, "frac_reward_zero_std": 0.25, "grad_norm": 0.08394243568181992, "learning_rate": 3.918445389809866e-06, "loss": -0.0311, "num_tokens": 257289575.0, "reward": 6.31013822555542, "reward_std": 0.06235632672905922, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8101381063461304, "rewards/ngram_similarity_reward/std": 0.304575115442276, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 568.328125, "completions/mean_terminated_length": 520.5967407226562, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.7209666592078765, "frac_reward_zero_std": 0.0, "grad_norm": 0.08711081743240356, "learning_rate": 3.9170223861590165e-06, "loss": 0.0208, "num_tokens": 257431964.0, "reward": 3.1470353603363037, "reward_std": 0.9846519231796265, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.0400354862213135, "rewards/ngram_similarity_reward/mean": 0.7564102411270142, "rewards/ngram_similarity_reward/std": 0.21799270808696747, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 424.515625, "completions/mean_terminated_length": 424.515625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7214141866189304, "frac_reward_zero_std": 0.0, "grad_norm": 0.10729598253965378, "learning_rate": 3.915598743628695e-06, "loss": 0.0266, "num_tokens": 257562317.0, "reward": 3.5499908924102783, "reward_std": 1.1870309114456177, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.7687408924102783, "rewards/ngram_similarity_reward/std": 0.3213869333267212, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 490.859375, "completions/mean_terminated_length": 490.859375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.7218617140299843, "frac_reward_zero_std": 0.25, "grad_norm": 0.06329638510942459, "learning_rate": 3.9141744629982695e-06, "loss": -0.0139, "num_tokens": 257709844.0, "reward": 3.0198938846588135, "reward_std": 0.43414995074272156, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6136438846588135, "rewards/ngram_similarity_reward/std": 0.38070380687713623, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 477.171875, "completions/mean_terminated_length": 477.171875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.7223092414410383, "frac_reward_zero_std": 0.25, "grad_norm": 0.0825599655508995, "learning_rate": 3.912749545047452e-06, "loss": 0.0323, "num_tokens": 257902575.0, "reward": 4.817748069763184, "reward_std": 0.1024700403213501, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8177484273910522, "rewards/ngram_similarity_reward/std": 0.30864542722702026, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 513.359375, "completions/mean_terminated_length": 513.359375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.7227567688520922, "frac_reward_zero_std": 0.0, "grad_norm": 0.1094784289598465, "learning_rate": 3.911323990556305e-06, "loss": 0.0087, "num_tokens": 258045206.0, "reward": 3.0424697399139404, "reward_std": 0.1478545069694519, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5424696803092957, "rewards/ngram_similarity_reward/std": 0.2889968752861023, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 596.25, "completions/mean_terminated_length": 596.25, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7232042962631461, "frac_reward_zero_std": 0.0, "grad_norm": 0.07353372126817703, "learning_rate": 3.90989780030524e-06, "loss": -0.0124, "num_tokens": 258212918.0, "reward": 3.8062007427215576, "reward_std": 1.2309160232543945, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.5562007427215576, "rewards/ngram_similarity_reward/std": 0.3006802201271057, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 603.078125, "completions/mean_terminated_length": 556.4677124023438, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.7236518236742, "frac_reward_zero_std": 0.0, "grad_norm": 0.09781184047460556, "learning_rate": 3.9084709750750185e-06, "loss": -0.0391, "num_tokens": 258380235.0, "reward": 1.91732919216156, "reward_std": 0.7352028489112854, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 2.8141860961914062, "rewards/ngram_similarity_reward/mean": 0.6985790729522705, "rewards/ngram_similarity_reward/std": 0.27080607414245605, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 743.140625, "completions/mean_terminated_length": 529.6181640625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.7240993510852539, "frac_reward_zero_std": 0.0, "grad_norm": 0.11270378530025482, "learning_rate": 3.907043515646745e-06, "loss": -0.0517, "num_tokens": 258552276.0, "reward": 3.2019901275634766, "reward_std": 1.109002709388733, "rewards/accuracy_reward/mean": 2.578125, "rewards/accuracy_reward/std": 3.0410144329071045, "rewards/ngram_similarity_reward/mean": 0.6238652467727661, "rewards/ngram_similarity_reward/std": 0.37827983498573303, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 488.875, "completions/mean_terminated_length": 488.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.724546878496308, "frac_reward_zero_std": 0.0, "grad_norm": 0.12448471784591675, "learning_rate": 3.905615422801875e-06, "loss": 0.1146, "num_tokens": 258716364.0, "reward": 0.19647154211997986, "reward_std": 0.22791625559329987, "rewards/accuracy_reward/mean": -0.515625, "rewards/accuracy_reward/std": 0.125, "rewards/ngram_similarity_reward/mean": 0.7120965719223022, "rewards/ngram_similarity_reward/std": 0.22158174216747284, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 547.75, "completions/mean_terminated_length": 523.9365234375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.7249944059073619, "frac_reward_zero_std": 0.25, "grad_norm": 0.08939939737319946, "learning_rate": 3.904186697322209e-06, "loss": 0.0042, "num_tokens": 258862076.0, "reward": 4.555126190185547, "reward_std": 0.5544596910476685, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.7426260709762573, "rewards/ngram_similarity_reward/std": 0.316176176071167, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 412.21875, "completions/mean_terminated_length": 412.21875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7254419333184158, "frac_reward_zero_std": 0.25, "grad_norm": 0.0931994691491127, "learning_rate": 3.902757339989893e-06, "loss": 0.0073, "num_tokens": 259029306.0, "reward": 6.047171115875244, "reward_std": 0.4441218376159668, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.6409210562705994, "rewards/ngram_similarity_reward/std": 0.23308861255645752, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 458.203125, "completions/mean_terminated_length": 458.203125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.7258894607294697, "frac_reward_zero_std": 0.0, "grad_norm": 0.10467223078012466, "learning_rate": 3.9013273515874225e-06, "loss": -0.0121, "num_tokens": 259172087.0, "reward": 4.927596569061279, "reward_std": 0.18634790182113647, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9275966882705688, "rewards/ngram_similarity_reward/std": 0.2064925581216812, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 406.703125, "completions/mean_terminated_length": 406.703125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7263369881405236, "frac_reward_zero_std": 0.0, "grad_norm": 0.11514323949813843, "learning_rate": 3.899896732897635e-06, "loss": -0.0087, "num_tokens": 259337988.0, "reward": 5.104360580444336, "reward_std": 0.10654406249523163, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 1.1043603420257568, "rewards/ngram_similarity_reward/std": 0.13795940577983856, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 491.75, "completions/mean_terminated_length": 491.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.7267845155515775, "frac_reward_zero_std": 0.0, "grad_norm": 0.10860388725996017, "learning_rate": 3.898465484703713e-06, "loss": 0.0192, "num_tokens": 259482692.0, "reward": 5.41290283203125, "reward_std": 1.1937735080718994, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.7566529512405396, "rewards/ngram_similarity_reward/std": 0.16372349858283997, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 471.40625, "completions/mean_terminated_length": 446.3809814453125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7272320429626314, "frac_reward_zero_std": 0.0, "grad_norm": 0.12862062454223633, "learning_rate": 3.897033607789187e-06, "loss": -0.0043, "num_tokens": 259662430.0, "reward": 3.2956037521362305, "reward_std": 0.11109915375709534, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7956037521362305, "rewards/ngram_similarity_reward/std": 0.2544381022453308, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 441.921875, "completions/mean_terminated_length": 416.4285888671875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7276795703736854, "frac_reward_zero_std": 0.0, "grad_norm": 0.15487508475780487, "learning_rate": 3.895601102937929e-06, "loss": -0.0359, "num_tokens": 259842057.0, "reward": 3.8578715324401855, "reward_std": 0.7877907752990723, "rewards/accuracy_reward/mean": 3.0625, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.7953715920448303, "rewards/ngram_similarity_reward/std": 0.35424208641052246, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 463.0, "completions/mean_terminated_length": 463.0, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.7281270977847393, "frac_reward_zero_std": 0.0, "grad_norm": 0.09510334581136703, "learning_rate": 3.894167970934155e-06, "loss": -0.0346, "num_tokens": 259996745.0, "reward": 5.00093936920166, "reward_std": 0.6892349123954773, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.7196898460388184, "rewards/ngram_similarity_reward/std": 0.29325228929519653, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 400.28125, "completions/mean_terminated_length": 400.28125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.7285746251957932, "frac_reward_zero_std": 0.0, "grad_norm": 0.12284601479768753, "learning_rate": 3.892734212562423e-06, "loss": -0.0202, "num_tokens": 260148267.0, "reward": 5.090183734893799, "reward_std": 1.8874471187591553, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.5276838541030884, "rewards/ngram_similarity_reward/std": 0.2353626936674118, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 571.59375, "completions/mean_terminated_length": 571.59375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7290221526068472, "frac_reward_zero_std": 0.0, "grad_norm": 0.09873700886964798, "learning_rate": 3.891299828607639e-06, "loss": 0.0696, "num_tokens": 260276849.0, "reward": 3.9734854698181152, "reward_std": 1.4497017860412598, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.8172354698181152, "rewards/ngram_similarity_reward/std": 0.32224228978157043, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 508.5625, "completions/mean_terminated_length": 508.5625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.7294696800179011, "frac_reward_zero_std": 0.0, "grad_norm": 0.07983771711587906, "learning_rate": 3.889864819855044e-06, "loss": -0.0056, "num_tokens": 260453285.0, "reward": 4.735306739807129, "reward_std": 0.15424823760986328, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.735306978225708, "rewards/ngram_similarity_reward/std": 0.24989010393619537, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 518.203125, "completions/mean_terminated_length": 518.203125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.729917207428955, "frac_reward_zero_std": 0.0, "grad_norm": 0.13449940085411072, "learning_rate": 3.8884291870902285e-06, "loss": -0.0069, "num_tokens": 260610450.0, "reward": 3.005706310272217, "reward_std": 0.621200680732727, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.31820613145828247, "rewards/ngram_similarity_reward/std": 0.18179622292518616, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 431.796875, "completions/mean_terminated_length": 431.796875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.730364734840009, "frac_reward_zero_std": 0.0, "grad_norm": 0.112546406686306, "learning_rate": 3.886992931099118e-06, "loss": 0.0252, "num_tokens": 260754421.0, "reward": 2.45912504196167, "reward_std": 0.8453289270401001, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6153750419616699, "rewards/ngram_similarity_reward/std": 0.33217865228652954, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 424.546875, "completions/mean_terminated_length": 424.546875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7308122622510629, "frac_reward_zero_std": 0.0, "grad_norm": 0.09754940122365952, "learning_rate": 3.885556052667985e-06, "loss": 0.0249, "num_tokens": 260894936.0, "reward": 6.336062908172607, "reward_std": 0.10131148993968964, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8360626697540283, "rewards/ngram_similarity_reward/std": 0.15345709025859833, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 428.375, "completions/mean_terminated_length": 428.375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7312597896621168, "frac_reward_zero_std": 0.0, "grad_norm": 0.12307172268629074, "learning_rate": 3.88411855258344e-06, "loss": 0.0222, "num_tokens": 261005072.0, "reward": 6.0622735023498535, "reward_std": 1.0507888793945312, "rewards/accuracy_reward/mean": 5.125, "rewards/accuracy_reward/std": 1.4638501405715942, "rewards/ngram_similarity_reward/mean": 0.9372736215591431, "rewards/ngram_similarity_reward/std": 0.2874011695384979, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 493.65625, "completions/mean_terminated_length": 493.65625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7317073170731707, "frac_reward_zero_std": 0.0, "grad_norm": 0.10293906927108765, "learning_rate": 3.88268043163243e-06, "loss": -0.0203, "num_tokens": 261142730.0, "reward": 4.750032424926758, "reward_std": 0.14096632599830627, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7500326633453369, "rewards/ngram_similarity_reward/std": 0.24176648259162903, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 443.28125, "completions/mean_terminated_length": 443.28125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7321548444842246, "frac_reward_zero_std": 0.25, "grad_norm": 0.06822414696216583, "learning_rate": 3.881241690602251e-06, "loss": 0.0066, "num_tokens": 261299244.0, "reward": 5.113492012023926, "reward_std": 0.5794992446899414, "rewards/accuracy_reward/mean": 4.1875, "rewards/accuracy_reward/std": 2.5, "rewards/ngram_similarity_reward/mean": 0.9259923100471497, "rewards/ngram_similarity_reward/std": 0.17460310459136963, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 617.828125, "completions/mean_terminated_length": 617.828125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.7326023718952785, "frac_reward_zero_std": 0.0, "grad_norm": 0.08448846638202667, "learning_rate": 3.879802330280531e-06, "loss": 0.0022, "num_tokens": 261468753.0, "reward": 4.327930927276611, "reward_std": 0.6729658842086792, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6091808676719666, "rewards/ngram_similarity_reward/std": 0.269133061170578, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 480.234375, "completions/mean_terminated_length": 480.234375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.7330498993063325, "frac_reward_zero_std": 0.0, "grad_norm": 0.13625337183475494, "learning_rate": 3.878362351455237e-06, "loss": -0.0071, "num_tokens": 261594816.0, "reward": 4.344447135925293, "reward_std": 0.835814893245697, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.7194472551345825, "rewards/ngram_similarity_reward/std": 0.2889397144317627, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 511.921875, "completions/mean_terminated_length": 487.5397033691406, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.7334974267173865, "frac_reward_zero_std": 0.25, "grad_norm": 0.07077033817768097, "learning_rate": 3.87692175491468e-06, "loss": 0.0621, "num_tokens": 261713163.0, "reward": 4.173326015472412, "reward_std": 0.8966077566146851, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6420760154724121, "rewards/ngram_similarity_reward/std": 0.3697829842567444, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 558.265625, "completions/mean_terminated_length": 558.265625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7339449541284404, "frac_reward_zero_std": 0.0, "grad_norm": 0.07937997579574585, "learning_rate": 3.875480541447505e-06, "loss": -0.0224, "num_tokens": 261874140.0, "reward": 3.18398380279541, "reward_std": 0.10827402770519257, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6839838027954102, "rewards/ngram_similarity_reward/std": 0.29379263520240784, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 567.0625, "completions/mean_terminated_length": 543.5556030273438, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.7343924815394943, "frac_reward_zero_std": 0.0, "grad_norm": 0.09610205143690109, "learning_rate": 3.874038711842696e-06, "loss": 0.009, "num_tokens": 262002864.0, "reward": 4.4142889976501465, "reward_std": 0.9095728993415833, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6955393552780151, "rewards/ngram_similarity_reward/std": 0.2655259668827057, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 501.8125, "completions/mean_terminated_length": 477.2698669433594, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.7348400089505482, "frac_reward_zero_std": 0.0, "grad_norm": 0.12218975275754929, "learning_rate": 3.872596266889572e-06, "loss": 0.0507, "num_tokens": 262143108.0, "reward": 3.1528711318969727, "reward_std": 0.2617282271385193, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.1305904388427734, "rewards/ngram_similarity_reward/mean": 0.7466210126876831, "rewards/ngram_similarity_reward/std": 0.25369471311569214, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 515.96875, "completions/mean_terminated_length": 515.96875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7352875363616022, "frac_reward_zero_std": 0.0, "grad_norm": 0.09477532655000687, "learning_rate": 3.871153207377795e-06, "loss": 0.0101, "num_tokens": 262303218.0, "reward": 4.540840148925781, "reward_std": 0.11984336376190186, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5408403277397156, "rewards/ngram_similarity_reward/std": 0.34968307614326477, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 440.109375, "completions/mean_terminated_length": 440.109375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7357350637726561, "frac_reward_zero_std": 0.0, "grad_norm": 0.11149965226650238, "learning_rate": 3.869709534097355e-06, "loss": 0.0133, "num_tokens": 262478761.0, "reward": 2.4047563076019287, "reward_std": 1.3677046298980713, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.6547563672065735, "rewards/ngram_similarity_reward/std": 0.2777254283428192, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 475.46875, "completions/mean_terminated_length": 475.46875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.73618259118371, "frac_reward_zero_std": 0.0, "grad_norm": 0.11225708574056625, "learning_rate": 3.868265247838586e-06, "loss": 0.0002, "num_tokens": 262645991.0, "reward": 4.428684234619141, "reward_std": 0.18725836277008057, "rewards/accuracy_reward/mean": 3.828125, "rewards/accuracy_reward/std": 2.9279966354370117, "rewards/ngram_similarity_reward/mean": 0.6005595922470093, "rewards/ngram_similarity_reward/std": 0.22134476900100708, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 462.203125, "completions/mean_terminated_length": 462.203125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7366301185947639, "frac_reward_zero_std": 0.25, "grad_norm": 0.10217977315187454, "learning_rate": 3.866820349392152e-06, "loss": -0.0361, "num_tokens": 262765892.0, "reward": 2.707876205444336, "reward_std": 1.2375640869140625, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.7703762054443359, "rewards/ngram_similarity_reward/std": 0.2640071213245392, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 416.34375, "completions/mean_terminated_length": 416.34375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7370776460058178, "frac_reward_zero_std": 0.25, "grad_norm": 0.08163312077522278, "learning_rate": 3.865374839549054e-06, "loss": 0.0528, "num_tokens": 262899818.0, "reward": 3.4912028312683105, "reward_std": 0.12502653896808624, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.9912027716636658, "rewards/ngram_similarity_reward/std": 0.18320074677467346, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 445.28125, "completions/mean_terminated_length": 445.28125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7375251734168717, "frac_reward_zero_std": 0.0, "grad_norm": 0.13695028424263, "learning_rate": 3.863928719100628e-06, "loss": 0.0157, "num_tokens": 263042124.0, "reward": 4.577659606933594, "reward_std": 0.6269991993904114, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.765159547328949, "rewards/ngram_similarity_reward/std": 0.27326247096061707, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1583.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 460.15625, "completions/mean_terminated_length": 442.3333740234375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7379727008279258, "frac_reward_zero_std": 0.0, "grad_norm": 0.11578591167926788, "learning_rate": 3.862481988838544e-06, "loss": -0.0722, "num_tokens": 263254199.0, "reward": 4.636940956115723, "reward_std": 0.883092999458313, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6369410753250122, "rewards/ngram_similarity_reward/std": 0.376800537109375, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 462.890625, "completions/mean_terminated_length": 462.890625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7384202282389797, "frac_reward_zero_std": 0.0, "grad_norm": 0.10988388955593109, "learning_rate": 3.861034649554807e-06, "loss": -0.0338, "num_tokens": 263384112.0, "reward": 3.533763885498047, "reward_std": 1.0205178260803223, "rewards/accuracy_reward/mean": 2.765625, "rewards/accuracy_reward/std": 3.0302298069000244, "rewards/ngram_similarity_reward/mean": 0.7681391835212708, "rewards/ngram_similarity_reward/std": 0.2180566042661667, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 462.21875, "completions/mean_terminated_length": 437.0476379394531, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7388677556500336, "frac_reward_zero_std": 0.0, "grad_norm": 0.13337351381778717, "learning_rate": 3.8595867020417525e-06, "loss": -0.0145, "num_tokens": 263544094.0, "reward": 3.9548983573913574, "reward_std": 1.4088985919952393, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.7048982381820679, "rewards/ngram_similarity_reward/std": 0.22468942403793335, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 513.984375, "completions/mean_terminated_length": 513.984375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.7393152830610875, "frac_reward_zero_std": 0.0, "grad_norm": 0.11442842334508896, "learning_rate": 3.858138147092051e-06, "loss": -0.016, "num_tokens": 263675885.0, "reward": 5.1964521408081055, "reward_std": 1.0896323919296265, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.9152022004127502, "rewards/ngram_similarity_reward/std": 0.3200232684612274, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 426.484375, "completions/mean_terminated_length": 426.484375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7397628104721414, "frac_reward_zero_std": 0.0, "grad_norm": 0.10637952387332916, "learning_rate": 3.856688985498707e-06, "loss": -0.0017, "num_tokens": 263810892.0, "reward": 4.461467266082764, "reward_std": 0.4828716814517975, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5552173256874084, "rewards/ngram_similarity_reward/std": 0.2920112907886505, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 485.203125, "completions/mean_terminated_length": 485.203125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.7402103378831953, "frac_reward_zero_std": 0.0, "grad_norm": 0.1339215636253357, "learning_rate": 3.855239218055055e-06, "loss": -0.013, "num_tokens": 263964265.0, "reward": 4.873979568481445, "reward_std": 1.5473817586898804, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8739794492721558, "rewards/ngram_similarity_reward/std": 0.22126427292823792, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 420.0625, "completions/mean_terminated_length": 420.0625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.7406578652942493, "frac_reward_zero_std": 0.0, "grad_norm": 0.10984153300523758, "learning_rate": 3.8537888455547595e-06, "loss": -0.0018, "num_tokens": 264089837.0, "reward": 5.766287803649902, "reward_std": 1.2951836585998535, "rewards/accuracy_reward/mean": 4.921875, "rewards/accuracy_reward/std": 1.8153201341629028, "rewards/ngram_similarity_reward/mean": 0.8444128036499023, "rewards/ngram_similarity_reward/std": 0.23695027828216553, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 569.640625, "completions/mean_terminated_length": 569.640625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7411053927053032, "frac_reward_zero_std": 0.0, "grad_norm": 0.09492413699626923, "learning_rate": 3.85233786879182e-06, "loss": 0.0257, "num_tokens": 264220230.0, "reward": 3.574841022491455, "reward_std": 1.4794020652770996, "rewards/accuracy_reward/mean": 2.96875, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.6060912609100342, "rewards/ngram_similarity_reward/std": 0.24550238251686096, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 415.125, "completions/mean_terminated_length": 415.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.7415529201163571, "frac_reward_zero_std": 0.25, "grad_norm": 0.14240625500679016, "learning_rate": 3.850886288560565e-06, "loss": -0.0264, "num_tokens": 264360862.0, "reward": 4.8376312255859375, "reward_std": 0.09781719744205475, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.837631344795227, "rewards/ngram_similarity_reward/std": 0.21419653296470642, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 553.96875, "completions/mean_terminated_length": 427.3559265136719, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.742000447527411, "frac_reward_zero_std": 0.0, "grad_norm": 0.13936901092529297, "learning_rate": 3.849434105655653e-06, "loss": -0.0329, "num_tokens": 264520412.0, "reward": 3.005133628845215, "reward_std": 0.6597869396209717, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.6926336884498596, "rewards/ngram_similarity_reward/std": 0.25201693177223206, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 385.703125, "completions/mean_terminated_length": 385.703125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7424479749384649, "frac_reward_zero_std": 0.0, "grad_norm": 0.12147440761327744, "learning_rate": 3.847981320872074e-06, "loss": 0.0279, "num_tokens": 264686105.0, "reward": 2.880711078643799, "reward_std": 1.4913966655731201, "rewards/accuracy_reward/mean": 2.21875, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.6619612574577332, "rewards/ngram_similarity_reward/std": 0.3021509349346161, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 435.453125, "completions/mean_terminated_length": 435.453125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.742895502349519, "frac_reward_zero_std": 0.0, "grad_norm": 0.11805487424135208, "learning_rate": 3.846527935005145e-06, "loss": 0.0191, "num_tokens": 264839110.0, "reward": 3.053068161010742, "reward_std": 0.47172486782073975, "rewards/accuracy_reward/mean": 2.359375, "rewards/accuracy_reward/std": 3.075077533721924, "rewards/ngram_similarity_reward/mean": 0.6936930418014526, "rewards/ngram_similarity_reward/std": 0.31582656502723694, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 491.265625, "completions/mean_terminated_length": 491.265625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.7433430297605729, "frac_reward_zero_std": 0.25, "grad_norm": 0.0991583988070488, "learning_rate": 3.845073948850513e-06, "loss": -0.0153, "num_tokens": 265004391.0, "reward": 2.2496678829193115, "reward_std": 1.1892794370651245, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.687167763710022, "rewards/ngram_similarity_reward/std": 0.2876705527305603, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 531.0625, "completions/mean_terminated_length": 531.0625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.7437905571716268, "frac_reward_zero_std": 0.25, "grad_norm": 0.10559695959091187, "learning_rate": 3.843619363204157e-06, "loss": 0.0238, "num_tokens": 265174059.0, "reward": 3.0393824577331543, "reward_std": 0.5305798053741455, "rewards/accuracy_reward/mean": 2.390625, "rewards/accuracy_reward/std": 3.0400354862213135, "rewards/ngram_similarity_reward/mean": 0.6487575769424438, "rewards/ngram_similarity_reward/std": 0.33010560274124146, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 578.15625, "completions/mean_terminated_length": 530.741943359375, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.7442380845826807, "frac_reward_zero_std": 0.0, "grad_norm": 0.10930699855089188, "learning_rate": 3.842164178862378e-06, "loss": -0.0353, "num_tokens": 265346533.0, "reward": 3.4777369499206543, "reward_std": 0.6202689409255981, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.7902370691299438, "rewards/ngram_similarity_reward/std": 0.23186592757701874, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 531.15625, "completions/mean_terminated_length": 531.15625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7446856119937346, "frac_reward_zero_std": 0.0, "grad_norm": 0.10400087386369705, "learning_rate": 3.84070839662181e-06, "loss": 0.0315, "num_tokens": 265511775.0, "reward": 3.239142417907715, "reward_std": 0.08492843806743622, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7391422986984253, "rewards/ngram_similarity_reward/std": 0.2556718587875366, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 378.46875, "completions/mean_terminated_length": 378.46875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7451331394047885, "frac_reward_zero_std": 0.25, "grad_norm": 0.12179583311080933, "learning_rate": 3.839252017279412e-06, "loss": 0.0367, "num_tokens": 265716189.0, "reward": 5.8060760498046875, "reward_std": 0.6848320364952087, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.5873256921768188, "rewards/ngram_similarity_reward/std": 0.4143521785736084, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 426.953125, "completions/mean_terminated_length": 426.953125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7455806668158425, "frac_reward_zero_std": 0.25, "grad_norm": 0.09366989880800247, "learning_rate": 3.83779504163247e-06, "loss": 0.0202, "num_tokens": 265836186.0, "reward": 5.846687316894531, "reward_std": 0.8473169207572937, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.8154371380805969, "rewards/ngram_similarity_reward/std": 0.3438029885292053, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 435.828125, "completions/mean_terminated_length": 435.828125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.7460281942268964, "frac_reward_zero_std": 0.0, "grad_norm": 0.12821348011493683, "learning_rate": 3.836337470478596e-06, "loss": 0.0414, "num_tokens": 265992207.0, "reward": 2.483971118927002, "reward_std": 1.8114967346191406, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 2.986577272415161, "rewards/ngram_similarity_reward/mean": 0.4527212977409363, "rewards/ngram_similarity_reward/std": 0.11671894788742065, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 482.046875, "completions/mean_terminated_length": 482.046875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.7464757216379503, "frac_reward_zero_std": 0.0, "grad_norm": 0.0851159542798996, "learning_rate": 3.834879304615729e-06, "loss": -0.0183, "num_tokens": 266129634.0, "reward": 2.963693618774414, "reward_std": 0.6637592315673828, "rewards/accuracy_reward/mean": 2.28125, "rewards/accuracy_reward/std": 3.0522892475128174, "rewards/ngram_similarity_reward/mean": 0.6824436187744141, "rewards/ngram_similarity_reward/std": 0.16195547580718994, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 525.28125, "completions/mean_terminated_length": 476.1612854003906, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7469232490490042, "frac_reward_zero_std": 0.0, "grad_norm": 0.11129762977361679, "learning_rate": 3.833420544842135e-06, "loss": -0.0332, "num_tokens": 266249396.0, "reward": 6.464842796325684, "reward_std": 0.08006338775157928, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.9648431539535522, "rewards/ngram_similarity_reward/std": 0.17534130811691284, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 406.078125, "completions/mean_terminated_length": 406.078125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.7473707764600582, "frac_reward_zero_std": 0.0, "grad_norm": 0.13044987618923187, "learning_rate": 3.831961191956401e-06, "loss": 0.0307, "num_tokens": 266456153.0, "reward": 4.403616905212402, "reward_std": 1.0030522346496582, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.6848673820495605, "rewards/ngram_similarity_reward/std": 0.2170443832874298, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 495.09375, "completions/mean_terminated_length": 495.09375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.7478183038711121, "frac_reward_zero_std": 0.25, "grad_norm": 0.07495610415935516, "learning_rate": 3.830501246757442e-06, "loss": -0.021, "num_tokens": 266640687.0, "reward": 1.750803828239441, "reward_std": 0.42258378863334656, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6570536494255066, "rewards/ngram_similarity_reward/std": 0.10247427970170975, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 418.734375, "completions/mean_terminated_length": 418.734375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7482658312821661, "frac_reward_zero_std": 0.0, "grad_norm": 0.12411162257194519, "learning_rate": 3.829040710044495e-06, "loss": 0.0021, "num_tokens": 266802990.0, "reward": 5.783257484436035, "reward_std": 1.2124266624450684, "rewards/accuracy_reward/mean": 4.984375, "rewards/accuracy_reward/std": 1.790558934211731, "rewards/ngram_similarity_reward/mean": 0.7988826036453247, "rewards/ngram_similarity_reward/std": 0.21857917308807373, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 541.8125, "completions/mean_terminated_length": 541.8125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.74871335869322, "frac_reward_zero_std": 0.0, "grad_norm": 0.14073511958122253, "learning_rate": 3.827579582617126e-06, "loss": -0.002, "num_tokens": 266970130.0, "reward": 3.7232022285461426, "reward_std": 1.6067060232162476, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.9419524669647217, "rewards/ngram_similarity_reward/std": 0.2151007354259491, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 402.15625, "completions/mean_terminated_length": 402.15625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.7491608861042739, "frac_reward_zero_std": 0.25, "grad_norm": 0.09434747695922852, "learning_rate": 3.826117865275216e-06, "loss": 0.0164, "num_tokens": 267157980.0, "reward": 2.489003896713257, "reward_std": 0.8085304498672485, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.8327538967132568, "rewards/ngram_similarity_reward/std": 0.19913837313652039, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 410.8125, "completions/mean_terminated_length": 410.8125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7496084135153278, "frac_reward_zero_std": 0.0, "grad_norm": 0.12702973186969757, "learning_rate": 3.824655558818976e-06, "loss": 0.0109, "num_tokens": 267287584.0, "reward": 4.820849895477295, "reward_std": 0.1471467763185501, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8208498954772949, "rewards/ngram_similarity_reward/std": 0.235096737742424, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 505.234375, "completions/mean_terminated_length": 505.234375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7500559409263817, "frac_reward_zero_std": 0.0, "grad_norm": 0.09892650693655014, "learning_rate": 3.823192664048936e-06, "loss": 0.0008, "num_tokens": 267458399.0, "reward": 3.194218873977661, "reward_std": 0.08959155529737473, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6942191123962402, "rewards/ngram_similarity_reward/std": 0.254846453666687, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 413.5, "completions/mean_terminated_length": 360.774169921875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7505034683374356, "frac_reward_zero_std": 0.25, "grad_norm": 0.1347137987613678, "learning_rate": 3.82172918176595e-06, "loss": 0.0065, "num_tokens": 267674559.0, "reward": 1.778525710105896, "reward_std": 0.16010552644729614, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 2.6306629180908203, "rewards/ngram_similarity_reward/mean": 0.7941508293151855, "rewards/ngram_similarity_reward/std": 0.32695913314819336, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 558.28125, "completions/mean_terminated_length": 534.6349487304688, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.7509509957484896, "frac_reward_zero_std": 0.0, "grad_norm": 0.10573645681142807, "learning_rate": 3.820265112771192e-06, "loss": -0.0171, "num_tokens": 267826433.0, "reward": 3.6088709831237793, "reward_std": 0.9199876189231873, "rewards/accuracy_reward/mean": 3.3125, "rewards/accuracy_reward/std": 2.948634386062622, "rewards/ngram_similarity_reward/mean": 0.29637086391448975, "rewards/ngram_similarity_reward/std": 0.1301681399345398, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 522.46875, "completions/mean_terminated_length": 522.46875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.7513985231595435, "frac_reward_zero_std": 0.25, "grad_norm": 0.0744960680603981, "learning_rate": 3.818800457866158e-06, "loss": -0.0546, "num_tokens": 267986175.0, "reward": 4.9039812088012695, "reward_std": 1.5562037229537964, "rewards/accuracy_reward/mean": 4.09375, "rewards/accuracy_reward/std": 2.5617377758026123, "rewards/ngram_similarity_reward/mean": 0.8102311491966248, "rewards/ngram_similarity_reward/std": 0.200358584523201, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 473.15625, "completions/mean_terminated_length": 473.15625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7518460505705975, "frac_reward_zero_std": 0.25, "grad_norm": 0.09306388348340988, "learning_rate": 3.817335217852664e-06, "loss": 0.0059, "num_tokens": 268119897.0, "reward": 5.6161651611328125, "reward_std": 0.8234336972236633, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.9599153995513916, "rewards/ngram_similarity_reward/std": 0.31456896662712097, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 494.0, "completions/mean_terminated_length": 494.0, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.7522935779816514, "frac_reward_zero_std": 0.0, "grad_norm": 0.1134127527475357, "learning_rate": 3.8158693935328485e-06, "loss": -0.0021, "num_tokens": 268301369.0, "reward": 4.461511611938477, "reward_std": 0.17666134238243103, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.4771363437175751, "rewards/ngram_similarity_reward/std": 0.3310698866844177, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 527.53125, "completions/mean_terminated_length": 527.53125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.7527411053927053, "frac_reward_zero_std": 0.0, "grad_norm": 0.09310425817966461, "learning_rate": 3.814402985709167e-06, "loss": -0.0041, "num_tokens": 268438523.0, "reward": 3.3843183517456055, "reward_std": 0.16243869066238403, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.899943470954895, "rewards/ngram_similarity_reward/std": 0.33668068051338196, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 461.125, "completions/mean_terminated_length": 461.125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7531886328037593, "frac_reward_zero_std": 0.0, "grad_norm": 0.12268368154764175, "learning_rate": 3.8129359951843963e-06, "loss": 0.0474, "num_tokens": 268600499.0, "reward": 5.946240425109863, "reward_std": 1.2668113708496094, "rewards/accuracy_reward/mean": 5.203125, "rewards/accuracy_reward/std": 1.3531819581985474, "rewards/ngram_similarity_reward/mean": 0.7431154251098633, "rewards/ngram_similarity_reward/std": 0.3180932402610779, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 484.71875, "completions/mean_terminated_length": 459.90478515625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7536361602148132, "frac_reward_zero_std": 0.0, "grad_norm": 0.10037180781364441, "learning_rate": 3.811468422761631e-06, "loss": 0.0605, "num_tokens": 268782273.0, "reward": 6.026999473571777, "reward_std": 0.5377567410469055, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.6207495331764221, "rewards/ngram_similarity_reward/std": 0.32750198245048523, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 443.203125, "completions/mean_terminated_length": 443.203125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7540836876258671, "frac_reward_zero_std": 0.25, "grad_norm": 0.1014128252863884, "learning_rate": 3.8100002692442855e-06, "loss": 0.0167, "num_tokens": 268932286.0, "reward": 2.9368717670440674, "reward_std": 0.045694585889577866, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.43687185645103455, "rewards/ngram_similarity_reward/std": 0.22525519132614136, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 479.1875, "completions/mean_terminated_length": 479.1875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.754531215036921, "frac_reward_zero_std": 0.25, "grad_norm": 0.08453965187072754, "learning_rate": 3.8085315354360917e-06, "loss": -0.0105, "num_tokens": 269097226.0, "reward": 6.354506492614746, "reward_std": 0.05074494332075119, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8545065522193909, "rewards/ngram_similarity_reward/std": 0.27964460849761963, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 416.03125, "completions/mean_terminated_length": 416.03125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7549787424479749, "frac_reward_zero_std": 0.0, "grad_norm": 0.0906090959906578, "learning_rate": 3.807062222141099e-06, "loss": 0.0053, "num_tokens": 269278268.0, "reward": 4.438193321228027, "reward_std": 0.6636765003204346, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.6256935596466064, "rewards/ngram_similarity_reward/std": 0.28762826323509216, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 576.0, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.7554262698590288, "frac_reward_zero_std": 0.25, "grad_norm": 0.0749097540974617, "learning_rate": 3.805592330163675e-06, "loss": -0.0114, "num_tokens": 269426076.0, "reward": 4.431105136871338, "reward_std": 0.4843612611293793, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.5404797792434692, "rewards/ngram_similarity_reward/std": 0.2098207175731659, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 505.078125, "completions/mean_terminated_length": 505.078125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.7558737972700827, "frac_reward_zero_std": 0.0, "grad_norm": 0.10263252258300781, "learning_rate": 3.804121860308502e-06, "loss": 0.0431, "num_tokens": 269539921.0, "reward": 5.977553367614746, "reward_std": 0.07637765258550644, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.4775530993938446, "rewards/ngram_similarity_reward/std": 0.18223613500595093, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 701.5625, "completions/mean_terminated_length": 587.4576416015625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.7563213246811367, "frac_reward_zero_std": 0.0, "grad_norm": 0.13813172280788422, "learning_rate": 3.8026508133805806e-06, "loss": -0.01, "num_tokens": 269707701.0, "reward": 2.7967331409454346, "reward_std": 0.5754700899124146, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.4842332601547241, "rewards/ngram_similarity_reward/std": 0.23618005216121674, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 453.140625, "completions/mean_terminated_length": 453.140625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7567688520921907, "frac_reward_zero_std": 0.0, "grad_norm": 0.11035829782485962, "learning_rate": 3.801179190185227e-06, "loss": -0.0044, "num_tokens": 269895326.0, "reward": 0.7430910468101501, "reward_std": 0.8383191823959351, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.5868409872055054, "rewards/ngram_similarity_reward/std": 0.20448924601078033, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 445.28125, "completions/mean_terminated_length": 445.28125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.7572163795032446, "frac_reward_zero_std": 0.0, "grad_norm": 0.16396446526050568, "learning_rate": 3.799706991528072e-06, "loss": 0.0013, "num_tokens": 270028832.0, "reward": 4.434956073760986, "reward_std": 0.7647863626480103, "rewards/accuracy_reward/mean": 3.71875, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.7162062525749207, "rewards/ngram_similarity_reward/std": 0.3267899751663208, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 448.203125, "completions/mean_terminated_length": 422.8095397949219, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7576639069142985, "frac_reward_zero_std": 0.25, "grad_norm": 0.10294193774461746, "learning_rate": 3.7982342182150627e-06, "loss": -0.0186, "num_tokens": 270168109.0, "reward": 4.480494499206543, "reward_std": 0.8373005390167236, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 1.0429949760437012, "rewards/ngram_similarity_reward/std": 0.19750304520130157, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 504.078125, "completions/mean_terminated_length": 504.078125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7581114343253524, "frac_reward_zero_std": 0.0, "grad_norm": 0.12504678964614868, "learning_rate": 3.7967608710524596e-06, "loss": -0.0071, "num_tokens": 270353554.0, "reward": 1.8410844802856445, "reward_std": 1.1014564037322998, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 2.7629566192626953, "rewards/ngram_similarity_reward/mean": 0.5598344206809998, "rewards/ngram_similarity_reward/std": 0.330642431974411, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 512.375, "completions/mean_terminated_length": 462.83868408203125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7585589617364064, "frac_reward_zero_std": 0.0, "grad_norm": 0.13782398402690887, "learning_rate": 3.7952869508468375e-06, "loss": 0.0321, "num_tokens": 270513194.0, "reward": 3.069380760192871, "reward_std": 1.9833970069885254, "rewards/accuracy_reward/mean": 2.328125, "rewards/accuracy_reward/std": 3.109405755996704, "rewards/ngram_similarity_reward/mean": 0.7412558197975159, "rewards/ngram_similarity_reward/std": 0.23963335156440735, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 584.734375, "completions/mean_terminated_length": 584.734375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.7590064891474603, "frac_reward_zero_std": 0.25, "grad_norm": 0.08866584300994873, "learning_rate": 3.793812458405086e-06, "loss": 0.0096, "num_tokens": 270672537.0, "reward": 3.525939464569092, "reward_std": 0.8003028631210327, "rewards/accuracy_reward/mean": 3.25, "rewards/accuracy_reward/std": 2.9277002811431885, "rewards/ngram_similarity_reward/mean": 0.27593910694122314, "rewards/ngram_similarity_reward/std": 0.16110624372959137, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 550.828125, "completions/mean_terminated_length": 527.0635375976562, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7594540165585142, "frac_reward_zero_std": 0.0, "grad_norm": 0.12685289978981018, "learning_rate": 3.792337394534407e-06, "loss": -0.0162, "num_tokens": 270880942.0, "reward": 4.023468494415283, "reward_std": 1.6493728160858154, "rewards/accuracy_reward/mean": 3.21875, "rewards/accuracy_reward/std": 2.9732606410980225, "rewards/ngram_similarity_reward/mean": 0.8047187328338623, "rewards/ngram_similarity_reward/std": 0.2672255039215088, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 554.03125, "completions/mean_terminated_length": 530.3175048828125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.7599015439695681, "frac_reward_zero_std": 0.0, "grad_norm": 0.08748823404312134, "learning_rate": 3.7908617600423146e-06, "loss": 0.0346, "num_tokens": 271034832.0, "reward": 5.809333801269531, "reward_std": 1.2132689952850342, "rewards/accuracy_reward/mean": 4.921875, "rewards/accuracy_reward/std": 1.8153201341629028, "rewards/ngram_similarity_reward/mean": 0.8874589204788208, "rewards/ngram_similarity_reward/std": 0.20694543421268463, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 392.015625, "completions/mean_terminated_length": 392.015625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.760349071380622, "frac_reward_zero_std": 0.0, "grad_norm": 0.13442236185073853, "learning_rate": 3.789385555736638e-06, "loss": 0.0282, "num_tokens": 271186993.0, "reward": 6.382952690124512, "reward_std": 0.2047431319952011, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8829526305198669, "rewards/ngram_similarity_reward/std": 0.29756835103034973, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 458.203125, "completions/mean_terminated_length": 458.203125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7607965987916759, "frac_reward_zero_std": 0.0, "grad_norm": 0.10752265900373459, "learning_rate": 3.7879087824255155e-06, "loss": 0.0086, "num_tokens": 271334814.0, "reward": 6.193717956542969, "reward_std": 0.10046614706516266, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.6937180757522583, "rewards/ngram_similarity_reward/std": 0.2641277313232422, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 442.3125, "completions/mean_terminated_length": 442.3125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.76124412620273, "frac_reward_zero_std": 0.0, "grad_norm": 0.1317412108182907, "learning_rate": 3.7864314409173977e-06, "loss": 0.0131, "num_tokens": 271500034.0, "reward": 3.051396369934082, "reward_std": 1.8890447616577148, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.645146369934082, "rewards/ngram_similarity_reward/std": 0.323547899723053, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 395.84375, "completions/mean_terminated_length": 395.84375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7616916536137839, "frac_reward_zero_std": 0.0, "grad_norm": 0.10261240601539612, "learning_rate": 3.7849535320210456e-06, "loss": 0.0094, "num_tokens": 271673816.0, "reward": 6.512883186340332, "reward_std": 0.15527239441871643, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 1.0128833055496216, "rewards/ngram_similarity_reward/std": 0.1972905546426773, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 530.703125, "completions/mean_terminated_length": 530.703125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7621391810248378, "frac_reward_zero_std": 0.0, "grad_norm": 0.1261419802904129, "learning_rate": 3.7834750565455337e-06, "loss": -0.0152, "num_tokens": 271904837.0, "reward": 3.066795587539673, "reward_std": 0.5125082731246948, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6605455875396729, "rewards/ngram_similarity_reward/std": 0.3532313108444214, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 494.140625, "completions/mean_terminated_length": 469.4762268066406, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.7625867084358917, "frac_reward_zero_std": 0.0, "grad_norm": 0.0991603285074234, "learning_rate": 3.7819960153002423e-06, "loss": -0.003, "num_tokens": 272044974.0, "reward": 3.3295774459838867, "reward_std": 0.6637980341911316, "rewards/accuracy_reward/mean": 2.671875, "rewards/accuracy_reward/std": 3.037097215652466, "rewards/ngram_similarity_reward/mean": 0.6577024459838867, "rewards/ngram_similarity_reward/std": 0.24875979125499725, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 461.53125, "completions/mean_terminated_length": 461.53125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7630342358469456, "frac_reward_zero_std": 0.25, "grad_norm": 0.1032610833644867, "learning_rate": 3.7805164090948658e-06, "loss": -0.0488, "num_tokens": 272190544.0, "reward": 3.4105939865112305, "reward_std": 1.555128812789917, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.5355940461158752, "rewards/ngram_similarity_reward/std": 0.1661282628774643, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 491.5, "completions/mean_terminated_length": 466.7936706542969, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7634817632579995, "frac_reward_zero_std": 0.0, "grad_norm": 0.09144928306341171, "learning_rate": 3.779036238739404e-06, "loss": -0.0045, "num_tokens": 272327696.0, "reward": 5.4266486167907715, "reward_std": 0.7983545660972595, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.4891488254070282, "rewards/ngram_similarity_reward/std": 0.2724377512931824, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 554.875, "completions/mean_terminated_length": 531.1746215820312, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.7639292906690535, "frac_reward_zero_std": 0.0, "grad_norm": 0.11611030995845795, "learning_rate": 3.7775555050441693e-06, "loss": 0.0478, "num_tokens": 272491048.0, "reward": 3.4665048122406006, "reward_std": 1.703005313873291, "rewards/accuracy_reward/mean": 2.71875, "rewards/accuracy_reward/std": 3.08847713470459, "rewards/ngram_similarity_reward/mean": 0.7477551102638245, "rewards/ngram_similarity_reward/std": 0.30098485946655273, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 441.015625, "completions/mean_terminated_length": 441.015625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7643768180801074, "frac_reward_zero_std": 0.25, "grad_norm": 0.0940161719918251, "learning_rate": 3.7760742088197794e-06, "loss": 0.0098, "num_tokens": 272644425.0, "reward": 4.572781562805176, "reward_std": 0.4228802025318146, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.6665315628051758, "rewards/ngram_similarity_reward/std": 0.4215388596057892, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 468.671875, "completions/mean_terminated_length": 468.671875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7648243454911613, "frac_reward_zero_std": 0.25, "grad_norm": 0.11359170079231262, "learning_rate": 3.774592350877161e-06, "loss": -0.0031, "num_tokens": 272790532.0, "reward": 0.6768046617507935, "reward_std": 0.7949711680412292, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 1.7354841232299805, "rewards/ngram_similarity_reward/mean": 0.36430463194847107, "rewards/ngram_similarity_reward/std": 0.21512088179588318, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 501.953125, "completions/mean_terminated_length": 501.953125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.7652718729022152, "frac_reward_zero_std": 0.0, "grad_norm": 0.09563387185335159, "learning_rate": 3.7731099320275484e-06, "loss": 0.002, "num_tokens": 272966801.0, "reward": 3.2861225605010986, "reward_std": 0.14564277231693268, "rewards/accuracy_reward/mean": 2.484375, "rewards/accuracy_reward/std": 3.0419929027557373, "rewards/ngram_similarity_reward/mean": 0.8017475605010986, "rewards/ngram_similarity_reward/std": 0.1357528418302536, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 419.34375, "completions/mean_terminated_length": 419.34375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7657194003132692, "frac_reward_zero_std": 0.25, "grad_norm": 0.1052631288766861, "learning_rate": 3.7716269530824835e-06, "loss": 0.0322, "num_tokens": 273093031.0, "reward": 3.015035629272461, "reward_std": 1.1161115169525146, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.8900353908538818, "rewards/ngram_similarity_reward/std": 0.22037431597709656, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 465.921875, "completions/mean_terminated_length": 465.921875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.7661669277243232, "frac_reward_zero_std": 0.0, "grad_norm": 0.11694445461034775, "learning_rate": 3.770143414853814e-06, "loss": -0.0549, "num_tokens": 273235154.0, "reward": 1.3984278440475464, "reward_std": 1.64403235912323, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 2.3603873252868652, "rewards/ngram_similarity_reward/mean": 0.7734278440475464, "rewards/ngram_similarity_reward/std": 0.29485347867012024, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 470.265625, "completions/mean_terminated_length": 470.265625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7666144551353771, "frac_reward_zero_std": 0.25, "grad_norm": 0.18651999533176422, "learning_rate": 3.7686593181536946e-06, "loss": 0.0215, "num_tokens": 273364435.0, "reward": 6.231356620788574, "reward_std": 0.06699755787849426, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7313565611839294, "rewards/ngram_similarity_reward/std": 0.3515256941318512, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 667.796875, "completions/mean_terminated_length": 470.6250305175781, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.767061982546431, "frac_reward_zero_std": 0.0, "grad_norm": 0.10543458163738251, "learning_rate": 3.7671746637945845e-06, "loss": -0.1146, "num_tokens": 273501654.0, "reward": 3.568516254425049, "reward_std": 2.31913161277771, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6935162544250488, "rewards/ngram_similarity_reward/std": 0.444365531206131, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 453.03125, "completions/mean_terminated_length": 453.03125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.7675095099574849, "frac_reward_zero_std": 0.25, "grad_norm": 0.09862305223941803, "learning_rate": 3.765689452589249e-06, "loss": 0.0132, "num_tokens": 273623208.0, "reward": 4.911007881164551, "reward_std": 0.06254077702760696, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9110076427459717, "rewards/ngram_similarity_reward/std": 0.22745934128761292, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 496.78125, "completions/mean_terminated_length": 472.15875244140625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.7679570373685388, "frac_reward_zero_std": 0.0, "grad_norm": 0.11860388517379761, "learning_rate": 3.764203685350759e-06, "loss": 0.0158, "num_tokens": 273849834.0, "reward": 4.710654258728027, "reward_std": 1.5064451694488525, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7106548547744751, "rewards/ngram_similarity_reward/std": 0.35538649559020996, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 549.90625, "completions/mean_terminated_length": 422.94915771484375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.7684045647795927, "frac_reward_zero_std": 0.0, "grad_norm": 0.11047927290201187, "learning_rate": 3.7627173628924878e-06, "loss": 0.0317, "num_tokens": 273995604.0, "reward": 4.477602958679199, "reward_std": 0.08160172402858734, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.4776029586791992, "rewards/ngram_similarity_reward/std": 0.18633443117141724, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 486.5, "completions/mean_terminated_length": 461.7143249511719, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.7688520921906467, "frac_reward_zero_std": 0.0, "grad_norm": 0.10446258634328842, "learning_rate": 3.7612304860281142e-06, "loss": -0.0119, "num_tokens": 274136580.0, "reward": 3.9350204467773438, "reward_std": 0.8723170757293701, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.7787700891494751, "rewards/ngram_similarity_reward/std": 0.22662793099880219, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 371.171875, "completions/mean_terminated_length": 371.171875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7692996196017006, "frac_reward_zero_std": 0.0, "grad_norm": 0.14761660993099213, "learning_rate": 3.7597430555716204e-06, "loss": 0.0037, "num_tokens": 274312911.0, "reward": 3.5941073894500732, "reward_std": 0.7306583523750305, "rewards/accuracy_reward/mean": 2.875, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.7191075086593628, "rewards/ngram_similarity_reward/std": 0.36091846227645874, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 512.484375, "completions/mean_terminated_length": 512.484375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.7697471470127545, "frac_reward_zero_std": 0.25, "grad_norm": 0.08637955039739609, "learning_rate": 3.7582550723372912e-06, "loss": -0.001, "num_tokens": 274448222.0, "reward": 6.481459617614746, "reward_std": 0.08800999820232391, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.9814598560333252, "rewards/ngram_similarity_reward/std": 0.13310606777668, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 589.578125, "completions/mean_terminated_length": 589.578125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7701946744238085, "frac_reward_zero_std": 0.0, "grad_norm": 0.126104474067688, "learning_rate": 3.7567665371397137e-06, "loss": 0.0233, "num_tokens": 274696915.0, "reward": 3.328207492828369, "reward_std": 0.5135577917098999, "rewards/accuracy_reward/mean": 2.578125, "rewards/accuracy_reward/std": 3.0410144329071045, "rewards/ngram_similarity_reward/mean": 0.7500826120376587, "rewards/ngram_similarity_reward/std": 0.2727869749069214, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 565.15625, "completions/mean_terminated_length": 541.6190795898438, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.7706422018348624, "frac_reward_zero_std": 0.0, "grad_norm": 0.08273998647928238, "learning_rate": 3.7552774507937787e-06, "loss": -0.0139, "num_tokens": 274832173.0, "reward": 4.147893905639648, "reward_std": 1.1679359674453735, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.6166439056396484, "rewards/ngram_similarity_reward/std": 0.17714762687683105, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 318.828125, "completions/mean_terminated_length": 318.828125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7710897292459163, "frac_reward_zero_std": 0.25, "grad_norm": 0.1377553939819336, "learning_rate": 3.753787814114677e-06, "loss": -0.0052, "num_tokens": 274980034.0, "reward": 3.1438984870910645, "reward_std": 0.4386993646621704, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.5501485466957092, "rewards/ngram_similarity_reward/std": 0.19643662869930267, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 620.140625, "completions/mean_terminated_length": 524.9500122070312, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.7715372566569703, "frac_reward_zero_std": 0.0, "grad_norm": 0.10216798633337021, "learning_rate": 3.7522976279179013e-06, "loss": 0.0368, "num_tokens": 275136171.0, "reward": 5.586757659912109, "reward_std": 1.5851632356643677, "rewards/accuracy_reward/mean": 4.84375, "rewards/accuracy_reward/std": 1.8874586820602417, "rewards/ngram_similarity_reward/mean": 0.7430075407028198, "rewards/ngram_similarity_reward/std": 0.22118467092514038, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 468.421875, "completions/mean_terminated_length": 468.421875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.7719847840680242, "frac_reward_zero_std": 0.0, "grad_norm": 0.1122996062040329, "learning_rate": 3.7508068930192455e-06, "loss": -0.0024, "num_tokens": 275314214.0, "reward": 3.8260443210601807, "reward_std": 1.3485864400863647, "rewards/accuracy_reward/mean": 3.15625, "rewards/accuracy_reward/std": 2.950484275817871, "rewards/ngram_similarity_reward/mean": 0.6697943210601807, "rewards/ngram_similarity_reward/std": 0.2788363993167877, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 464.921875, "completions/mean_terminated_length": 464.921875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7724323114790781, "frac_reward_zero_std": 0.0, "grad_norm": 0.13570821285247803, "learning_rate": 3.749315610234802e-06, "loss": -0.013, "num_tokens": 275434529.0, "reward": 5.731655120849609, "reward_std": 0.5602239966392517, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.419155091047287, "rewards/ngram_similarity_reward/std": 0.2204188108444214, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 407.65625, "completions/mean_terminated_length": 407.65625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.772879838890132, "frac_reward_zero_std": 0.25, "grad_norm": 0.09329795837402344, "learning_rate": 3.7478237803809677e-06, "loss": -0.0051, "num_tokens": 275629819.0, "reward": 3.1534743309020996, "reward_std": 1.598434567451477, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.8409743309020996, "rewards/ngram_similarity_reward/std": 0.2519665062427521, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 388.265625, "completions/mean_terminated_length": 388.265625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7733273663011859, "frac_reward_zero_std": 0.5, "grad_norm": 0.08827347308397293, "learning_rate": 3.7463314042744336e-06, "loss": 0.0172, "num_tokens": 275784204.0, "reward": 4.787602424621582, "reward_std": 0.42931514978408813, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.8813523054122925, "rewards/ngram_similarity_reward/std": 0.28254538774490356, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 482.546875, "completions/mean_terminated_length": 482.546875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.7737748937122398, "frac_reward_zero_std": 0.0, "grad_norm": 0.16683056950569153, "learning_rate": 3.7448384827321932e-06, "loss": 0.0021, "num_tokens": 275946383.0, "reward": 4.025887966156006, "reward_std": 1.2076332569122314, "rewards/accuracy_reward/mean": 3.4375, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.5883880257606506, "rewards/ngram_similarity_reward/std": 0.13875126838684082, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1591.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 491.875, "completions/mean_terminated_length": 456.4193420410156, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7742224211232938, "frac_reward_zero_std": 0.0, "grad_norm": 0.16087134182453156, "learning_rate": 3.7433450165715372e-06, "loss": 0.0374, "num_tokens": 276168649.0, "reward": 4.721775054931641, "reward_std": 0.24048519134521484, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7217749357223511, "rewards/ngram_similarity_reward/std": 0.3546585142612457, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 494.21875, "completions/mean_terminated_length": 494.21875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7746699485343477, "frac_reward_zero_std": 0.0, "grad_norm": 0.13882260024547577, "learning_rate": 3.7418510066100544e-06, "loss": 0.0169, "num_tokens": 276310327.0, "reward": 2.9754281044006348, "reward_std": 0.050156496465206146, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.47542816400527954, "rewards/ngram_similarity_reward/std": 0.19159510731697083, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 444.359375, "completions/mean_terminated_length": 444.359375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7751174759454017, "frac_reward_zero_std": 0.25, "grad_norm": 0.07742560654878616, "learning_rate": 3.740356453665632e-06, "loss": 0.0196, "num_tokens": 276477038.0, "reward": 6.4582014083862305, "reward_std": 0.10869987308979034, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.9582017660140991, "rewards/ngram_similarity_reward/std": 0.1663198322057724, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 369.203125, "completions/mean_terminated_length": 369.203125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7755650033564556, "frac_reward_zero_std": 0.25, "grad_norm": 0.08513515442609787, "learning_rate": 3.738861358556455e-06, "loss": 0.007, "num_tokens": 276607531.0, "reward": 6.405516624450684, "reward_std": 0.11752209067344666, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.9055166244506836, "rewards/ngram_similarity_reward/std": 0.29450514912605286, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 447.515625, "completions/mean_terminated_length": 447.515625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7760125307675095, "frac_reward_zero_std": 0.25, "grad_norm": 0.11043563485145569, "learning_rate": 3.7373657221010027e-06, "loss": -0.0045, "num_tokens": 276743020.0, "reward": 5.889714241027832, "reward_std": 0.8159486651420593, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.8584644794464111, "rewards/ngram_similarity_reward/std": 0.18844884634017944, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 461.734375, "completions/mean_terminated_length": 410.56451416015625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.7764600581785635, "frac_reward_zero_std": 0.25, "grad_norm": 0.17382760345935822, "learning_rate": 3.735869545118053e-06, "loss": -0.0272, "num_tokens": 276867899.0, "reward": 0.08718939870595932, "reward_std": 0.5110728740692139, "rewards/accuracy_reward/mean": -0.53125, "rewards/accuracy_reward/std": 0.8351171612739563, "rewards/ngram_similarity_reward/mean": 0.6184394359588623, "rewards/ngram_similarity_reward/std": 0.1825799196958542, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 479.953125, "completions/mean_terminated_length": 479.953125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7769075855896174, "frac_reward_zero_std": 0.0, "grad_norm": 0.10287876427173615, "learning_rate": 3.73437282842668e-06, "loss": -0.0017, "num_tokens": 277028088.0, "reward": 4.718317985534668, "reward_std": 0.22731512784957886, "rewards/accuracy_reward/mean": 3.984375, "rewards/accuracy_reward/std": 2.648702621459961, "rewards/ngram_similarity_reward/mean": 0.733942449092865, "rewards/ngram_similarity_reward/std": 0.34278374910354614, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 459.65625, "completions/mean_terminated_length": 459.65625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7773551130006713, "frac_reward_zero_std": 0.0, "grad_norm": 0.09848344326019287, "learning_rate": 3.7328755728462513e-06, "loss": -0.0324, "num_tokens": 277177938.0, "reward": 4.4768171310424805, "reward_std": 0.5262054800987244, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.5705671310424805, "rewards/ngram_similarity_reward/std": 0.27129846811294556, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 380.203125, "completions/mean_terminated_length": 380.203125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7778026404117252, "frac_reward_zero_std": 0.5, "grad_norm": 0.08698117733001709, "learning_rate": 3.731377779196431e-06, "loss": -0.0085, "num_tokens": 277319519.0, "reward": 2.3189797401428223, "reward_std": 0.8595125675201416, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 2.872281312942505, "rewards/ngram_similarity_reward/mean": 0.7564799785614014, "rewards/ngram_similarity_reward/std": 0.3434167206287384, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 562.765625, "completions/mean_terminated_length": 436.8983154296875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7782501678227791, "frac_reward_zero_std": 0.0, "grad_norm": 0.09498909115791321, "learning_rate": 3.7298794482971773e-06, "loss": -0.1533, "num_tokens": 277471408.0, "reward": 5.730620384216309, "reward_std": 1.2200387716293335, "rewards/accuracy_reward/mean": 4.9375, "rewards/accuracy_reward/std": 1.7627090215682983, "rewards/ngram_similarity_reward/mean": 0.7931205034255981, "rewards/ngram_similarity_reward/std": 0.3747904598712921, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 603.5, "completions/mean_terminated_length": 580.5714721679688, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.778697695233833, "frac_reward_zero_std": 0.0, "grad_norm": 0.13159863650798798, "learning_rate": 3.7283805809687427e-06, "loss": -0.0029, "num_tokens": 277598768.0, "reward": 3.0115487575531006, "reward_std": 0.5999206304550171, "rewards/accuracy_reward/mean": 2.359375, "rewards/accuracy_reward/std": 3.075077533721924, "rewards/ngram_similarity_reward/mean": 0.652173638343811, "rewards/ngram_similarity_reward/std": 0.17643523216247559, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 433.03125, "completions/mean_terminated_length": 433.03125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.779145222644887, "frac_reward_zero_std": 0.25, "grad_norm": 0.10145675390958786, "learning_rate": 3.7268811780316726e-06, "loss": 0.0091, "num_tokens": 277718882.0, "reward": 3.3468050956726074, "reward_std": 0.6583366990089417, "rewards/accuracy_reward/mean": 2.78125, "rewards/accuracy_reward/std": 3.0103988647460938, "rewards/ngram_similarity_reward/mean": 0.5655550956726074, "rewards/ngram_similarity_reward/std": 0.34843239188194275, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 397.96875, "completions/mean_terminated_length": 397.96875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.779592750055941, "frac_reward_zero_std": 0.25, "grad_norm": 0.10779356956481934, "learning_rate": 3.725381240306807e-06, "loss": -0.0318, "num_tokens": 277857664.0, "reward": 6.0483808517456055, "reward_std": 0.10932482033967972, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.5483807921409607, "rewards/ngram_similarity_reward/std": 0.19631454348564148, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1445.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 751.46875, "completions/mean_terminated_length": 751.46875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.7800402774669949, "frac_reward_zero_std": 0.0, "grad_norm": 0.06561101973056793, "learning_rate": 3.7238807686152773e-06, "loss": -0.0637, "num_tokens": 278055854.0, "reward": 3.3921141624450684, "reward_std": 1.6131346225738525, "rewards/accuracy_reward/mean": 2.671875, "rewards/accuracy_reward/std": 3.037097215652466, "rewards/ngram_similarity_reward/mean": 0.7202394008636475, "rewards/ngram_similarity_reward/std": 0.19713854789733887, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 411.828125, "completions/mean_terminated_length": 411.828125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7804878048780488, "frac_reward_zero_std": 0.25, "grad_norm": 0.12284394353628159, "learning_rate": 3.722379763778508e-06, "loss": 0.0036, "num_tokens": 278187603.0, "reward": 5.143313407897949, "reward_std": 1.265124797821045, "rewards/accuracy_reward/mean": 4.28125, "rewards/accuracy_reward/std": 2.4330317974090576, "rewards/ngram_similarity_reward/mean": 0.8620636463165283, "rewards/ngram_similarity_reward/std": 0.26387763023376465, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 501.3125, "completions/mean_terminated_length": 501.3125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7809353322891027, "frac_reward_zero_std": 0.25, "grad_norm": 0.09614390879869461, "learning_rate": 3.7208782266182153e-06, "loss": -0.0188, "num_tokens": 278337799.0, "reward": 2.9313480854034424, "reward_std": 0.04841892421245575, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.43134796619415283, "rewards/ngram_similarity_reward/std": 0.1135719045996666, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 505.640625, "completions/mean_terminated_length": 505.640625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.7813828597001566, "frac_reward_zero_std": 0.0, "grad_norm": 0.08681753277778625, "learning_rate": 3.7193761579564075e-06, "loss": -0.0226, "num_tokens": 278503488.0, "reward": 3.15793514251709, "reward_std": 0.09037813544273376, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6579351425170898, "rewards/ngram_similarity_reward/std": 0.2862730324268341, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 459.90625, "completions/mean_terminated_length": 459.90625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.7818303871112106, "frac_reward_zero_std": 0.0, "grad_norm": 0.0824127346277237, "learning_rate": 3.7178735586153817e-06, "loss": 0.017, "num_tokens": 278653306.0, "reward": 4.810904502868652, "reward_std": 0.09839123487472534, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8109046220779419, "rewards/ngram_similarity_reward/std": 0.23212027549743652, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 476.0, "completions/mean_terminated_length": 476.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7822779145222645, "frac_reward_zero_std": 0.0, "grad_norm": 0.1105496883392334, "learning_rate": 3.716370429417728e-06, "loss": 0.02, "num_tokens": 278806858.0, "reward": 3.7836852073669434, "reward_std": 0.7806867361068726, "rewards/accuracy_reward/mean": 3.53125, "rewards/accuracy_reward/std": 2.839454174041748, "rewards/ngram_similarity_reward/mean": 0.252435564994812, "rewards/ngram_similarity_reward/std": 0.16177693009376526, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 494.140625, "completions/mean_terminated_length": 494.140625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.7827254419333184, "frac_reward_zero_std": 0.0, "grad_norm": 0.14065444469451904, "learning_rate": 3.7148667711863253e-06, "loss": -0.0056, "num_tokens": 279012995.0, "reward": 1.6374659538269043, "reward_std": 0.7110859155654907, "rewards/accuracy_reward/mean": 1.140625, "rewards/accuracy_reward/std": 2.7566208839416504, "rewards/ngram_similarity_reward/mean": 0.4968408942222595, "rewards/ngram_similarity_reward/std": 0.195643812417984, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 525.65625, "completions/mean_terminated_length": 525.65625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.7831729693443723, "frac_reward_zero_std": 0.5, "grad_norm": 0.05567352473735809, "learning_rate": 3.7133625847443426e-06, "loss": -0.0103, "num_tokens": 279174237.0, "reward": 3.3230068683624268, "reward_std": 0.04326403886079788, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.8230066895484924, "rewards/ngram_similarity_reward/std": 0.23887582123279572, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 487.984375, "completions/mean_terminated_length": 487.984375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7836204967554262, "frac_reward_zero_std": 0.0, "grad_norm": 0.09342952072620392, "learning_rate": 3.711857870915237e-06, "loss": 0.0294, "num_tokens": 279361340.0, "reward": 4.5902204513549805, "reward_std": 1.5695034265518188, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.7777203321456909, "rewards/ngram_similarity_reward/std": 0.2926555573940277, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 410.03125, "completions/mean_terminated_length": 384.0317687988281, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7840680241664802, "frac_reward_zero_std": 0.25, "grad_norm": 0.16596359014511108, "learning_rate": 3.7103526305227565e-06, "loss": -0.0127, "num_tokens": 279463742.0, "reward": 5.742364883422852, "reward_std": 0.785595178604126, "rewards/accuracy_reward/mean": 5.03125, "rewards/accuracy_reward/std": 1.6229382753372192, "rewards/ngram_similarity_reward/mean": 0.711115300655365, "rewards/ngram_similarity_reward/std": 0.48928964138031006, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 452.484375, "completions/mean_terminated_length": 452.484375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.7845155515775342, "frac_reward_zero_std": 0.25, "grad_norm": 0.11051739007234573, "learning_rate": 3.7088468643909346e-06, "loss": 0.0023, "num_tokens": 279602941.0, "reward": 5.040217876434326, "reward_std": 0.06647270172834396, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 1.0402178764343262, "rewards/ngram_similarity_reward/std": 0.1375802755355835, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 549.875, "completions/mean_terminated_length": 549.875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7849630789885881, "frac_reward_zero_std": 0.0, "grad_norm": 0.07461415231227875, "learning_rate": 3.7073405733440955e-06, "loss": -0.0009, "num_tokens": 279748853.0, "reward": 4.671903610229492, "reward_std": 0.1308913230895996, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.671903669834137, "rewards/ngram_similarity_reward/std": 0.26790651679039, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 370.5, "completions/mean_terminated_length": 370.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.785410606399642, "frac_reward_zero_std": 0.25, "grad_norm": 0.09640100598335266, "learning_rate": 3.7058337582068476e-06, "loss": -0.0201, "num_tokens": 279904661.0, "reward": 3.4819796085357666, "reward_std": 0.39395928382873535, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.8882298469543457, "rewards/ngram_similarity_reward/std": 0.42067092657089233, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 409.375, "completions/mean_terminated_length": 409.375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.7858581338106959, "frac_reward_zero_std": 0.25, "grad_norm": 0.14509481191635132, "learning_rate": 3.7043264198040897e-06, "loss": 0.0049, "num_tokens": 280145949.0, "reward": 2.6730215549468994, "reward_std": 0.7071306705474854, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 3.0164480209350586, "rewards/ngram_similarity_reward/mean": 0.5636464357376099, "rewards/ngram_similarity_reward/std": 0.2723451256752014, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 515.6875, "completions/mean_terminated_length": 515.6875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.7863056612217498, "frac_reward_zero_std": 0.0, "grad_norm": 0.09780410677194595, "learning_rate": 3.7028185589610035e-06, "loss": 0.0056, "num_tokens": 280297049.0, "reward": 6.146766185760498, "reward_std": 0.6536001563072205, "rewards/accuracy_reward/mean": 5.3125, "rewards/accuracy_reward/std": 1.0522085428237915, "rewards/ngram_similarity_reward/mean": 0.834265947341919, "rewards/ngram_similarity_reward/std": 0.2860173285007477, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 439.15625, "completions/mean_terminated_length": 439.15625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7867531886328037, "frac_reward_zero_std": 0.25, "grad_norm": 0.10675951093435287, "learning_rate": 3.7013101765030597e-06, "loss": 0.0213, "num_tokens": 280432499.0, "reward": 4.9260478019714355, "reward_std": 0.06432859599590302, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.9260480403900146, "rewards/ngram_similarity_reward/std": 0.18244914710521698, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 458.71875, "completions/mean_terminated_length": 433.4920959472656, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7872007160438577, "frac_reward_zero_std": 0.25, "grad_norm": 0.1172085776925087, "learning_rate": 3.6998012732560127e-06, "loss": -0.0153, "num_tokens": 280644529.0, "reward": 2.6222519874572754, "reward_std": 0.7941980957984924, "rewards/accuracy_reward/mean": 2.046875, "rewards/accuracy_reward/std": 3.080557107925415, "rewards/ngram_similarity_reward/mean": 0.5753771066665649, "rewards/ngram_similarity_reward/std": 0.2146225869655609, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 444.5, "completions/mean_terminated_length": 444.5, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7876482434549116, "frac_reward_zero_std": 0.25, "grad_norm": 0.08716736733913422, "learning_rate": 3.698291850045902e-06, "loss": 0.0079, "num_tokens": 280783713.0, "reward": 3.271043539047241, "reward_std": 0.10971924662590027, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.7710434198379517, "rewards/ngram_similarity_reward/std": 0.2659313380718231, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 515.5, "completions/mean_terminated_length": 515.5, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7880957708659655, "frac_reward_zero_std": 0.25, "grad_norm": 0.07007109373807907, "learning_rate": 3.6967819076990546e-06, "loss": -0.0025, "num_tokens": 280967249.0, "reward": 3.259934425354004, "reward_std": 0.5627215504646301, "rewards/accuracy_reward/mean": 2.6875, "rewards/accuracy_reward/std": 3.0178043842315674, "rewards/ngram_similarity_reward/mean": 0.5724344253540039, "rewards/ngram_similarity_reward/std": 0.5026712417602539, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 453.109375, "completions/mean_terminated_length": 453.109375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.7885432982770195, "frac_reward_zero_std": 0.25, "grad_norm": 0.0714978277683258, "learning_rate": 3.695271447042077e-06, "loss": -0.0219, "num_tokens": 281114312.0, "reward": 4.756280422210693, "reward_std": 0.062186598777770996, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7562806606292725, "rewards/ngram_similarity_reward/std": 0.20702359080314636, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 509.125, "completions/mean_terminated_length": 509.125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.7889908256880734, "frac_reward_zero_std": 0.25, "grad_norm": 0.06607312709093094, "learning_rate": 3.6937604689018634e-06, "loss": -0.0346, "num_tokens": 281232960.0, "reward": 6.391016960144043, "reward_std": 0.08699575811624527, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.8910167813301086, "rewards/ngram_similarity_reward/std": 0.2577274441719055, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 460.875, "completions/mean_terminated_length": 460.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7894383530991274, "frac_reward_zero_std": 0.5, "grad_norm": 0.06838354468345642, "learning_rate": 3.692248974105589e-06, "loss": 0.0448, "num_tokens": 281375896.0, "reward": 3.369481086730957, "reward_std": 0.39635372161865234, "rewards/accuracy_reward/mean": 2.578125, "rewards/accuracy_reward/std": 3.0410144329071045, "rewards/ngram_similarity_reward/mean": 0.791356086730957, "rewards/ngram_similarity_reward/std": 0.30302315950393677, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 514.453125, "completions/mean_terminated_length": 514.453125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7898858805101813, "frac_reward_zero_std": 0.0, "grad_norm": 0.08800314366817474, "learning_rate": 3.6907369634807132e-06, "loss": -0.0067, "num_tokens": 281549157.0, "reward": 5.869156837463379, "reward_std": 0.7548459768295288, "rewards/accuracy_reward/mean": 5.21875, "rewards/accuracy_reward/std": 1.2782522439956665, "rewards/ngram_similarity_reward/mean": 0.6504068374633789, "rewards/ngram_similarity_reward/std": 0.30050766468048096, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 438.5625, "completions/mean_terminated_length": 438.5625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7903334079212352, "frac_reward_zero_std": 0.0, "grad_norm": 0.090070940554142, "learning_rate": 3.6892244378549775e-06, "loss": -0.0097, "num_tokens": 281669769.0, "reward": 4.827174663543701, "reward_std": 0.10513508319854736, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.8271746635437012, "rewards/ngram_similarity_reward/std": 0.24546904861927032, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 443.4375, "completions/mean_terminated_length": 443.4375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7907809353322891, "frac_reward_zero_std": 0.0, "grad_norm": 0.103789322078228, "learning_rate": 3.687711398056404e-06, "loss": 0.0321, "num_tokens": 281818997.0, "reward": 3.0784761905670166, "reward_std": 2.0790152549743652, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.5784761309623718, "rewards/ngram_similarity_reward/std": 0.26098406314849854, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 445.875, "completions/mean_terminated_length": 445.875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.791228462743343, "frac_reward_zero_std": 0.25, "grad_norm": 0.10282961279153824, "learning_rate": 3.6861978449132974e-06, "loss": 0.0062, "num_tokens": 281959437.0, "reward": 1.7224863767623901, "reward_std": 0.16980081796646118, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 2.688710927963257, "rewards/ngram_similarity_reward/mean": 0.8162364363670349, "rewards/ngram_similarity_reward/std": 0.3230360150337219, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 445.015625, "completions/mean_terminated_length": 445.015625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.7916759901543969, "frac_reward_zero_std": 0.25, "grad_norm": 0.09563589841127396, "learning_rate": 3.684683779254245e-06, "loss": -0.0191, "num_tokens": 282092974.0, "reward": 6.333760738372803, "reward_std": 0.41795966029167175, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.9275108575820923, "rewards/ngram_similarity_reward/std": 0.22054578363895416, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 379.234375, "completions/mean_terminated_length": 379.234375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7921235175654509, "frac_reward_zero_std": 0.0, "grad_norm": 0.13181842863559723, "learning_rate": 3.6831692019081118e-06, "loss": 0.0045, "num_tokens": 282212301.0, "reward": 6.2495317459106445, "reward_std": 0.08606675267219543, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.749532163143158, "rewards/ngram_similarity_reward/std": 0.26874497532844543, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 554.671875, "completions/mean_terminated_length": 554.671875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.7925710449765048, "frac_reward_zero_std": 0.0, "grad_norm": 0.07528512924909592, "learning_rate": 3.681654113704044e-06, "loss": -0.023, "num_tokens": 282384152.0, "reward": 4.789369583129883, "reward_std": 0.1011795625090599, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7893695831298828, "rewards/ngram_similarity_reward/std": 0.25145819783210754, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 420.796875, "completions/mean_terminated_length": 420.796875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7930185723875587, "frac_reward_zero_std": 0.25, "grad_norm": 0.10464499145746231, "learning_rate": 3.6801385154714676e-06, "loss": -0.0179, "num_tokens": 282589547.0, "reward": 1.6843279600143433, "reward_std": 0.05340495705604553, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.6843281388282776, "rewards/ngram_similarity_reward/std": 0.13969381153583527, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 462.640625, "completions/mean_terminated_length": 462.640625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7934660997986127, "frac_reward_zero_std": 0.25, "grad_norm": 0.09221204370260239, "learning_rate": 3.6786224080400886e-06, "loss": 0.011, "num_tokens": 282751956.0, "reward": 4.661773681640625, "reward_std": 0.5073695778846741, "rewards/accuracy_reward/mean": 3.890625, "rewards/accuracy_reward/std": 2.6998953819274902, "rewards/ngram_similarity_reward/mean": 0.7711489200592041, "rewards/ngram_similarity_reward/std": 0.41562578082084656, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 413.140625, "completions/mean_terminated_length": 413.140625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.7939136272096666, "frac_reward_zero_std": 0.25, "grad_norm": 0.11431113630533218, "learning_rate": 3.6771057922398905e-06, "loss": 0.0065, "num_tokens": 282903709.0, "reward": 2.763495922088623, "reward_std": 1.5588185787200928, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.6384957432746887, "rewards/ngram_similarity_reward/std": 0.19880542159080505, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 438.359375, "completions/mean_terminated_length": 438.359375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.7943611546207205, "frac_reward_zero_std": 0.0, "grad_norm": 0.10721493512392044, "learning_rate": 3.6755886689011355e-06, "loss": -0.0138, "num_tokens": 283071956.0, "reward": 1.9768624305725098, "reward_std": 1.5460984706878662, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.7893625497817993, "rewards/ngram_similarity_reward/std": 0.27228429913520813, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 460.96875, "completions/mean_terminated_length": 435.7778015136719, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7948086820317745, "frac_reward_zero_std": 0.5, "grad_norm": 0.08337391912937164, "learning_rate": 3.674071038854364e-06, "loss": -0.0193, "num_tokens": 283212034.0, "reward": 4.8831915855407715, "reward_std": 0.09472521394491196, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.883191704750061, "rewards/ngram_similarity_reward/std": 0.2588989734649658, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 402.265625, "completions/mean_terminated_length": 402.265625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.7952562094428284, "frac_reward_zero_std": 0.0, "grad_norm": 0.13653582334518433, "learning_rate": 3.672552902930394e-06, "loss": 0.0153, "num_tokens": 283378675.0, "reward": 4.7928595542907715, "reward_std": 0.11162438988685608, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.7928594350814819, "rewards/ngram_similarity_reward/std": 0.2462923675775528, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 492.921875, "completions/mean_terminated_length": 468.2381286621094, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.7957037368538823, "frac_reward_zero_std": 0.0, "grad_norm": 0.10597686469554901, "learning_rate": 3.671034261960319e-06, "loss": 0.0321, "num_tokens": 283553006.0, "reward": 2.369813919067383, "reward_std": 1.3356808423995972, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 2.9016621112823486, "rewards/ngram_similarity_reward/mean": 0.7135640382766724, "rewards/ngram_similarity_reward/std": 0.18256360292434692, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 416.734375, "completions/mean_terminated_length": 390.8412780761719, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7961512642649362, "frac_reward_zero_std": 0.0, "grad_norm": 0.12527470290660858, "learning_rate": 3.669515116775511e-06, "loss": 0.0801, "num_tokens": 283715805.0, "reward": 2.960556745529175, "reward_std": 0.07754544168710709, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.46055683493614197, "rewards/ngram_similarity_reward/std": 0.23064681887626648, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 547.859375, "completions/mean_terminated_length": 547.859375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7965987916759901, "frac_reward_zero_std": 0.25, "grad_norm": 0.0594840906560421, "learning_rate": 3.6679954682076158e-06, "loss": -0.0067, "num_tokens": 283880004.0, "reward": 6.205974578857422, "reward_std": 0.06984958052635193, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7059743404388428, "rewards/ngram_similarity_reward/std": 0.29315268993377686, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 378.46875, "completions/mean_terminated_length": 378.46875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.797046319087044, "frac_reward_zero_std": 0.25, "grad_norm": 0.11517168581485748, "learning_rate": 3.6664753170885574e-06, "loss": -0.0129, "num_tokens": 284110754.0, "reward": 4.98866081237793, "reward_std": 0.08581934124231339, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.988660991191864, "rewards/ngram_similarity_reward/std": 0.23940570652484894, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 388.03125, "completions/mean_terminated_length": 388.03125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.797493846498098, "frac_reward_zero_std": 0.25, "grad_norm": 0.10036811977624893, "learning_rate": 3.6649546642505324e-06, "loss": 0.0273, "num_tokens": 284260772.0, "reward": 4.801386833190918, "reward_std": 0.18152017891407013, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.801386833190918, "rewards/ngram_similarity_reward/std": 0.3517768681049347, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 538.703125, "completions/mean_terminated_length": 538.703125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.797941373909152, "frac_reward_zero_std": 0.0, "grad_norm": 0.08701319992542267, "learning_rate": 3.663433510526014e-06, "loss": -0.0013, "num_tokens": 284418113.0, "reward": 2.274028778076172, "reward_std": 0.8565681576728821, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 2.9700891971588135, "rewards/ngram_similarity_reward/mean": 0.33652883768081665, "rewards/ngram_similarity_reward/std": 0.21484392881393433, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 509.15625, "completions/mean_terminated_length": 509.15625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7983889013202059, "frac_reward_zero_std": 0.0, "grad_norm": 0.1106950119137764, "learning_rate": 3.6619118567477474e-06, "loss": -0.0118, "num_tokens": 284573179.0, "reward": 5.514496326446533, "reward_std": 0.9562253952026367, "rewards/accuracy_reward/mean": 4.65625, "rewards/accuracy_reward/std": 2.102294683456421, "rewards/ngram_similarity_reward/mean": 0.8582462668418884, "rewards/ngram_similarity_reward/std": 0.21829448640346527, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 468.109375, "completions/mean_terminated_length": 468.109375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7988364287312598, "frac_reward_zero_std": 0.0, "grad_norm": 0.0942654237151146, "learning_rate": 3.660389703748754e-06, "loss": 0.0083, "num_tokens": 284724146.0, "reward": 6.243251800537109, "reward_std": 0.4922480583190918, "rewards/accuracy_reward/mean": 5.40625, "rewards/accuracy_reward/std": 0.7500000596046448, "rewards/ngram_similarity_reward/mean": 0.8370020389556885, "rewards/ngram_similarity_reward/std": 0.14942000806331635, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 394.921875, "completions/mean_terminated_length": 394.921875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7992839561423137, "frac_reward_zero_std": 0.25, "grad_norm": 0.09098318219184875, "learning_rate": 3.658867052362328e-06, "loss": 0.0327, "num_tokens": 284875453.0, "reward": 5.2692790031433105, "reward_std": 0.8968341946601868, "rewards/accuracy_reward/mean": 4.75, "rewards/accuracy_reward/std": 2.0, "rewards/ngram_similarity_reward/mean": 0.5192788243293762, "rewards/ngram_similarity_reward/std": 0.296322762966156, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 508.359375, "completions/mean_terminated_length": 458.69354248046875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.7997314835533677, "frac_reward_zero_std": 0.0, "grad_norm": 0.11340255290269852, "learning_rate": 3.6573439034220336e-06, "loss": -0.0219, "num_tokens": 285025012.0, "reward": 4.652604579925537, "reward_std": 0.4156516492366791, "rewards/accuracy_reward/mean": 3.90625, "rewards/accuracy_reward/std": 2.6709415912628174, "rewards/ngram_similarity_reward/mean": 0.7463546991348267, "rewards/ngram_similarity_reward/std": 0.3110392391681671, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 497.953125, "completions/mean_terminated_length": 497.953125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.8001790109644216, "frac_reward_zero_std": 0.0, "grad_norm": 0.1081513836979866, "learning_rate": 3.6558202577617125e-06, "loss": 0.0199, "num_tokens": 285132033.0, "reward": 3.0789520740509033, "reward_std": 0.4339574873447418, "rewards/accuracy_reward/mean": 2.40625, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.6727021932601929, "rewards/ngram_similarity_reward/std": 0.37716013193130493, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 557.921875, "completions/mean_terminated_length": 534.2698974609375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.8006265383754755, "frac_reward_zero_std": 0.0, "grad_norm": 0.09166616201400757, "learning_rate": 3.654296116215473e-06, "loss": -0.0014, "num_tokens": 285305788.0, "reward": 4.567384719848633, "reward_std": 0.7663179636001587, "rewards/accuracy_reward/mean": 3.625, "rewards/accuracy_reward/std": 2.8030595779418945, "rewards/ngram_similarity_reward/mean": 0.9423847198486328, "rewards/ngram_similarity_reward/std": 0.2744849920272827, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 440.828125, "completions/mean_terminated_length": 440.828125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.8010740657865294, "frac_reward_zero_std": 0.0, "grad_norm": 0.13537095487117767, "learning_rate": 3.6527714796176996e-06, "loss": -0.0083, "num_tokens": 285402305.0, "reward": 4.586366653442383, "reward_std": 0.16081549227237701, "rewards/accuracy_reward/mean": 4.0, "rewards/accuracy_reward/std": 2.618614673614502, "rewards/ngram_similarity_reward/mean": 0.5863662958145142, "rewards/ngram_similarity_reward/std": 0.267704576253891, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 392.03125, "completions/mean_terminated_length": 392.03125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8015215931975833, "frac_reward_zero_std": 0.0, "grad_norm": 0.11276783049106598, "learning_rate": 3.6512463488030443e-06, "loss": 0.0058, "num_tokens": 285537891.0, "reward": 3.4525465965270996, "reward_std": 0.44818663597106934, "rewards/accuracy_reward/mean": 2.59375, "rewards/accuracy_reward/std": 3.0222392082214355, "rewards/ngram_similarity_reward/mean": 0.8587964177131653, "rewards/ngram_similarity_reward/std": 0.2226724773645401, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 478.5625, "completions/mean_terminated_length": 478.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8019691206086372, "frac_reward_zero_std": 0.0, "grad_norm": 0.115529865026474, "learning_rate": 3.6497207246064296e-06, "loss": 0.0023, "num_tokens": 285670903.0, "reward": 2.5690736770629883, "reward_std": 0.7571008801460266, "rewards/accuracy_reward/mean": 2.125, "rewards/accuracy_reward/std": 3.000000238418579, "rewards/ngram_similarity_reward/mean": 0.44407370686531067, "rewards/ngram_similarity_reward/std": 0.3232196569442749, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 557.09375, "completions/mean_terminated_length": 557.09375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.8024166480196913, "frac_reward_zero_std": 0.0, "grad_norm": 0.08564143627882004, "learning_rate": 3.648194607863052e-06, "loss": 0.0524, "num_tokens": 285806157.0, "reward": 4.387675762176514, "reward_std": 0.807447075843811, "rewards/accuracy_reward/mean": 3.8125, "rewards/accuracy_reward/std": 2.7189810276031494, "rewards/ngram_similarity_reward/mean": 0.5751754641532898, "rewards/ngram_similarity_reward/std": 0.21848054230213165, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 453.515625, "completions/mean_terminated_length": 453.515625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.8028641754307452, "frac_reward_zero_std": 0.0, "grad_norm": 0.14359252154827118, "learning_rate": 3.646667999408373e-06, "loss": -0.0296, "num_tokens": 285955630.0, "reward": 4.477880477905273, "reward_std": 1.0613147020339966, "rewards/accuracy_reward/mean": 3.703125, "rewards/accuracy_reward/std": 2.789889335632324, "rewards/ngram_similarity_reward/mean": 0.7747553586959839, "rewards/ngram_similarity_reward/std": 0.289902001619339, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1058.0, "completions/max_terminated_length": 1058.0, "completions/mean_length": 519.09375, "completions/mean_terminated_length": 519.09375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.8033117028417991, "frac_reward_zero_std": 0.0, "grad_norm": 0.11683569103479385, "learning_rate": 3.6451409000781263e-06, "loss": 0.0152, "num_tokens": 286067060.0, "reward": 5.668905735015869, "reward_std": 0.8359593152999878, "rewards/accuracy_reward/mean": 5.171875, "rewards/accuracy_reward/std": 1.491294264793396, "rewards/ngram_similarity_reward/mean": 0.4970306158065796, "rewards/ngram_similarity_reward/std": 0.3316640555858612, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 526.0, "completions/mean_terminated_length": 451.14752197265625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.803759230252853, "frac_reward_zero_std": 0.0, "grad_norm": 0.1556176394224167, "learning_rate": 3.643613310708314e-06, "loss": 0.0674, "num_tokens": 286204356.0, "reward": 6.280373573303223, "reward_std": 0.08985870331525803, "rewards/accuracy_reward/mean": 5.5, "rewards/accuracy_reward/std": 0.0, "rewards/ngram_similarity_reward/mean": 0.7803735733032227, "rewards/ngram_similarity_reward/std": 0.33898887038230896, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 492.515625, "completions/mean_terminated_length": 492.515625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.8042067576639069, "frac_reward_zero_std": 0.0, "grad_norm": 0.11379936337471008, "learning_rate": 3.642085232135204e-06, "loss": -0.022, "num_tokens": 286339941.0, "reward": 5.672510147094727, "reward_std": 1.0292677879333496, "rewards/accuracy_reward/mean": 5.1875, "rewards/accuracy_reward/std": 1.42400062084198, "rewards/ngram_similarity_reward/mean": 0.48501038551330566, "rewards/ngram_similarity_reward/std": 0.2070183902978897, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 507.125, "completions/mean_terminated_length": 507.125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.8046542850749608, "frac_reward_zero_std": 0.0, "grad_norm": 0.1011330708861351, "learning_rate": 3.640556665195335e-06, "loss": 0.0084, "num_tokens": 286502445.0, "reward": 0.8693095445632935, "reward_std": 0.9130166172981262, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 1.8970921039581299, "rewards/ngram_similarity_reward/mean": 0.7286845445632935, "rewards/ngram_similarity_reward/std": 0.25176477432250977, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 464.796875, "completions/mean_terminated_length": 439.66668701171875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.8051018124860148, "frac_reward_zero_std": 0.25, "grad_norm": 0.09837361425161362, "learning_rate": 3.639027610725512e-06, "loss": -0.0147, "num_tokens": 286651008.0, "reward": 3.1708290576934814, "reward_std": 0.09569500386714935, "rewards/accuracy_reward/mean": 2.5, "rewards/accuracy_reward/std": 3.0237157344818115, "rewards/ngram_similarity_reward/mean": 0.6708289980888367, "rewards/ngram_similarity_reward/std": 0.3423401713371277, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 501.6875, "completions/mean_terminated_length": 501.6875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.8055493398970687, "frac_reward_zero_std": 0.25, "grad_norm": 0.07266926765441895, "learning_rate": 3.6374980695628064e-06, "loss": 0.0053, "num_tokens": 286785260.0, "reward": 5.345115661621094, "reward_std": 0.8422203063964844, "rewards/accuracy_reward/mean": 4.5625, "rewards/accuracy_reward/std": 2.195775270462036, "rewards/ngram_similarity_reward/mean": 0.7826155424118042, "rewards/ngram_similarity_reward/std": 0.24295289814472198, "step": 1800 } ], "logging_steps": 1, "max_steps": 4470, "num_input_tokens_seen": 286785260, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }