diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12697 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.10005333333333333, + "eval_steps": 500, + "global_step": 469, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3841.0, + "completions/mean_length": 1272.1640625, + "completions/mean_terminated_length": 1249.9290771484375, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.00021333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22031651437282562, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 482922.0, + "reward": 2.009742021560669, + "reward_std": 0.5785077810287476, + "rewards/cosine_scaled_reward/mean": 0.3822018802165985, + "rewards/cosine_scaled_reward/std": 0.3494165539741516, + "rewards/repetition_penalty_reward/mean": -0.060741037130355835, + "rewards/repetition_penalty_reward/std": 0.045807842165231705, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.69140625, + "rewards/reward_reference/std": 0.46281787753105164, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3649.0, + "completions/mean_length": 1335.62890625, + "completions/mean_terminated_length": 1302.8973388671875, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "epoch": 0.00042666666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22476382553577423, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 974007.0, + "reward": 2.014347553253174, + "reward_std": 0.6111550331115723, + "rewards/cosine_scaled_reward/mean": 0.3857782185077667, + "rewards/cosine_scaled_reward/std": 0.35800445079803467, + "rewards/repetition_penalty_reward/mean": -0.06752431392669678, + "rewards/repetition_penalty_reward/std": 0.05347849428653717, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.69921875, + "rewards/reward_reference/std": 0.45949608087539673, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 4086.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 1288.3828125, + "completions/mean_terminated_length": 1288.3828125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.00064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20334957540035248, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 1465649.0, + "reward": 2.1703991889953613, + "reward_std": 0.5016853213310242, + "rewards/cosine_scaled_reward/mean": 0.4534637928009033, + "rewards/cosine_scaled_reward/std": 0.3181309998035431, + "rewards/repetition_penalty_reward/mean": -0.06431479007005692, + "rewards/repetition_penalty_reward/std": 0.03901338577270508, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.78125, + "rewards/reward_reference/std": 0.41420844197273254, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3700.0, + "completions/mean_length": 1424.984375, + "completions/mean_terminated_length": 1403.9527587890625, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.0008533333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19816721975803375, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 1986721.0, + "reward": 2.2232859134674072, + "reward_std": 0.4856342077255249, + "rewards/cosine_scaled_reward/mean": 0.48364967107772827, + "rewards/cosine_scaled_reward/std": 0.32795608043670654, + "rewards/repetition_penalty_reward/mean": -0.07364509999752045, + "rewards/repetition_penalty_reward/std": 0.055652402341365814, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.81640625, + "rewards/reward_reference/std": 0.387910932302475, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3592.0, + "completions/max_terminated_length": 3592.0, + "completions/mean_length": 1338.8125, + "completions/mean_terminated_length": 1338.8125, + "completions/min_length": 570.0, + "completions/min_terminated_length": 570.0, + "epoch": 0.0010666666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13079342246055603, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 2486485.0, + "reward": 2.3322033882141113, + "reward_std": 0.22138240933418274, + "rewards/cosine_scaled_reward/mean": 0.5233694314956665, + "rewards/cosine_scaled_reward/std": 0.27166542410850525, + "rewards/repetition_penalty_reward/mean": -0.06616615504026413, + "rewards/repetition_penalty_reward/std": 0.04239708185195923, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3989.0, + "completions/mean_length": 1401.1953125, + "completions/mean_terminated_length": 1379.976318359375, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.00128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11242787539958954, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 2997299.0, + "reward": 2.324979066848755, + "reward_std": 0.17272153496742249, + "rewards/cosine_scaled_reward/mean": 0.5257084369659424, + "rewards/cosine_scaled_reward/std": 0.2853335738182068, + "rewards/repetition_penalty_reward/mean": -0.06869829446077347, + "rewards/repetition_penalty_reward/std": 0.05674975365400314, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3755.0, + "completions/max_terminated_length": 3755.0, + "completions/mean_length": 1290.30859375, + "completions/mean_terminated_length": 1290.30859375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.0014933333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09726474434137344, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 3489290.0, + "reward": 2.472348928451538, + "reward_std": 0.11173544079065323, + "rewards/cosine_scaled_reward/mean": 0.5793841481208801, + "rewards/cosine_scaled_reward/std": 0.1878894716501236, + "rewards/repetition_penalty_reward/mean": -0.0679725855588913, + "rewards/repetition_penalty_reward/std": 0.05107080563902855, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4009.0, + "completions/mean_length": 1331.0390625, + "completions/mean_terminated_length": 1320.1961669921875, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "epoch": 0.0017066666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10814463347196579, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 3989000.0, + "reward": 2.4125254154205322, + "reward_std": 0.13539515435695648, + "rewards/cosine_scaled_reward/mean": 0.5574886202812195, + "rewards/cosine_scaled_reward/std": 0.22672739624977112, + "rewards/repetition_penalty_reward/mean": -0.06683817505836487, + "rewards/repetition_penalty_reward/std": 0.042216457426548004, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3870.0, + "completions/max_terminated_length": 3870.0, + "completions/mean_length": 1249.4140625, + "completions/mean_terminated_length": 1249.4140625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.00192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0961054190993309, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 4473702.0, + "reward": 2.467017650604248, + "reward_std": 0.11271210014820099, + "rewards/cosine_scaled_reward/mean": 0.5698755979537964, + "rewards/cosine_scaled_reward/std": 0.18185605108737946, + "rewards/repetition_penalty_reward/mean": -0.05988934636116028, + "rewards/repetition_penalty_reward/std": 0.037777043879032135, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3699.0, + "completions/max_terminated_length": 3699.0, + "completions/mean_length": 1270.58984375, + "completions/mean_terminated_length": 1270.58984375, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "epoch": 0.0021333333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10294565558433533, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 4969001.0, + "reward": 2.4084889888763428, + "reward_std": 0.12390808016061783, + "rewards/cosine_scaled_reward/mean": 0.543577253818512, + "rewards/cosine_scaled_reward/std": 0.2268955409526825, + "rewards/repetition_penalty_reward/mean": -0.05696332827210426, + "rewards/repetition_penalty_reward/std": 0.033603210002183914, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3572.0, + "completions/mean_length": 1343.6171875, + "completions/mean_terminated_length": 1321.94482421875, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.0023466666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10278859734535217, + "learning_rate": 1e-06, + "loss": -0.018, + "num_tokens": 5464207.0, + "reward": 2.426042318344116, + "reward_std": 0.14806506037712097, + "rewards/cosine_scaled_reward/mean": 0.560874342918396, + "rewards/cosine_scaled_reward/std": 0.22602853178977966, + "rewards/repetition_penalty_reward/mean": -0.06530068069696426, + "rewards/repetition_penalty_reward/std": 0.048898182809352875, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3606.0, + "completions/mean_length": 1317.421875, + "completions/mean_terminated_length": 1306.5255126953125, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.00256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1048831194639206, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 5956883.0, + "reward": 2.4089107513427734, + "reward_std": 0.14584854245185852, + "rewards/cosine_scaled_reward/mean": 0.5502076148986816, + "rewards/cosine_scaled_reward/std": 0.2330145388841629, + "rewards/repetition_penalty_reward/mean": -0.06707821041345596, + "rewards/repetition_penalty_reward/std": 0.045541103929281235, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4063.0, + "completions/mean_length": 1363.87109375, + "completions/mean_terminated_length": 1320.5040283203125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.0027733333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1066160574555397, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 6444286.0, + "reward": 2.264991044998169, + "reward_std": 0.16080796718597412, + "rewards/cosine_scaled_reward/mean": 0.4897496700286865, + "rewards/cosine_scaled_reward/std": 0.30944791436195374, + "rewards/repetition_penalty_reward/mean": -0.07788346707820892, + "rewards/repetition_penalty_reward/std": 0.07128535211086273, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.859375, + "rewards/reward_reference/std": 0.3483152687549591, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3437.0, + "completions/mean_length": 1307.9140625, + "completions/mean_terminated_length": 1296.98046875, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "epoch": 0.0029866666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1263291835784912, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 6929804.0, + "reward": 2.425128936767578, + "reward_std": 0.18342849612236023, + "rewards/cosine_scaled_reward/mean": 0.5549286603927612, + "rewards/cosine_scaled_reward/std": 0.22249378263950348, + "rewards/repetition_penalty_reward/mean": -0.0633934885263443, + "rewards/repetition_penalty_reward/std": 0.0460335873067379, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4006.0, + "completions/mean_length": 1294.0390625, + "completions/mean_terminated_length": 1283.051025390625, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.0032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08281808346509933, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 7419574.0, + "reward": 2.4733660221099854, + "reward_std": 0.09956402331590652, + "rewards/cosine_scaled_reward/mean": 0.5843473076820374, + "rewards/cosine_scaled_reward/std": 0.1703619360923767, + "rewards/repetition_penalty_reward/mean": -0.06801241636276245, + "rewards/repetition_penalty_reward/std": 0.055640317499637604, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3652.0, + "completions/max_terminated_length": 3652.0, + "completions/mean_length": 1338.58984375, + "completions/mean_terminated_length": 1338.58984375, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "epoch": 0.0034133333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09150482714176178, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 7930145.0, + "reward": 2.4295272827148438, + "reward_std": 0.1320231556892395, + "rewards/cosine_scaled_reward/mean": 0.5594995021820068, + "rewards/cosine_scaled_reward/std": 0.22843553125858307, + "rewards/repetition_penalty_reward/mean": -0.06356588006019592, + "rewards/repetition_penalty_reward/std": 0.044449321925640106, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 1384.94921875, + "completions/mean_terminated_length": 1374.3177490234375, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.0036266666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12217939645051956, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 8441464.0, + "reward": 2.330857753753662, + "reward_std": 0.18522366881370544, + "rewards/cosine_scaled_reward/mean": 0.5207034945487976, + "rewards/cosine_scaled_reward/std": 0.2845655679702759, + "rewards/repetition_penalty_reward/mean": -0.06875194609165192, + "rewards/repetition_penalty_reward/std": 0.04453708976507187, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3870.0, + "completions/mean_length": 1400.40234375, + "completions/mean_terminated_length": 1389.8314208984375, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "epoch": 0.00384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09449199587106705, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 8958591.0, + "reward": 2.371668577194214, + "reward_std": 0.15123680233955383, + "rewards/cosine_scaled_reward/mean": 0.5336047410964966, + "rewards/cosine_scaled_reward/std": 0.2732281982898712, + "rewards/repetition_penalty_reward/mean": -0.06506102532148361, + "rewards/repetition_penalty_reward/std": 0.04685278609395027, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3348.0, + "completions/mean_length": 1337.62109375, + "completions/mean_terminated_length": 1304.9130859375, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "epoch": 0.004053333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09695859253406525, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 9451434.0, + "reward": 2.4069466590881348, + "reward_std": 0.11520685255527496, + "rewards/cosine_scaled_reward/mean": 0.5477147102355957, + "rewards/cosine_scaled_reward/std": 0.24599869549274445, + "rewards/repetition_penalty_reward/mean": -0.06733058393001556, + "rewards/repetition_penalty_reward/std": 0.055632736533880234, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3706.0, + "completions/max_terminated_length": 3706.0, + "completions/mean_length": 1414.38671875, + "completions/mean_terminated_length": 1414.38671875, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.004266666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11205258220434189, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 9976625.0, + "reward": 2.3993844985961914, + "reward_std": 0.15085497498512268, + "rewards/cosine_scaled_reward/mean": 0.5560339093208313, + "rewards/cosine_scaled_reward/std": 0.25779932737350464, + "rewards/repetition_penalty_reward/mean": -0.06680548191070557, + "rewards/repetition_penalty_reward/std": 0.0472581572830677, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3791.0, + "completions/mean_length": 1426.796875, + "completions/mean_terminated_length": 1384.4287109375, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "epoch": 0.00448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09845885634422302, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 10482365.0, + "reward": 2.345158338546753, + "reward_std": 0.14726582169532776, + "rewards/cosine_scaled_reward/mean": 0.5292760133743286, + "rewards/cosine_scaled_reward/std": 0.2867335379123688, + "rewards/repetition_penalty_reward/mean": -0.07630515843629837, + "rewards/repetition_penalty_reward/std": 0.05343927443027496, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3537.0, + "completions/max_terminated_length": 3537.0, + "completions/mean_length": 1426.203125, + "completions/mean_terminated_length": 1426.203125, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.004693333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1269364356994629, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 11005109.0, + "reward": 2.360685348510742, + "reward_std": 0.18901582062244415, + "rewards/cosine_scaled_reward/mean": 0.5373590588569641, + "rewards/cosine_scaled_reward/std": 0.278967946767807, + "rewards/repetition_penalty_reward/mean": -0.0712050348520279, + "rewards/repetition_penalty_reward/std": 0.05502448230981827, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4044.0, + "completions/mean_length": 1537.12890625, + "completions/mean_terminated_length": 1454.5845947265625, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "epoch": 0.004906666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1407482624053955, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 11524066.0, + "reward": 2.2200188636779785, + "reward_std": 0.21890655159950256, + "rewards/cosine_scaled_reward/mean": 0.48634082078933716, + "rewards/cosine_scaled_reward/std": 0.3508237600326538, + "rewards/repetition_penalty_reward/mean": -0.08272843062877655, + "rewards/repetition_penalty_reward/std": 0.07371426373720169, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.83203125, + "rewards/reward_reference/std": 0.3745708465576172, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3893.0, + "completions/mean_length": 1413.125, + "completions/mean_terminated_length": 1370.539794921875, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.00512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09089359641075134, + "learning_rate": 1e-06, + "loss": -0.0207, + "num_tokens": 12036734.0, + "reward": 2.3405649662017822, + "reward_std": 0.13450174033641815, + "rewards/cosine_scaled_reward/mean": 0.522262454032898, + "rewards/cosine_scaled_reward/std": 0.2896619141101837, + "rewards/repetition_penalty_reward/mean": -0.06919749081134796, + "rewards/repetition_penalty_reward/std": 0.05012732744216919, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2961.0, + "completions/max_terminated_length": 2961.0, + "completions/mean_length": 1314.49609375, + "completions/mean_terminated_length": 1314.49609375, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "epoch": 0.005333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06929586082696915, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 12535273.0, + "reward": 2.4096481800079346, + "reward_std": 0.09283026307821274, + "rewards/cosine_scaled_reward/mean": 0.5563814640045166, + "rewards/cosine_scaled_reward/std": 0.22016239166259766, + "rewards/repetition_penalty_reward/mean": -0.06470194458961487, + "rewards/repetition_penalty_reward/std": 0.042412176728248596, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3672.0, + "completions/mean_length": 1402.0390625, + "completions/mean_terminated_length": 1326.30517578125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "epoch": 0.005546666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10325585305690765, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 13034299.0, + "reward": 2.3359270095825195, + "reward_std": 0.13954925537109375, + "rewards/cosine_scaled_reward/mean": 0.5185195803642273, + "rewards/cosine_scaled_reward/std": 0.2903260290622711, + "rewards/repetition_penalty_reward/mean": -0.07399865984916687, + "rewards/repetition_penalty_reward/std": 0.06387010216712952, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 2552.0, + "completions/max_terminated_length": 2552.0, + "completions/mean_length": 1295.125, + "completions/mean_terminated_length": 1295.125, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.00576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1057233065366745, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 13528279.0, + "reward": 2.425154447555542, + "reward_std": 0.12597481906414032, + "rewards/cosine_scaled_reward/mean": 0.5589825510978699, + "rewards/cosine_scaled_reward/std": 0.20628339052200317, + "rewards/repetition_penalty_reward/mean": -0.05960933491587639, + "rewards/repetition_penalty_reward/std": 0.027123799547553062, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4035.0, + "completions/mean_length": 1445.77734375, + "completions/mean_terminated_length": 1435.3843994140625, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.005973333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13786497712135315, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 14062230.0, + "reward": 2.406646251678467, + "reward_std": 0.22535404562950134, + "rewards/cosine_scaled_reward/mean": 0.561261773109436, + "rewards/cosine_scaled_reward/std": 0.25999942421913147, + "rewards/repetition_penalty_reward/mean": -0.06945927441120148, + "rewards/repetition_penalty_reward/std": 0.053039710968732834, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 1456.67578125, + "completions/mean_terminated_length": 1435.8936767578125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "epoch": 0.006186666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12175247073173523, + "learning_rate": 1e-06, + "loss": -0.0315, + "num_tokens": 14588299.0, + "reward": 2.405393123626709, + "reward_std": 0.19517257809638977, + "rewards/cosine_scaled_reward/mean": 0.5614031553268433, + "rewards/cosine_scaled_reward/std": 0.26180750131607056, + "rewards/repetition_penalty_reward/mean": -0.07085388153791428, + "rewards/repetition_penalty_reward/std": 0.05317673459649086, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3379.0, + "completions/mean_length": 1438.1015625, + "completions/mean_terminated_length": 1417.1732177734375, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "epoch": 0.0064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12587440013885498, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 15107513.0, + "reward": 2.3349556922912598, + "reward_std": 0.2020719051361084, + "rewards/cosine_scaled_reward/mean": 0.5276631116867065, + "rewards/cosine_scaled_reward/std": 0.29128366708755493, + "rewards/repetition_penalty_reward/mean": -0.06770722568035126, + "rewards/repetition_penalty_reward/std": 0.04827690124511719, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3383.0, + "completions/mean_length": 1442.3125, + "completions/mean_terminated_length": 1431.906005859375, + "completions/min_length": 644.0, + "completions/min_terminated_length": 644.0, + "epoch": 0.006613333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1460960954427719, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 15636525.0, + "reward": 2.3774428367614746, + "reward_std": 0.21828415989875793, + "rewards/cosine_scaled_reward/mean": 0.5441263914108276, + "rewards/cosine_scaled_reward/std": 0.2732025682926178, + "rewards/repetition_penalty_reward/mean": -0.06512115895748138, + "rewards/repetition_penalty_reward/std": 0.04758916050195694, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3752.0, + "completions/mean_length": 1484.0546875, + "completions/mean_terminated_length": 1463.4881591796875, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "epoch": 0.006826666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14973130822181702, + "learning_rate": 1e-06, + "loss": -0.0236, + "num_tokens": 16167843.0, + "reward": 2.349381685256958, + "reward_std": 0.24881769716739655, + "rewards/cosine_scaled_reward/mean": 0.5373992919921875, + "rewards/cosine_scaled_reward/std": 0.2915312647819519, + "rewards/repetition_penalty_reward/mean": -0.07161141186952591, + "rewards/repetition_penalty_reward/std": 0.054504405707120895, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3925.0, + "completions/mean_length": 1444.78515625, + "completions/mean_terminated_length": 1423.909423828125, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.00704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10494732111692429, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 16693652.0, + "reward": 2.421522855758667, + "reward_std": 0.1476818323135376, + "rewards/cosine_scaled_reward/mean": 0.567467451095581, + "rewards/cosine_scaled_reward/std": 0.249455064535141, + "rewards/repetition_penalty_reward/mean": -0.07172583043575287, + "rewards/repetition_penalty_reward/std": 0.057648930698633194, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3803.0, + "completions/max_terminated_length": 3803.0, + "completions/mean_length": 1356.015625, + "completions/mean_terminated_length": 1356.015625, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "epoch": 0.007253333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09943025559186935, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 17200788.0, + "reward": 2.4666028022766113, + "reward_std": 0.14464417099952698, + "rewards/cosine_scaled_reward/mean": 0.5810573101043701, + "rewards/cosine_scaled_reward/std": 0.1915411502122879, + "rewards/repetition_penalty_reward/mean": -0.05976710468530655, + "rewards/repetition_penalty_reward/std": 0.03395003080368042, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9453125, + "rewards/reward_reference/std": 0.22781464457511902, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3482.0, + "completions/mean_length": 1397.02734375, + "completions/mean_terminated_length": 1375.775634765625, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.007466666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10014528036117554, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 17711819.0, + "reward": 2.452289342880249, + "reward_std": 0.15739864110946655, + "rewards/cosine_scaled_reward/mean": 0.574344277381897, + "rewards/cosine_scaled_reward/std": 0.22381597757339478, + "rewards/repetition_penalty_reward/mean": -0.06424228847026825, + "rewards/repetition_penalty_reward/std": 0.04820853844285011, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9453125, + "rewards/reward_reference/std": 0.22781464457511902, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3419.0, + "completions/mean_length": 1457.8203125, + "completions/mean_terminated_length": 1437.0472412109375, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "epoch": 0.00768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12512792646884918, + "learning_rate": 1e-06, + "loss": -0.0205, + "num_tokens": 18241569.0, + "reward": 2.326671600341797, + "reward_std": 0.20083504915237427, + "rewards/cosine_scaled_reward/mean": 0.5292975306510925, + "rewards/cosine_scaled_reward/std": 0.2945072054862976, + "rewards/repetition_penalty_reward/mean": -0.074501171708107, + "rewards/repetition_penalty_reward/std": 0.055338870733976364, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3618.0, + "completions/mean_length": 1373.0703125, + "completions/mean_terminated_length": 1362.3922119140625, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.007893333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08265010267496109, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 18748431.0, + "reward": 2.423821449279785, + "reward_std": 0.1160140410065651, + "rewards/cosine_scaled_reward/mean": 0.5647482872009277, + "rewards/cosine_scaled_reward/std": 0.2289552241563797, + "rewards/repetition_penalty_reward/mean": -0.06280169636011124, + "rewards/repetition_penalty_reward/std": 0.04026205465197563, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3864.0, + "completions/mean_length": 1464.5546875, + "completions/mean_terminated_length": 1443.8345947265625, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "epoch": 0.008106666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21051424741744995, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 19270157.0, + "reward": 2.389101028442383, + "reward_std": 0.2080249786376953, + "rewards/cosine_scaled_reward/mean": 0.5605127811431885, + "rewards/cosine_scaled_reward/std": 0.25950494408607483, + "rewards/repetition_penalty_reward/mean": -0.07141192257404327, + "rewards/repetition_penalty_reward/std": 0.05478326603770256, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 4076.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 1544.125, + "completions/mean_terminated_length": 1544.125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "epoch": 0.00832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13599231839179993, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 19814469.0, + "reward": 2.3371737003326416, + "reward_std": 0.21445316076278687, + "rewards/cosine_scaled_reward/mean": 0.5401486158370972, + "rewards/cosine_scaled_reward/std": 0.30538463592529297, + "rewards/repetition_penalty_reward/mean": -0.07797486335039139, + "rewards/repetition_penalty_reward/std": 0.05641409009695053, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3719.0, + "completions/mean_length": 1509.828125, + "completions/mean_terminated_length": 1489.464599609375, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "epoch": 0.008533333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12129685282707214, + "learning_rate": 1e-06, + "loss": -0.0166, + "num_tokens": 20351553.0, + "reward": 2.3992435932159424, + "reward_std": 0.21829761564731598, + "rewards/cosine_scaled_reward/mean": 0.5619106888771057, + "rewards/cosine_scaled_reward/std": 0.2718035876750946, + "rewards/repetition_penalty_reward/mean": -0.07282336056232452, + "rewards/repetition_penalty_reward/std": 0.0561990961432457, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3967.0, + "completions/mean_length": 1472.12109375, + "completions/mean_terminated_length": 1461.8314208984375, + "completions/min_length": 641.0, + "completions/min_terminated_length": 641.0, + "epoch": 0.008746666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1293008178472519, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 20887544.0, + "reward": 2.3384013175964355, + "reward_std": 0.20636960864067078, + "rewards/cosine_scaled_reward/mean": 0.5350905060768127, + "rewards/cosine_scaled_reward/std": 0.29096055030822754, + "rewards/repetition_penalty_reward/mean": -0.07168925553560257, + "rewards/repetition_penalty_reward/std": 0.04815354570746422, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 1545.1015625, + "completions/mean_terminated_length": 1473.3895263671875, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "epoch": 0.00896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1378139704465866, + "learning_rate": 1e-06, + "loss": 0.0222, + "num_tokens": 21419114.0, + "reward": 2.301651954650879, + "reward_std": 0.1993783414363861, + "rewards/cosine_scaled_reward/mean": 0.521456241607666, + "rewards/cosine_scaled_reward/std": 0.3168460726737976, + "rewards/repetition_penalty_reward/mean": -0.07058563828468323, + "rewards/repetition_penalty_reward/std": 0.05316044017672539, + "rewards/reward_format/mean": 0.9874999523162842, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.86328125, + "rewards/reward_reference/std": 0.34422317147254944, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3695.0, + "completions/mean_length": 1472.265625, + "completions/mean_terminated_length": 1461.9765625, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "epoch": 0.009173333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10675688087940216, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 21958298.0, + "reward": 2.4147229194641113, + "reward_std": 0.14962312579154968, + "rewards/cosine_scaled_reward/mean": 0.5648523569107056, + "rewards/cosine_scaled_reward/std": 0.2577730417251587, + "rewards/repetition_penalty_reward/mean": -0.06497295200824738, + "rewards/repetition_penalty_reward/std": 0.04444069415330887, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3991.0, + "completions/mean_length": 1509.57421875, + "completions/mean_terminated_length": 1478.9051513671875, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.009386666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10473625361919403, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 22491545.0, + "reward": 2.3497352600097656, + "reward_std": 0.1567041128873825, + "rewards/cosine_scaled_reward/mean": 0.5374966859817505, + "rewards/cosine_scaled_reward/std": 0.2960168719291687, + "rewards/repetition_penalty_reward/mean": -0.06666764616966248, + "rewards/repetition_penalty_reward/std": 0.04307934269309044, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 1599.6640625, + "completions/mean_terminated_length": 1539.7520751953125, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "epoch": 0.0096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11697933822870255, + "learning_rate": 1e-06, + "loss": -0.0498, + "num_tokens": 23036343.0, + "reward": 2.4010181427001953, + "reward_std": 0.23659676313400269, + "rewards/cosine_scaled_reward/mean": 0.5671864151954651, + "rewards/cosine_scaled_reward/std": 0.28708207607269287, + "rewards/repetition_penalty_reward/mean": -0.0739808902144432, + "rewards/repetition_penalty_reward/std": 0.05830969288945198, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3781.0, + "completions/mean_length": 1523.109375, + "completions/mean_terminated_length": 1502.850341796875, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "epoch": 0.009813333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13121408224105835, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 23589571.0, + "reward": 2.303783893585205, + "reward_std": 0.2267310470342636, + "rewards/cosine_scaled_reward/mean": 0.5189235806465149, + "rewards/cosine_scaled_reward/std": 0.31321921944618225, + "rewards/repetition_penalty_reward/mean": -0.06748341023921967, + "rewards/repetition_penalty_reward/std": 0.05285539850592613, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.85546875, + "rewards/reward_reference/std": 0.35231640934944153, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3932.0, + "completions/max_terminated_length": 3932.0, + "completions/mean_length": 1465.68359375, + "completions/mean_terminated_length": 1465.68359375, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.010026666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12762288749217987, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 24123094.0, + "reward": 2.360924482345581, + "reward_std": 0.1576913297176361, + "rewards/cosine_scaled_reward/mean": 0.5412070751190186, + "rewards/cosine_scaled_reward/std": 0.279924601316452, + "rewards/repetition_penalty_reward/mean": -0.06309500336647034, + "rewards/repetition_penalty_reward/std": 0.04309338703751564, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 1575.7734375, + "completions/mean_terminated_length": 1545.889404296875, + "completions/min_length": 760.0, + "completions/min_terminated_length": 760.0, + "epoch": 0.01024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1287793666124344, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 24679392.0, + "reward": 2.373932361602783, + "reward_std": 0.24167510867118835, + "rewards/cosine_scaled_reward/mean": 0.5552927255630493, + "rewards/cosine_scaled_reward/std": 0.2919711470603943, + "rewards/repetition_penalty_reward/mean": -0.07198527455329895, + "rewards/repetition_penalty_reward/std": 0.052775539457798004, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3708.0, + "completions/mean_length": 1634.5, + "completions/mean_terminated_length": 1544.809814453125, + "completions/min_length": 644.0, + "completions/min_terminated_length": 644.0, + "epoch": 0.010453333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12446154654026031, + "learning_rate": 1e-06, + "loss": -0.0409, + "num_tokens": 25222184.0, + "reward": 2.3497486114501953, + "reward_std": 0.2241676151752472, + "rewards/cosine_scaled_reward/mean": 0.550166130065918, + "rewards/cosine_scaled_reward/std": 0.31175705790519714, + "rewards/repetition_penalty_reward/mean": -0.0793239176273346, + "rewards/repetition_penalty_reward/std": 0.06476996839046478, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4042.0, + "completions/mean_length": 1526.890625, + "completions/mean_terminated_length": 1475.713134765625, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "epoch": 0.010666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09193289279937744, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 25756788.0, + "reward": 2.4244213104248047, + "reward_std": 0.15554851293563843, + "rewards/cosine_scaled_reward/mean": 0.5812386274337769, + "rewards/cosine_scaled_reward/std": 0.25069162249565125, + "rewards/repetition_penalty_reward/mean": -0.0724422037601471, + "rewards/repetition_penalty_reward/std": 0.05959644913673401, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4044.0, + "completions/mean_length": 1562.3515625, + "completions/mean_terminated_length": 1522.135009765625, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "epoch": 0.01088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13995690643787384, + "learning_rate": 1e-06, + "loss": 0.027, + "num_tokens": 26302410.0, + "reward": 2.387965679168701, + "reward_std": 0.20636487007141113, + "rewards/cosine_scaled_reward/mean": 0.5577713251113892, + "rewards/cosine_scaled_reward/std": 0.2876478433609009, + "rewards/repetition_penalty_reward/mean": -0.06589921563863754, + "rewards/repetition_penalty_reward/std": 0.049709804356098175, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4010.0, + "completions/mean_length": 1536.3125, + "completions/mean_terminated_length": 1505.9605712890625, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "epoch": 0.011093333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12113416194915771, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 26855006.0, + "reward": 2.423652410507202, + "reward_std": 0.17835842072963715, + "rewards/cosine_scaled_reward/mean": 0.5704998970031738, + "rewards/cosine_scaled_reward/std": 0.2632061541080475, + "rewards/repetition_penalty_reward/mean": -0.06559744477272034, + "rewards/repetition_penalty_reward/std": 0.05109817534685135, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3876.0, + "completions/mean_length": 1622.54296875, + "completions/mean_terminated_length": 1542.7540283203125, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "epoch": 0.011306666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1375443935394287, + "learning_rate": 1e-06, + "loss": -0.0355, + "num_tokens": 27399657.0, + "reward": 2.3556900024414062, + "reward_std": 0.13406921923160553, + "rewards/cosine_scaled_reward/mean": 0.542635440826416, + "rewards/cosine_scaled_reward/std": 0.31481704115867615, + "rewards/repetition_penalty_reward/mean": -0.07210142910480499, + "rewards/repetition_penalty_reward/std": 0.06866049021482468, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3825.0, + "completions/mean_length": 1507.78125, + "completions/mean_terminated_length": 1487.401611328125, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "epoch": 0.01152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09410291910171509, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 27940741.0, + "reward": 2.426107406616211, + "reward_std": 0.12273335456848145, + "rewards/cosine_scaled_reward/mean": 0.5737569332122803, + "rewards/cosine_scaled_reward/std": 0.24874204397201538, + "rewards/repetition_penalty_reward/mean": -0.0624934583902359, + "rewards/repetition_penalty_reward/std": 0.04799724370241165, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3801.0, + "completions/mean_length": 1612.08203125, + "completions/mean_terminated_length": 1602.34130859375, + "completions/min_length": 801.0, + "completions/min_terminated_length": 801.0, + "epoch": 0.011733333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12277078628540039, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 28519370.0, + "reward": 2.3675689697265625, + "reward_std": 0.20383627712726593, + "rewards/cosine_scaled_reward/mean": 0.5437647104263306, + "rewards/cosine_scaled_reward/std": 0.3121073544025421, + "rewards/repetition_penalty_reward/mean": -0.06682077050209045, + "rewards/repetition_penalty_reward/std": 0.04597178474068642, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3848.0, + "completions/mean_length": 1528.94140625, + "completions/mean_terminated_length": 1518.8746337890625, + "completions/min_length": 734.0, + "completions/min_terminated_length": 734.0, + "epoch": 0.011946666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06064042076468468, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 29074019.0, + "reward": 2.4705758094787598, + "reward_std": 0.09662184119224548, + "rewards/cosine_scaled_reward/mean": 0.5923464298248291, + "rewards/cosine_scaled_reward/std": 0.23122240602970123, + "rewards/repetition_penalty_reward/mean": -0.05927072837948799, + "rewards/repetition_penalty_reward/std": 0.036969270557165146, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3724.0, + "completions/mean_length": 1532.11328125, + "completions/mean_terminated_length": 1501.7115478515625, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "epoch": 0.01216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09682509303092957, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 29622644.0, + "reward": 2.3833389282226562, + "reward_std": 0.14281076192855835, + "rewards/cosine_scaled_reward/mean": 0.552849531173706, + "rewards/cosine_scaled_reward/std": 0.28276121616363525, + "rewards/repetition_penalty_reward/mean": -0.06404202431440353, + "rewards/repetition_penalty_reward/std": 0.05114806815981865, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3964.0, + "completions/mean_length": 1569.6953125, + "completions/mean_terminated_length": 1549.8031005859375, + "completions/min_length": 648.0, + "completions/min_terminated_length": 648.0, + "epoch": 0.012373333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10780574381351471, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 30178094.0, + "reward": 2.3729443550109863, + "reward_std": 0.16647489368915558, + "rewards/cosine_scaled_reward/mean": 0.5469235181808472, + "rewards/cosine_scaled_reward/std": 0.29688480496406555, + "rewards/repetition_penalty_reward/mean": -0.06069795787334442, + "rewards/repetition_penalty_reward/std": 0.04385991394519806, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3892.0, + "completions/mean_length": 1562.03515625, + "completions/mean_terminated_length": 1552.09814453125, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "epoch": 0.012586666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10283534973859787, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 30732855.0, + "reward": 2.440593719482422, + "reward_std": 0.18287886679172516, + "rewards/cosine_scaled_reward/mean": 0.584496796131134, + "rewards/cosine_scaled_reward/std": 0.253009557723999, + "rewards/repetition_penalty_reward/mean": -0.06265303492546082, + "rewards/repetition_penalty_reward/std": 0.04739008843898773, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4010.0, + "completions/mean_length": 1643.4375, + "completions/mean_terminated_length": 1604.508056640625, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "epoch": 0.0128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10289309918880463, + "learning_rate": 1e-06, + "loss": -0.0138, + "num_tokens": 31302263.0, + "reward": 2.4196486473083496, + "reward_std": 0.19348298013210297, + "rewards/cosine_scaled_reward/mean": 0.578532338142395, + "rewards/cosine_scaled_reward/std": 0.28303632140159607, + "rewards/repetition_penalty_reward/mean": -0.0659150779247284, + "rewards/repetition_penalty_reward/std": 0.048271458595991135, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3836.0, + "completions/mean_length": 1636.42578125, + "completions/mean_terminated_length": 1617.05908203125, + "completions/min_length": 696.0, + "completions/min_terminated_length": 696.0, + "epoch": 0.013013333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0934101939201355, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 31874212.0, + "reward": 2.444908380508423, + "reward_std": 0.1609368920326233, + "rewards/cosine_scaled_reward/mean": 0.5966625213623047, + "rewards/cosine_scaled_reward/std": 0.26409921050071716, + "rewards/repetition_penalty_reward/mean": -0.07441024482250214, + "rewards/repetition_penalty_reward/std": 0.061948809772729874, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4018.0, + "completions/mean_length": 1661.08203125, + "completions/mean_terminated_length": 1622.4326171875, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "epoch": 0.013226666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10913221538066864, + "learning_rate": 1e-06, + "loss": -0.0188, + "num_tokens": 32445621.0, + "reward": 2.4171042442321777, + "reward_std": 0.18818770349025726, + "rewards/cosine_scaled_reward/mean": 0.5740669965744019, + "rewards/cosine_scaled_reward/std": 0.2936588227748871, + "rewards/repetition_penalty_reward/mean": -0.06790010631084442, + "rewards/repetition_penalty_reward/std": 0.05626881867647171, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3497.0, + "completions/mean_length": 1632.52734375, + "completions/mean_terminated_length": 1613.1298828125, + "completions/min_length": 756.0, + "completions/min_terminated_length": 756.0, + "epoch": 0.01344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09852778911590576, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 33019764.0, + "reward": 2.3997602462768555, + "reward_std": 0.15767687559127808, + "rewards/cosine_scaled_reward/mean": 0.5749585628509521, + "rewards/cosine_scaled_reward/std": 0.286191463470459, + "rewards/repetition_penalty_reward/mean": -0.06738582998514175, + "rewards/repetition_penalty_reward/std": 0.04598398134112358, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4005.0, + "completions/mean_length": 1688.46484375, + "completions/mean_terminated_length": 1659.9171142578125, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "epoch": 0.013653333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11886442452669144, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 33606475.0, + "reward": 2.3002028465270996, + "reward_std": 0.19646117091178894, + "rewards/cosine_scaled_reward/mean": 0.5263907313346863, + "rewards/cosine_scaled_reward/std": 0.3443312644958496, + "rewards/repetition_penalty_reward/mean": -0.07775020599365234, + "rewards/repetition_penalty_reward/std": 0.058893799781799316, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8515625, + "rewards/reward_reference/std": 0.3562295734882355, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 4025.0, + "completions/max_terminated_length": 4025.0, + "completions/mean_length": 1567.79296875, + "completions/mean_terminated_length": 1567.79296875, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "epoch": 0.013866666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09965450316667557, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 34171822.0, + "reward": 2.453932285308838, + "reward_std": 0.18380096554756165, + "rewards/cosine_scaled_reward/mean": 0.5916061997413635, + "rewards/cosine_scaled_reward/std": 0.24207937717437744, + "rewards/repetition_penalty_reward/mean": -0.06345503032207489, + "rewards/repetition_penalty_reward/std": 0.037380099296569824, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 4061.0, + "completions/max_terminated_length": 4061.0, + "completions/mean_length": 1609.9296875, + "completions/mean_terminated_length": 1609.9296875, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.01408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11172179132699966, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 34739068.0, + "reward": 2.3695924282073975, + "reward_std": 0.19150829315185547, + "rewards/cosine_scaled_reward/mean": 0.5578655004501343, + "rewards/cosine_scaled_reward/std": 0.29703474044799805, + "rewards/repetition_penalty_reward/mean": -0.07108558714389801, + "rewards/repetition_penalty_reward/std": 0.04733828827738762, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3984.0, + "completions/mean_length": 1664.4140625, + "completions/mean_terminated_length": 1606.0560302734375, + "completions/min_length": 603.0, + "completions/min_terminated_length": 603.0, + "epoch": 0.014293333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15400172770023346, + "learning_rate": 1e-06, + "loss": -0.0638, + "num_tokens": 35304030.0, + "reward": 2.4119789600372314, + "reward_std": 0.2544468939304352, + "rewards/cosine_scaled_reward/mean": 0.5801342725753784, + "rewards/cosine_scaled_reward/std": 0.28957661986351013, + "rewards/repetition_penalty_reward/mean": -0.07284298539161682, + "rewards/repetition_penalty_reward/std": 0.07479114085435867, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4025.0, + "completions/mean_length": 1690.984375, + "completions/mean_terminated_length": 1603.352294921875, + "completions/min_length": 663.0, + "completions/min_terminated_length": 663.0, + "epoch": 0.014506666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08533817529678345, + "learning_rate": 1e-06, + "loss": -0.0186, + "num_tokens": 35867958.0, + "reward": 2.358893394470215, + "reward_std": 0.1489824652671814, + "rewards/cosine_scaled_reward/mean": 0.556054949760437, + "rewards/cosine_scaled_reward/std": 0.3177521228790283, + "rewards/repetition_penalty_reward/mean": -0.07684915512800217, + "rewards/repetition_penalty_reward/std": 0.07336685806512833, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3912.0, + "completions/mean_length": 1666.86328125, + "completions/mean_terminated_length": 1647.7362060546875, + "completions/min_length": 687.0, + "completions/min_terminated_length": 687.0, + "epoch": 0.01472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12211061269044876, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 36458403.0, + "reward": 2.379150390625, + "reward_std": 0.22196923196315765, + "rewards/cosine_scaled_reward/mean": 0.5638763904571533, + "rewards/cosine_scaled_reward/std": 0.30511754751205444, + "rewards/repetition_penalty_reward/mean": -0.07222599536180496, + "rewards/repetition_penalty_reward/std": 0.04784548282623291, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3987.0, + "completions/mean_length": 1706.54296875, + "completions/mean_terminated_length": 1697.172607421875, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "epoch": 0.014933333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08350487053394318, + "learning_rate": 1e-06, + "loss": -0.0159, + "num_tokens": 37059562.0, + "reward": 2.497920036315918, + "reward_std": 0.12415796518325806, + "rewards/cosine_scaled_reward/mean": 0.6232722997665405, + "rewards/cosine_scaled_reward/std": 0.24374060332775116, + "rewards/repetition_penalty_reward/mean": -0.06675856560468674, + "rewards/repetition_penalty_reward/std": 0.04270366206765175, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3730.0, + "completions/mean_length": 1697.40625, + "completions/mean_terminated_length": 1688.0001220703125, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "epoch": 0.015146666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09371346235275269, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 37652426.0, + "reward": 2.481433868408203, + "reward_std": 0.16789013147354126, + "rewards/cosine_scaled_reward/mean": 0.6213948726654053, + "rewards/cosine_scaled_reward/std": 0.2410319447517395, + "rewards/repetition_penalty_reward/mean": -0.07433594018220901, + "rewards/repetition_penalty_reward/std": 0.05567564442753792, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3244.0, + "completions/mean_length": 1703.26953125, + "completions/mean_terminated_length": 1674.8973388671875, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "epoch": 0.01536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08914853632450104, + "learning_rate": 1e-06, + "loss": -0.0221, + "num_tokens": 38244783.0, + "reward": 2.4869542121887207, + "reward_std": 0.12982487678527832, + "rewards/cosine_scaled_reward/mean": 0.6192671060562134, + "rewards/cosine_scaled_reward/std": 0.24536055326461792, + "rewards/repetition_penalty_reward/mean": -0.06746931374073029, + "rewards/repetition_penalty_reward/std": 0.053976256400346756, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3789.0, + "completions/mean_length": 1795.80859375, + "completions/mean_terminated_length": 1768.53369140625, + "completions/min_length": 762.0, + "completions/min_terminated_length": 762.0, + "epoch": 0.015573333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13626892864704132, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 38857138.0, + "reward": 2.3470053672790527, + "reward_std": 0.2492281198501587, + "rewards/cosine_scaled_reward/mean": 0.5566455125808716, + "rewards/cosine_scaled_reward/std": 0.3459410071372986, + "rewards/repetition_penalty_reward/mean": -0.07760874181985855, + "rewards/repetition_penalty_reward/std": 0.054391711950302124, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3685.0, + "completions/mean_length": 1721.2734375, + "completions/mean_terminated_length": 1711.9609375, + "completions/min_length": 762.0, + "completions/min_terminated_length": 762.0, + "epoch": 0.015786666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06626079231500626, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 39464616.0, + "reward": 2.4405510425567627, + "reward_std": 0.11783230304718018, + "rewards/cosine_scaled_reward/mean": 0.5952588319778442, + "rewards/cosine_scaled_reward/std": 0.2840306758880615, + "rewards/repetition_penalty_reward/mean": -0.06486397236585617, + "rewards/repetition_penalty_reward/std": 0.04353988170623779, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3728.0, + "completions/mean_length": 1689.69140625, + "completions/mean_terminated_length": 1661.158203125, + "completions/min_length": 830.0, + "completions/min_terminated_length": 830.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1122131422162056, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 40046925.0, + "reward": 2.3153374195098877, + "reward_std": 0.18534180521965027, + "rewards/cosine_scaled_reward/mean": 0.5333756804466248, + "rewards/cosine_scaled_reward/std": 0.3348003029823303, + "rewards/repetition_penalty_reward/mean": -0.06647560000419617, + "rewards/repetition_penalty_reward/std": 0.05163053795695305, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8515625, + "rewards/reward_reference/std": 0.3562295734882355, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 1863.859375, + "completions/mean_terminated_length": 1801.1083984375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "epoch": 0.016213333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0832432359457016, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 40660097.0, + "reward": 2.320946216583252, + "reward_std": 0.14731647074222565, + "rewards/cosine_scaled_reward/mean": 0.5464308857917786, + "rewards/cosine_scaled_reward/std": 0.36857691407203674, + "rewards/repetition_penalty_reward/mean": -0.07860984653234482, + "rewards/repetition_penalty_reward/std": 0.059814877808094025, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.859375, + "rewards/reward_reference/std": 0.3483152687549591, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3703.0, + "completions/max_terminated_length": 3703.0, + "completions/mean_length": 1677.28515625, + "completions/mean_terminated_length": 1677.28515625, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "epoch": 0.016426666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09381895512342453, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 41260350.0, + "reward": 2.333367347717285, + "reward_std": 0.16393491625785828, + "rewards/cosine_scaled_reward/mean": 0.5406162738800049, + "rewards/cosine_scaled_reward/std": 0.327106237411499, + "rewards/repetition_penalty_reward/mean": -0.06271764636039734, + "rewards/repetition_penalty_reward/std": 0.03436657041311264, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.85546875, + "rewards/reward_reference/std": 0.35231640934944153, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3929.0, + "completions/mean_length": 1841.54296875, + "completions/mean_terminated_length": 1796.633544921875, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "epoch": 0.01664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1164449080824852, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 41870129.0, + "reward": 2.2937211990356445, + "reward_std": 0.2213471531867981, + "rewards/cosine_scaled_reward/mean": 0.5263729691505432, + "rewards/cosine_scaled_reward/std": 0.37787869572639465, + "rewards/repetition_penalty_reward/mean": -0.07718320190906525, + "rewards/repetition_penalty_reward/std": 0.052163559943437576, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.84765625, + "rewards/reward_reference/std": 0.3600577116012573, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4027.0, + "completions/mean_length": 1882.796875, + "completions/mean_terminated_length": 1783.428466796875, + "completions/min_length": 822.0, + "completions/min_terminated_length": 822.0, + "epoch": 0.016853333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11616773903369904, + "learning_rate": 1e-06, + "loss": -0.0399, + "num_tokens": 42464209.0, + "reward": 2.329927444458008, + "reward_std": 0.2060365378856659, + "rewards/cosine_scaled_reward/mean": 0.5598228573799133, + "rewards/cosine_scaled_reward/std": 0.36188092827796936, + "rewards/repetition_penalty_reward/mean": -0.08770774304866791, + "rewards/repetition_penalty_reward/std": 0.07510842382907867, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3895.0, + "completions/max_terminated_length": 3895.0, + "completions/mean_length": 1749.42578125, + "completions/mean_terminated_length": 1749.42578125, + "completions/min_length": 1026.0, + "completions/min_terminated_length": 1026.0, + "epoch": 0.017066666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1016409620642662, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 43071518.0, + "reward": 2.4230754375457764, + "reward_std": 0.15485021471977234, + "rewards/cosine_scaled_reward/mean": 0.5909368395805359, + "rewards/cosine_scaled_reward/std": 0.29618415236473083, + "rewards/repetition_penalty_reward/mean": -0.07020512223243713, + "rewards/repetition_penalty_reward/std": 0.041945360600948334, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3290.0, + "completions/mean_length": 1692.96484375, + "completions/mean_terminated_length": 1664.470458984375, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "epoch": 0.01728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08751388639211655, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 43656253.0, + "reward": 2.5253281593322754, + "reward_std": 0.12936024367809296, + "rewards/cosine_scaled_reward/mean": 0.6400842070579529, + "rewards/cosine_scaled_reward/std": 0.20249401032924652, + "rewards/repetition_penalty_reward/mean": -0.0717872753739357, + "rewards/repetition_penalty_reward/std": 0.05126966908574104, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3955.0, + "completions/mean_length": 1845.78515625, + "completions/mean_terminated_length": 1800.960205078125, + "completions/min_length": 908.0, + "completions/min_terminated_length": 908.0, + "epoch": 0.017493333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12477394193410873, + "learning_rate": 1e-06, + "loss": -0.0299, + "num_tokens": 44268842.0, + "reward": 2.382167339324951, + "reward_std": 0.20765957236289978, + "rewards/cosine_scaled_reward/mean": 0.5757420063018799, + "rewards/cosine_scaled_reward/std": 0.3393123149871826, + "rewards/repetition_penalty_reward/mean": -0.07794953882694244, + "rewards/repetition_penalty_reward/std": 0.05676357075572014, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3896.0, + "completions/mean_length": 1813.8359375, + "completions/mean_terminated_length": 1777.6112060546875, + "completions/min_length": 1021.0, + "completions/min_terminated_length": 1021.0, + "epoch": 0.017706666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11310097575187683, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 44878580.0, + "reward": 2.4027891159057617, + "reward_std": 0.20823755860328674, + "rewards/cosine_scaled_reward/mean": 0.5920209884643555, + "rewards/cosine_scaled_reward/std": 0.31304964423179626, + "rewards/repetition_penalty_reward/mean": -0.08063797652721405, + "rewards/repetition_penalty_reward/std": 0.05689622461795807, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4028.0, + "completions/mean_length": 1746.9296875, + "completions/mean_terminated_length": 1737.7177734375, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "epoch": 0.01792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0818057730793953, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 45484070.0, + "reward": 2.4049665927886963, + "reward_std": 0.12297569960355759, + "rewards/cosine_scaled_reward/mean": 0.5779396891593933, + "rewards/cosine_scaled_reward/std": 0.3099997639656067, + "rewards/repetition_penalty_reward/mean": -0.0682855099439621, + "rewards/repetition_penalty_reward/std": 0.04200433939695358, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3760.0, + "completions/mean_length": 1762.9609375, + "completions/mean_terminated_length": 1735.2965087890625, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "epoch": 0.018133333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07859272509813309, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 46083580.0, + "reward": 2.47398042678833, + "reward_std": 0.1363099366426468, + "rewards/cosine_scaled_reward/mean": 0.6111050248146057, + "rewards/cosine_scaled_reward/std": 0.27342745661735535, + "rewards/repetition_penalty_reward/mean": -0.06681206077337265, + "rewards/repetition_penalty_reward/std": 0.050561416894197464, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3901.0, + "completions/mean_length": 1759.5546875, + "completions/mean_terminated_length": 1722.4683837890625, + "completions/min_length": 825.0, + "completions/min_terminated_length": 825.0, + "epoch": 0.018346666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07375761866569519, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 46677474.0, + "reward": 2.5168585777282715, + "reward_std": 0.10227377712726593, + "rewards/cosine_scaled_reward/mean": 0.6395881175994873, + "rewards/cosine_scaled_reward/std": 0.22985778748989105, + "rewards/repetition_penalty_reward/mean": -0.07194818556308746, + "rewards/repetition_penalty_reward/std": 0.042965106666088104, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94921875, + "rewards/reward_reference/std": 0.21998079121112823, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3865.0, + "completions/mean_length": 1764.23046875, + "completions/mean_terminated_length": 1745.8701171875, + "completions/min_length": 986.0, + "completions/min_terminated_length": 986.0, + "epoch": 0.01856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09555403888225555, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 47287537.0, + "reward": 2.4244163036346436, + "reward_std": 0.1566460132598877, + "rewards/cosine_scaled_reward/mean": 0.5867924690246582, + "rewards/cosine_scaled_reward/std": 0.30464106798171997, + "rewards/repetition_penalty_reward/mean": -0.07253240048885345, + "rewards/repetition_penalty_reward/std": 0.04896243289113045, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3812.0, + "completions/mean_length": 1809.65625, + "completions/mean_terminated_length": 1800.6903076171875, + "completions/min_length": 1076.0, + "completions/min_terminated_length": 1076.0, + "epoch": 0.018773333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08929698914289474, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 47910045.0, + "reward": 2.4683656692504883, + "reward_std": 0.13447409868240356, + "rewards/cosine_scaled_reward/mean": 0.6193373203277588, + "rewards/cosine_scaled_reward/std": 0.2785070240497589, + "rewards/repetition_penalty_reward/mean": -0.07284662127494812, + "rewards/repetition_penalty_reward/std": 0.04199657589197159, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4006.0, + "completions/mean_length": 1735.84375, + "completions/mean_terminated_length": 1717.2598876953125, + "completions/min_length": 712.0, + "completions/min_terminated_length": 712.0, + "epoch": 0.018986666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05542474985122681, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 48511993.0, + "reward": 2.4176106452941895, + "reward_std": 0.06906415522098541, + "rewards/cosine_scaled_reward/mean": 0.5868261456489563, + "rewards/cosine_scaled_reward/std": 0.296097069978714, + "rewards/repetition_penalty_reward/mean": -0.06765298545360565, + "rewards/repetition_penalty_reward/std": 0.03946821764111519, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3982.0, + "completions/mean_length": 1745.66796875, + "completions/mean_terminated_length": 1679.5943603515625, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.0192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09133629500865936, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 49090192.0, + "reward": 2.322629451751709, + "reward_std": 0.1549871861934662, + "rewards/cosine_scaled_reward/mean": 0.5519901514053345, + "rewards/cosine_scaled_reward/std": 0.33279502391815186, + "rewards/repetition_penalty_reward/mean": -0.08170440793037415, + "rewards/repetition_penalty_reward/std": 0.06908124685287476, + "rewards/reward_format/mean": 0.9812500476837158, + "rewards/reward_format/std": 0.12126781791448593, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3837.0, + "completions/mean_length": 1705.6953125, + "completions/mean_terminated_length": 1638.4979248046875, + "completions/min_length": 984.0, + "completions/min_terminated_length": 984.0, + "epoch": 0.019413333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08402293175458908, + "learning_rate": 1e-06, + "loss": -0.0185, + "num_tokens": 49654050.0, + "reward": 2.4880495071411133, + "reward_std": 0.12586981058120728, + "rewards/cosine_scaled_reward/mean": 0.6164243221282959, + "rewards/cosine_scaled_reward/std": 0.24689579010009766, + "rewards/repetition_penalty_reward/mean": -0.06743744760751724, + "rewards/repetition_penalty_reward/std": 0.04767215624451637, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.9453125, + "rewards/reward_reference/std": 0.22781464457511902, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3693.0, + "completions/max_terminated_length": 3693.0, + "completions/mean_length": 1792.48828125, + "completions/mean_terminated_length": 1792.48828125, + "completions/min_length": 1058.0, + "completions/min_terminated_length": 1058.0, + "epoch": 0.019626666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08743888139724731, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 50275411.0, + "reward": 2.3785476684570312, + "reward_std": 0.1486774981021881, + "rewards/cosine_scaled_reward/mean": 0.5760810971260071, + "rewards/cosine_scaled_reward/std": 0.3226264417171478, + "rewards/repetition_penalty_reward/mean": -0.07643987238407135, + "rewards/repetition_penalty_reward/std": 0.04226500540971756, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4028.0, + "completions/mean_length": 1894.8125, + "completions/mean_terminated_length": 1786.5572509765625, + "completions/min_length": 1093.0, + "completions/min_terminated_length": 1093.0, + "epoch": 0.01984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13492800295352936, + "learning_rate": 1e-06, + "loss": -0.0355, + "num_tokens": 50865851.0, + "reward": 2.384434700012207, + "reward_std": 0.23413318395614624, + "rewards/cosine_scaled_reward/mean": 0.581243634223938, + "rewards/cosine_scaled_reward/std": 0.3413110673427582, + "rewards/repetition_penalty_reward/mean": -0.0858713760972023, + "rewards/repetition_penalty_reward/std": 0.0653069019317627, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3629.0, + "completions/mean_length": 1738.40625, + "completions/mean_terminated_length": 1719.842529296875, + "completions/min_length": 885.0, + "completions/min_terminated_length": 885.0, + "epoch": 0.020053333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06449340283870697, + "learning_rate": 1e-06, + "loss": -0.015, + "num_tokens": 51469135.0, + "reward": 2.540250778198242, + "reward_std": 0.10148729383945465, + "rewards/cosine_scaled_reward/mean": 0.6441509127616882, + "rewards/cosine_scaled_reward/std": 0.21270470321178436, + "rewards/repetition_penalty_reward/mean": -0.06483766436576843, + "rewards/repetition_penalty_reward/std": 0.03865836560726166, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3683.0, + "completions/mean_length": 1811.0859375, + "completions/mean_terminated_length": 1765.56982421875, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "epoch": 0.020266666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10099369287490845, + "learning_rate": 1e-06, + "loss": -0.0201, + "num_tokens": 52071265.0, + "reward": 2.4201340675354004, + "reward_std": 0.1737205535173416, + "rewards/cosine_scaled_reward/mean": 0.6000106334686279, + "rewards/cosine_scaled_reward/std": 0.3004744052886963, + "rewards/repetition_penalty_reward/mean": -0.0798763632774353, + "rewards/repetition_penalty_reward/std": 0.06247806176543236, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3961.0, + "completions/mean_length": 1730.5625, + "completions/mean_terminated_length": 1721.286376953125, + "completions/min_length": 1064.0, + "completions/min_terminated_length": 1064.0, + "epoch": 0.02048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07633478939533234, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 52681929.0, + "reward": 2.4455995559692383, + "reward_std": 0.1185745969414711, + "rewards/cosine_scaled_reward/mean": 0.6002703905105591, + "rewards/cosine_scaled_reward/std": 0.2775324583053589, + "rewards/repetition_penalty_reward/mean": -0.06873318552970886, + "rewards/repetition_penalty_reward/std": 0.042261138558387756, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 1756.83203125, + "completions/mean_terminated_length": 1747.658935546875, + "completions/min_length": 976.0, + "completions/min_terminated_length": 976.0, + "epoch": 0.020693333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0499541349709034, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 53282854.0, + "reward": 2.4268693923950195, + "reward_std": 0.07386516034603119, + "rewards/cosine_scaled_reward/mean": 0.594407320022583, + "rewards/cosine_scaled_reward/std": 0.29383519291877747, + "rewards/repetition_penalty_reward/mean": -0.07378794252872467, + "rewards/repetition_penalty_reward/std": 0.048145439475774765, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 1892.8359375, + "completions/mean_terminated_length": 1848.9482421875, + "completions/min_length": 993.0, + "completions/min_terminated_length": 993.0, + "epoch": 0.020906666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11263293027877808, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 53905152.0, + "reward": 2.305772542953491, + "reward_std": 0.18838255107402802, + "rewards/cosine_scaled_reward/mean": 0.5442582368850708, + "rewards/cosine_scaled_reward/std": 0.3724890649318695, + "rewards/repetition_penalty_reward/mean": -0.07129818201065063, + "rewards/repetition_penalty_reward/std": 0.04757782071828842, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8359375, + "rewards/reward_reference/std": 0.3710577189922333, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4051.0, + "completions/mean_length": 1890.90625, + "completions/mean_terminated_length": 1846.9801025390625, + "completions/min_length": 971.0, + "completions/min_terminated_length": 971.0, + "epoch": 0.02112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11732571572065353, + "learning_rate": 1e-06, + "loss": -0.0177, + "num_tokens": 54530496.0, + "reward": 2.391127109527588, + "reward_std": 0.19609452784061432, + "rewards/cosine_scaled_reward/mean": 0.5890634059906006, + "rewards/cosine_scaled_reward/std": 0.3334295153617859, + "rewards/repetition_penalty_reward/mean": -0.07684235274791718, + "rewards/repetition_penalty_reward/std": 0.04716531187295914, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4045.0, + "completions/mean_length": 1772.96484375, + "completions/mean_terminated_length": 1745.4190673828125, + "completions/min_length": 1082.0, + "completions/min_terminated_length": 1082.0, + "epoch": 0.021333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08264286071062088, + "learning_rate": 1e-06, + "loss": -0.0155, + "num_tokens": 55142359.0, + "reward": 2.398106575012207, + "reward_std": 0.10862451791763306, + "rewards/cosine_scaled_reward/mean": 0.5802006721496582, + "rewards/cosine_scaled_reward/std": 0.3117934763431549, + "rewards/repetition_penalty_reward/mean": -0.06959399580955505, + "rewards/repetition_penalty_reward/std": 0.04771846532821655, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4033.0, + "completions/mean_length": 1838.60546875, + "completions/mean_terminated_length": 1784.4281005859375, + "completions/min_length": 1020.0, + "completions/min_terminated_length": 1020.0, + "epoch": 0.021546666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10220247507095337, + "learning_rate": 1e-06, + "loss": -0.0234, + "num_tokens": 55750342.0, + "reward": 2.4891393184661865, + "reward_std": 0.1632259488105774, + "rewards/cosine_scaled_reward/mean": 0.628940224647522, + "rewards/cosine_scaled_reward/std": 0.27056002616882324, + "rewards/repetition_penalty_reward/mean": -0.07339469343423843, + "rewards/repetition_penalty_reward/std": 0.051973793655633926, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3805.0, + "completions/mean_length": 1867.9296875, + "completions/mean_terminated_length": 1859.1922607421875, + "completions/min_length": 1127.0, + "completions/min_terminated_length": 1127.0, + "epoch": 0.02176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11397871375083923, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 56386668.0, + "reward": 2.396754264831543, + "reward_std": 0.1740192025899887, + "rewards/cosine_scaled_reward/mean": 0.5868169069290161, + "rewards/cosine_scaled_reward/std": 0.33226945996284485, + "rewards/repetition_penalty_reward/mean": -0.06975027173757553, + "rewards/repetition_penalty_reward/std": 0.057229943573474884, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 4009.0, + "completions/max_terminated_length": 4009.0, + "completions/mean_length": 1682.66015625, + "completions/mean_terminated_length": 1682.66015625, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "epoch": 0.021973333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056475766003131866, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 56986793.0, + "reward": 2.5267632007598877, + "reward_std": 0.06429925560951233, + "rewards/cosine_scaled_reward/mean": 0.6376233100891113, + "rewards/cosine_scaled_reward/std": 0.20191699266433716, + "rewards/repetition_penalty_reward/mean": -0.0639849454164505, + "rewards/repetition_penalty_reward/std": 0.03479979932308197, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.953125, + "rewards/reward_reference/std": 0.21178513765335083, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3586.0, + "completions/mean_length": 1783.94921875, + "completions/mean_terminated_length": 1765.744140625, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "epoch": 0.022186666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11576827615499496, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 57592760.0, + "reward": 2.4529004096984863, + "reward_std": 0.18349312245845795, + "rewards/cosine_scaled_reward/mean": 0.6015920639038086, + "rewards/cosine_scaled_reward/std": 0.2936531603336334, + "rewards/repetition_penalty_reward/mean": -0.06275419145822525, + "rewards/repetition_penalty_reward/std": 0.03655506670475006, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3947.0, + "completions/mean_length": 1912.76171875, + "completions/mean_terminated_length": 1878.1072998046875, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "epoch": 0.0224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10496613383293152, + "learning_rate": 1e-06, + "loss": -0.0234, + "num_tokens": 58232699.0, + "reward": 2.2054691314697266, + "reward_std": 0.20665660500526428, + "rewards/cosine_scaled_reward/mean": 0.48843634128570557, + "rewards/cosine_scaled_reward/std": 0.4195135831832886, + "rewards/repetition_penalty_reward/mean": -0.06968589127063751, + "rewards/repetition_penalty_reward/std": 0.044393159449100494, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.79296875, + "rewards/reward_reference/std": 0.40597182512283325, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3891.0, + "completions/mean_length": 1855.42578125, + "completions/mean_terminated_length": 1810.7928466796875, + "completions/min_length": 947.0, + "completions/min_terminated_length": 947.0, + "epoch": 0.022613333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07337377220392227, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 58844188.0, + "reward": 2.4211018085479736, + "reward_std": 0.12404131889343262, + "rewards/cosine_scaled_reward/mean": 0.5939671993255615, + "rewards/cosine_scaled_reward/std": 0.32044410705566406, + "rewards/repetition_penalty_reward/mean": -0.06817790120840073, + "rewards/repetition_penalty_reward/std": 0.045555904507637024, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4044.0, + "completions/mean_length": 1896.109375, + "completions/mean_terminated_length": 1861.1905517578125, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "epoch": 0.022826666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10826458781957626, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 59486044.0, + "reward": 2.4502131938934326, + "reward_std": 0.17930112779140472, + "rewards/cosine_scaled_reward/mean": 0.6160438656806946, + "rewards/cosine_scaled_reward/std": 0.30508628487586975, + "rewards/repetition_penalty_reward/mean": -0.0728619247674942, + "rewards/repetition_penalty_reward/std": 0.05683686584234238, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3477.0, + "completions/mean_length": 1825.83984375, + "completions/mean_terminated_length": 1798.9210205078125, + "completions/min_length": 1036.0, + "completions/min_terminated_length": 1036.0, + "epoch": 0.02304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09361043572425842, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 60105751.0, + "reward": 2.4900074005126953, + "reward_std": 0.12909862399101257, + "rewards/cosine_scaled_reward/mean": 0.6301261186599731, + "rewards/cosine_scaled_reward/std": 0.26686570048332214, + "rewards/repetition_penalty_reward/mean": -0.0705873891711235, + "rewards/repetition_penalty_reward/std": 0.043489061295986176, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3810.0, + "completions/mean_length": 1801.35546875, + "completions/mean_terminated_length": 1792.35693359375, + "completions/min_length": 972.0, + "completions/min_terminated_length": 972.0, + "epoch": 0.023253333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09894856065511703, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 60725642.0, + "reward": 2.4845056533813477, + "reward_std": 0.14575153589248657, + "rewards/cosine_scaled_reward/mean": 0.6300021409988403, + "rewards/cosine_scaled_reward/std": 0.2584840953350067, + "rewards/repetition_penalty_reward/mean": -0.06815264374017715, + "rewards/repetition_penalty_reward/std": 0.049226224422454834, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3955.0, + "completions/mean_length": 1938.015625, + "completions/mean_terminated_length": 1895.0279541015625, + "completions/min_length": 1074.0, + "completions/min_terminated_length": 1074.0, + "epoch": 0.023466666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08501613140106201, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 61361878.0, + "reward": 2.5716586112976074, + "reward_std": 0.13666792213916779, + "rewards/cosine_scaled_reward/mean": 0.6835469007492065, + "rewards/cosine_scaled_reward/std": 0.21558046340942383, + "rewards/repetition_penalty_reward/mean": -0.07048188149929047, + "rewards/repetition_penalty_reward/std": 0.040720950812101364, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718994140625, + "rewards/reward_reference/mean": 0.96484375, + "rewards/reward_reference/std": 0.18453538417816162, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4026.0, + "completions/mean_length": 1827.046875, + "completions/mean_terminated_length": 1791.0318603515625, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "epoch": 0.02368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08992420881986618, + "learning_rate": 1e-06, + "loss": -0.0133, + "num_tokens": 61965338.0, + "reward": 2.503587007522583, + "reward_std": 0.15698347985744476, + "rewards/cosine_scaled_reward/mean": 0.635814905166626, + "rewards/cosine_scaled_reward/std": 0.2587208151817322, + "rewards/repetition_penalty_reward/mean": -0.06582161784172058, + "rewards/repetition_penalty_reward/std": 0.04158446192741394, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4043.0, + "completions/mean_length": 1858.609375, + "completions/mean_terminated_length": 1832.0791015625, + "completions/min_length": 857.0, + "completions/min_terminated_length": 857.0, + "epoch": 0.023893333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09149877727031708, + "learning_rate": 1e-06, + "loss": -0.0306, + "num_tokens": 62585210.0, + "reward": 2.4405405521392822, + "reward_std": 0.13705675303936005, + "rewards/cosine_scaled_reward/mean": 0.6155904531478882, + "rewards/cosine_scaled_reward/std": 0.295393705368042, + "rewards/repetition_penalty_reward/mean": -0.06801863014698029, + "rewards/repetition_penalty_reward/std": 0.04448341205716133, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3933.0, + "completions/mean_length": 1910.171875, + "completions/mean_terminated_length": 1866.6295166015625, + "completions/min_length": 1114.0, + "completions/min_terminated_length": 1114.0, + "epoch": 0.024106666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08117534965276718, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 63218522.0, + "reward": 2.3720033168792725, + "reward_std": 0.11090590059757233, + "rewards/cosine_scaled_reward/mean": 0.5723944902420044, + "rewards/cosine_scaled_reward/std": 0.35445037484169006, + "rewards/repetition_penalty_reward/mean": -0.07539094239473343, + "rewards/repetition_penalty_reward/std": 0.0551920086145401, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3800.0, + "completions/max_terminated_length": 3800.0, + "completions/mean_length": 1847.49609375, + "completions/mean_terminated_length": 1847.49609375, + "completions/min_length": 1090.0, + "completions/min_terminated_length": 1090.0, + "epoch": 0.02432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09835077077150345, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 63849337.0, + "reward": 2.4445981979370117, + "reward_std": 0.15487414598464966, + "rewards/cosine_scaled_reward/mean": 0.6096133589744568, + "rewards/cosine_scaled_reward/std": 0.2992246448993683, + "rewards/repetition_penalty_reward/mean": -0.06735902279615402, + "rewards/repetition_penalty_reward/std": 0.039419885724782944, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 1888.6796875, + "completions/mean_terminated_length": 1871.2991943359375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "epoch": 0.024533333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09056826680898666, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 64491495.0, + "reward": 2.4354710578918457, + "reward_std": 0.1214878261089325, + "rewards/cosine_scaled_reward/mean": 0.6106761693954468, + "rewards/cosine_scaled_reward/std": 0.309811532497406, + "rewards/repetition_penalty_reward/mean": -0.07051754742860794, + "rewards/repetition_penalty_reward/std": 0.049914922565221786, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4029.0, + "completions/mean_length": 2027.52734375, + "completions/mean_terminated_length": 1977.884033203125, + "completions/min_length": 973.0, + "completions/min_terminated_length": 973.0, + "epoch": 0.024746666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09796389192342758, + "learning_rate": 1e-06, + "loss": -0.0353, + "num_tokens": 65146530.0, + "reward": 2.429624080657959, + "reward_std": 0.19394034147262573, + "rewards/cosine_scaled_reward/mean": 0.6130001544952393, + "rewards/cosine_scaled_reward/std": 0.34257784485816956, + "rewards/repetition_penalty_reward/mean": -0.07400095462799072, + "rewards/repetition_penalty_reward/std": 0.041537486016750336, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3967.0, + "completions/mean_length": 1915.296875, + "completions/mean_terminated_length": 1871.856689453125, + "completions/min_length": 1020.0, + "completions/min_terminated_length": 1020.0, + "epoch": 0.02496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09959474205970764, + "learning_rate": 1e-06, + "loss": -0.0367, + "num_tokens": 65778562.0, + "reward": 2.3682241439819336, + "reward_std": 0.15466037392616272, + "rewards/cosine_scaled_reward/mean": 0.5779094696044922, + "rewards/cosine_scaled_reward/std": 0.35092681646347046, + "rewards/repetition_penalty_reward/mean": -0.06984167546033859, + "rewards/repetition_penalty_reward/std": 0.05132605880498886, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.86328125, + "rewards/reward_reference/std": 0.34422317147254944, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3970.0, + "completions/mean_length": 1996.67578125, + "completions/mean_terminated_length": 1937.6585693359375, + "completions/min_length": 1097.0, + "completions/min_terminated_length": 1097.0, + "epoch": 0.025173333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10874690860509872, + "learning_rate": 1e-06, + "loss": -0.0386, + "num_tokens": 66428271.0, + "reward": 2.454425573348999, + "reward_std": 0.18959057331085205, + "rewards/cosine_scaled_reward/mean": 0.6289983987808228, + "rewards/cosine_scaled_reward/std": 0.317272812128067, + "rewards/repetition_penalty_reward/mean": -0.0847291648387909, + "rewards/repetition_penalty_reward/std": 0.05817628279328346, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3950.0, + "completions/mean_length": 1973.50390625, + "completions/mean_terminated_length": 1948.3360595703125, + "completions/min_length": 1139.0, + "completions/min_terminated_length": 1139.0, + "epoch": 0.025386666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0805424302816391, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 67085776.0, + "reward": 2.535606861114502, + "reward_std": 0.12788008153438568, + "rewards/cosine_scaled_reward/mean": 0.6682774424552917, + "rewards/cosine_scaled_reward/std": 0.2557551860809326, + "rewards/repetition_penalty_reward/mean": -0.07095189392566681, + "rewards/repetition_penalty_reward/std": 0.04125396907329559, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3879.0, + "completions/mean_length": 1930.96484375, + "completions/mean_terminated_length": 1905.2926025390625, + "completions/min_length": 1056.0, + "completions/min_terminated_length": 1056.0, + "epoch": 0.0256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11947871744632721, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 67731251.0, + "reward": 2.4403347969055176, + "reward_std": 0.18954503536224365, + "rewards/cosine_scaled_reward/mean": 0.6084376573562622, + "rewards/cosine_scaled_reward/std": 0.32468897104263306, + "rewards/repetition_penalty_reward/mean": -0.07044674456119537, + "rewards/repetition_penalty_reward/std": 0.043707504868507385, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3712.0, + "completions/mean_length": 2024.26953125, + "completions/mean_terminated_length": 1931.2529296875, + "completions/min_length": 1157.0, + "completions/min_terminated_length": 1157.0, + "epoch": 0.025813333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07135090231895447, + "learning_rate": 1e-06, + "loss": -0.0286, + "num_tokens": 68365360.0, + "reward": 2.4914162158966064, + "reward_std": 0.11930274963378906, + "rewards/cosine_scaled_reward/mean": 0.6463751792907715, + "rewards/cosine_scaled_reward/std": 0.30041638016700745, + "rewards/repetition_penalty_reward/mean": -0.07214657962322235, + "rewards/repetition_penalty_reward/std": 0.05392837896943092, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4022.0, + "completions/mean_length": 2024.44140625, + "completions/mean_terminated_length": 1991.5596923828125, + "completions/min_length": 1027.0, + "completions/min_terminated_length": 1027.0, + "epoch": 0.026026666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12721113860607147, + "learning_rate": 1e-06, + "loss": -0.0302, + "num_tokens": 69023685.0, + "reward": 2.404798984527588, + "reward_std": 0.2287992238998413, + "rewards/cosine_scaled_reward/mean": 0.6008037328720093, + "rewards/cosine_scaled_reward/std": 0.3563878536224365, + "rewards/repetition_penalty_reward/mean": -0.07569223642349243, + "rewards/repetition_penalty_reward/std": 0.05869562551379204, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3926.0, + "completions/mean_length": 2080.6953125, + "completions/mean_terminated_length": 2024.0400390625, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "epoch": 0.02624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07020558416843414, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 69699179.0, + "reward": 2.508188009262085, + "reward_std": 0.09734837710857391, + "rewards/cosine_scaled_reward/mean": 0.6614010334014893, + "rewards/cosine_scaled_reward/std": 0.29930007457733154, + "rewards/repetition_penalty_reward/mean": -0.0758691132068634, + "rewards/repetition_penalty_reward/std": 0.05258987843990326, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3513.0, + "completions/mean_length": 2013.26171875, + "completions/mean_terminated_length": 2005.09423828125, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "epoch": 0.026453333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10724949091672897, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 70370350.0, + "reward": 2.5433502197265625, + "reward_std": 0.1528834104537964, + "rewards/cosine_scaled_reward/mean": 0.6737625598907471, + "rewards/cosine_scaled_reward/std": 0.2616587281227112, + "rewards/repetition_penalty_reward/mean": -0.06791241466999054, + "rewards/repetition_penalty_reward/std": 0.04298345372080803, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3957.0, + "completions/mean_length": 2100.45703125, + "completions/mean_terminated_length": 2044.3572998046875, + "completions/min_length": 1012.0, + "completions/min_terminated_length": 1012.0, + "epoch": 0.02666666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10530390590429306, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 71047395.0, + "reward": 2.407808780670166, + "reward_std": 0.19165164232254028, + "rewards/cosine_scaled_reward/mean": 0.6080106496810913, + "rewards/cosine_scaled_reward/std": 0.36637604236602783, + "rewards/repetition_penalty_reward/mean": -0.0798891931772232, + "rewards/repetition_penalty_reward/std": 0.052827976644039154, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3566.0, + "completions/mean_length": 1920.1484375, + "completions/mean_terminated_length": 1911.6158447265625, + "completions/min_length": 1080.0, + "completions/min_terminated_length": 1080.0, + "epoch": 0.02688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026686642318964005, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 71693881.0, + "reward": 2.619581937789917, + "reward_std": 0.05261857807636261, + "rewards/cosine_scaled_reward/mean": 0.7018906474113464, + "rewards/cosine_scaled_reward/std": 0.16163213551044464, + "rewards/repetition_penalty_reward/mean": -0.06277747452259064, + "rewards/repetition_penalty_reward/std": 0.04004902392625809, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.98046875, + "rewards/reward_reference/std": 0.13865381479263306, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3853.0, + "completions/mean_length": 2037.4765625, + "completions/mean_terminated_length": 2013.0672607421875, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "epoch": 0.027093333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06936460733413696, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 72356207.0, + "reward": 2.4788427352905273, + "reward_std": 0.10106191039085388, + "rewards/cosine_scaled_reward/mean": 0.6478254199028015, + "rewards/cosine_scaled_reward/std": 0.3071100413799286, + "rewards/repetition_penalty_reward/mean": -0.07288884371519089, + "rewards/repetition_penalty_reward/std": 0.04149799793958664, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4032.0, + "completions/mean_length": 2047.5234375, + "completions/mean_terminated_length": 1955.5509033203125, + "completions/min_length": 1143.0, + "completions/min_terminated_length": 1143.0, + "epoch": 0.027306666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09897647798061371, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 72987537.0, + "reward": 2.4084055423736572, + "reward_std": 0.1814638078212738, + "rewards/cosine_scaled_reward/mean": 0.6032418012619019, + "rewards/cosine_scaled_reward/std": 0.35705065727233887, + "rewards/repetition_penalty_reward/mean": -0.06905508041381836, + "rewards/repetition_penalty_reward/std": 0.048145100474357605, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3861.0, + "completions/mean_length": 2110.09765625, + "completions/mean_terminated_length": 2094.460693359375, + "completions/min_length": 1158.0, + "completions/min_terminated_length": 1158.0, + "epoch": 0.02752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1392839103937149, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 73680226.0, + "reward": 2.4746334552764893, + "reward_std": 0.24597196280956268, + "rewards/cosine_scaled_reward/mean": 0.6423944234848022, + "rewards/cosine_scaled_reward/std": 0.33495256304740906, + "rewards/repetition_penalty_reward/mean": -0.07010472565889359, + "rewards/repetition_penalty_reward/std": 0.03858313709497452, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3984.0, + "completions/mean_length": 2141.734375, + "completions/mean_terminated_length": 2011.4500732421875, + "completions/min_length": 1148.0, + "completions/min_terminated_length": 1148.0, + "epoch": 0.027733333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.093904048204422, + "learning_rate": 1e-06, + "loss": -0.0385, + "num_tokens": 74322770.0, + "reward": 2.463771343231201, + "reward_std": 0.19271329045295715, + "rewards/cosine_scaled_reward/mean": 0.6441590189933777, + "rewards/cosine_scaled_reward/std": 0.3357996940612793, + "rewards/repetition_penalty_reward/mean": -0.07726290076971054, + "rewards/repetition_penalty_reward/std": 0.055103596299886703, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4070.0, + "completions/mean_length": 2108.25, + "completions/mean_terminated_length": 2044.1290283203125, + "completions/min_length": 1217.0, + "completions/min_terminated_length": 1217.0, + "epoch": 0.02794666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10858944058418274, + "learning_rate": 1e-06, + "loss": -0.0266, + "num_tokens": 74983862.0, + "reward": 2.411698818206787, + "reward_std": 0.14864076673984528, + "rewards/cosine_scaled_reward/mean": 0.6128315925598145, + "rewards/cosine_scaled_reward/std": 0.3646450638771057, + "rewards/repetition_penalty_reward/mean": -0.0745701789855957, + "rewards/repetition_penalty_reward/std": 0.0539807491004467, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3876.0, + "completions/mean_length": 2086.9140625, + "completions/mean_terminated_length": 2030.4337158203125, + "completions/min_length": 1081.0, + "completions/min_terminated_length": 1081.0, + "epoch": 0.02816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12158370763063431, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 75646960.0, + "reward": 2.420889139175415, + "reward_std": 0.14892278611660004, + "rewards/cosine_scaled_reward/mean": 0.6176682710647583, + "rewards/cosine_scaled_reward/std": 0.35261261463165283, + "rewards/repetition_penalty_reward/mean": -0.07412292063236237, + "rewards/repetition_penalty_reward/std": 0.06017336621880531, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3954.0, + "completions/mean_length": 2100.36328125, + "completions/mean_terminated_length": 2068.6865234375, + "completions/min_length": 1299.0, + "completions/min_terminated_length": 1299.0, + "epoch": 0.028373333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08128384500741959, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 76331945.0, + "reward": 2.5049729347229004, + "reward_std": 0.11164649575948715, + "rewards/cosine_scaled_reward/mean": 0.6589208245277405, + "rewards/cosine_scaled_reward/std": 0.309569776058197, + "rewards/repetition_penalty_reward/mean": -0.06879155337810516, + "rewards/repetition_penalty_reward/std": 0.036213457584381104, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4003.0, + "completions/mean_length": 2168.25390625, + "completions/mean_terminated_length": 2153.07470703125, + "completions/min_length": 1157.0, + "completions/min_terminated_length": 1157.0, + "epoch": 0.028586666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1047474816441536, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 77025026.0, + "reward": 2.452206611633301, + "reward_std": 0.17508813738822937, + "rewards/cosine_scaled_reward/mean": 0.6381878852844238, + "rewards/cosine_scaled_reward/std": 0.35581979155540466, + "rewards/repetition_penalty_reward/mean": -0.06957493722438812, + "rewards/repetition_penalty_reward/std": 0.04151101037859917, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3802.0, + "completions/mean_length": 2059.671875, + "completions/mean_terminated_length": 2043.6378173828125, + "completions/min_length": 1275.0, + "completions/min_terminated_length": 1275.0, + "epoch": 0.0288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08779332041740417, + "learning_rate": 1e-06, + "loss": -0.0137, + "num_tokens": 77701418.0, + "reward": 2.5491394996643066, + "reward_std": 0.12299126386642456, + "rewards/cosine_scaled_reward/mean": 0.6745621562004089, + "rewards/cosine_scaled_reward/std": 0.2754439413547516, + "rewards/repetition_penalty_reward/mean": -0.06292291730642319, + "rewards/repetition_penalty_reward/std": 0.04240197688341141, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4013.0, + "completions/mean_length": 2068.34765625, + "completions/mean_terminated_length": 2036.162841796875, + "completions/min_length": 1027.0, + "completions/min_terminated_length": 1027.0, + "epoch": 0.029013333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09798014909029007, + "learning_rate": 1e-06, + "loss": -0.0196, + "num_tokens": 78378615.0, + "reward": 2.6103098392486572, + "reward_std": 0.09224607050418854, + "rewards/cosine_scaled_reward/mean": 0.7064558267593384, + "rewards/cosine_scaled_reward/std": 0.22068408131599426, + "rewards/repetition_penalty_reward/mean": -0.06098976358771324, + "rewards/repetition_penalty_reward/std": 0.04063894599676132, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.96484375, + "rewards/reward_reference/std": 0.18453538417816162, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -7.0, + "completions/max_length": 3710.0, + "completions/max_terminated_length": 3710.0, + "completions/mean_length": 2071.12109375, + "completions/mean_terminated_length": 2071.12109375, + "completions/min_length": 1181.0, + "completions/min_terminated_length": 1181.0, + "epoch": 0.029226666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3282462954521179, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 79071558.0, + "reward": 2.4916515350341797, + "reward_std": 0.12393692880868912, + "rewards/cosine_scaled_reward/mean": 0.6448882818222046, + "rewards/cosine_scaled_reward/std": 0.3213672935962677, + "rewards/repetition_penalty_reward/mean": -0.06339280307292938, + "rewards/repetition_penalty_reward/std": 0.0333862230181694, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3987.0, + "completions/mean_length": 2098.19140625, + "completions/mean_terminated_length": 2066.480224609375, + "completions/min_length": 1271.0, + "completions/min_terminated_length": 1271.0, + "epoch": 0.02944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08934274315834045, + "learning_rate": 1e-06, + "loss": -0.0331, + "num_tokens": 79752351.0, + "reward": 2.5181455612182617, + "reward_std": 0.14886733889579773, + "rewards/cosine_scaled_reward/mean": 0.6674562096595764, + "rewards/cosine_scaled_reward/std": 0.2978929877281189, + "rewards/repetition_penalty_reward/mean": -0.06806077808141708, + "rewards/repetition_penalty_reward/std": 0.04327305778861046, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.6875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3661.0, + "completions/mean_length": 2254.41015625, + "completions/mean_terminated_length": 2179.548583984375, + "completions/min_length": 1266.0, + "completions/min_terminated_length": 1266.0, + "epoch": 0.029653333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12618717551231384, + "learning_rate": 1e-06, + "loss": -0.0466, + "num_tokens": 80450128.0, + "reward": 2.5388686656951904, + "reward_std": 0.24636448919773102, + "rewards/cosine_scaled_reward/mean": 0.6855044960975647, + "rewards/cosine_scaled_reward/std": 0.3177576959133148, + "rewards/repetition_penalty_reward/mean": -0.06929213553667068, + "rewards/repetition_penalty_reward/std": 0.0481431819498539, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3906.0, + "completions/mean_length": 2127.328125, + "completions/mean_terminated_length": 2096.07958984375, + "completions/min_length": 1090.0, + "completions/min_terminated_length": 1090.0, + "epoch": 0.029866666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11913932859897614, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 81140656.0, + "reward": 2.5316383838653564, + "reward_std": 0.18211862444877625, + "rewards/cosine_scaled_reward/mean": 0.673290491104126, + "rewards/cosine_scaled_reward/std": 0.29935595393180847, + "rewards/repetition_penalty_reward/mean": -0.07133965194225311, + "rewards/repetition_penalty_reward/std": 0.04711141809821129, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 2295.82421875, + "completions/mean_terminated_length": 2237.75390625, + "completions/min_length": 1371.0, + "completions/min_terminated_length": 1371.0, + "epoch": 0.03008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13442406058311462, + "learning_rate": 1e-06, + "loss": -0.0334, + "num_tokens": 81863259.0, + "reward": 2.4578609466552734, + "reward_std": 0.2603323757648468, + "rewards/cosine_scaled_reward/mean": 0.649109959602356, + "rewards/cosine_scaled_reward/std": 0.3731234669685364, + "rewards/repetition_penalty_reward/mean": -0.07484269142150879, + "rewards/repetition_penalty_reward/std": 0.04426012560725212, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4033.0, + "completions/mean_length": 2203.62109375, + "completions/mean_terminated_length": 2158.2041015625, + "completions/min_length": 1266.0, + "completions/min_terminated_length": 1266.0, + "epoch": 0.030293333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07963822036981583, + "learning_rate": 1e-06, + "loss": -0.0364, + "num_tokens": 82563706.0, + "reward": 2.5777063369750977, + "reward_std": 0.1257934868335724, + "rewards/cosine_scaled_reward/mean": 0.7073277235031128, + "rewards/cosine_scaled_reward/std": 0.26815474033355713, + "rewards/repetition_penalty_reward/mean": -0.06868387758731842, + "rewards/repetition_penalty_reward/std": 0.054385874420404434, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.9453125, + "rewards/reward_reference/std": 0.22781464457511902, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 2262.61328125, + "completions/mean_terminated_length": 2211.072265625, + "completions/min_length": 1203.0, + "completions/min_terminated_length": 1203.0, + "epoch": 0.030506666666666668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10525289922952652, + "learning_rate": 1e-06, + "loss": -0.0235, + "num_tokens": 83278755.0, + "reward": 2.4962220191955566, + "reward_std": 0.18696096539497375, + "rewards/cosine_scaled_reward/mean": 0.6652387380599976, + "rewards/cosine_scaled_reward/std": 0.34593114256858826, + "rewards/repetition_penalty_reward/mean": -0.07136042416095734, + "rewards/repetition_penalty_reward/std": 0.04237104579806328, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3829.0, + "completions/mean_length": 2297.6015625, + "completions/mean_terminated_length": 2232.072998046875, + "completions/min_length": 1335.0, + "completions/min_terminated_length": 1335.0, + "epoch": 0.03072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15209636092185974, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 83990089.0, + "reward": 2.5153186321258545, + "reward_std": 0.2199176400899887, + "rewards/cosine_scaled_reward/mean": 0.6713955998420715, + "rewards/cosine_scaled_reward/std": 0.3484877049922943, + "rewards/repetition_penalty_reward/mean": -0.07013943791389465, + "rewards/repetition_penalty_reward/std": 0.0492706373333931, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4001.0, + "completions/mean_length": 2090.34765625, + "completions/mean_terminated_length": 2066.565185546875, + "completions/min_length": 1152.0, + "completions/min_terminated_length": 1152.0, + "epoch": 0.030933333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09697643667459488, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 84684358.0, + "reward": 2.6115381717681885, + "reward_std": 0.1272924244403839, + "rewards/cosine_scaled_reward/mean": 0.7085531949996948, + "rewards/cosine_scaled_reward/std": 0.22511249780654907, + "rewards/repetition_penalty_reward/mean": -0.06185879185795784, + "rewards/repetition_penalty_reward/std": 0.03542029857635498, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.96484375, + "rewards/reward_reference/std": 0.18453538417816162, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4047.0, + "completions/mean_length": 2369.5859375, + "completions/mean_terminated_length": 2306.68017578125, + "completions/min_length": 1360.0, + "completions/min_terminated_length": 1360.0, + "epoch": 0.031146666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08577720075845718, + "learning_rate": 1e-06, + "loss": -0.0259, + "num_tokens": 85418660.0, + "reward": 2.4473724365234375, + "reward_std": 0.10904312133789062, + "rewards/cosine_scaled_reward/mean": 0.6387135982513428, + "rewards/cosine_scaled_reward/std": 0.40183010697364807, + "rewards/repetition_penalty_reward/mean": -0.07024732232093811, + "rewards/repetition_penalty_reward/std": 0.04344237968325615, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 2296.45703125, + "completions/mean_terminated_length": 2230.88671875, + "completions/min_length": 1198.0, + "completions/min_terminated_length": 1198.0, + "epoch": 0.03136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16243378818035126, + "learning_rate": 1e-06, + "loss": -0.0572, + "num_tokens": 86130453.0, + "reward": 2.5525739192962646, + "reward_std": 0.20578095316886902, + "rewards/cosine_scaled_reward/mean": 0.6998293399810791, + "rewards/cosine_scaled_reward/std": 0.31000176072120667, + "rewards/repetition_penalty_reward/mean": -0.0714741051197052, + "rewards/repetition_penalty_reward/std": 0.06012459099292755, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3801.0, + "completions/mean_length": 2168.5078125, + "completions/mean_terminated_length": 2153.330810546875, + "completions/min_length": 1034.0, + "completions/min_terminated_length": 1034.0, + "epoch": 0.031573333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11189655214548111, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 86843211.0, + "reward": 2.5102152824401855, + "reward_std": 0.11899837106466293, + "rewards/cosine_scaled_reward/mean": 0.6611608266830444, + "rewards/cosine_scaled_reward/std": 0.3290916681289673, + "rewards/repetition_penalty_reward/mean": -0.0611017569899559, + "rewards/repetition_penalty_reward/std": 0.04228059947490692, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4053.0, + "completions/mean_length": 2289.94921875, + "completions/mean_terminated_length": 2224.141845703125, + "completions/min_length": 1175.0, + "completions/min_terminated_length": 1175.0, + "epoch": 0.031786666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07405582815408707, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 87555606.0, + "reward": 2.5253071784973145, + "reward_std": 0.08711743354797363, + "rewards/cosine_scaled_reward/mean": 0.6888059377670288, + "rewards/cosine_scaled_reward/std": 0.32022789120674133, + "rewards/repetition_penalty_reward/mean": -0.07365491986274719, + "rewards/repetition_penalty_reward/std": 0.0674201175570488, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3995.0, + "completions/mean_length": 2282.83203125, + "completions/mean_terminated_length": 2246.713134765625, + "completions/min_length": 1189.0, + "completions/min_terminated_length": 1189.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11084163933992386, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 88283223.0, + "reward": 2.4095571041107178, + "reward_std": 0.14110702276229858, + "rewards/cosine_scaled_reward/mean": 0.6185789108276367, + "rewards/cosine_scaled_reward/std": 0.40124762058258057, + "rewards/repetition_penalty_reward/mean": -0.06995944678783417, + "rewards/repetition_penalty_reward/std": 0.04662758484482765, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3901.0, + "completions/mean_length": 2269.77734375, + "completions/mean_terminated_length": 2248.12255859375, + "completions/min_length": 1339.0, + "completions/min_terminated_length": 1339.0, + "epoch": 0.03221333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08746644854545593, + "learning_rate": 1e-06, + "loss": -0.0327, + "num_tokens": 89019246.0, + "reward": 2.6399552822113037, + "reward_std": 0.09949071705341339, + "rewards/cosine_scaled_reward/mean": 0.7395962476730347, + "rewards/cosine_scaled_reward/std": 0.23498032987117767, + "rewards/repetition_penalty_reward/mean": -0.05745348334312439, + "rewards/repetition_penalty_reward/std": 0.02995491400361061, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3631.0, + "completions/mean_length": 2177.3203125, + "completions/mean_terminated_length": 2162.212646484375, + "completions/min_length": 1313.0, + "completions/min_terminated_length": 1313.0, + "epoch": 0.032426666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06584572046995163, + "learning_rate": 1e-06, + "loss": -0.0111, + "num_tokens": 89737848.0, + "reward": 2.6734888553619385, + "reward_std": 0.07492285966873169, + "rewards/cosine_scaled_reward/mean": 0.7505329847335815, + "rewards/cosine_scaled_reward/std": 0.1640416830778122, + "rewards/repetition_penalty_reward/mean": -0.0614192858338356, + "rewards/repetition_penalty_reward/std": 0.03193984180688858, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.984375, + "rewards/reward_reference/std": 0.12426253408193588, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4061.0, + "completions/mean_length": 2339.8046875, + "completions/mean_terminated_length": 2297.656005859375, + "completions/min_length": 1232.0, + "completions/min_terminated_length": 1232.0, + "epoch": 0.03264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16215871274471283, + "learning_rate": 1e-06, + "loss": -0.046, + "num_tokens": 90474706.0, + "reward": 2.512293815612793, + "reward_std": 0.2166244387626648, + "rewards/cosine_scaled_reward/mean": 0.6809740662574768, + "rewards/cosine_scaled_reward/std": 0.34952786564826965, + "rewards/repetition_penalty_reward/mean": -0.06243017315864563, + "rewards/repetition_penalty_reward/std": 0.040823645889759064, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 2528.88671875, + "completions/mean_terminated_length": 2458.5263671875, + "completions/min_length": 1344.0, + "completions/min_terminated_length": 1344.0, + "epoch": 0.03285333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13708092272281647, + "learning_rate": 1e-06, + "loss": -0.0313, + "num_tokens": 91241253.0, + "reward": 2.446394920349121, + "reward_std": 0.2406942993402481, + "rewards/cosine_scaled_reward/mean": 0.6548641324043274, + "rewards/cosine_scaled_reward/std": 0.42234691977500916, + "rewards/repetition_penalty_reward/mean": -0.07565662264823914, + "rewards/repetition_penalty_reward/std": 0.058208148926496506, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4036.0, + "completions/mean_length": 2224.78125, + "completions/mean_terminated_length": 2217.443359375, + "completions/min_length": 1331.0, + "completions/min_terminated_length": 1331.0, + "epoch": 0.03306666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11987435817718506, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 91966513.0, + "reward": 2.5100762844085693, + "reward_std": 0.1404195874929428, + "rewards/cosine_scaled_reward/mean": 0.6662790775299072, + "rewards/cosine_scaled_reward/std": 0.3378640413284302, + "rewards/repetition_penalty_reward/mean": -0.06635906547307968, + "rewards/repetition_penalty_reward/std": 0.03748669847846031, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3992.0, + "completions/mean_length": 2387.51171875, + "completions/mean_terminated_length": 2380.81201171875, + "completions/min_length": 1170.0, + "completions/min_terminated_length": 1170.0, + "epoch": 0.03328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.133942648768425, + "learning_rate": 1e-06, + "loss": -0.0175, + "num_tokens": 92736440.0, + "reward": 2.584214925765991, + "reward_std": 0.1829279661178589, + "rewards/cosine_scaled_reward/mean": 0.7195907235145569, + "rewards/cosine_scaled_reward/std": 0.3137247860431671, + "rewards/repetition_penalty_reward/mean": -0.06115696206688881, + "rewards/repetition_penalty_reward/std": 0.030387477949261665, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3841.0, + "completions/mean_length": 2337.01171875, + "completions/mean_terminated_length": 2272.919189453125, + "completions/min_length": 1126.0, + "completions/min_terminated_length": 1126.0, + "epoch": 0.03349333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08026068657636642, + "learning_rate": 1e-06, + "loss": -0.0285, + "num_tokens": 93468063.0, + "reward": 2.5636253356933594, + "reward_std": 0.14338311553001404, + "rewards/cosine_scaled_reward/mean": 0.7090778350830078, + "rewards/cosine_scaled_reward/std": 0.3102107048034668, + "rewards/repetition_penalty_reward/mean": -0.06967125833034515, + "rewards/repetition_penalty_reward/std": 0.04909246787428856, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4070.0, + "completions/mean_length": 2347.16015625, + "completions/mean_terminated_length": 2340.302001953125, + "completions/min_length": 1254.0, + "completions/min_terminated_length": 1254.0, + "epoch": 0.03370666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07013080269098282, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 94229528.0, + "reward": 2.6471498012542725, + "reward_std": 0.06843972951173782, + "rewards/cosine_scaled_reward/mean": 0.7538424134254456, + "rewards/cosine_scaled_reward/std": 0.2390926480293274, + "rewards/repetition_penalty_reward/mean": -0.06763020157814026, + "rewards/repetition_penalty_reward/std": 0.04745958000421524, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4060.0, + "completions/mean_length": 2336.1640625, + "completions/mean_terminated_length": 2286.690673828125, + "completions/min_length": 1311.0, + "completions/min_terminated_length": 1311.0, + "epoch": 0.03392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11125911772251129, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 94959914.0, + "reward": 2.49423885345459, + "reward_std": 0.16092067956924438, + "rewards/cosine_scaled_reward/mean": 0.6674948930740356, + "rewards/cosine_scaled_reward/std": 0.3645523488521576, + "rewards/repetition_penalty_reward/mean": -0.06388123333454132, + "rewards/repetition_penalty_reward/std": 0.03779073432087898, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 2357.67578125, + "completions/mean_terminated_length": 2343.98828125, + "completions/min_length": 1368.0, + "completions/min_terminated_length": 1368.0, + "epoch": 0.034133333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17198342084884644, + "learning_rate": 1e-06, + "loss": -0.0143, + "num_tokens": 95715799.0, + "reward": 2.5218825340270996, + "reward_std": 0.16397586464881897, + "rewards/cosine_scaled_reward/mean": 0.6893811225891113, + "rewards/cosine_scaled_reward/std": 0.3468390107154846, + "rewards/repetition_penalty_reward/mean": -0.06671740859746933, + "rewards/repetition_penalty_reward/std": 0.036188509315252304, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 2388.20703125, + "completions/mean_terminated_length": 2381.510009765625, + "completions/min_length": 1381.0, + "completions/min_terminated_length": 1381.0, + "epoch": 0.034346666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12249568849802017, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 96481476.0, + "reward": 2.613097667694092, + "reward_std": 0.16200359165668488, + "rewards/cosine_scaled_reward/mean": 0.7398969531059265, + "rewards/cosine_scaled_reward/std": 0.28102821111679077, + "rewards/repetition_penalty_reward/mean": -0.06508070230484009, + "rewards/repetition_penalty_reward/std": 0.03461037203669548, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3821.0, + "completions/mean_length": 2416.1640625, + "completions/mean_terminated_length": 2340.74267578125, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "epoch": 0.03456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13304626941680908, + "learning_rate": 1e-06, + "loss": -0.0298, + "num_tokens": 97219774.0, + "reward": 2.577547550201416, + "reward_std": 0.1826041340827942, + "rewards/cosine_scaled_reward/mean": 0.7215290069580078, + "rewards/cosine_scaled_reward/std": 0.31414729356765747, + "rewards/repetition_penalty_reward/mean": -0.07366900146007538, + "rewards/repetition_penalty_reward/std": 0.050914179533720016, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4003.0, + "completions/mean_length": 2342.35546875, + "completions/mean_terminated_length": 2335.478515625, + "completions/min_length": 1415.0, + "completions/min_terminated_length": 1415.0, + "epoch": 0.03477333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08213608711957932, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 97981425.0, + "reward": 2.6511223316192627, + "reward_std": 0.07822506129741669, + "rewards/cosine_scaled_reward/mean": 0.7545344829559326, + "rewards/cosine_scaled_reward/std": 0.23536533117294312, + "rewards/repetition_penalty_reward/mean": -0.06122472137212753, + "rewards/repetition_penalty_reward/std": 0.03276967257261276, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3932.0, + "completions/mean_length": 2422.30859375, + "completions/mean_terminated_length": 2347.1630859375, + "completions/min_length": 1465.0, + "completions/min_terminated_length": 1465.0, + "epoch": 0.034986666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1392398625612259, + "learning_rate": 1e-06, + "loss": -0.0218, + "num_tokens": 98721128.0, + "reward": 2.518110513687134, + "reward_std": 0.192726731300354, + "rewards/cosine_scaled_reward/mean": 0.6899570226669312, + "rewards/cosine_scaled_reward/std": 0.3583027422428131, + "rewards/repetition_penalty_reward/mean": -0.07575271278619766, + "rewards/repetition_penalty_reward/std": 0.059990935027599335, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3879.0, + "completions/mean_length": 2430.46875, + "completions/mean_terminated_length": 2390.49609375, + "completions/min_length": 1569.0, + "completions/min_terminated_length": 1569.0, + "epoch": 0.0352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11965960264205933, + "learning_rate": 1e-06, + "loss": -0.0261, + "num_tokens": 99479680.0, + "reward": 2.5446901321411133, + "reward_std": 0.14299365878105164, + "rewards/cosine_scaled_reward/mean": 0.7057998180389404, + "rewards/cosine_scaled_reward/std": 0.34264159202575684, + "rewards/repetition_penalty_reward/mean": -0.07126598805189133, + "rewards/repetition_penalty_reward/std": 0.04200437292456627, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 2471.875, + "completions/mean_terminated_length": 2384.987548828125, + "completions/min_length": 1349.0, + "completions/min_terminated_length": 1349.0, + "epoch": 0.03541333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1257084310054779, + "learning_rate": 1e-06, + "loss": -0.0213, + "num_tokens": 100223592.0, + "reward": 2.496830940246582, + "reward_std": 0.1805962324142456, + "rewards/cosine_scaled_reward/mean": 0.6858898997306824, + "rewards/cosine_scaled_reward/std": 0.3756991922855377, + "rewards/repetition_penalty_reward/mean": -0.07812131196260452, + "rewards/repetition_penalty_reward/std": 0.060796987265348434, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3871.0, + "completions/mean_length": 2479.078125, + "completions/mean_terminated_length": 2440.272216796875, + "completions/min_length": 1397.0, + "completions/min_terminated_length": 1397.0, + "epoch": 0.03562666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18038783967494965, + "learning_rate": 1e-06, + "loss": -0.0203, + "num_tokens": 100991524.0, + "reward": 2.527858018875122, + "reward_std": 0.2230907678604126, + "rewards/cosine_scaled_reward/mean": 0.6958828568458557, + "rewards/cosine_scaled_reward/std": 0.36940014362335205, + "rewards/repetition_penalty_reward/mean": -0.07427485287189484, + "rewards/repetition_penalty_reward/std": 0.047159843146800995, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 2387.18359375, + "completions/mean_terminated_length": 2360.0595703125, + "completions/min_length": 1488.0, + "completions/min_terminated_length": 1488.0, + "epoch": 0.03584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10775547474622726, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 101752855.0, + "reward": 2.671187400817871, + "reward_std": 0.11521877348423004, + "rewards/cosine_scaled_reward/mean": 0.7671756148338318, + "rewards/cosine_scaled_reward/std": 0.2231522649526596, + "rewards/repetition_penalty_reward/mean": -0.06083208695054054, + "rewards/repetition_penalty_reward/std": 0.036698468029499054, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.96484375, + "rewards/reward_reference/std": 0.18453538417816162, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4015.0, + "completions/mean_length": 2439.29296875, + "completions/mean_terminated_length": 2406.291015625, + "completions/min_length": 1414.0, + "completions/min_terminated_length": 1414.0, + "epoch": 0.03605333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07064104825258255, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 102523330.0, + "reward": 2.598905563354492, + "reward_std": 0.09044383466243744, + "rewards/cosine_scaled_reward/mean": 0.7386330366134644, + "rewards/cosine_scaled_reward/std": 0.2991005480289459, + "rewards/repetition_penalty_reward/mean": -0.0709775984287262, + "rewards/repetition_penalty_reward/std": 0.045941270887851715, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3997.0, + "completions/mean_length": 2463.8046875, + "completions/mean_terminated_length": 2424.632080078125, + "completions/min_length": 1516.0, + "completions/min_terminated_length": 1516.0, + "epoch": 0.03626666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13372494280338287, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 103297172.0, + "reward": 2.620488405227661, + "reward_std": 0.1414932757616043, + "rewards/cosine_scaled_reward/mean": 0.746385931968689, + "rewards/cosine_scaled_reward/std": 0.2921934127807617, + "rewards/repetition_penalty_reward/mean": -0.06730364263057709, + "rewards/repetition_penalty_reward/std": 0.0357523076236248, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4062.0, + "completions/mean_length": 2397.1640625, + "completions/mean_terminated_length": 2377.019775390625, + "completions/min_length": 1352.0, + "completions/min_terminated_length": 1352.0, + "epoch": 0.03648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07134996354579926, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 104062430.0, + "reward": 2.5790481567382812, + "reward_std": 0.08886488527059555, + "rewards/cosine_scaled_reward/mean": 0.7279187440872192, + "rewards/cosine_scaled_reward/std": 0.30195677280426025, + "rewards/repetition_penalty_reward/mean": -0.07152681052684784, + "rewards/repetition_penalty_reward/std": 0.03701797500252724, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 2537.82421875, + "completions/mean_terminated_length": 2519.347900390625, + "completions/min_length": 1523.0, + "completions/min_terminated_length": 1523.0, + "epoch": 0.036693333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09948837012052536, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 104860645.0, + "reward": 2.573436975479126, + "reward_std": 0.12694615125656128, + "rewards/cosine_scaled_reward/mean": 0.7350782155990601, + "rewards/cosine_scaled_reward/std": 0.334741473197937, + "rewards/repetition_penalty_reward/mean": -0.07648499310016632, + "rewards/repetition_penalty_reward/std": 0.03855476155877113, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3928.0, + "completions/mean_length": 2518.53515625, + "completions/mean_terminated_length": 2493.49609375, + "completions/min_length": 1453.0, + "completions/min_terminated_length": 1453.0, + "epoch": 0.036906666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11611484736204147, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 105656278.0, + "reward": 2.664511203765869, + "reward_std": 0.1288546919822693, + "rewards/cosine_scaled_reward/mean": 0.7776679992675781, + "rewards/cosine_scaled_reward/std": 0.2540041506290436, + "rewards/repetition_penalty_reward/mean": -0.07018814235925674, + "rewards/repetition_penalty_reward/std": 0.03482862934470177, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4036.0, + "completions/mean_length": 2552.69921875, + "completions/mean_terminated_length": 2515.66015625, + "completions/min_length": 1630.0, + "completions/min_terminated_length": 1630.0, + "epoch": 0.03712, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07543495297431946, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 106455677.0, + "reward": 2.617737293243408, + "reward_std": 0.07202796638011932, + "rewards/cosine_scaled_reward/mean": 0.7571834921836853, + "rewards/cosine_scaled_reward/std": 0.30415284633636475, + "rewards/repetition_penalty_reward/mean": -0.07382114231586456, + "rewards/repetition_penalty_reward/std": 0.041181910783052444, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 2674.3125, + "completions/mean_terminated_length": 2634.34521484375, + "completions/min_length": 1519.0, + "completions/min_terminated_length": 1519.0, + "epoch": 0.037333333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11848440021276474, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 107270749.0, + "reward": 2.5728800296783447, + "reward_std": 0.14646834135055542, + "rewards/cosine_scaled_reward/mean": 0.7433780431747437, + "rewards/cosine_scaled_reward/std": 0.3599579930305481, + "rewards/repetition_penalty_reward/mean": -0.07518541812896729, + "rewards/repetition_penalty_reward/std": 0.04847825691103935, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4042.0, + "completions/mean_length": 2718.9375, + "completions/mean_terminated_length": 2651.212890625, + "completions/min_length": 1497.0, + "completions/min_terminated_length": 1497.0, + "epoch": 0.037546666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1338227242231369, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 108074361.0, + "reward": 2.518850803375244, + "reward_std": 0.18713831901550293, + "rewards/cosine_scaled_reward/mean": 0.7246681451797485, + "rewards/cosine_scaled_reward/std": 0.3936954736709595, + "rewards/repetition_penalty_reward/mean": -0.08628620207309723, + "rewards/repetition_penalty_reward/std": 0.05177297443151474, + "rewards/reward_format/mean": 0.9781249761581421, + "rewards/reward_format/std": 0.13072198629379272, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 2679.640625, + "completions/mean_terminated_length": 2657.158935546875, + "completions/min_length": 1473.0, + "completions/min_terminated_length": 1473.0, + "epoch": 0.03776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12189571559429169, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 108901109.0, + "reward": 2.581157684326172, + "reward_std": 0.14895252883434296, + "rewards/cosine_scaled_reward/mean": 0.7483083009719849, + "rewards/cosine_scaled_reward/std": 0.35640743374824524, + "rewards/repetition_penalty_reward/mean": -0.07808814942836761, + "rewards/repetition_penalty_reward/std": 0.033407896757125854, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.6875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4034.0, + "completions/mean_length": 2704.85546875, + "completions/mean_terminated_length": 2648.3046875, + "completions/min_length": 1542.0, + "completions/min_terminated_length": 1542.0, + "epoch": 0.03797333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12708529829978943, + "learning_rate": 1e-06, + "loss": -0.0187, + "num_tokens": 109710816.0, + "reward": 2.556445598602295, + "reward_std": 0.15058737993240356, + "rewards/cosine_scaled_reward/mean": 0.7380162477493286, + "rewards/cosine_scaled_reward/std": 0.37427443265914917, + "rewards/repetition_penalty_reward/mean": -0.0784459114074707, + "rewards/repetition_penalty_reward/std": 0.039408691227436066, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3977.0, + "completions/mean_length": 2772.37109375, + "completions/mean_terminated_length": 2707.2744140625, + "completions/min_length": 1277.0, + "completions/min_terminated_length": 1277.0, + "epoch": 0.03818666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1372547298669815, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 110533871.0, + "reward": 2.4635605812072754, + "reward_std": 0.17123311758041382, + "rewards/cosine_scaled_reward/mean": 0.6944536566734314, + "rewards/cosine_scaled_reward/std": 0.4396909177303314, + "rewards/repetition_penalty_reward/mean": -0.08401811122894287, + "rewards/repetition_penalty_reward/std": 0.051559146493673325, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718994140625, + "rewards/reward_reference/mean": 0.859375, + "rewards/reward_reference/std": 0.3483152687549591, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3997.0, + "completions/mean_length": 2734.51171875, + "completions/mean_terminated_length": 2684.90283203125, + "completions/min_length": 1469.0, + "completions/min_terminated_length": 1469.0, + "epoch": 0.0384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10627460479736328, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 111356254.0, + "reward": 2.5642635822296143, + "reward_std": 0.08983881771564484, + "rewards/cosine_scaled_reward/mean": 0.7460783123970032, + "rewards/cosine_scaled_reward/std": 0.37204012274742126, + "rewards/repetition_penalty_reward/mean": -0.07868963479995728, + "rewards/repetition_penalty_reward/std": 0.041410669684410095, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 2769.875, + "completions/mean_terminated_length": 2764.674560546875, + "completions/min_length": 1579.0, + "completions/min_terminated_length": 1579.0, + "epoch": 0.03861333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09451936185359955, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 112228258.0, + "reward": 2.7477917671203613, + "reward_std": 0.08577438443899155, + "rewards/cosine_scaled_reward/mean": 0.843535304069519, + "rewards/cosine_scaled_reward/std": 0.204621359705925, + "rewards/repetition_penalty_reward/mean": -0.0723060667514801, + "rewards/repetition_penalty_reward/std": 0.031312841922044754, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9765625, + "rewards/reward_reference/std": 0.15158477425575256, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4006.0, + "completions/mean_length": 2760.140625, + "completions/mean_terminated_length": 2694.4423828125, + "completions/min_length": 1352.0, + "completions/min_terminated_length": 1352.0, + "epoch": 0.03882666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13490934669971466, + "learning_rate": 1e-06, + "loss": -0.0398, + "num_tokens": 113043882.0, + "reward": 2.5234622955322266, + "reward_std": 0.19220773875713348, + "rewards/cosine_scaled_reward/mean": 0.7239133715629578, + "rewards/cosine_scaled_reward/std": 0.4048174321651459, + "rewards/repetition_penalty_reward/mean": -0.08404479920864105, + "rewards/repetition_penalty_reward/std": 0.046684637665748596, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3860.0, + "completions/mean_length": 2787.796875, + "completions/mean_terminated_length": 2772.28466796875, + "completions/min_length": 1674.0, + "completions/min_terminated_length": 1674.0, + "epoch": 0.03904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14421382546424866, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 113910150.0, + "reward": 2.5739893913269043, + "reward_std": 0.16917826235294342, + "rewards/cosine_scaled_reward/mean": 0.747693657875061, + "rewards/cosine_scaled_reward/std": 0.3862946033477783, + "rewards/repetition_penalty_reward/mean": -0.07995417714118958, + "rewards/repetition_penalty_reward/std": 0.03675444424152374, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3901.0, + "completions/mean_length": 2733.01171875, + "completions/mean_terminated_length": 2700.300048828125, + "completions/min_length": 1620.0, + "completions/min_terminated_length": 1620.0, + "epoch": 0.039253333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12685048580169678, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 114744729.0, + "reward": 2.5625431537628174, + "reward_std": 0.1313878297805786, + "rewards/cosine_scaled_reward/mean": 0.7464696168899536, + "rewards/cosine_scaled_reward/std": 0.3718789219856262, + "rewards/repetition_penalty_reward/mean": -0.08080147951841354, + "rewards/repetition_penalty_reward/std": 0.04628358036279678, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4039.0, + "completions/mean_length": 2822.77734375, + "completions/mean_terminated_length": 2797.414306640625, + "completions/min_length": 1797.0, + "completions/min_terminated_length": 1797.0, + "epoch": 0.039466666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14532560110092163, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 115605096.0, + "reward": 2.493752956390381, + "reward_std": 0.18467921018600464, + "rewards/cosine_scaled_reward/mean": 0.7073122262954712, + "rewards/cosine_scaled_reward/std": 0.4398163855075836, + "rewards/repetition_penalty_reward/mean": -0.08074688911437988, + "rewards/repetition_penalty_reward/std": 0.038459036499261856, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4007.0, + "completions/mean_length": 2905.9609375, + "completions/mean_terminated_length": 2887.071533203125, + "completions/min_length": 1849.0, + "completions/min_terminated_length": 1849.0, + "epoch": 0.03968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07806304842233658, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 116500474.0, + "reward": 2.704318046569824, + "reward_std": 0.08798962831497192, + "rewards/cosine_scaled_reward/mean": 0.8352360725402832, + "rewards/cosine_scaled_reward/std": 0.2752145230770111, + "rewards/repetition_penalty_reward/mean": -0.08794920146465302, + "rewards/repetition_penalty_reward/std": 0.032611243426799774, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 2934.3984375, + "completions/mean_terminated_length": 2915.96044921875, + "completions/min_length": 2009.0, + "completions/min_terminated_length": 2009.0, + "epoch": 0.039893333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12208955734968185, + "learning_rate": 1e-06, + "loss": -0.0099, + "num_tokens": 117398232.0, + "reward": 2.709160327911377, + "reward_std": 0.12949120998382568, + "rewards/cosine_scaled_reward/mean": 0.8370053172111511, + "rewards/cosine_scaled_reward/std": 0.28149205446243286, + "rewards/repetition_penalty_reward/mean": -0.08487646281719208, + "rewards/repetition_penalty_reward/std": 0.0364808514714241, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 2816.8046875, + "completions/mean_terminated_length": 2801.636474609375, + "completions/min_length": 1413.0, + "completions/min_terminated_length": 1413.0, + "epoch": 0.040106666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11472379416227341, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 118263698.0, + "reward": 2.5987226963043213, + "reward_std": 0.1316905915737152, + "rewards/cosine_scaled_reward/mean": 0.7706905603408813, + "rewards/cosine_scaled_reward/std": 0.36017245054244995, + "rewards/repetition_penalty_reward/mean": -0.08603024482727051, + "rewards/repetition_penalty_reward/std": 0.030638542026281357, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4018.0, + "completions/mean_length": 3020.15234375, + "completions/mean_terminated_length": 2971.848876953125, + "completions/min_length": 1944.0, + "completions/min_terminated_length": 1944.0, + "epoch": 0.04032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31249523162841797, + "learning_rate": 1e-06, + "loss": -0.0538, + "num_tokens": 119145317.0, + "reward": 2.6029105186462402, + "reward_std": 0.25811532139778137, + "rewards/cosine_scaled_reward/mean": 0.7884730100631714, + "rewards/cosine_scaled_reward/std": 0.3834865689277649, + "rewards/repetition_penalty_reward/mean": -0.08946871757507324, + "rewards/repetition_penalty_reward/std": 0.042005784809589386, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3139.890625, + "completions/mean_terminated_length": 3088.74072265625, + "completions/min_length": 1696.0, + "completions/min_terminated_length": 1696.0, + "epoch": 0.04053333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05340828001499176, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 120058193.0, + "reward": 2.6640408039093018, + "reward_std": 0.06567069888114929, + "rewards/cosine_scaled_reward/mean": 0.8290449380874634, + "rewards/cosine_scaled_reward/std": 0.34697195887565613, + "rewards/repetition_penalty_reward/mean": -0.09156674891710281, + "rewards/repetition_penalty_reward/std": 0.03702438622713089, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3050.08984375, + "completions/mean_terminated_length": 2984.991943359375, + "completions/min_length": 1597.0, + "completions/min_terminated_length": 1597.0, + "epoch": 0.04074666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1327456533908844, + "learning_rate": 1e-06, + "loss": -0.0159, + "num_tokens": 120942404.0, + "reward": 2.5852432250976562, + "reward_std": 0.21544049680233002, + "rewards/cosine_scaled_reward/mean": 0.789260983467102, + "rewards/cosine_scaled_reward/std": 0.3850567936897278, + "rewards/repetition_penalty_reward/mean": -0.09620514512062073, + "rewards/repetition_penalty_reward/std": 0.04582031071186066, + "rewards/reward_format/mean": 0.9781249761581421, + "rewards/reward_format/std": 0.13072198629379272, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 3017.37890625, + "completions/mean_terminated_length": 2964.331787109375, + "completions/min_length": 1971.0, + "completions/min_terminated_length": 1971.0, + "epoch": 0.04096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06460432708263397, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 121827473.0, + "reward": 2.5872888565063477, + "reward_std": 0.09368535131216049, + "rewards/cosine_scaled_reward/mean": 0.7789064645767212, + "rewards/cosine_scaled_reward/std": 0.3939122259616852, + "rewards/repetition_penalty_reward/mean": -0.0978674441576004, + "rewards/repetition_penalty_reward/std": 0.040292881429195404, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 2939.30859375, + "completions/mean_terminated_length": 2882.421875, + "completions/min_length": 1532.0, + "completions/min_terminated_length": 1532.0, + "epoch": 0.04117333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1415102332830429, + "learning_rate": 1e-06, + "loss": -0.022, + "num_tokens": 122691892.0, + "reward": 2.592499017715454, + "reward_std": 0.16990575194358826, + "rewards/cosine_scaled_reward/mean": 0.7752484083175659, + "rewards/cosine_scaled_reward/std": 0.3793916404247284, + "rewards/repetition_penalty_reward/mean": -0.09681184589862823, + "rewards/repetition_penalty_reward/std": 0.03823775798082352, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4054.0, + "completions/mean_length": 2930.1796875, + "completions/mean_terminated_length": 2887.700439453125, + "completions/min_length": 1627.0, + "completions/min_terminated_length": 1627.0, + "epoch": 0.04138666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14961861073970795, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 123563606.0, + "reward": 2.638584613800049, + "reward_std": 0.20664985477924347, + "rewards/cosine_scaled_reward/mean": 0.8019155263900757, + "rewards/cosine_scaled_reward/std": 0.33979693055152893, + "rewards/repetition_penalty_reward/mean": -0.09379956871271133, + "rewards/repetition_penalty_reward/std": 0.03525862842798233, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 2985.79296875, + "completions/mean_terminated_length": 2945.340087890625, + "completions/min_length": 1421.0, + "completions/min_terminated_length": 1421.0, + "epoch": 0.0416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1307501643896103, + "learning_rate": 1e-06, + "loss": -0.019, + "num_tokens": 124465661.0, + "reward": 2.659268856048584, + "reward_std": 0.16081370413303375, + "rewards/cosine_scaled_reward/mean": 0.8172322511672974, + "rewards/cosine_scaled_reward/std": 0.32748943567276, + "rewards/repetition_penalty_reward/mean": -0.09546343237161636, + "rewards/repetition_penalty_reward/std": 0.03714780509471893, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4074.0, + "completions/mean_length": 2998.1875, + "completions/mean_terminated_length": 2944.196533203125, + "completions/min_length": 1623.0, + "completions/min_terminated_length": 1623.0, + "epoch": 0.041813333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12217960506677628, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 125349101.0, + "reward": 2.6592578887939453, + "reward_std": 0.13490937650203705, + "rewards/cosine_scaled_reward/mean": 0.8189342021942139, + "rewards/cosine_scaled_reward/std": 0.32874596118927, + "rewards/repetition_penalty_reward/mean": -0.09327013045549393, + "rewards/repetition_penalty_reward/std": 0.033582936972379684, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 2956.6015625, + "completions/mean_terminated_length": 2919.8466796875, + "completions/min_length": 1686.0, + "completions/min_terminated_length": 1686.0, + "epoch": 0.042026666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06713753193616867, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 126240259.0, + "reward": 2.714076042175293, + "reward_std": 0.09007962793111801, + "rewards/cosine_scaled_reward/mean": 0.8496314287185669, + "rewards/cosine_scaled_reward/std": 0.2547302544116974, + "rewards/repetition_penalty_reward/mean": -0.09649277478456497, + "rewards/repetition_penalty_reward/std": 0.04336199536919594, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 3068.90625, + "completions/mean_terminated_length": 2991.22705078125, + "completions/min_length": 1184.0, + "completions/min_terminated_length": 1184.0, + "epoch": 0.04224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09822492301464081, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 127112907.0, + "reward": 2.6488893032073975, + "reward_std": 0.14325517416000366, + "rewards/cosine_scaled_reward/mean": 0.8167030215263367, + "rewards/cosine_scaled_reward/std": 0.3480437099933624, + "rewards/repetition_penalty_reward/mean": -0.0975012555718422, + "rewards/repetition_penalty_reward/std": 0.03966366872191429, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4046.0, + "completions/mean_length": 3024.32421875, + "completions/mean_terminated_length": 2957.62255859375, + "completions/min_length": 1712.0, + "completions/min_terminated_length": 1712.0, + "epoch": 0.042453333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09099367260932922, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 127996106.0, + "reward": 2.632681369781494, + "reward_std": 0.1334916204214096, + "rewards/cosine_scaled_reward/mean": 0.7996670603752136, + "rewards/cosine_scaled_reward/std": 0.3644350469112396, + "rewards/repetition_penalty_reward/mean": -0.08886080980300903, + "rewards/repetition_penalty_reward/std": 0.03873216733336449, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 3094.09375, + "completions/mean_terminated_length": 3044.819580078125, + "completions/min_length": 1948.0, + "completions/min_terminated_length": 1948.0, + "epoch": 0.042666666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14509928226470947, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 128896898.0, + "reward": 2.5783491134643555, + "reward_std": 0.20983529090881348, + "rewards/cosine_scaled_reward/mean": 0.7796949148178101, + "rewards/cosine_scaled_reward/std": 0.4083874821662903, + "rewards/repetition_penalty_reward/mean": -0.10056446492671967, + "rewards/repetition_penalty_reward/std": 0.04261765629053116, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4046.0, + "completions/mean_length": 3027.64453125, + "completions/mean_terminated_length": 2988.716552734375, + "completions/min_length": 1566.0, + "completions/min_terminated_length": 1566.0, + "epoch": 0.04288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1521781086921692, + "learning_rate": 1e-06, + "loss": -0.0159, + "num_tokens": 129796651.0, + "reward": 2.5637636184692383, + "reward_std": 0.2006940245628357, + "rewards/cosine_scaled_reward/mean": 0.7643671035766602, + "rewards/cosine_scaled_reward/std": 0.41633033752441406, + "rewards/repetition_penalty_reward/mean": -0.09122838079929352, + "rewards/repetition_penalty_reward/std": 0.03793216124176979, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4049.0, + "completions/mean_length": 3094.3671875, + "completions/mean_terminated_length": 3040.78173828125, + "completions/min_length": 1759.0, + "completions/min_terminated_length": 1759.0, + "epoch": 0.04309333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039731431752443314, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 130699769.0, + "reward": 2.798168182373047, + "reward_std": 0.07617372274398804, + "rewards/cosine_scaled_reward/mean": 0.9041758179664612, + "rewards/cosine_scaled_reward/std": 0.14127732813358307, + "rewards/repetition_penalty_reward/mean": -0.0981951504945755, + "rewards/repetition_penalty_reward/std": 0.03816307336091995, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9921875, + "rewards/reward_reference/std": 0.08821486681699753, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4048.0, + "completions/mean_length": 2993.23046875, + "completions/mean_terminated_length": 2938.995849609375, + "completions/min_length": 1732.0, + "completions/min_terminated_length": 1732.0, + "epoch": 0.04330666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0980035662651062, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 131575184.0, + "reward": 2.4957215785980225, + "reward_std": 0.16899895668029785, + "rewards/cosine_scaled_reward/mean": 0.7244171500205994, + "rewards/cosine_scaled_reward/std": 0.45311957597732544, + "rewards/repetition_penalty_reward/mean": -0.09275795519351959, + "rewards/repetition_penalty_reward/std": 0.048649415373802185, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 3021.48828125, + "completions/mean_terminated_length": 2973.244873046875, + "completions/min_length": 1873.0, + "completions/min_terminated_length": 1873.0, + "epoch": 0.04352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0876871794462204, + "learning_rate": 1e-06, + "loss": -0.016, + "num_tokens": 132464137.0, + "reward": 2.6541526317596436, + "reward_std": 0.10013444721698761, + "rewards/cosine_scaled_reward/mean": 0.8130102157592773, + "rewards/cosine_scaled_reward/std": 0.3439813554286957, + "rewards/repetition_penalty_reward/mean": -0.08854503184556961, + "rewards/repetition_penalty_reward/std": 0.04010245203971863, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 2941.59375, + "completions/mean_terminated_length": 2932.50390625, + "completions/min_length": 1694.0, + "completions/min_terminated_length": 1694.0, + "epoch": 0.04373333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08829687535762787, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 133372257.0, + "reward": 2.6222023963928223, + "reward_std": 0.09712390601634979, + "rewards/cosine_scaled_reward/mean": 0.7839394211769104, + "rewards/cosine_scaled_reward/std": 0.37110278010368347, + "rewards/repetition_penalty_reward/mean": -0.08361180871725082, + "rewards/repetition_penalty_reward/std": 0.036519844084978104, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4052.0, + "completions/mean_length": 3063.41015625, + "completions/mean_terminated_length": 3038.628173828125, + "completions/min_length": 1935.0, + "completions/min_terminated_length": 1935.0, + "epoch": 0.04394666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1125572919845581, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 134293802.0, + "reward": 2.643646478652954, + "reward_std": 0.15754051506519318, + "rewards/cosine_scaled_reward/mean": 0.8129024505615234, + "rewards/cosine_scaled_reward/std": 0.35667291283607483, + "rewards/repetition_penalty_reward/mean": -0.09113100171089172, + "rewards/repetition_penalty_reward/std": 0.04145337641239166, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 3047.96875, + "completions/mean_terminated_length": 2996.426025390625, + "completions/min_length": 1952.0, + "completions/min_terminated_length": 1952.0, + "epoch": 0.04416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12327634543180466, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 135185766.0, + "reward": 2.4939651489257812, + "reward_std": 0.18519936501979828, + "rewards/cosine_scaled_reward/mean": 0.7189508080482483, + "rewards/cosine_scaled_reward/std": 0.4683718979358673, + "rewards/repetition_penalty_reward/mean": -0.08826674520969391, + "rewards/repetition_penalty_reward/std": 0.0476943776011467, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.86328125, + "rewards/reward_reference/std": 0.34422317147254944, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3996.0, + "completions/mean_length": 2941.19140625, + "completions/mean_terminated_length": 2932.098388671875, + "completions/min_length": 1626.0, + "completions/min_terminated_length": 1626.0, + "epoch": 0.044373333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06048279255628586, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 136098811.0, + "reward": 2.684300422668457, + "reward_std": 0.07446961104869843, + "rewards/cosine_scaled_reward/mean": 0.8218631744384766, + "rewards/cosine_scaled_reward/std": 0.3134990632534027, + "rewards/repetition_penalty_reward/mean": -0.07896900177001953, + "rewards/repetition_penalty_reward/std": 0.0332108810544014, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 3074.71484375, + "completions/mean_terminated_length": 2988.165283203125, + "completions/min_length": 1816.0, + "completions/min_terminated_length": 1816.0, + "epoch": 0.04458666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08215809613466263, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 136966294.0, + "reward": 2.624622344970703, + "reward_std": 0.1569027155637741, + "rewards/cosine_scaled_reward/mean": 0.8008697628974915, + "rewards/cosine_scaled_reward/std": 0.3725561201572418, + "rewards/repetition_penalty_reward/mean": -0.08796609938144684, + "rewards/repetition_penalty_reward/std": 0.05247509479522705, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4015.0, + "completions/mean_length": 2938.37109375, + "completions/mean_terminated_length": 2901.028076171875, + "completions/min_length": 1573.0, + "completions/min_terminated_length": 1573.0, + "epoch": 0.0448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14607393741607666, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 137849509.0, + "reward": 2.6370255947113037, + "reward_std": 0.22698213160037994, + "rewards/cosine_scaled_reward/mean": 0.7916585803031921, + "rewards/cosine_scaled_reward/std": 0.3612592816352844, + "rewards/repetition_penalty_reward/mean": -0.07650791108608246, + "rewards/repetition_penalty_reward/std": 0.042758870869874954, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3943.0, + "completions/mean_length": 2850.15234375, + "completions/mean_terminated_length": 2830.377197265625, + "completions/min_length": 1788.0, + "completions/min_terminated_length": 1788.0, + "epoch": 0.045013333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12609753012657166, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 138723052.0, + "reward": 2.524672031402588, + "reward_std": 0.1426827311515808, + "rewards/cosine_scaled_reward/mean": 0.7225947380065918, + "rewards/cosine_scaled_reward/std": 0.4295172691345215, + "rewards/repetition_penalty_reward/mean": -0.07292264699935913, + "rewards/repetition_penalty_reward/std": 0.03526431694626808, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 2919.5390625, + "completions/mean_terminated_length": 2891.30419921875, + "completions/min_length": 1815.0, + "completions/min_terminated_length": 1815.0, + "epoch": 0.045226666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1029074564576149, + "learning_rate": 1e-06, + "loss": -0.0146, + "num_tokens": 139603246.0, + "reward": 2.6630499362945557, + "reward_std": 0.13122224807739258, + "rewards/cosine_scaled_reward/mean": 0.8020139932632446, + "rewards/cosine_scaled_reward/std": 0.33735036849975586, + "rewards/repetition_penalty_reward/mean": -0.07255782186985016, + "rewards/repetition_penalty_reward/std": 0.031210284680128098, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3999.0, + "completions/mean_length": 2900.22265625, + "completions/mean_terminated_length": 2846.53466796875, + "completions/min_length": 1855.0, + "completions/min_terminated_length": 1855.0, + "epoch": 0.04544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11880956590175629, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 140461603.0, + "reward": 2.6664743423461914, + "reward_std": 0.14175042510032654, + "rewards/cosine_scaled_reward/mean": 0.8035378456115723, + "rewards/cosine_scaled_reward/std": 0.32895320653915405, + "rewards/repetition_penalty_reward/mean": -0.0745634138584137, + "rewards/repetition_penalty_reward/std": 0.035252682864665985, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4029.0, + "completions/mean_length": 2865.43359375, + "completions/mean_terminated_length": 2835.900146484375, + "completions/min_length": 1427.0, + "completions/min_terminated_length": 1427.0, + "epoch": 0.04565333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08788994699716568, + "learning_rate": 1e-06, + "loss": -0.0391, + "num_tokens": 141330550.0, + "reward": 2.6503522396087646, + "reward_std": 0.13490185141563416, + "rewards/cosine_scaled_reward/mean": 0.7931498289108276, + "rewards/cosine_scaled_reward/std": 0.33805233240127563, + "rewards/repetition_penalty_reward/mean": -0.0724850744009018, + "rewards/repetition_penalty_reward/std": 0.02933771163225174, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4060.0, + "completions/mean_length": 2900.1171875, + "completions/mean_terminated_length": 2871.416015625, + "completions/min_length": 547.0, + "completions/min_terminated_length": 547.0, + "epoch": 0.04586666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12615323066711426, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 142217136.0, + "reward": 2.5718753337860107, + "reward_std": 0.15692222118377686, + "rewards/cosine_scaled_reward/mean": 0.7568657398223877, + "rewards/cosine_scaled_reward/std": 0.39952579140663147, + "rewards/repetition_penalty_reward/mean": -0.07483415305614471, + "rewards/repetition_penalty_reward/std": 0.03881849721074104, + "rewards/reward_format/mean": 0.995312511920929, + "rewards/reward_format/std": 0.05581394582986832, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 3043.5234375, + "completions/mean_terminated_length": 3013.935546875, + "completions/min_length": 1927.0, + "completions/min_terminated_length": 1927.0, + "epoch": 0.04608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12513042986392975, + "learning_rate": 1e-06, + "loss": -0.0171, + "num_tokens": 143130158.0, + "reward": 2.602346658706665, + "reward_std": 0.18451841175556183, + "rewards/cosine_scaled_reward/mean": 0.7779296636581421, + "rewards/cosine_scaled_reward/std": 0.40503278374671936, + "rewards/repetition_penalty_reward/mean": -0.07402050495147705, + "rewards/repetition_penalty_reward/std": 0.030004704371094704, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 2995.80078125, + "completions/mean_terminated_length": 2964.871337890625, + "completions/min_length": 1747.0, + "completions/min_terminated_length": 1747.0, + "epoch": 0.04629333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12058738619089127, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 144025731.0, + "reward": 2.4607934951782227, + "reward_std": 0.14996573328971863, + "rewards/cosine_scaled_reward/mean": 0.6966145038604736, + "rewards/cosine_scaled_reward/std": 0.48234885931015015, + "rewards/repetition_penalty_reward/mean": -0.08738350868225098, + "rewards/repetition_penalty_reward/std": 0.043269261717796326, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8515625, + "rewards/reward_reference/std": 0.3562295734882355, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 3143.25, + "completions/mean_terminated_length": 3062.508544921875, + "completions/min_length": 1899.0, + "completions/min_terminated_length": 1899.0, + "epoch": 0.04650666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06288176029920578, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 144920803.0, + "reward": 2.7394394874572754, + "reward_std": 0.09670986980199814, + "rewards/cosine_scaled_reward/mean": 0.8714437484741211, + "rewards/cosine_scaled_reward/std": 0.26286423206329346, + "rewards/repetition_penalty_reward/mean": -0.09059803187847137, + "rewards/repetition_penalty_reward/std": 0.04828861728310585, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.96484375, + "rewards/reward_reference/std": 0.18453538417816162, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4054.0, + "completions/mean_length": 3217.70703125, + "completions/mean_terminated_length": 3122.653564453125, + "completions/min_length": 2044.0, + "completions/min_terminated_length": 2044.0, + "epoch": 0.04672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12472854554653168, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 145811928.0, + "reward": 2.718686103820801, + "reward_std": 0.12714329361915588, + "rewards/cosine_scaled_reward/mean": 0.8730841279029846, + "rewards/cosine_scaled_reward/std": 0.28264540433883667, + "rewards/repetition_penalty_reward/mean": -0.1114293783903122, + "rewards/repetition_penalty_reward/std": 0.05651494488120079, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 3317.3203125, + "completions/mean_terminated_length": 3251.33056640625, + "completions/min_length": 1816.0, + "completions/min_terminated_length": 1816.0, + "epoch": 0.046933333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11304979771375656, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 146742942.0, + "reward": 2.702650547027588, + "reward_std": 0.14594215154647827, + "rewards/cosine_scaled_reward/mean": 0.8594841957092285, + "rewards/cosine_scaled_reward/std": 0.33492517471313477, + "rewards/repetition_penalty_reward/mean": -0.09824004769325256, + "rewards/repetition_penalty_reward/std": 0.05412546917796135, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3472.67578125, + "completions/mean_terminated_length": 3317.60498046875, + "completions/min_length": 1775.0, + "completions/min_terminated_length": 1775.0, + "epoch": 0.04714666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13369408249855042, + "learning_rate": 1e-06, + "loss": -0.0467, + "num_tokens": 147579567.0, + "reward": 2.5761454105377197, + "reward_std": 0.23749884963035583, + "rewards/cosine_scaled_reward/mean": 0.798820972442627, + "rewards/cosine_scaled_reward/std": 0.4479582905769348, + "rewards/repetition_penalty_reward/mean": -0.1133006364107132, + "rewards/repetition_penalty_reward/std": 0.05030689388513565, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3550.10546875, + "completions/mean_terminated_length": 3427.344482421875, + "completions/min_length": 1707.0, + "completions/min_terminated_length": 1707.0, + "epoch": 0.04736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22356118261814117, + "learning_rate": 1e-06, + "loss": -0.0519, + "num_tokens": 148456190.0, + "reward": 2.5239830017089844, + "reward_std": 0.34182125329971313, + "rewards/cosine_scaled_reward/mean": 0.7705343961715698, + "rewards/cosine_scaled_reward/std": 0.49305281043052673, + "rewards/repetition_penalty_reward/mean": -0.11373880505561829, + "rewards/repetition_penalty_reward/std": 0.04404772073030472, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.25, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 3501.703125, + "completions/mean_terminated_length": 3335.2998046875, + "completions/min_length": 2298.0, + "completions/min_terminated_length": 2298.0, + "epoch": 0.047573333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12591539323329926, + "learning_rate": 1e-06, + "loss": -0.0701, + "num_tokens": 149286418.0, + "reward": 2.5235965251922607, + "reward_std": 0.3133578598499298, + "rewards/cosine_scaled_reward/mean": 0.7684545516967773, + "rewards/cosine_scaled_reward/std": 0.4874875545501709, + "rewards/repetition_penalty_reward/mean": -0.10970175266265869, + "rewards/repetition_penalty_reward/std": 0.05052530765533447, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 3428.93359375, + "completions/mean_terminated_length": 3294.267578125, + "completions/min_length": 2225.0, + "completions/min_terminated_length": 2225.0, + "epoch": 0.047786666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20103350281715393, + "learning_rate": 1e-06, + "loss": -0.0459, + "num_tokens": 150155365.0, + "reward": 2.6452536582946777, + "reward_std": 0.23817402124404907, + "rewards/cosine_scaled_reward/mean": 0.8362963199615479, + "rewards/cosine_scaled_reward/std": 0.39203858375549316, + "rewards/repetition_penalty_reward/mean": -0.10666733235120773, + "rewards/repetition_penalty_reward/std": 0.04530767351388931, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 3358.8359375, + "completions/mean_terminated_length": 3257.271240234375, + "completions/min_length": 2110.0, + "completions/min_terminated_length": 2110.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09694981575012207, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 151056083.0, + "reward": 2.6646728515625, + "reward_std": 0.1226922944188118, + "rewards/cosine_scaled_reward/mean": 0.8418905735015869, + "rewards/cosine_scaled_reward/std": 0.37268179655075073, + "rewards/repetition_penalty_reward/mean": -0.10299905389547348, + "rewards/repetition_penalty_reward/std": 0.038974642753601074, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4070.0, + "completions/mean_length": 3305.85546875, + "completions/mean_terminated_length": 3192.977783203125, + "completions/min_length": 2018.0, + "completions/min_terminated_length": 2018.0, + "epoch": 0.04821333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10110696405172348, + "learning_rate": 1e-06, + "loss": -0.0127, + "num_tokens": 151933410.0, + "reward": 2.6624765396118164, + "reward_std": 0.13132601976394653, + "rewards/cosine_scaled_reward/mean": 0.8468172550201416, + "rewards/cosine_scaled_reward/std": 0.35259246826171875, + "rewards/repetition_penalty_reward/mean": -0.11480934172868729, + "rewards/repetition_penalty_reward/std": 0.0483165867626667, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3393.78125, + "completions/mean_terminated_length": 3210.443359375, + "completions/min_length": 1401.0, + "completions/min_terminated_length": 1401.0, + "epoch": 0.048426666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10942267626523972, + "learning_rate": 1e-06, + "loss": -0.0237, + "num_tokens": 152751126.0, + "reward": 2.390054225921631, + "reward_std": 0.24889340996742249, + "rewards/cosine_scaled_reward/mean": 0.6856964826583862, + "rewards/cosine_scaled_reward/std": 0.5464708209037781, + "rewards/repetition_penalty_reward/mean": -0.11595490574836731, + "rewards/repetition_penalty_reward/std": 0.0469362810254097, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8203125, + "rewards/reward_reference/std": 0.38467901945114136, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3496.51171875, + "completions/mean_terminated_length": 3347.370849609375, + "completions/min_length": 2328.0, + "completions/min_terminated_length": 2328.0, + "epoch": 0.04864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1789751499891281, + "learning_rate": 1e-06, + "loss": -0.0884, + "num_tokens": 153594401.0, + "reward": 2.5644519329071045, + "reward_std": 0.27266085147857666, + "rewards/cosine_scaled_reward/mean": 0.793789267539978, + "rewards/cosine_scaled_reward/std": 0.4596695303916931, + "rewards/repetition_penalty_reward/mean": -0.11605618894100189, + "rewards/repetition_penalty_reward/std": 0.051774267107248306, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3340.65234375, + "completions/mean_terminated_length": 3213.036376953125, + "completions/min_length": 1993.0, + "completions/min_terminated_length": 1993.0, + "epoch": 0.04885333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11022138595581055, + "learning_rate": 1e-06, + "loss": -0.0348, + "num_tokens": 154459104.0, + "reward": 2.6423568725585938, + "reward_std": 0.20323346555233002, + "rewards/cosine_scaled_reward/mean": 0.8313206434249878, + "rewards/cosine_scaled_reward/std": 0.38284415006637573, + "rewards/repetition_penalty_reward/mean": -0.1108388751745224, + "rewards/repetition_penalty_reward/std": 0.05063518136739731, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4063.0, + "completions/mean_length": 3300.359375, + "completions/mean_terminated_length": 3186.696533203125, + "completions/min_length": 1988.0, + "completions/min_terminated_length": 1988.0, + "epoch": 0.04906666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1534515768289566, + "learning_rate": 1e-06, + "loss": -0.0095, + "num_tokens": 155334276.0, + "reward": 2.536712646484375, + "reward_std": 0.24921995401382446, + "rewards/cosine_scaled_reward/mean": 0.770061194896698, + "rewards/cosine_scaled_reward/std": 0.4599458873271942, + "rewards/repetition_penalty_reward/mean": -0.11225477606058121, + "rewards/repetition_penalty_reward/std": 0.04974426329135895, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4070.0, + "completions/mean_length": 3381.6640625, + "completions/mean_terminated_length": 3253.281005859375, + "completions/min_length": 1435.0, + "completions/min_terminated_length": 1435.0, + "epoch": 0.04928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15593107044696808, + "learning_rate": 1e-06, + "loss": -0.0331, + "num_tokens": 156200458.0, + "reward": 2.547464370727539, + "reward_std": 0.21806156635284424, + "rewards/cosine_scaled_reward/mean": 0.7779920101165771, + "rewards/cosine_scaled_reward/std": 0.46156105399131775, + "rewards/repetition_penalty_reward/mean": -0.11334001272916794, + "rewards/repetition_penalty_reward/std": 0.05135047063231468, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3302.9296875, + "completions/mean_terminated_length": 3189.634033203125, + "completions/min_length": 2091.0, + "completions/min_terminated_length": 2091.0, + "epoch": 0.049493333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1349734663963318, + "learning_rate": 1e-06, + "loss": -0.0397, + "num_tokens": 157074428.0, + "reward": 2.4278252124786377, + "reward_std": 0.1906101256608963, + "rewards/cosine_scaled_reward/mean": 0.7002975344657898, + "rewards/cosine_scaled_reward/std": 0.5271919369697571, + "rewards/repetition_penalty_reward/mean": -0.10137848556041718, + "rewards/repetition_penalty_reward/std": 0.05053536593914032, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.83203125, + "rewards/reward_reference/std": 0.3745708465576172, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3317.5859375, + "completions/mean_terminated_length": 3269.13720703125, + "completions/min_length": 1730.0, + "completions/min_terminated_length": 1730.0, + "epoch": 0.04970666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12044618278741837, + "learning_rate": 1e-06, + "loss": -0.019, + "num_tokens": 158039622.0, + "reward": 2.680562734603882, + "reward_std": 0.18000000715255737, + "rewards/cosine_scaled_reward/mean": 0.8430810570716858, + "rewards/cosine_scaled_reward/std": 0.36536091566085815, + "rewards/repetition_penalty_reward/mean": -0.09220585227012634, + "rewards/repetition_penalty_reward/std": 0.03817109391093254, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 3241.15625, + "completions/mean_terminated_length": 3148.640625, + "completions/min_length": 2108.0, + "completions/min_terminated_length": 2108.0, + "epoch": 0.04992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10091330856084824, + "learning_rate": 1e-06, + "loss": -0.0177, + "num_tokens": 158919982.0, + "reward": 2.644728899002075, + "reward_std": 0.20279303193092346, + "rewards/cosine_scaled_reward/mean": 0.822848916053772, + "rewards/cosine_scaled_reward/std": 0.3769766688346863, + "rewards/repetition_penalty_reward/mean": -0.09999504685401917, + "rewards/repetition_penalty_reward/std": 0.03829427435994148, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3326.4375, + "completions/mean_terminated_length": 3240.195556640625, + "completions/min_length": 2029.0, + "completions/min_terminated_length": 2029.0, + "epoch": 0.050133333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13295038044452667, + "learning_rate": 1e-06, + "loss": -0.0124, + "num_tokens": 159822851.0, + "reward": 2.5747616291046143, + "reward_std": 0.16040503978729248, + "rewards/cosine_scaled_reward/mean": 0.7831013202667236, + "rewards/cosine_scaled_reward/std": 0.449871689081192, + "rewards/repetition_penalty_reward/mean": -0.09505827724933624, + "rewards/repetition_penalty_reward/std": 0.03566938266158104, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 3277.328125, + "completions/mean_terminated_length": 3160.375244140625, + "completions/min_length": 1833.0, + "completions/min_terminated_length": 1833.0, + "epoch": 0.050346666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16465094685554504, + "learning_rate": 1e-06, + "loss": -0.0298, + "num_tokens": 160686123.0, + "reward": 2.4961965084075928, + "reward_std": 0.24923688173294067, + "rewards/cosine_scaled_reward/mean": 0.744510293006897, + "rewards/cosine_scaled_reward/std": 0.4811403155326843, + "rewards/repetition_penalty_reward/mean": -0.0967513769865036, + "rewards/repetition_penalty_reward/std": 0.045376941561698914, + "rewards/reward_format/mean": 0.981249988079071, + "rewards/reward_format/std": 0.12126781791448593, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 3352.30078125, + "completions/mean_terminated_length": 3249.835693359375, + "completions/min_length": 1897.0, + "completions/min_terminated_length": 1897.0, + "epoch": 0.05056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1568382978439331, + "learning_rate": 1e-06, + "loss": -0.0474, + "num_tokens": 161582056.0, + "reward": 2.635594367980957, + "reward_std": 0.2220080941915512, + "rewards/cosine_scaled_reward/mean": 0.8191026449203491, + "rewards/cosine_scaled_reward/std": 0.4030058979988098, + "rewards/repetition_penalty_reward/mean": -0.09444564580917358, + "rewards/repetition_penalty_reward/std": 0.0460088886320591, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4060.0, + "completions/mean_length": 3279.74609375, + "completions/mean_terminated_length": 3179.50439453125, + "completions/min_length": 1848.0, + "completions/min_terminated_length": 1848.0, + "epoch": 0.05077333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17059451341629028, + "learning_rate": 1e-06, + "loss": -0.0619, + "num_tokens": 162473719.0, + "reward": 2.5931172370910645, + "reward_std": 0.25767046213150024, + "rewards/cosine_scaled_reward/mean": 0.7888476848602295, + "rewards/cosine_scaled_reward/std": 0.4332166910171509, + "rewards/repetition_penalty_reward/mean": -0.09104306995868683, + "rewards/repetition_penalty_reward/std": 0.04912455752491951, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 3219.9453125, + "completions/mean_terminated_length": 3141.659423828125, + "completions/min_length": 1931.0, + "completions/min_terminated_length": 1931.0, + "epoch": 0.050986666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10416688770055771, + "learning_rate": 1e-06, + "loss": -0.0127, + "num_tokens": 163372405.0, + "reward": 2.6563687324523926, + "reward_std": 0.12349631637334824, + "rewards/cosine_scaled_reward/mean": 0.8200638294219971, + "rewards/cosine_scaled_reward/std": 0.37859129905700684, + "rewards/repetition_penalty_reward/mean": -0.08557005226612091, + "rewards/repetition_penalty_reward/std": 0.03227576985955238, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4022.0, + "completions/mean_length": 3282.00390625, + "completions/mean_terminated_length": 3148.804443359375, + "completions/min_length": 2080.0, + "completions/min_terminated_length": 2080.0, + "epoch": 0.0512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1761353760957718, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 164225194.0, + "reward": 2.449544906616211, + "reward_std": 0.2723526358604431, + "rewards/cosine_scaled_reward/mean": 0.7084175944328308, + "rewards/cosine_scaled_reward/std": 0.5179768800735474, + "rewards/repetition_penalty_reward/mean": -0.09168502688407898, + "rewards/repetition_penalty_reward/std": 0.03688832372426987, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8359375, + "rewards/reward_reference/std": 0.3710577189922333, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4044.0, + "completions/mean_length": 3087.98046875, + "completions/mean_terminated_length": 3016.2802734375, + "completions/min_length": 2035.0, + "completions/min_terminated_length": 2035.0, + "epoch": 0.05141333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1311665177345276, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 165107713.0, + "reward": 2.599454164505005, + "reward_std": 0.16613246500492096, + "rewards/cosine_scaled_reward/mean": 0.7890911102294922, + "rewards/cosine_scaled_reward/std": 0.39621245861053467, + "rewards/repetition_penalty_reward/mean": -0.09198064357042313, + "rewards/repetition_penalty_reward/std": 0.03552812337875366, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3141.2734375, + "completions/mean_terminated_length": 3065.510498046875, + "completions/min_length": 1743.0, + "completions/min_terminated_length": 1743.0, + "epoch": 0.05162666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1170988455414772, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 165988723.0, + "reward": 2.692446708679199, + "reward_std": 0.16342981159687042, + "rewards/cosine_scaled_reward/mean": 0.8425164222717285, + "rewards/cosine_scaled_reward/std": 0.3216044306755066, + "rewards/repetition_penalty_reward/mean": -0.09538224339485168, + "rewards/repetition_penalty_reward/std": 0.036083489656448364, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9453125, + "rewards/reward_reference/std": 0.22781464457511902, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3263.703125, + "completions/mean_terminated_length": 3127.509033203125, + "completions/min_length": 1848.0, + "completions/min_terminated_length": 1848.0, + "epoch": 0.05184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14324931800365448, + "learning_rate": 1e-06, + "loss": -0.0426, + "num_tokens": 166838611.0, + "reward": 2.5586345195770264, + "reward_std": 0.2131500393152237, + "rewards/cosine_scaled_reward/mean": 0.7745422124862671, + "rewards/cosine_scaled_reward/std": 0.44403311610221863, + "rewards/repetition_penalty_reward/mean": -0.10028272867202759, + "rewards/repetition_penalty_reward/std": 0.04599667713046074, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 3275.5390625, + "completions/mean_terminated_length": 3199.14111328125, + "completions/min_length": 1784.0, + "completions/min_terminated_length": 1784.0, + "epoch": 0.05205333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12255389243364334, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 167756922.0, + "reward": 2.6074233055114746, + "reward_std": 0.15961477160453796, + "rewards/cosine_scaled_reward/mean": 0.8047130107879639, + "rewards/cosine_scaled_reward/std": 0.4105326533317566, + "rewards/repetition_penalty_reward/mean": -0.0996333584189415, + "rewards/repetition_penalty_reward/std": 0.041832976043224335, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.09375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4070.0, + "completions/mean_length": 3361.8125, + "completions/mean_terminated_length": 3268.017578125, + "completions/min_length": 2264.0, + "completions/min_terminated_length": 2264.0, + "epoch": 0.05226666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1401897817850113, + "learning_rate": 1e-06, + "loss": -0.0166, + "num_tokens": 168667354.0, + "reward": 2.6775858402252197, + "reward_std": 0.18005535006523132, + "rewards/cosine_scaled_reward/mean": 0.8490869998931885, + "rewards/cosine_scaled_reward/std": 0.3619866967201233, + "rewards/repetition_penalty_reward/mean": -0.1050950288772583, + "rewards/repetition_penalty_reward/std": 0.03700846806168556, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 3354.1640625, + "completions/mean_terminated_length": 3237.036376953125, + "completions/min_length": 2237.0, + "completions/min_terminated_length": 2237.0, + "epoch": 0.05248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1567317396402359, + "learning_rate": 1e-06, + "loss": -0.0361, + "num_tokens": 169544251.0, + "reward": 2.568568229675293, + "reward_std": 0.2007070928812027, + "rewards/cosine_scaled_reward/mean": 0.8031135201454163, + "rewards/cosine_scaled_reward/std": 0.4272143840789795, + "rewards/repetition_penalty_reward/mean": -0.12360787391662598, + "rewards/repetition_penalty_reward/std": 0.056523095816373825, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3360.3984375, + "completions/mean_terminated_length": 3243.8681640625, + "completions/min_length": 1815.0, + "completions/min_terminated_length": 1815.0, + "epoch": 0.052693333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11087020486593246, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 170415658.0, + "reward": 2.5351624488830566, + "reward_std": 0.11044815182685852, + "rewards/cosine_scaled_reward/mean": 0.7818148136138916, + "rewards/cosine_scaled_reward/std": 0.4538746476173401, + "rewards/repetition_penalty_reward/mean": -0.12946489453315735, + "rewards/repetition_penalty_reward/std": 0.04873738810420036, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3427.25, + "completions/mean_terminated_length": 3252.650146484375, + "completions/min_length": 2025.0, + "completions/min_terminated_length": 2025.0, + "epoch": 0.052906666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15536320209503174, + "learning_rate": 1e-06, + "loss": -0.0183, + "num_tokens": 171231498.0, + "reward": 2.3969178199768066, + "reward_std": 0.2622104287147522, + "rewards/cosine_scaled_reward/mean": 0.712480366230011, + "rewards/cosine_scaled_reward/std": 0.5295530557632446, + "rewards/repetition_penalty_reward/mean": -0.13275012373924255, + "rewards/repetition_penalty_reward/std": 0.042228639125823975, + "rewards/reward_format/mean": 0.981249988079071, + "rewards/reward_format/std": 0.12126781791448593, + "rewards/reward_reference/mean": 0.8359375, + "rewards/reward_reference/std": 0.3710577189922333, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3377.05859375, + "completions/mean_terminated_length": 3244.273193359375, + "completions/min_length": 1885.0, + "completions/min_terminated_length": 1885.0, + "epoch": 0.05312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13335011899471283, + "learning_rate": 1e-06, + "loss": -0.0273, + "num_tokens": 172091853.0, + "reward": 2.5199947357177734, + "reward_std": 0.16750986874103546, + "rewards/cosine_scaled_reward/mean": 0.7740402817726135, + "rewards/cosine_scaled_reward/std": 0.46254417300224304, + "rewards/repetition_penalty_reward/mean": -0.1329517662525177, + "rewards/repetition_penalty_reward/std": 0.04756597429513931, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 3280.61328125, + "completions/mean_terminated_length": 3186.118408203125, + "completions/min_length": 2232.0, + "completions/min_terminated_length": 2232.0, + "epoch": 0.05333333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12476290017366409, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 172972160.0, + "reward": 2.71635103225708, + "reward_std": 0.13595899939537048, + "rewards/cosine_scaled_reward/mean": 0.8891398906707764, + "rewards/cosine_scaled_reward/std": 0.26229527592658997, + "rewards/repetition_penalty_reward/mean": -0.13138249516487122, + "rewards/repetition_penalty_reward/std": 0.042913768440485, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.96484375, + "rewards/reward_reference/std": 0.18453538417816162, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 3309.2578125, + "completions/mean_terminated_length": 3150.431884765625, + "completions/min_length": 1909.0, + "completions/min_terminated_length": 1909.0, + "epoch": 0.053546666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15214867889881134, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 173800374.0, + "reward": 2.6647098064422607, + "reward_std": 0.14656221866607666, + "rewards/cosine_scaled_reward/mean": 0.8541313409805298, + "rewards/cosine_scaled_reward/std": 0.33413589000701904, + "rewards/repetition_penalty_reward/mean": -0.13082784414291382, + "rewards/repetition_penalty_reward/std": 0.043466996401548386, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 3286.8203125, + "completions/mean_terminated_length": 3224.38134765625, + "completions/min_length": 2185.0, + "completions/min_terminated_length": 2185.0, + "epoch": 0.05376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11460880935192108, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 174722612.0, + "reward": 2.7309741973876953, + "reward_std": 0.10704682767391205, + "rewards/cosine_scaled_reward/mean": 0.8857072591781616, + "rewards/cosine_scaled_reward/std": 0.27719277143478394, + "rewards/repetition_penalty_reward/mean": -0.11567074060440063, + "rewards/repetition_penalty_reward/std": 0.030316317453980446, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3266.65625, + "completions/mean_terminated_length": 3139.581787109375, + "completions/min_length": 2063.0, + "completions/min_terminated_length": 2063.0, + "epoch": 0.05397333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22021500766277313, + "learning_rate": 1e-06, + "loss": -0.0572, + "num_tokens": 175574204.0, + "reward": 2.482029438018799, + "reward_std": 0.24132297933101654, + "rewards/cosine_scaled_reward/mean": 0.7378534078598022, + "rewards/cosine_scaled_reward/std": 0.4845937490463257, + "rewards/repetition_penalty_reward/mean": -0.11910516023635864, + "rewards/repetition_penalty_reward/std": 0.04768124595284462, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.86328125, + "rewards/reward_reference/std": 0.34422317147254944, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3190.765625, + "completions/mean_terminated_length": 3109.50439453125, + "completions/min_length": 2085.0, + "completions/min_terminated_length": 2085.0, + "epoch": 0.05418666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11906640976667404, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 176467160.0, + "reward": 2.697772979736328, + "reward_std": 0.08515664935112, + "rewards/cosine_scaled_reward/mean": 0.8567619323730469, + "rewards/cosine_scaled_reward/std": 0.30950209498405457, + "rewards/repetition_penalty_reward/mean": -0.10820753127336502, + "rewards/repetition_penalty_reward/std": 0.02926632948219776, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94921875, + "rewards/reward_reference/std": 0.21998079121112823, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.25, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3160.6171875, + "completions/mean_terminated_length": 3067.116455078125, + "completions/min_length": 1814.0, + "completions/min_terminated_length": 1814.0, + "epoch": 0.0544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12647591531276703, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 177336651.0, + "reward": 2.4994921684265137, + "reward_std": 0.12857121229171753, + "rewards/cosine_scaled_reward/mean": 0.7419434785842896, + "rewards/cosine_scaled_reward/std": 0.4661872386932373, + "rewards/repetition_penalty_reward/mean": -0.1135449931025505, + "rewards/repetition_penalty_reward/std": 0.03488827869296074, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4054.0, + "completions/mean_length": 3018.65234375, + "completions/mean_terminated_length": 2955.219970703125, + "completions/min_length": 2017.0, + "completions/min_terminated_length": 2017.0, + "epoch": 0.05461333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10715894401073456, + "learning_rate": 1e-06, + "loss": -0.0174, + "num_tokens": 178201099.0, + "reward": 2.680346965789795, + "reward_std": 0.10996614396572113, + "rewards/cosine_scaled_reward/mean": 0.8404624462127686, + "rewards/cosine_scaled_reward/std": 0.2958315908908844, + "rewards/repetition_penalty_reward/mean": -0.11011554300785065, + "rewards/repetition_penalty_reward/std": 0.03434314951300621, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.953125, + "rewards/reward_reference/std": 0.21178513765335083, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3964.0, + "completions/mean_length": 2943.23828125, + "completions/mean_terminated_length": 2915.572021484375, + "completions/min_length": 1742.0, + "completions/min_terminated_length": 1742.0, + "epoch": 0.05482666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03318123519420624, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 179101432.0, + "reward": 2.7122738361358643, + "reward_std": 0.047639843076467514, + "rewards/cosine_scaled_reward/mean": 0.8511331081390381, + "rewards/cosine_scaled_reward/std": 0.25263121724128723, + "rewards/repetition_penalty_reward/mean": -0.10370296239852905, + "rewards/repetition_penalty_reward/std": 0.032866187393665314, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.96484375, + "rewards/reward_reference/std": 0.18453538417816162, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3934.0, + "completions/mean_length": 2962.69921875, + "completions/mean_terminated_length": 2906.226318359375, + "completions/min_length": 1761.0, + "completions/min_terminated_length": 1761.0, + "epoch": 0.05504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14833249151706696, + "learning_rate": 1e-06, + "loss": -0.0222, + "num_tokens": 179978589.0, + "reward": 2.6928648948669434, + "reward_std": 0.11383727192878723, + "rewards/cosine_scaled_reward/mean": 0.8450006246566772, + "rewards/cosine_scaled_reward/std": 0.2710968852043152, + "rewards/repetition_penalty_reward/mean": -0.09744831174612045, + "rewards/repetition_penalty_reward/std": 0.04459652677178383, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 2981.18359375, + "completions/mean_terminated_length": 2909.14599609375, + "completions/min_length": 2071.0, + "completions/min_terminated_length": 2071.0, + "epoch": 0.055253333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18252234160900116, + "learning_rate": 1e-06, + "loss": -0.0232, + "num_tokens": 180840680.0, + "reward": 2.587800979614258, + "reward_std": 0.24309676885604858, + "rewards/cosine_scaled_reward/mean": 0.7760927677154541, + "rewards/cosine_scaled_reward/std": 0.3907049298286438, + "rewards/repetition_penalty_reward/mean": -0.098448246717453, + "rewards/repetition_penalty_reward/std": 0.03178011253476143, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.6875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4031.0, + "completions/mean_length": 2906.25, + "completions/mean_terminated_length": 2859.044677734375, + "completions/min_length": 1778.0, + "completions/min_terminated_length": 1778.0, + "epoch": 0.055466666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12263341248035431, + "learning_rate": 1e-06, + "loss": -0.0353, + "num_tokens": 181704285.0, + "reward": 2.6258513927459717, + "reward_std": 0.14923956990242004, + "rewards/cosine_scaled_reward/mean": 0.7974638938903809, + "rewards/cosine_scaled_reward/std": 0.343732088804245, + "rewards/repetition_penalty_reward/mean": -0.09817510098218918, + "rewards/repetition_penalty_reward/std": 0.032411616295576096, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4029.0, + "completions/mean_length": 3021.8515625, + "completions/mean_terminated_length": 3000.454345703125, + "completions/min_length": 2023.0, + "completions/min_terminated_length": 2023.0, + "epoch": 0.05568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1144053116440773, + "learning_rate": 1e-06, + "loss": -0.0253, + "num_tokens": 182623943.0, + "reward": 2.600637912750244, + "reward_std": 0.13089832663536072, + "rewards/cosine_scaled_reward/mean": 0.793508768081665, + "rewards/cosine_scaled_reward/std": 0.38059523701667786, + "rewards/repetition_penalty_reward/mean": -0.10380822420120239, + "rewards/repetition_penalty_reward/std": 0.03530118241906166, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4036.0, + "completions/mean_length": 3025.87890625, + "completions/mean_terminated_length": 2991.35888671875, + "completions/min_length": 1986.0, + "completions/min_terminated_length": 1986.0, + "epoch": 0.05589333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12886103987693787, + "learning_rate": 1e-06, + "loss": -0.0184, + "num_tokens": 183529604.0, + "reward": 2.684856414794922, + "reward_std": 0.12615951895713806, + "rewards/cosine_scaled_reward/mean": 0.8371844291687012, + "rewards/cosine_scaled_reward/std": 0.30617836117744446, + "rewards/repetition_penalty_reward/mean": -0.10154666006565094, + "rewards/repetition_penalty_reward/std": 0.04301191866397858, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94921875, + "rewards/reward_reference/std": 0.21998079121112823, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3973.0, + "completions/mean_length": 3003.31640625, + "completions/mean_terminated_length": 2925.593994140625, + "completions/min_length": 1879.0, + "completions/min_terminated_length": 1879.0, + "epoch": 0.056106666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13606540858745575, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 184387913.0, + "reward": 2.5859298706054688, + "reward_std": 0.16840368509292603, + "rewards/cosine_scaled_reward/mean": 0.7858878374099731, + "rewards/cosine_scaled_reward/std": 0.38099440932273865, + "rewards/repetition_penalty_reward/mean": -0.10698917508125305, + "rewards/repetition_penalty_reward/std": 0.041619643568992615, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4015.0, + "completions/mean_length": 3049.58203125, + "completions/mean_terminated_length": 2965.69189453125, + "completions/min_length": 1703.0, + "completions/min_terminated_length": 1703.0, + "epoch": 0.05632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0597972609102726, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 185252946.0, + "reward": 2.6976442337036133, + "reward_std": 0.07548146694898605, + "rewards/cosine_scaled_reward/mean": 0.8483776450157166, + "rewards/cosine_scaled_reward/std": 0.28662973642349243, + "rewards/repetition_penalty_reward/mean": -0.10463981330394745, + "rewards/repetition_penalty_reward/std": 0.04801265150308609, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.25, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 3129.72265625, + "completions/mean_terminated_length": 3029.762939453125, + "completions/min_length": 1824.0, + "completions/min_terminated_length": 1824.0, + "epoch": 0.05653333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1184777244925499, + "learning_rate": 1e-06, + "loss": -0.039, + "num_tokens": 186116847.0, + "reward": 2.613929271697998, + "reward_std": 0.16660256683826447, + "rewards/cosine_scaled_reward/mean": 0.8040981292724609, + "rewards/cosine_scaled_reward/std": 0.38224369287490845, + "rewards/repetition_penalty_reward/mean": -0.09954366832971573, + "rewards/repetition_penalty_reward/std": 0.03987511247396469, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4030.0, + "completions/mean_length": 3116.640625, + "completions/mean_terminated_length": 3010.6494140625, + "completions/min_length": 2146.0, + "completions/min_terminated_length": 2146.0, + "epoch": 0.05674666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17567136883735657, + "learning_rate": 1e-06, + "loss": -0.0435, + "num_tokens": 186971919.0, + "reward": 2.539384365081787, + "reward_std": 0.2967863976955414, + "rewards/cosine_scaled_reward/mean": 0.7633931636810303, + "rewards/cosine_scaled_reward/std": 0.43265610933303833, + "rewards/repetition_penalty_reward/mean": -0.09900867938995361, + "rewards/repetition_penalty_reward/std": 0.03749703988432884, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4048.0, + "completions/mean_length": 3276.57421875, + "completions/mean_terminated_length": 3133.73828125, + "completions/min_length": 1998.0, + "completions/min_terminated_length": 1998.0, + "epoch": 0.05696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18023541569709778, + "learning_rate": 1e-06, + "loss": -0.0461, + "num_tokens": 187819014.0, + "reward": 2.4331483840942383, + "reward_std": 0.21694278717041016, + "rewards/cosine_scaled_reward/mean": 0.7236498594284058, + "rewards/cosine_scaled_reward/std": 0.5014133453369141, + "rewards/repetition_penalty_reward/mean": -0.11081399023532867, + "rewards/repetition_penalty_reward/std": 0.047140881419181824, + "rewards/reward_format/mean": 0.96875, + "rewards/reward_format/std": 0.15529859066009521, + "rewards/reward_reference/mean": 0.8515625, + "rewards/reward_reference/std": 0.3562295734882355, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 3139.97265625, + "completions/mean_terminated_length": 3067.668212890625, + "completions/min_length": 2059.0, + "completions/min_terminated_length": 2059.0, + "epoch": 0.05717333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.126700758934021, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 188712451.0, + "reward": 2.656765937805176, + "reward_std": 0.14348021149635315, + "rewards/cosine_scaled_reward/mean": 0.8294387459754944, + "rewards/cosine_scaled_reward/std": 0.34767425060272217, + "rewards/repetition_penalty_reward/mean": -0.10001654922962189, + "rewards/repetition_penalty_reward/std": 0.041904617100954056, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3171.8203125, + "completions/mean_terminated_length": 3126.36865234375, + "completions/min_length": 1776.0, + "completions/min_terminated_length": 1776.0, + "epoch": 0.05738666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06981715559959412, + "learning_rate": 1e-06, + "loss": -0.0295, + "num_tokens": 189632833.0, + "reward": 2.670246124267578, + "reward_std": 0.0859508365392685, + "rewards/cosine_scaled_reward/mean": 0.8335663080215454, + "rewards/cosine_scaled_reward/std": 0.34999513626098633, + "rewards/repetition_penalty_reward/mean": -0.09691374003887177, + "rewards/repetition_penalty_reward/std": 0.034196559339761734, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4049.0, + "completions/mean_length": 3266.21875, + "completions/mean_terminated_length": 3164.31591796875, + "completions/min_length": 1990.0, + "completions/min_terminated_length": 1990.0, + "epoch": 0.0576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09501556307077408, + "learning_rate": 1e-06, + "loss": -0.0297, + "num_tokens": 190514865.0, + "reward": 2.5719354152679443, + "reward_std": 0.17583732306957245, + "rewards/cosine_scaled_reward/mean": 0.7868057489395142, + "rewards/cosine_scaled_reward/std": 0.43395617604255676, + "rewards/repetition_penalty_reward/mean": -0.10315153002738953, + "rewards/repetition_penalty_reward/std": 0.03619806841015816, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4081.0, + "completions/mean_length": 3167.8203125, + "completions/mean_terminated_length": 3093.4091796875, + "completions/min_length": 1871.0, + "completions/min_terminated_length": 1871.0, + "epoch": 0.057813333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09603821486234665, + "learning_rate": 1e-06, + "loss": -0.0295, + "num_tokens": 191411267.0, + "reward": 2.5477559566497803, + "reward_std": 0.14840057492256165, + "rewards/cosine_scaled_reward/mean": 0.7638838291168213, + "rewards/cosine_scaled_reward/std": 0.4406859874725342, + "rewards/repetition_penalty_reward/mean": -0.09425283968448639, + "rewards/repetition_penalty_reward/std": 0.03563051298260689, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4051.0, + "completions/mean_length": 3265.11328125, + "completions/mean_terminated_length": 3129.14990234375, + "completions/min_length": 1960.0, + "completions/min_terminated_length": 1960.0, + "epoch": 0.058026666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16115225851535797, + "learning_rate": 1e-06, + "loss": -0.032, + "num_tokens": 192265076.0, + "reward": 2.475174903869629, + "reward_std": 0.226247176527977, + "rewards/cosine_scaled_reward/mean": 0.7361939549446106, + "rewards/cosine_scaled_reward/std": 0.4867141842842102, + "rewards/repetition_penalty_reward/mean": -0.10398783534765244, + "rewards/repetition_penalty_reward/std": 0.04068681597709656, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.85546875, + "rewards/reward_reference/std": 0.35231640934944153, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4043.0, + "completions/mean_length": 3192.22265625, + "completions/mean_terminated_length": 3090.056396484375, + "completions/min_length": 2059.0, + "completions/min_terminated_length": 2059.0, + "epoch": 0.05824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12524887919425964, + "learning_rate": 1e-06, + "loss": -0.0274, + "num_tokens": 193135969.0, + "reward": 2.5385026931762695, + "reward_std": 0.20201915502548218, + "rewards/cosine_scaled_reward/mean": 0.7618891000747681, + "rewards/cosine_scaled_reward/std": 0.45050087571144104, + "rewards/repetition_penalty_reward/mean": -0.09994899481534958, + "rewards/repetition_penalty_reward/std": 0.033864665776491165, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 3212.69140625, + "completions/mean_terminated_length": 3145.88671875, + "completions/min_length": 2237.0, + "completions/min_terminated_length": 2237.0, + "epoch": 0.058453333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12767969071865082, + "learning_rate": 1e-06, + "loss": -0.0307, + "num_tokens": 194048178.0, + "reward": 2.5636038780212402, + "reward_std": 0.1429387778043747, + "rewards/cosine_scaled_reward/mean": 0.7813065052032471, + "rewards/cosine_scaled_reward/std": 0.4319900572299957, + "rewards/repetition_penalty_reward/mean": -0.09973399341106415, + "rewards/repetition_penalty_reward/std": 0.03400009125471115, + "rewards/reward_format/mean": 0.9874999523162842, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3136.1640625, + "completions/mean_terminated_length": 3054.822021484375, + "completions/min_length": 1454.0, + "completions/min_terminated_length": 1454.0, + "epoch": 0.058666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11141457408666611, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 194931672.0, + "reward": 2.6866955757141113, + "reward_std": 0.145915687084198, + "rewards/cosine_scaled_reward/mean": 0.8439056873321533, + "rewards/cosine_scaled_reward/std": 0.31848499178886414, + "rewards/repetition_penalty_reward/mean": -0.09627248346805573, + "rewards/repetition_penalty_reward/std": 0.03544648736715317, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.9453125, + "rewards/reward_reference/std": 0.22781464457511902, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3193.03125, + "completions/mean_terminated_length": 3124.73974609375, + "completions/min_length": 2225.0, + "completions/min_terminated_length": 2225.0, + "epoch": 0.05888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08281750977039337, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 195837212.0, + "reward": 2.665475845336914, + "reward_std": 0.1273091584444046, + "rewards/cosine_scaled_reward/mean": 0.8375442624092102, + "rewards/cosine_scaled_reward/std": 0.34636956453323364, + "rewards/repetition_penalty_reward/mean": -0.09316206723451614, + "rewards/repetition_penalty_reward/std": 0.03389699384570122, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4046.0, + "completions/mean_length": 3165.02734375, + "completions/mean_terminated_length": 3090.392333984375, + "completions/min_length": 2009.0, + "completions/min_terminated_length": 2009.0, + "epoch": 0.05909333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08791069686412811, + "learning_rate": 1e-06, + "loss": -0.0169, + "num_tokens": 196731383.0, + "reward": 2.6439356803894043, + "reward_std": 0.15615218877792358, + "rewards/cosine_scaled_reward/mean": 0.8256608843803406, + "rewards/cosine_scaled_reward/std": 0.3610367774963379, + "rewards/repetition_penalty_reward/mean": -0.09813161194324493, + "rewards/repetition_penalty_reward/std": 0.04111843183636665, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 3193.63671875, + "completions/mean_terminated_length": 3108.79931640625, + "completions/min_length": 2110.0, + "completions/min_terminated_length": 2110.0, + "epoch": 0.05930666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14001445472240448, + "learning_rate": 1e-06, + "loss": -0.0237, + "num_tokens": 197620078.0, + "reward": 2.425388813018799, + "reward_std": 0.21485967934131622, + "rewards/cosine_scaled_reward/mean": 0.7161268591880798, + "rewards/cosine_scaled_reward/std": 0.4970853924751282, + "rewards/repetition_penalty_reward/mean": -0.11026932299137115, + "rewards/repetition_penalty_reward/std": 0.04518568143248558, + "rewards/reward_format/mean": 0.9718749523162842, + "rewards/reward_format/std": 0.1476283222436905, + "rewards/reward_reference/mean": 0.84765625, + "rewards/reward_reference/std": 0.3600577116012573, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 3300.109375, + "completions/mean_terminated_length": 3213.97412109375, + "completions/min_length": 2161.0, + "completions/min_terminated_length": 2161.0, + "epoch": 0.05952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14491397142410278, + "learning_rate": 1e-06, + "loss": -0.0148, + "num_tokens": 198519118.0, + "reward": 2.5698421001434326, + "reward_std": 0.20768402516841888, + "rewards/cosine_scaled_reward/mean": 0.7817451357841492, + "rewards/cosine_scaled_reward/std": 0.44465169310569763, + "rewards/repetition_penalty_reward/mean": -0.09940297901630402, + "rewards/repetition_penalty_reward/std": 0.037566814571619034, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 3297.74609375, + "completions/mean_terminated_length": 3226.41259765625, + "completions/min_length": 1910.0, + "completions/min_terminated_length": 1910.0, + "epoch": 0.05973333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13260126113891602, + "learning_rate": 1e-06, + "loss": -0.0481, + "num_tokens": 199439145.0, + "reward": 2.5585880279541016, + "reward_std": 0.18200063705444336, + "rewards/cosine_scaled_reward/mean": 0.7834160327911377, + "rewards/cosine_scaled_reward/std": 0.4448276460170746, + "rewards/repetition_penalty_reward/mean": -0.10217203199863434, + "rewards/repetition_penalty_reward/std": 0.036684300750494, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3296.578125, + "completions/mean_terminated_length": 3190.62841796875, + "completions/min_length": 2143.0, + "completions/min_terminated_length": 2143.0, + "epoch": 0.05994666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10680922865867615, + "learning_rate": 1e-06, + "loss": -0.0296, + "num_tokens": 200329419.0, + "reward": 2.635676383972168, + "reward_std": 0.1627238392829895, + "rewards/cosine_scaled_reward/mean": 0.8256530165672302, + "rewards/cosine_scaled_reward/std": 0.3862798810005188, + "rewards/repetition_penalty_reward/mean": -0.09935159981250763, + "rewards/repetition_penalty_reward/std": 0.04032512754201889, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4041.0, + "completions/mean_length": 3258.14453125, + "completions/mean_terminated_length": 3167.467529296875, + "completions/min_length": 1958.0, + "completions/min_terminated_length": 1958.0, + "epoch": 0.06016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14413586258888245, + "learning_rate": 1e-06, + "loss": -0.0382, + "num_tokens": 201221040.0, + "reward": 2.547389030456543, + "reward_std": 0.22861404716968536, + "rewards/cosine_scaled_reward/mean": 0.7698594331741333, + "rewards/cosine_scaled_reward/std": 0.45155566930770874, + "rewards/repetition_penalty_reward/mean": -0.09981396794319153, + "rewards/repetition_penalty_reward/std": 0.03832467272877693, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 3319.23046875, + "completions/mean_terminated_length": 3183.830078125, + "completions/min_length": 2119.0, + "completions/min_terminated_length": 2119.0, + "epoch": 0.060373333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11200573295354843, + "learning_rate": 1e-06, + "loss": -0.0452, + "num_tokens": 202074631.0, + "reward": 2.628976345062256, + "reward_std": 0.1832224726676941, + "rewards/cosine_scaled_reward/mean": 0.8138462901115417, + "rewards/cosine_scaled_reward/std": 0.4038219749927521, + "rewards/repetition_penalty_reward/mean": -0.09893252700567245, + "rewards/repetition_penalty_reward/std": 0.034034404903650284, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.09375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3257.13671875, + "completions/mean_terminated_length": 3149.968994140625, + "completions/min_length": 2014.0, + "completions/min_terminated_length": 2014.0, + "epoch": 0.060586666666666664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09669145196676254, + "learning_rate": 1e-06, + "loss": -0.0382, + "num_tokens": 202950866.0, + "reward": 2.6120972633361816, + "reward_std": 0.13966014981269836, + "rewards/cosine_scaled_reward/mean": 0.819635808467865, + "rewards/cosine_scaled_reward/std": 0.3874451816082001, + "rewards/repetition_penalty_reward/mean": -0.10988226532936096, + "rewards/repetition_penalty_reward/std": 0.0434051938354969, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4027.0, + "completions/mean_length": 3347.43359375, + "completions/mean_terminated_length": 3237.659423828125, + "completions/min_length": 2152.0, + "completions/min_terminated_length": 2152.0, + "epoch": 0.0608, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1725301891565323, + "learning_rate": 1e-06, + "loss": -0.0637, + "num_tokens": 203839344.0, + "reward": 2.493455410003662, + "reward_std": 0.24681459367275238, + "rewards/cosine_scaled_reward/mean": 0.7412490248680115, + "rewards/cosine_scaled_reward/std": 0.4978993535041809, + "rewards/repetition_penalty_reward/mean": -0.10091851651668549, + "rewards/repetition_penalty_reward/std": 0.03664093464612961, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.859375, + "rewards/reward_reference/std": 0.3483152687549591, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 3269.0703125, + "completions/mean_terminated_length": 3146.69970703125, + "completions/min_length": 2100.0, + "completions/min_terminated_length": 2100.0, + "epoch": 0.061013333333333336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17975397408008575, + "learning_rate": 1e-06, + "loss": -0.0196, + "num_tokens": 204701962.0, + "reward": 2.495086193084717, + "reward_std": 0.22708861529827118, + "rewards/cosine_scaled_reward/mean": 0.7405046820640564, + "rewards/cosine_scaled_reward/std": 0.48294419050216675, + "rewards/repetition_penalty_reward/mean": -0.1001059040427208, + "rewards/repetition_penalty_reward/std": 0.0333668552339077, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3339.1640625, + "completions/mean_terminated_length": 3215.318115234375, + "completions/min_length": 2036.0, + "completions/min_terminated_length": 2036.0, + "epoch": 0.061226666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10416944324970245, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 205566048.0, + "reward": 2.5090126991271973, + "reward_std": 0.16860689222812653, + "rewards/cosine_scaled_reward/mean": 0.7532828450202942, + "rewards/cosine_scaled_reward/std": 0.4829765260219574, + "rewards/repetition_penalty_reward/mean": -0.10833275318145752, + "rewards/repetition_penalty_reward/std": 0.04052012786269188, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 3386.76171875, + "completions/mean_terminated_length": 3251.51171875, + "completions/min_length": 2190.0, + "completions/min_terminated_length": 2190.0, + "epoch": 0.06144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12626913189888, + "learning_rate": 1e-06, + "loss": -0.0333, + "num_tokens": 206423219.0, + "reward": 2.553403377532959, + "reward_std": 0.2632274925708771, + "rewards/cosine_scaled_reward/mean": 0.7919354438781738, + "rewards/cosine_scaled_reward/std": 0.44670379161834717, + "rewards/repetition_penalty_reward/mean": -0.10728215426206589, + "rewards/repetition_penalty_reward/std": 0.03807291015982628, + "rewards/reward_format/mean": 0.9781249761581421, + "rewards/reward_format/std": 0.13072198629379272, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3415.59765625, + "completions/mean_terminated_length": 3270.48828125, + "completions/min_length": 2249.0, + "completions/min_terminated_length": 2249.0, + "epoch": 0.06165333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14491280913352966, + "learning_rate": 1e-06, + "loss": -0.069, + "num_tokens": 207272948.0, + "reward": 2.448690414428711, + "reward_std": 0.23005658388137817, + "rewards/cosine_scaled_reward/mean": 0.729130208492279, + "rewards/cosine_scaled_reward/std": 0.5144220590591431, + "rewards/repetition_penalty_reward/mean": -0.1124710738658905, + "rewards/repetition_penalty_reward/std": 0.03470207005739212, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.84765625, + "rewards/reward_reference/std": 0.3600577116012573, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4081.0, + "completions/mean_length": 3416.36328125, + "completions/mean_terminated_length": 3308.728759765625, + "completions/min_length": 1827.0, + "completions/min_terminated_length": 1827.0, + "epoch": 0.06186666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16962240636348724, + "learning_rate": 1e-06, + "loss": -0.0594, + "num_tokens": 208159941.0, + "reward": 2.477572202682495, + "reward_std": 0.23482105135917664, + "rewards/cosine_scaled_reward/mean": 0.7438188791275024, + "rewards/cosine_scaled_reward/std": 0.5046337842941284, + "rewards/repetition_penalty_reward/mean": -0.11546549201011658, + "rewards/repetition_penalty_reward/std": 0.041258033365011215, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.85546875, + "rewards/reward_reference/std": 0.35231640934944153, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 3439.75, + "completions/mean_terminated_length": 3342.636962890625, + "completions/min_length": 2159.0, + "completions/min_terminated_length": 2159.0, + "epoch": 0.06208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1446293741464615, + "learning_rate": 1e-06, + "loss": -0.0473, + "num_tokens": 209067349.0, + "reward": 2.663702964782715, + "reward_std": 0.18608683347702026, + "rewards/cosine_scaled_reward/mean": 0.8517959117889404, + "rewards/cosine_scaled_reward/std": 0.37194642424583435, + "rewards/repetition_penalty_reward/mean": -0.10840542614459991, + "rewards/repetition_penalty_reward/std": 0.0434514544904232, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3305.83984375, + "completions/mean_terminated_length": 3188.910400390625, + "completions/min_length": 1947.0, + "completions/min_terminated_length": 1947.0, + "epoch": 0.06229333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11810626089572906, + "learning_rate": 1e-06, + "loss": -0.0112, + "num_tokens": 209936872.0, + "reward": 2.6013426780700684, + "reward_std": 0.17282244563102722, + "rewards/cosine_scaled_reward/mean": 0.8206616640090942, + "rewards/cosine_scaled_reward/std": 0.39270922541618347, + "rewards/repetition_penalty_reward/mean": -0.11541298031806946, + "rewards/repetition_penalty_reward/std": 0.043614938855171204, + "rewards/reward_format/mean": 0.9781249761581421, + "rewards/reward_format/std": 0.13072198629379272, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3438.2109375, + "completions/mean_terminated_length": 3323.55029296875, + "completions/min_length": 2268.0, + "completions/min_terminated_length": 2268.0, + "epoch": 0.06250666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0373576357960701, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 210821630.0, + "reward": 2.6414241790771484, + "reward_std": 0.13541769981384277, + "rewards/cosine_scaled_reward/mean": 0.8415735960006714, + "rewards/cosine_scaled_reward/std": 0.39156049489974976, + "rewards/repetition_penalty_reward/mean": -0.11577431857585907, + "rewards/repetition_penalty_reward/std": 0.03710121288895607, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3379.16796875, + "completions/mean_terminated_length": 3265.642578125, + "completions/min_length": 2079.0, + "completions/min_terminated_length": 2079.0, + "epoch": 0.06272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20948846638202667, + "learning_rate": 1e-06, + "loss": -0.0318, + "num_tokens": 211710845.0, + "reward": 2.5950610637664795, + "reward_std": 0.1684325486421585, + "rewards/cosine_scaled_reward/mean": 0.8164889216423035, + "rewards/cosine_scaled_reward/std": 0.4118175208568573, + "rewards/repetition_penalty_reward/mean": -0.12220916152000427, + "rewards/repetition_penalty_reward/std": 0.04563181474804878, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3329.10546875, + "completions/mean_terminated_length": 3207.651611328125, + "completions/min_length": 2152.0, + "completions/min_terminated_length": 2152.0, + "epoch": 0.06293333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13313370943069458, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 212586592.0, + "reward": 2.592970371246338, + "reward_std": 0.2044907510280609, + "rewards/cosine_scaled_reward/mean": 0.8135062456130981, + "rewards/cosine_scaled_reward/std": 0.4072699248790741, + "rewards/repetition_penalty_reward/mean": -0.11819228529930115, + "rewards/repetition_penalty_reward/std": 0.03720817714929581, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3270.796875, + "completions/mean_terminated_length": 3177.512939453125, + "completions/min_length": 2274.0, + "completions/min_terminated_length": 2274.0, + "epoch": 0.06314666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1521005481481552, + "learning_rate": 1e-06, + "loss": -0.0386, + "num_tokens": 213475716.0, + "reward": 2.567768096923828, + "reward_std": 0.22235240042209625, + "rewards/cosine_scaled_reward/mean": 0.8015576601028442, + "rewards/cosine_scaled_reward/std": 0.41258522868156433, + "rewards/repetition_penalty_reward/mean": -0.13300836086273193, + "rewards/repetition_penalty_reward/std": 0.03633992001414299, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.5625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3497.609375, + "completions/mean_terminated_length": 3366.533447265625, + "completions/min_length": 2249.0, + "completions/min_terminated_length": 2249.0, + "epoch": 0.06336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14543825387954712, + "learning_rate": 1e-06, + "loss": -0.0242, + "num_tokens": 214346760.0, + "reward": 2.4732470512390137, + "reward_std": 0.29390355944633484, + "rewards/cosine_scaled_reward/mean": 0.7561601400375366, + "rewards/cosine_scaled_reward/std": 0.5014687776565552, + "rewards/repetition_penalty_reward/mean": -0.1297881156206131, + "rewards/repetition_penalty_reward/std": 0.04418834671378136, + "rewards/reward_format/mean": 0.9874999523162842, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.859375, + "rewards/reward_reference/std": 0.3483152687549591, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 3238.6484375, + "completions/mean_terminated_length": 3173.806884765625, + "completions/min_length": 2302.0, + "completions/min_terminated_length": 2302.0, + "epoch": 0.06357333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09670107811689377, + "learning_rate": 1e-06, + "loss": -0.0168, + "num_tokens": 215257686.0, + "reward": 2.6410131454467773, + "reward_std": 0.12483850121498108, + "rewards/cosine_scaled_reward/mean": 0.8424481749534607, + "rewards/cosine_scaled_reward/std": 0.34818026423454285, + "rewards/repetition_penalty_reward/mean": -0.13346639275550842, + "rewards/repetition_penalty_reward/std": 0.04153239354491234, + "rewards/reward_format/mean": 0.9984375238418579, + "rewards/reward_format/std": 0.02500000037252903, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4068.0, + "completions/mean_length": 3258.67578125, + "completions/mean_terminated_length": 3159.951904296875, + "completions/min_length": 2020.0, + "completions/min_terminated_length": 2020.0, + "epoch": 0.06378666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11681879311800003, + "learning_rate": 1e-06, + "loss": -0.0334, + "num_tokens": 216144543.0, + "reward": 2.4384727478027344, + "reward_std": 0.16123497486114502, + "rewards/cosine_scaled_reward/mean": 0.7305750846862793, + "rewards/cosine_scaled_reward/std": 0.4939228594303131, + "rewards/repetition_penalty_reward/mean": -0.13507118821144104, + "rewards/repetition_penalty_reward/std": 0.048798564821481705, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.85546875, + "rewards/reward_reference/std": 0.35231640934944153, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3418.9609375, + "completions/mean_terminated_length": 3335.81591796875, + "completions/min_length": 2241.0, + "completions/min_terminated_length": 2241.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1509273201227188, + "learning_rate": 1e-06, + "loss": -0.0569, + "num_tokens": 217073285.0, + "reward": 2.488239049911499, + "reward_std": 0.19046112895011902, + "rewards/cosine_scaled_reward/mean": 0.7709412574768066, + "rewards/cosine_scaled_reward/std": 0.477975994348526, + "rewards/repetition_penalty_reward/mean": -0.13895225524902344, + "rewards/repetition_penalty_reward/std": 0.041755057871341705, + "rewards/reward_format/mean": 0.9812500476837158, + "rewards/reward_format/std": 0.12126781791448593, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.09375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4081.0, + "completions/mean_length": 3356.453125, + "completions/mean_terminated_length": 3261.973388671875, + "completions/min_length": 2103.0, + "completions/min_terminated_length": 2103.0, + "epoch": 0.06421333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09332457184791565, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 217976461.0, + "reward": 2.6025989055633545, + "reward_std": 0.14444968104362488, + "rewards/cosine_scaled_reward/mean": 0.8382408618927002, + "rewards/cosine_scaled_reward/std": 0.3792869448661804, + "rewards/repetition_penalty_reward/mean": -0.14189210534095764, + "rewards/repetition_penalty_reward/std": 0.040102288126945496, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3446.98046875, + "completions/mean_terminated_length": 3360.827392578125, + "completions/min_length": 2168.0, + "completions/min_terminated_length": 2168.0, + "epoch": 0.06442666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15771138668060303, + "learning_rate": 1e-06, + "loss": -0.0407, + "num_tokens": 218901280.0, + "reward": 2.5199830532073975, + "reward_std": 0.21550403535366058, + "rewards/cosine_scaled_reward/mean": 0.7968882322311401, + "rewards/cosine_scaled_reward/std": 0.4498888850212097, + "rewards/repetition_penalty_reward/mean": -0.15503031015396118, + "rewards/repetition_penalty_reward/std": 0.05109821632504463, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 3348.5390625, + "completions/mean_terminated_length": 3295.372314453125, + "completions/min_length": 2140.0, + "completions/min_terminated_length": 2140.0, + "epoch": 0.06464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08387938141822815, + "learning_rate": 1e-06, + "loss": -0.025, + "num_tokens": 219845850.0, + "reward": 2.6532342433929443, + "reward_std": 0.11601746082305908, + "rewards/cosine_scaled_reward/mean": 0.8620700836181641, + "rewards/cosine_scaled_reward/std": 0.33811599016189575, + "rewards/repetition_penalty_reward/mean": -0.14711710810661316, + "rewards/repetition_penalty_reward/std": 0.0417744405567646, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 3422.9453125, + "completions/mean_terminated_length": 3323.345458984375, + "completions/min_length": 2324.0, + "completions/min_terminated_length": 2324.0, + "epoch": 0.06485333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1095823347568512, + "learning_rate": 1e-06, + "loss": -0.0295, + "num_tokens": 220753264.0, + "reward": 2.5606091022491455, + "reward_std": 0.15420199930667877, + "rewards/cosine_scaled_reward/mean": 0.8168035745620728, + "rewards/cosine_scaled_reward/std": 0.4226503074169159, + "rewards/repetition_penalty_reward/mean": -0.15619438886642456, + "rewards/repetition_penalty_reward/std": 0.04338504374027252, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3535.171875, + "completions/mean_terminated_length": 3415.564208984375, + "completions/min_length": 2551.0, + "completions/min_terminated_length": 2551.0, + "epoch": 0.06506666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10752750188112259, + "learning_rate": 1e-06, + "loss": -0.0545, + "num_tokens": 221632156.0, + "reward": 2.4782791137695312, + "reward_std": 0.2294086217880249, + "rewards/cosine_scaled_reward/mean": 0.7806603908538818, + "rewards/cosine_scaled_reward/std": 0.48239925503730774, + "rewards/repetition_penalty_reward/mean": -0.17113137245178223, + "rewards/repetition_penalty_reward/std": 0.04546995088458061, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 3519.09375, + "completions/mean_terminated_length": 3338.625732421875, + "completions/min_length": 2358.0, + "completions/min_terminated_length": 2358.0, + "epoch": 0.06528, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2055405229330063, + "learning_rate": 1e-06, + "loss": -0.1014, + "num_tokens": 222445968.0, + "reward": 2.464463472366333, + "reward_std": 0.27007126808166504, + "rewards/cosine_scaled_reward/mean": 0.7708021402359009, + "rewards/cosine_scaled_reward/std": 0.4875037968158722, + "rewards/repetition_penalty_reward/mean": -0.17118236422538757, + "rewards/repetition_penalty_reward/std": 0.05047919228672981, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3404.953125, + "completions/mean_terminated_length": 3299.1171875, + "completions/min_length": 2167.0, + "completions/min_terminated_length": 2167.0, + "epoch": 0.06549333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0901985913515091, + "learning_rate": 1e-06, + "loss": -0.044, + "num_tokens": 223336840.0, + "reward": 2.4638819694519043, + "reward_std": 0.20657259225845337, + "rewards/cosine_scaled_reward/mean": 0.7703253030776978, + "rewards/cosine_scaled_reward/std": 0.4742421805858612, + "rewards/repetition_penalty_reward/mean": -0.1650371104478836, + "rewards/repetition_penalty_reward/std": 0.04591949284076691, + "rewards/reward_format/mean": 0.9874999523162842, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4050.0, + "completions/mean_length": 3365.19140625, + "completions/mean_terminated_length": 3296.483154296875, + "completions/min_length": 2215.0, + "completions/min_terminated_length": 2215.0, + "epoch": 0.06570666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11064548045396805, + "learning_rate": 1e-06, + "loss": -0.0274, + "num_tokens": 224260045.0, + "reward": 2.606769561767578, + "reward_std": 0.12193898856639862, + "rewards/cosine_scaled_reward/mean": 0.8420050740242004, + "rewards/cosine_scaled_reward/std": 0.3759949207305908, + "rewards/repetition_penalty_reward/mean": -0.16101685166358948, + "rewards/repetition_penalty_reward/std": 0.05338205397129059, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 3431.328125, + "completions/mean_terminated_length": 3281.8564453125, + "completions/min_length": 2136.0, + "completions/min_terminated_length": 2136.0, + "epoch": 0.06592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14071083068847656, + "learning_rate": 1e-06, + "loss": -0.0582, + "num_tokens": 225100965.0, + "reward": 2.4305174350738525, + "reward_std": 0.22230038046836853, + "rewards/cosine_scaled_reward/mean": 0.7429267764091492, + "rewards/cosine_scaled_reward/std": 0.5048913955688477, + "rewards/repetition_penalty_reward/mean": -0.1639719009399414, + "rewards/repetition_penalty_reward/std": 0.047285351902246475, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8515625, + "rewards/reward_reference/std": 0.3562295734882355, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3377.625, + "completions/mean_terminated_length": 3278.64892578125, + "completions/min_length": 2386.0, + "completions/min_terminated_length": 2386.0, + "epoch": 0.06613333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08242145925760269, + "learning_rate": 1e-06, + "loss": -0.0368, + "num_tokens": 226001273.0, + "reward": 2.520883560180664, + "reward_std": 0.16118432581424713, + "rewards/cosine_scaled_reward/mean": 0.7903509736061096, + "rewards/cosine_scaled_reward/std": 0.4486011564731598, + "rewards/repetition_penalty_reward/mean": -0.15306127071380615, + "rewards/repetition_penalty_reward/std": 0.04898226633667946, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4053.0, + "completions/mean_length": 3350.13671875, + "completions/mean_terminated_length": 3232.013671875, + "completions/min_length": 2259.0, + "completions/min_terminated_length": 2259.0, + "epoch": 0.06634666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1643557995557785, + "learning_rate": 1e-06, + "loss": -0.05, + "num_tokens": 226871648.0, + "reward": 2.4743309020996094, + "reward_std": 0.24664385616779327, + "rewards/cosine_scaled_reward/mean": 0.7613555192947388, + "rewards/cosine_scaled_reward/std": 0.47476089000701904, + "rewards/repetition_penalty_reward/mean": -0.15811839699745178, + "rewards/repetition_penalty_reward/std": 0.054658714681863785, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 3321.578125, + "completions/mean_terminated_length": 3198.932373046875, + "completions/min_length": 2180.0, + "completions/min_terminated_length": 2180.0, + "epoch": 0.06656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09745794534683228, + "learning_rate": 1e-06, + "loss": -0.0309, + "num_tokens": 227734296.0, + "reward": 2.3746564388275146, + "reward_std": 0.18470577895641327, + "rewards/cosine_scaled_reward/mean": 0.6990936398506165, + "rewards/cosine_scaled_reward/std": 0.5295417308807373, + "rewards/repetition_penalty_reward/mean": -0.15021856129169464, + "rewards/repetition_penalty_reward/std": 0.05101132020354271, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.83203125, + "rewards/reward_reference/std": 0.3745708465576172, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 3381.27734375, + "completions/mean_terminated_length": 3271.8154296875, + "completions/min_length": 2172.0, + "completions/min_terminated_length": 2172.0, + "epoch": 0.06677333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06862661987543106, + "learning_rate": 1e-06, + "loss": -0.0099, + "num_tokens": 228628475.0, + "reward": 2.5650148391723633, + "reward_std": 0.10304483771324158, + "rewards/cosine_scaled_reward/mean": 0.8104842901229858, + "rewards/cosine_scaled_reward/std": 0.4223276972770691, + "rewards/repetition_penalty_reward/mean": -0.14546939730644226, + "rewards/repetition_penalty_reward/std": 0.04435879364609718, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 3382.66015625, + "completions/mean_terminated_length": 3246.6279296875, + "completions/min_length": 2071.0, + "completions/min_terminated_length": 2071.0, + "epoch": 0.06698666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1515108048915863, + "learning_rate": 1e-06, + "loss": -0.0663, + "num_tokens": 229494192.0, + "reward": 2.4176552295684814, + "reward_std": 0.323119193315506, + "rewards/cosine_scaled_reward/mean": 0.7321640849113464, + "rewards/cosine_scaled_reward/std": 0.5087533593177795, + "rewards/repetition_penalty_reward/mean": -0.14654016494750977, + "rewards/repetition_penalty_reward/std": 0.04701191931962967, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.84765625, + "rewards/reward_reference/std": 0.3600577116012573, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3217.7421875, + "completions/mean_terminated_length": 3155.27197265625, + "completions/min_length": 2127.0, + "completions/min_terminated_length": 2127.0, + "epoch": 0.0672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08618131279945374, + "learning_rate": 1e-06, + "loss": -0.0266, + "num_tokens": 230404190.0, + "reward": 2.642916202545166, + "reward_std": 0.12977439165115356, + "rewards/cosine_scaled_reward/mean": 0.8511161804199219, + "rewards/cosine_scaled_reward/std": 0.3265201151371002, + "rewards/repetition_penalty_reward/mean": -0.143356055021286, + "rewards/repetition_penalty_reward/std": 0.046873416751623154, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 3232.3515625, + "completions/mean_terminated_length": 3155.17431640625, + "completions/min_length": 2194.0, + "completions/min_terminated_length": 2194.0, + "epoch": 0.06741333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0868743509054184, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 231304712.0, + "reward": 2.633847236633301, + "reward_std": 0.17269165813922882, + "rewards/cosine_scaled_reward/mean": 0.8517568707466125, + "rewards/cosine_scaled_reward/std": 0.32880187034606934, + "rewards/repetition_penalty_reward/mean": -0.14994081854820251, + "rewards/repetition_penalty_reward/std": 0.0538417249917984, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3367.59765625, + "completions/mean_terminated_length": 3244.5341796875, + "completions/min_length": 2088.0, + "completions/min_terminated_length": 2088.0, + "epoch": 0.06762666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11475294828414917, + "learning_rate": 1e-06, + "loss": -0.0357, + "num_tokens": 232173373.0, + "reward": 2.503598213195801, + "reward_std": 0.21392254531383514, + "rewards/cosine_scaled_reward/mean": 0.7742725610733032, + "rewards/cosine_scaled_reward/std": 0.4625420868396759, + "rewards/repetition_penalty_reward/mean": -0.14333048462867737, + "rewards/repetition_penalty_reward/std": 0.04298888146877289, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4039.0, + "completions/mean_length": 3266.171875, + "completions/mean_terminated_length": 3121.522705078125, + "completions/min_length": 1969.0, + "completions/min_terminated_length": 1969.0, + "epoch": 0.06784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16944526135921478, + "learning_rate": 1e-06, + "loss": -0.0526, + "num_tokens": 233010949.0, + "reward": 2.3641538619995117, + "reward_std": 0.34667205810546875, + "rewards/cosine_scaled_reward/mean": 0.6916285753250122, + "rewards/cosine_scaled_reward/std": 0.5271083116531372, + "rewards/repetition_penalty_reward/mean": -0.14934960007667542, + "rewards/repetition_penalty_reward/std": 0.047472234815359116, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.828125, + "rewards/reward_reference/std": 0.3780108094215393, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 3283.04296875, + "completions/mean_terminated_length": 3118.925048828125, + "completions/min_length": 2031.0, + "completions/min_terminated_length": 2031.0, + "epoch": 0.06805333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16193901002407074, + "learning_rate": 1e-06, + "loss": -0.0488, + "num_tokens": 233842544.0, + "reward": 2.514294147491455, + "reward_std": 0.23703011870384216, + "rewards/cosine_scaled_reward/mean": 0.7864575386047363, + "rewards/cosine_scaled_reward/std": 0.42908984422683716, + "rewards/repetition_penalty_reward/mean": -0.15497568249702454, + "rewards/repetition_penalty_reward/std": 0.05239295959472656, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3214.52734375, + "completions/mean_terminated_length": 3147.861572265625, + "completions/min_length": 2131.0, + "completions/min_terminated_length": 2131.0, + "epoch": 0.06826666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1349935382604599, + "learning_rate": 1e-06, + "loss": -0.0489, + "num_tokens": 234753935.0, + "reward": 2.569523334503174, + "reward_std": 0.24764062464237213, + "rewards/cosine_scaled_reward/mean": 0.8103736042976379, + "rewards/cosine_scaled_reward/std": 0.39255258440971375, + "rewards/repetition_penalty_reward/mean": -0.15491291880607605, + "rewards/repetition_penalty_reward/std": 0.04374931380152702, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4049.0, + "completions/mean_length": 3301.375, + "completions/mean_terminated_length": 3215.376708984375, + "completions/min_length": 1930.0, + "completions/min_terminated_length": 1930.0, + "epoch": 0.06848, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11944566667079926, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 235665967.0, + "reward": 2.621381998062134, + "reward_std": 0.19357608258724213, + "rewards/cosine_scaled_reward/mean": 0.8457991480827332, + "rewards/cosine_scaled_reward/std": 0.35158571600914, + "rewards/repetition_penalty_reward/mean": -0.16191723942756653, + "rewards/repetition_penalty_reward/std": 0.051886800676584244, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 3223.7109375, + "completions/mean_terminated_length": 3107.92041015625, + "completions/min_length": 1928.0, + "completions/min_terminated_length": 1928.0, + "epoch": 0.06869333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12029201537370682, + "learning_rate": 1e-06, + "loss": -0.0333, + "num_tokens": 236527273.0, + "reward": 2.5427041053771973, + "reward_std": 0.19083553552627563, + "rewards/cosine_scaled_reward/mean": 0.7951414585113525, + "rewards/cosine_scaled_reward/std": 0.41195037961006165, + "rewards/repetition_penalty_reward/mean": -0.15478110313415527, + "rewards/repetition_penalty_reward/std": 0.04209055379033089, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4068.0, + "completions/mean_length": 3223.125, + "completions/mean_terminated_length": 3120.209716796875, + "completions/min_length": 1631.0, + "completions/min_terminated_length": 1631.0, + "epoch": 0.06890666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10623487830162048, + "learning_rate": 1e-06, + "loss": -0.021, + "num_tokens": 237403709.0, + "reward": 2.5744786262512207, + "reward_std": 0.15794894099235535, + "rewards/cosine_scaled_reward/mean": 0.8062847256660461, + "rewards/cosine_scaled_reward/std": 0.39761340618133545, + "rewards/repetition_penalty_reward/mean": -0.1419624239206314, + "rewards/repetition_penalty_reward/std": 0.045276276767253876, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4056.0, + "completions/mean_length": 3123.16796875, + "completions/mean_terminated_length": 3071.123291015625, + "completions/min_length": 1944.0, + "completions/min_terminated_length": 1944.0, + "epoch": 0.06912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0978594645857811, + "learning_rate": 1e-06, + "loss": -0.0269, + "num_tokens": 238315692.0, + "reward": 2.6401233673095703, + "reward_std": 0.13173067569732666, + "rewards/cosine_scaled_reward/mean": 0.8438149690628052, + "rewards/cosine_scaled_reward/std": 0.3169075548648834, + "rewards/repetition_penalty_reward/mean": -0.14900407195091248, + "rewards/repetition_penalty_reward/std": 0.044161971658468246, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9453125, + "rewards/reward_reference/std": 0.22781464457511902, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3193.15625, + "completions/mean_terminated_length": 3091.095458984375, + "completions/min_length": 1871.0, + "completions/min_terminated_length": 1871.0, + "epoch": 0.06933333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1280767023563385, + "learning_rate": 1e-06, + "loss": -0.0333, + "num_tokens": 239186876.0, + "reward": 2.480194568634033, + "reward_std": 0.1864958554506302, + "rewards/cosine_scaled_reward/mean": 0.748255729675293, + "rewards/cosine_scaled_reward/std": 0.46193239092826843, + "rewards/repetition_penalty_reward/mean": -0.14306114614009857, + "rewards/repetition_penalty_reward/std": 0.0422794334590435, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 3207.65625, + "completions/mean_terminated_length": 3089.734619140625, + "completions/min_length": 1846.0, + "completions/min_terminated_length": 1846.0, + "epoch": 0.06954666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10687022656202316, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 240050532.0, + "reward": 2.498737335205078, + "reward_std": 0.19235044717788696, + "rewards/cosine_scaled_reward/mean": 0.7589946389198303, + "rewards/cosine_scaled_reward/std": 0.4520692229270935, + "rewards/repetition_penalty_reward/mean": -0.13916371762752533, + "rewards/repetition_penalty_reward/std": 0.04271097108721733, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.28125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3233.9765625, + "completions/mean_terminated_length": 3148.884033203125, + "completions/min_length": 2025.0, + "completions/min_terminated_length": 2025.0, + "epoch": 0.06976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08872903883457184, + "learning_rate": 1e-06, + "loss": -0.04, + "num_tokens": 240950130.0, + "reward": 2.476565361022949, + "reward_std": 0.15382640063762665, + "rewards/cosine_scaled_reward/mean": 0.7487243413925171, + "rewards/cosine_scaled_reward/std": 0.4689893126487732, + "rewards/repetition_penalty_reward/mean": -0.14012770354747772, + "rewards/repetition_penalty_reward/std": 0.03780807927250862, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 3089.98828125, + "completions/mean_terminated_length": 3031.7890625, + "completions/min_length": 1994.0, + "completions/min_terminated_length": 1994.0, + "epoch": 0.06997333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.131632000207901, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 241849251.0, + "reward": 2.537092685699463, + "reward_std": 0.14982327818870544, + "rewards/cosine_scaled_reward/mean": 0.784229040145874, + "rewards/cosine_scaled_reward/std": 0.4037418067455292, + "rewards/repetition_penalty_reward/mean": -0.14010494947433472, + "rewards/repetition_penalty_reward/std": 0.04123867303133011, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.6875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 2925.15625, + "completions/mean_terminated_length": 2877.560791015625, + "completions/min_length": 1770.0, + "completions/min_terminated_length": 1770.0, + "epoch": 0.07018666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10088398307561874, + "learning_rate": 1e-06, + "loss": -0.0143, + "num_tokens": 242712703.0, + "reward": 2.5291996002197266, + "reward_std": 0.13664719462394714, + "rewards/cosine_scaled_reward/mean": 0.7678295969963074, + "rewards/cosine_scaled_reward/std": 0.38781559467315674, + "rewards/repetition_penalty_reward/mean": -0.13238000869750977, + "rewards/repetition_penalty_reward/std": 0.041910383850336075, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 3088.9453125, + "completions/mean_terminated_length": 3017.313720703125, + "completions/min_length": 1751.0, + "completions/min_terminated_length": 1751.0, + "epoch": 0.0704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1253613382577896, + "learning_rate": 1e-06, + "loss": -0.0298, + "num_tokens": 243592617.0, + "reward": 2.6539814472198486, + "reward_std": 0.12508264183998108, + "rewards/cosine_scaled_reward/mean": 0.8431792855262756, + "rewards/cosine_scaled_reward/std": 0.3038645088672638, + "rewards/repetition_penalty_reward/mean": -0.13294783234596252, + "rewards/repetition_penalty_reward/std": 0.044496990740299225, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.953125, + "rewards/reward_reference/std": 0.21178513765335083, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4029.0, + "completions/mean_length": 3092.9296875, + "completions/mean_terminated_length": 3039.267333984375, + "completions/min_length": 1928.0, + "completions/min_terminated_length": 1928.0, + "epoch": 0.07061333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1407911628484726, + "learning_rate": 1e-06, + "loss": -0.0164, + "num_tokens": 244490035.0, + "reward": 2.546139717102051, + "reward_std": 0.24451303482055664, + "rewards/cosine_scaled_reward/mean": 0.7802634239196777, + "rewards/cosine_scaled_reward/std": 0.40869632363319397, + "rewards/repetition_penalty_reward/mean": -0.13646754622459412, + "rewards/repetition_penalty_reward/std": 0.03807799518108368, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4026.0, + "completions/mean_length": 3078.32421875, + "completions/mean_terminated_length": 3049.71484375, + "completions/min_length": 2056.0, + "completions/min_terminated_length": 2056.0, + "epoch": 0.07082666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09870472550392151, + "learning_rate": 1e-06, + "loss": -0.0263, + "num_tokens": 245405750.0, + "reward": 2.5187315940856934, + "reward_std": 0.11564701050519943, + "rewards/cosine_scaled_reward/mean": 0.7655521035194397, + "rewards/cosine_scaled_reward/std": 0.4269881546497345, + "rewards/repetition_penalty_reward/mean": -0.1335393190383911, + "rewards/repetition_penalty_reward/std": 0.040202755481004715, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.09375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3170.1171875, + "completions/mean_terminated_length": 3051.83251953125, + "completions/min_length": 1803.0, + "completions/min_terminated_length": 1803.0, + "epoch": 0.07104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13158267736434937, + "learning_rate": 1e-06, + "loss": -0.021, + "num_tokens": 246258068.0, + "reward": 2.388303756713867, + "reward_std": 0.20445595681667328, + "rewards/cosine_scaled_reward/mean": 0.6967047452926636, + "rewards/cosine_scaled_reward/std": 0.50649094581604, + "rewards/repetition_penalty_reward/mean": -0.13730722665786743, + "rewards/repetition_penalty_reward/std": 0.04322898015379906, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.83203125, + "rewards/reward_reference/std": 0.3745708465576172, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 2926.36328125, + "completions/mean_terminated_length": 2848.3876953125, + "completions/min_length": 1722.0, + "completions/min_terminated_length": 1722.0, + "epoch": 0.07125333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13790081441402435, + "learning_rate": 1e-06, + "loss": -0.0391, + "num_tokens": 247097269.0, + "reward": 2.519500732421875, + "reward_std": 0.17879009246826172, + "rewards/cosine_scaled_reward/mean": 0.7537988424301147, + "rewards/cosine_scaled_reward/std": 0.4042138457298279, + "rewards/repetition_penalty_reward/mean": -0.13039204478263855, + "rewards/repetition_penalty_reward/std": 0.04133962467312813, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 3129.53125, + "completions/mean_terminated_length": 3052.050537109375, + "completions/min_length": 1994.0, + "completions/min_terminated_length": 1994.0, + "epoch": 0.07146666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1301202028989792, + "learning_rate": 1e-06, + "loss": -0.0415, + "num_tokens": 247987881.0, + "reward": 2.525852918624878, + "reward_std": 0.20776943862438202, + "rewards/cosine_scaled_reward/mean": 0.772997260093689, + "rewards/cosine_scaled_reward/std": 0.424231618642807, + "rewards/repetition_penalty_reward/mean": -0.13542558252811432, + "rewards/repetition_penalty_reward/std": 0.03885306045413017, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3204.3828125, + "completions/mean_terminated_length": 3136.94970703125, + "completions/min_length": 1910.0, + "completions/min_terminated_length": 1910.0, + "epoch": 0.07168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14549526572227478, + "learning_rate": 1e-06, + "loss": -0.0383, + "num_tokens": 248902923.0, + "reward": 2.6701064109802246, + "reward_std": 0.16786515712738037, + "rewards/cosine_scaled_reward/mean": 0.8573368787765503, + "rewards/cosine_scaled_reward/std": 0.31324511766433716, + "rewards/repetition_penalty_reward/mean": -0.1333240270614624, + "rewards/repetition_penalty_reward/std": 0.03887150436639786, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.94921875, + "rewards/reward_reference/std": 0.21998079121112823, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 3134.62890625, + "completions/mean_terminated_length": 3053.15673828125, + "completions/min_length": 1814.0, + "completions/min_terminated_length": 1814.0, + "epoch": 0.07189333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.114342100918293, + "learning_rate": 1e-06, + "loss": -0.0226, + "num_tokens": 249785728.0, + "reward": 2.490480422973633, + "reward_std": 0.15718211233615875, + "rewards/cosine_scaled_reward/mean": 0.7543493509292603, + "rewards/cosine_scaled_reward/std": 0.44703423976898193, + "rewards/repetition_penalty_reward/mean": -0.13652509450912476, + "rewards/repetition_penalty_reward/std": 0.04738989472389221, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718994140625, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4051.0, + "completions/mean_length": 3048.6015625, + "completions/mean_terminated_length": 2969.38671875, + "completions/min_length": 1985.0, + "completions/min_terminated_length": 1985.0, + "epoch": 0.07210666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14780311286449432, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 250653090.0, + "reward": 2.460996150970459, + "reward_std": 0.24623346328735352, + "rewards/cosine_scaled_reward/mean": 0.7283567786216736, + "rewards/cosine_scaled_reward/std": 0.45921623706817627, + "rewards/repetition_penalty_reward/mean": -0.13142311573028564, + "rewards/repetition_penalty_reward/std": 0.0422639399766922, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 3052.28125, + "completions/mean_terminated_length": 2991.900634765625, + "completions/min_length": 1768.0, + "completions/min_terminated_length": 1768.0, + "epoch": 0.07232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15734364092350006, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 251538318.0, + "reward": 2.5594005584716797, + "reward_std": 0.19011801481246948, + "rewards/cosine_scaled_reward/mean": 0.7857754826545715, + "rewards/cosine_scaled_reward/std": 0.39223918318748474, + "rewards/repetition_penalty_reward/mean": -0.13028106093406677, + "rewards/repetition_penalty_reward/std": 0.03792421892285347, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3079.796875, + "completions/mean_terminated_length": 2998.328857421875, + "completions/min_length": 1895.0, + "completions/min_terminated_length": 1895.0, + "epoch": 0.07253333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17438679933547974, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 252415094.0, + "reward": 2.578650951385498, + "reward_std": 0.22237978875637054, + "rewards/cosine_scaled_reward/mean": 0.7927536368370056, + "rewards/cosine_scaled_reward/std": 0.3868005573749542, + "rewards/repetition_penalty_reward/mean": -0.1320713758468628, + "rewards/repetition_penalty_reward/std": 0.04136000573635101, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 3052.14453125, + "completions/mean_terminated_length": 2958.86376953125, + "completions/min_length": 1826.0, + "completions/min_terminated_length": 1826.0, + "epoch": 0.07274666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13218127191066742, + "learning_rate": 1e-06, + "loss": -0.03, + "num_tokens": 253276631.0, + "reward": 2.4949088096618652, + "reward_std": 0.22383946180343628, + "rewards/cosine_scaled_reward/mean": 0.7475392818450928, + "rewards/cosine_scaled_reward/std": 0.4382774531841278, + "rewards/repetition_penalty_reward/mean": -0.12841179966926575, + "rewards/repetition_penalty_reward/std": 0.036752402782440186, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.28125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4034.0, + "completions/mean_length": 3101.42578125, + "completions/mean_terminated_length": 3003.2490234375, + "completions/min_length": 1932.0, + "completions/min_terminated_length": 1932.0, + "epoch": 0.07296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16702738404273987, + "learning_rate": 1e-06, + "loss": -0.0343, + "num_tokens": 254126232.0, + "reward": 2.5561437606811523, + "reward_std": 0.23330280184745789, + "rewards/cosine_scaled_reward/mean": 0.7809128761291504, + "rewards/cosine_scaled_reward/std": 0.40656355023384094, + "rewards/repetition_penalty_reward/mean": -0.12789398431777954, + "rewards/repetition_penalty_reward/std": 0.036753393709659576, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 3152.01953125, + "completions/mean_terminated_length": 3076.341552734375, + "completions/min_length": 1919.0, + "completions/min_terminated_length": 1919.0, + "epoch": 0.07317333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0745634138584137, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 255025093.0, + "reward": 2.526113748550415, + "reward_std": 0.09965360909700394, + "rewards/cosine_scaled_reward/mean": 0.7831565737724304, + "rewards/cosine_scaled_reward/std": 0.41442909836769104, + "rewards/repetition_penalty_reward/mean": -0.1328241229057312, + "rewards/repetition_penalty_reward/std": 0.053901638835668564, + "rewards/reward_format/mean": 0.9812500476837158, + "rewards/reward_format/std": 0.12126781791448593, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 3020.8671875, + "completions/mean_terminated_length": 2953.950439453125, + "completions/min_length": 1875.0, + "completions/min_terminated_length": 1875.0, + "epoch": 0.07338666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11070467531681061, + "learning_rate": 1e-06, + "loss": -0.0319, + "num_tokens": 255898239.0, + "reward": 2.5960850715637207, + "reward_std": 0.16182690858840942, + "rewards/cosine_scaled_reward/mean": 0.806800127029419, + "rewards/cosine_scaled_reward/std": 0.3528696298599243, + "rewards/repetition_penalty_reward/mean": -0.13024629652500153, + "rewards/repetition_penalty_reward/std": 0.03998575359582901, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4046.0, + "completions/mean_length": 3150.12890625, + "completions/mean_terminated_length": 3043.2041015625, + "completions/min_length": 1923.0, + "completions/min_terminated_length": 1923.0, + "epoch": 0.0736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1139230728149414, + "learning_rate": 1e-06, + "loss": -0.0359, + "num_tokens": 256756796.0, + "reward": 2.490581512451172, + "reward_std": 0.23320190608501434, + "rewards/cosine_scaled_reward/mean": 0.751013994216919, + "rewards/cosine_scaled_reward/std": 0.45080894231796265, + "rewards/repetition_penalty_reward/mean": -0.13699480891227722, + "rewards/repetition_penalty_reward/std": 0.047232817858457565, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3135.7890625, + "completions/mean_terminated_length": 3022.576416015625, + "completions/min_length": 1962.0, + "completions/min_terminated_length": 1962.0, + "epoch": 0.07381333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13618801534175873, + "learning_rate": 1e-06, + "loss": -0.0365, + "num_tokens": 257608746.0, + "reward": 2.4578542709350586, + "reward_std": 0.18854659795761108, + "rewards/cosine_scaled_reward/mean": 0.7333296537399292, + "rewards/cosine_scaled_reward/std": 0.4674873352050781, + "rewards/repetition_penalty_reward/mean": -0.13641303777694702, + "rewards/repetition_penalty_reward/std": 0.04397253319621086, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3082.71484375, + "completions/mean_terminated_length": 2968.16943359375, + "completions/min_length": 1786.0, + "completions/min_terminated_length": 1786.0, + "epoch": 0.07402666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17331695556640625, + "learning_rate": 1e-06, + "loss": -0.0341, + "num_tokens": 258451761.0, + "reward": 2.4771933555603027, + "reward_std": 0.28758424520492554, + "rewards/cosine_scaled_reward/mean": 0.7422814965248108, + "rewards/cosine_scaled_reward/std": 0.44633299112319946, + "rewards/repetition_penalty_reward/mean": -0.13774433732032776, + "rewards/repetition_penalty_reward/std": 0.03738153725862503, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4063.0, + "completions/mean_length": 3093.9140625, + "completions/mean_terminated_length": 2970.850830078125, + "completions/min_length": 1671.0, + "completions/min_terminated_length": 1671.0, + "epoch": 0.07424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1594753861427307, + "learning_rate": 1e-06, + "loss": -0.0315, + "num_tokens": 259287891.0, + "reward": 2.4308295249938965, + "reward_std": 0.21208155155181885, + "rewards/cosine_scaled_reward/mean": 0.7279205322265625, + "rewards/cosine_scaled_reward/std": 0.4628751277923584, + "rewards/repetition_penalty_reward/mean": -0.14005976915359497, + "rewards/repetition_penalty_reward/std": 0.046011701226234436, + "rewards/reward_format/mean": 0.9718749523162842, + "rewards/reward_format/std": 0.1476283222436905, + "rewards/reward_reference/mean": 0.87109375, + "rewards/reward_reference/std": 0.33575257658958435, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3122.5234375, + "completions/mean_terminated_length": 3031.000244140625, + "completions/min_length": 1635.0, + "completions/min_terminated_length": 1635.0, + "epoch": 0.07445333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1562374234199524, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 260165049.0, + "reward": 2.5966033935546875, + "reward_std": 0.22522631287574768, + "rewards/cosine_scaled_reward/mean": 0.8105063438415527, + "rewards/cosine_scaled_reward/std": 0.3691033720970154, + "rewards/repetition_penalty_reward/mean": -0.13187175989151, + "rewards/repetition_penalty_reward/std": 0.03818640485405922, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 3308.6328125, + "completions/mean_terminated_length": 3223.419921875, + "completions/min_length": 2104.0, + "completions/min_terminated_length": 2104.0, + "epoch": 0.07466666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13112296164035797, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 261085359.0, + "reward": 2.4156880378723145, + "reward_std": 0.11315947771072388, + "rewards/cosine_scaled_reward/mean": 0.711700975894928, + "rewards/cosine_scaled_reward/std": 0.5184807181358337, + "rewards/repetition_penalty_reward/mean": -0.12882539629936218, + "rewards/repetition_penalty_reward/std": 0.03247005119919777, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8359375, + "rewards/reward_reference/std": 0.3710577189922333, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.25, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 3232.9296875, + "completions/mean_terminated_length": 3143.646484375, + "completions/min_length": 1802.0, + "completions/min_terminated_length": 1802.0, + "epoch": 0.07488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18751507997512817, + "learning_rate": 1e-06, + "loss": -0.0467, + "num_tokens": 261980705.0, + "reward": 2.4058680534362793, + "reward_std": 0.25372546911239624, + "rewards/cosine_scaled_reward/mean": 0.7078858017921448, + "rewards/cosine_scaled_reward/std": 0.5087409019470215, + "rewards/repetition_penalty_reward/mean": -0.13873633742332458, + "rewards/repetition_penalty_reward/std": 0.04568910971283913, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.83984375, + "rewards/reward_reference/std": 0.36746934056282043, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3219.00390625, + "completions/mean_terminated_length": 3119.864990234375, + "completions/min_length": 1868.0, + "completions/min_terminated_length": 1868.0, + "epoch": 0.07509333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09587662667036057, + "learning_rate": 1e-06, + "loss": -0.0318, + "num_tokens": 262862994.0, + "reward": 2.5543429851531982, + "reward_std": 0.1828855276107788, + "rewards/cosine_scaled_reward/mean": 0.7893514633178711, + "rewards/cosine_scaled_reward/std": 0.4200114607810974, + "rewards/repetition_penalty_reward/mean": -0.12719596922397614, + "rewards/repetition_penalty_reward/std": 0.042319945991039276, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 3099.60546875, + "completions/mean_terminated_length": 2986.969482421875, + "completions/min_length": 1788.0, + "completions/min_terminated_length": 1788.0, + "epoch": 0.07530666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1518796682357788, + "learning_rate": 1e-06, + "loss": -0.0567, + "num_tokens": 263715473.0, + "reward": 2.518235683441162, + "reward_std": 0.22827556729316711, + "rewards/cosine_scaled_reward/mean": 0.7613465189933777, + "rewards/cosine_scaled_reward/std": 0.42951709032058716, + "rewards/repetition_penalty_reward/mean": -0.12357960641384125, + "rewards/repetition_penalty_reward/std": 0.043831564486026764, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 3060.171875, + "completions/mean_terminated_length": 2967.6083984375, + "completions/min_length": 1789.0, + "completions/min_terminated_length": 1789.0, + "epoch": 0.07552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10913080722093582, + "learning_rate": 1e-06, + "loss": -0.0263, + "num_tokens": 264573281.0, + "reward": 2.6362128257751465, + "reward_std": 0.16269515454769135, + "rewards/cosine_scaled_reward/mean": 0.8276014924049377, + "rewards/cosine_scaled_reward/std": 0.3232189416885376, + "rewards/repetition_penalty_reward/mean": -0.12966996431350708, + "rewards/repetition_penalty_reward/std": 0.04010913148522377, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 3119.25390625, + "completions/mean_terminated_length": 3008.839111328125, + "completions/min_length": 1870.0, + "completions/min_terminated_length": 1870.0, + "epoch": 0.07573333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1123742014169693, + "learning_rate": 1e-06, + "loss": -0.0572, + "num_tokens": 265430770.0, + "reward": 2.573673725128174, + "reward_std": 0.17991378903388977, + "rewards/cosine_scaled_reward/mean": 0.7907612323760986, + "rewards/cosine_scaled_reward/std": 0.39641907811164856, + "rewards/repetition_penalty_reward/mean": -0.12411877512931824, + "rewards/repetition_penalty_reward/std": 0.039008140563964844, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3150.96875, + "completions/mean_terminated_length": 3075.20654296875, + "completions/min_length": 1805.0, + "completions/min_terminated_length": 1805.0, + "epoch": 0.07594666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1424713432788849, + "learning_rate": 1e-06, + "loss": -0.0155, + "num_tokens": 266325634.0, + "reward": 2.5793204307556152, + "reward_std": 0.15808260440826416, + "rewards/cosine_scaled_reward/mean": 0.8081632852554321, + "rewards/cosine_scaled_reward/std": 0.3798864781856537, + "rewards/repetition_penalty_reward/mean": -0.13743659853935242, + "rewards/repetition_penalty_reward/std": 0.043698474764823914, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3171.09765625, + "completions/mean_terminated_length": 3057.51318359375, + "completions/min_length": 1760.0, + "completions/min_terminated_length": 1760.0, + "epoch": 0.07616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15009920299053192, + "learning_rate": 1e-06, + "loss": -0.0319, + "num_tokens": 267183507.0, + "reward": 2.5367894172668457, + "reward_std": 0.2662561535835266, + "rewards/cosine_scaled_reward/mean": 0.7738917469978333, + "rewards/cosine_scaled_reward/std": 0.428524911403656, + "rewards/repetition_penalty_reward/mean": -0.1316334754228592, + "rewards/repetition_penalty_reward/std": 0.04720846191048622, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.25, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4031.0, + "completions/mean_length": 3140.75, + "completions/mean_terminated_length": 3041.930908203125, + "completions/min_length": 1678.0, + "completions/min_terminated_length": 1678.0, + "epoch": 0.07637333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1082305908203125, + "learning_rate": 1e-06, + "loss": -0.0221, + "num_tokens": 268049203.0, + "reward": 2.462947368621826, + "reward_std": 0.17016665637493134, + "rewards/cosine_scaled_reward/mean": 0.7361332178115845, + "rewards/cosine_scaled_reward/std": 0.4668603539466858, + "rewards/repetition_penalty_reward/mean": -0.13099819421768188, + "rewards/repetition_penalty_reward/std": 0.044717565178871155, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626225590705872, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 3161.84375, + "completions/mean_terminated_length": 3074.017333984375, + "completions/min_length": 1797.0, + "completions/min_terminated_length": 1797.0, + "epoch": 0.07658666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1251969188451767, + "learning_rate": 1e-06, + "loss": -0.047, + "num_tokens": 268935583.0, + "reward": 2.5594985485076904, + "reward_std": 0.1853671371936798, + "rewards/cosine_scaled_reward/mean": 0.7982162237167358, + "rewards/cosine_scaled_reward/std": 0.3940296471118927, + "rewards/repetition_penalty_reward/mean": -0.12934255599975586, + "rewards/repetition_penalty_reward/std": 0.04356498643755913, + "rewards/reward_format/mean": 0.984375, + "rewards/reward_format/std": 0.11092304438352585, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3258.12890625, + "completions/mean_terminated_length": 3155.232421875, + "completions/min_length": 1929.0, + "completions/min_terminated_length": 1929.0, + "epoch": 0.0768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12503835558891296, + "learning_rate": 1e-06, + "loss": -0.0417, + "num_tokens": 269820536.0, + "reward": 2.60491943359375, + "reward_std": 0.19837068021297455, + "rewards/cosine_scaled_reward/mean": 0.8162407875061035, + "rewards/cosine_scaled_reward/std": 0.38866737484931946, + "rewards/repetition_penalty_reward/mean": -0.1292901486158371, + "rewards/repetition_penalty_reward/std": 0.04512140527367592, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3298.13671875, + "completions/mean_terminated_length": 3137.065673828125, + "completions/min_length": 1868.0, + "completions/min_terminated_length": 1868.0, + "epoch": 0.07701333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16601675748825073, + "learning_rate": 1e-06, + "loss": -0.0397, + "num_tokens": 270647575.0, + "reward": 2.546705722808838, + "reward_std": 0.22834451496601105, + "rewards/cosine_scaled_reward/mean": 0.7922992706298828, + "rewards/cosine_scaled_reward/std": 0.4232928454875946, + "rewards/repetition_penalty_reward/mean": -0.12528109550476074, + "rewards/repetition_penalty_reward/std": 0.0411822609603405, + "rewards/reward_format/mean": 0.981249988079071, + "rewards/reward_format/std": 0.12126781791448593, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 3211.41796875, + "completions/mean_terminated_length": 3132.3701171875, + "completions/min_length": 1917.0, + "completions/min_terminated_length": 1917.0, + "epoch": 0.07722666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09665444493293762, + "learning_rate": 1e-06, + "loss": -0.0161, + "num_tokens": 271544550.0, + "reward": 2.56905460357666, + "reward_std": 0.10312886536121368, + "rewards/cosine_scaled_reward/mean": 0.7973703145980835, + "rewards/cosine_scaled_reward/std": 0.4079588055610657, + "rewards/repetition_penalty_reward/mean": -0.12675310671329498, + "rewards/repetition_penalty_reward/std": 0.046840231865644455, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 3309.27734375, + "completions/mean_terminated_length": 3184.683349609375, + "completions/min_length": 1826.0, + "completions/min_terminated_length": 1826.0, + "epoch": 0.07744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19870643317699432, + "learning_rate": 1e-06, + "loss": -0.0334, + "num_tokens": 272415185.0, + "reward": 2.4216766357421875, + "reward_std": 0.30303946137428284, + "rewards/cosine_scaled_reward/mean": 0.7191956043243408, + "rewards/cosine_scaled_reward/std": 0.5092467069625854, + "rewards/repetition_penalty_reward/mean": -0.13423755764961243, + "rewards/repetition_penalty_reward/std": 0.04481777176260948, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.83984375, + "rewards/reward_reference/std": 0.36746934056282043, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3166.5546875, + "completions/mean_terminated_length": 3038.497802734375, + "completions/min_length": 1780.0, + "completions/min_terminated_length": 1780.0, + "epoch": 0.07765333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11488021165132523, + "learning_rate": 1e-06, + "loss": -0.0241, + "num_tokens": 273262835.0, + "reward": 2.5682260990142822, + "reward_std": 0.1561729609966278, + "rewards/cosine_scaled_reward/mean": 0.7933893203735352, + "rewards/cosine_scaled_reward/std": 0.3992950916290283, + "rewards/repetition_penalty_reward/mean": -0.1314133107662201, + "rewards/repetition_penalty_reward/std": 0.05208537355065346, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4062.0, + "completions/mean_length": 3165.7890625, + "completions/mean_terminated_length": 3078.33349609375, + "completions/min_length": 2017.0, + "completions/min_terminated_length": 2017.0, + "epoch": 0.07786666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1785222291946411, + "learning_rate": 1e-06, + "loss": -0.024, + "num_tokens": 274143013.0, + "reward": 2.6224820613861084, + "reward_std": 0.25625211000442505, + "rewards/cosine_scaled_reward/mean": 0.8286576271057129, + "rewards/cosine_scaled_reward/std": 0.3511349558830261, + "rewards/repetition_penalty_reward/mean": -0.1366441696882248, + "rewards/repetition_penalty_reward/std": 0.04617421701550484, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 3169.74609375, + "completions/mean_terminated_length": 3120.193359375, + "completions/min_length": 1721.0, + "completions/min_terminated_length": 1721.0, + "epoch": 0.07808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11180980503559113, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 275077732.0, + "reward": 2.5677073001861572, + "reward_std": 0.1280892938375473, + "rewards/cosine_scaled_reward/mean": 0.792206883430481, + "rewards/cosine_scaled_reward/std": 0.40753600001335144, + "rewards/repetition_penalty_reward/mean": -0.12684330344200134, + "rewards/repetition_penalty_reward/std": 0.04561019316315651, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 3230.05078125, + "completions/mean_terminated_length": 3115.101806640625, + "completions/min_length": 2011.0, + "completions/min_terminated_length": 2011.0, + "epoch": 0.07829333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15557599067687988, + "learning_rate": 1e-06, + "loss": -0.0496, + "num_tokens": 275945429.0, + "reward": 2.524695873260498, + "reward_std": 0.2952307164669037, + "rewards/cosine_scaled_reward/mean": 0.7712620496749878, + "rewards/cosine_scaled_reward/std": 0.4414149224758148, + "rewards/repetition_penalty_reward/mean": -0.13094106316566467, + "rewards/repetition_penalty_reward/std": 0.04363211616873741, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3075.4921875, + "completions/mean_terminated_length": 3007.45849609375, + "completions/min_length": 2109.0, + "completions/min_terminated_length": 2109.0, + "epoch": 0.07850666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10304121673107147, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 276824667.0, + "reward": 2.6640024185180664, + "reward_std": 0.15301068127155304, + "rewards/cosine_scaled_reward/mean": 0.8460245132446289, + "rewards/cosine_scaled_reward/std": 0.29841554164886475, + "rewards/repetition_penalty_reward/mean": -0.13124100863933563, + "rewards/repetition_penalty_reward/std": 0.04055340588092804, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94921875, + "rewards/reward_reference/std": 0.21998079121112823, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3073.58984375, + "completions/mean_terminated_length": 2996.264892578125, + "completions/min_length": 2018.0, + "completions/min_terminated_length": 2018.0, + "epoch": 0.07872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16008536517620087, + "learning_rate": 1e-06, + "loss": -0.0362, + "num_tokens": 277701158.0, + "reward": 2.486879825592041, + "reward_std": 0.22418928146362305, + "rewards/cosine_scaled_reward/mean": 0.7446539402008057, + "rewards/cosine_scaled_reward/std": 0.4457184672355652, + "rewards/repetition_penalty_reward/mean": -0.13355520367622375, + "rewards/repetition_penalty_reward/std": 0.04534250125288963, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.6875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 3048.96484375, + "completions/mean_terminated_length": 3006.40234375, + "completions/min_length": 1608.0, + "completions/min_terminated_length": 1608.0, + "epoch": 0.07893333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07023734599351883, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 278601961.0, + "reward": 2.585810661315918, + "reward_std": 0.12515553832054138, + "rewards/cosine_scaled_reward/mean": 0.8022688627243042, + "rewards/cosine_scaled_reward/std": 0.36715206503868103, + "rewards/repetition_penalty_reward/mean": -0.13520832359790802, + "rewards/repetition_penalty_reward/std": 0.04460633173584938, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4045.0, + "completions/mean_length": 3106.94921875, + "completions/mean_terminated_length": 3023.13134765625, + "completions/min_length": 1856.0, + "completions/min_terminated_length": 1856.0, + "epoch": 0.07914666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13042612373828888, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 279480480.0, + "reward": 2.483642578125, + "reward_std": 0.2134016901254654, + "rewards/cosine_scaled_reward/mean": 0.7506895065307617, + "rewards/cosine_scaled_reward/std": 0.44514280557632446, + "rewards/repetition_penalty_reward/mean": -0.14282819628715515, + "rewards/repetition_penalty_reward/std": 0.04998833313584328, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 3048.23828125, + "completions/mean_terminated_length": 2992.18505859375, + "completions/min_length": 2046.0, + "completions/min_terminated_length": 2046.0, + "epoch": 0.07936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14418748021125793, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 280377481.0, + "reward": 2.578580856323242, + "reward_std": 0.20151178538799286, + "rewards/cosine_scaled_reward/mean": 0.8029334545135498, + "rewards/cosine_scaled_reward/std": 0.3652251064777374, + "rewards/repetition_penalty_reward/mean": -0.14700883626937866, + "rewards/repetition_penalty_reward/std": 0.05446892976760864, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4043.0, + "completions/mean_length": 3071.16015625, + "completions/mean_terminated_length": 3020.758056640625, + "completions/min_length": 1699.0, + "completions/min_terminated_length": 1699.0, + "epoch": 0.07957333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1292157769203186, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 281280130.0, + "reward": 2.559688091278076, + "reward_std": 0.18442192673683167, + "rewards/cosine_scaled_reward/mean": 0.7905406951904297, + "rewards/cosine_scaled_reward/std": 0.39117321372032166, + "rewards/repetition_penalty_reward/mean": -0.14100870490074158, + "rewards/repetition_penalty_reward/std": 0.049596257507801056, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3003.04296875, + "completions/mean_terminated_length": 2949.290771484375, + "completions/min_length": 1669.0, + "completions/min_terminated_length": 1669.0, + "epoch": 0.07978666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1299491673707962, + "learning_rate": 1e-06, + "loss": -0.0406, + "num_tokens": 282171045.0, + "reward": 2.5163817405700684, + "reward_std": 0.19801975786685944, + "rewards/cosine_scaled_reward/mean": 0.7635452747344971, + "rewards/cosine_scaled_reward/std": 0.41028037667274475, + "rewards/repetition_penalty_reward/mean": -0.1495072841644287, + "rewards/repetition_penalty_reward/std": 0.05092966929078102, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 2962.68359375, + "completions/mean_terminated_length": 2911.7998046875, + "completions/min_length": 2057.0, + "completions/min_terminated_length": 2057.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12212635576725006, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 283054620.0, + "reward": 2.554882526397705, + "reward_std": 0.17215952277183533, + "rewards/cosine_scaled_reward/mean": 0.7953221797943115, + "rewards/cosine_scaled_reward/std": 0.3597381114959717, + "rewards/repetition_penalty_reward/mean": -0.15293972194194794, + "rewards/repetition_penalty_reward/std": 0.05043653026223183, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4040.0, + "completions/mean_length": 2971.55859375, + "completions/mean_terminated_length": 2911.4033203125, + "completions/min_length": 1506.0, + "completions/min_terminated_length": 1506.0, + "epoch": 0.08021333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2789333164691925, + "learning_rate": 1e-06, + "loss": -0.0239, + "num_tokens": 283915795.0, + "reward": 2.545088291168213, + "reward_std": 0.16484610736370087, + "rewards/cosine_scaled_reward/mean": 0.7841353416442871, + "rewards/cosine_scaled_reward/std": 0.375940203666687, + "rewards/repetition_penalty_reward/mean": -0.15076559782028198, + "rewards/repetition_penalty_reward/std": 0.05685606971383095, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718994140625, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4034.0, + "completions/mean_length": 3045.81640625, + "completions/mean_terminated_length": 3007.55078125, + "completions/min_length": 1876.0, + "completions/min_terminated_length": 1876.0, + "epoch": 0.08042666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16344138979911804, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 284821132.0, + "reward": 2.5898828506469727, + "reward_std": 0.20654936134815216, + "rewards/cosine_scaled_reward/mean": 0.8137679696083069, + "rewards/cosine_scaled_reward/std": 0.35199347138404846, + "rewards/repetition_penalty_reward/mean": -0.14654135704040527, + "rewards/repetition_penalty_reward/std": 0.047246113419532776, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3033.90625, + "completions/mean_terminated_length": 3008.416259765625, + "completions/min_length": 1850.0, + "completions/min_terminated_length": 1850.0, + "epoch": 0.08064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1035616546869278, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 285739580.0, + "reward": 2.6008670330047607, + "reward_std": 0.09359882771968842, + "rewards/cosine_scaled_reward/mean": 0.8161867260932922, + "rewards/cosine_scaled_reward/std": 0.34512314200401306, + "rewards/repetition_penalty_reward/mean": -0.137975811958313, + "rewards/repetition_penalty_reward/std": 0.043298136442899704, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4026.0, + "completions/mean_length": 3077.109375, + "completions/mean_terminated_length": 3009.18359375, + "completions/min_length": 1779.0, + "completions/min_terminated_length": 1779.0, + "epoch": 0.08085333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06813926994800568, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 286625760.0, + "reward": 2.673771858215332, + "reward_std": 0.09984706342220306, + "rewards/cosine_scaled_reward/mean": 0.8574278354644775, + "rewards/cosine_scaled_reward/std": 0.27632588148117065, + "rewards/repetition_penalty_reward/mean": -0.14068728685379028, + "rewards/repetition_penalty_reward/std": 0.04103941097855568, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 2952.99609375, + "completions/mean_terminated_length": 2911.34814453125, + "completions/min_length": 1943.0, + "completions/min_terminated_length": 1943.0, + "epoch": 0.08106666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14166662096977234, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 287499027.0, + "reward": 2.5731148719787598, + "reward_std": 0.19441863894462585, + "rewards/cosine_scaled_reward/mean": 0.7902377843856812, + "rewards/cosine_scaled_reward/std": 0.3644700348377228, + "rewards/repetition_penalty_reward/mean": -0.13899797201156616, + "rewards/repetition_penalty_reward/std": 0.050455491989851, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4068.0, + "completions/mean_length": 3016.1484375, + "completions/mean_terminated_length": 2963.040771484375, + "completions/min_length": 1691.0, + "completions/min_terminated_length": 1691.0, + "epoch": 0.08128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22079779207706451, + "learning_rate": 1e-06, + "loss": -0.0234, + "num_tokens": 288380001.0, + "reward": 2.5718040466308594, + "reward_std": 0.11867351830005646, + "rewards/cosine_scaled_reward/mean": 0.7975947260856628, + "rewards/cosine_scaled_reward/std": 0.3700672686100006, + "rewards/repetition_penalty_reward/mean": -0.13985300064086914, + "rewards/repetition_penalty_reward/std": 0.051185525953769684, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3013.15234375, + "completions/mean_terminated_length": 2936.129638671875, + "completions/min_length": 1764.0, + "completions/min_terminated_length": 1764.0, + "epoch": 0.08149333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15201827883720398, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 289252860.0, + "reward": 2.666290283203125, + "reward_std": 0.17419229447841644, + "rewards/cosine_scaled_reward/mean": 0.8443589210510254, + "rewards/cosine_scaled_reward/std": 0.2822229564189911, + "rewards/repetition_penalty_reward/mean": -0.13509991765022278, + "rewards/repetition_penalty_reward/std": 0.042069874703884125, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3070.90625, + "completions/mean_terminated_length": 3033.5546875, + "completions/min_length": 2053.0, + "completions/min_terminated_length": 2053.0, + "epoch": 0.08170666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13215400278568268, + "learning_rate": 1e-06, + "loss": -0.0289, + "num_tokens": 290172020.0, + "reward": 2.6237735748291016, + "reward_std": 0.19740281999111176, + "rewards/cosine_scaled_reward/mean": 0.8225434422492981, + "rewards/cosine_scaled_reward/std": 0.34391024708747864, + "rewards/repetition_penalty_reward/mean": -0.1323637068271637, + "rewards/repetition_penalty_reward/std": 0.042291488498449326, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 2975.4296875, + "completions/mean_terminated_length": 2915.4814453125, + "completions/min_length": 1947.0, + "completions/min_terminated_length": 1947.0, + "epoch": 0.08192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10557766258716583, + "learning_rate": 1e-06, + "loss": -0.0438, + "num_tokens": 291036618.0, + "reward": 2.6430983543395996, + "reward_std": 0.1428372859954834, + "rewards/cosine_scaled_reward/mean": 0.8273340463638306, + "rewards/cosine_scaled_reward/std": 0.30892860889434814, + "rewards/repetition_penalty_reward/mean": -0.13032954931259155, + "rewards/repetition_penalty_reward/std": 0.04752003774046898, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.94921875, + "rewards/reward_reference/std": 0.21998079121112823, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4054.0, + "completions/mean_length": 3062.87890625, + "completions/mean_terminated_length": 2970.557373046875, + "completions/min_length": 1749.0, + "completions/min_terminated_length": 1749.0, + "epoch": 0.08213333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08981072157621384, + "learning_rate": 1e-06, + "loss": -0.0342, + "num_tokens": 291894135.0, + "reward": 2.580354690551758, + "reward_std": 0.17908355593681335, + "rewards/cosine_scaled_reward/mean": 0.7964844107627869, + "rewards/cosine_scaled_reward/std": 0.3772408068180084, + "rewards/repetition_penalty_reward/mean": -0.13487963378429413, + "rewards/repetition_penalty_reward/std": 0.046523887664079666, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3078.24609375, + "completions/mean_terminated_length": 3005.853515625, + "completions/min_length": 1897.0, + "completions/min_terminated_length": 1897.0, + "epoch": 0.08234666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11945135146379471, + "learning_rate": 1e-06, + "loss": -0.0369, + "num_tokens": 292775774.0, + "reward": 2.579617500305176, + "reward_std": 0.22266732156276703, + "rewards/cosine_scaled_reward/mean": 0.799963116645813, + "rewards/cosine_scaled_reward/std": 0.377466082572937, + "rewards/repetition_penalty_reward/mean": -0.13128305971622467, + "rewards/repetition_penalty_reward/std": 0.053302206099033356, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3053.4765625, + "completions/mean_terminated_length": 2974.63037109375, + "completions/min_length": 2059.0, + "completions/min_terminated_length": 2059.0, + "epoch": 0.08256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.134053573012352, + "learning_rate": 1e-06, + "loss": -0.0201, + "num_tokens": 293636572.0, + "reward": 2.5145716667175293, + "reward_std": 0.18531867861747742, + "rewards/cosine_scaled_reward/mean": 0.7566462755203247, + "rewards/cosine_scaled_reward/std": 0.42772749066352844, + "rewards/repetition_penalty_reward/mean": -0.12644967436790466, + "rewards/repetition_penalty_reward/std": 0.04349347576498985, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4074.0, + "completions/mean_length": 3031.671875, + "completions/mean_terminated_length": 2979.32763671875, + "completions/min_length": 1821.0, + "completions/min_terminated_length": 1821.0, + "epoch": 0.08277333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1286914199590683, + "learning_rate": 1e-06, + "loss": -0.0467, + "num_tokens": 294520200.0, + "reward": 2.630145788192749, + "reward_std": 0.20631247758865356, + "rewards/cosine_scaled_reward/mean": 0.8196536302566528, + "rewards/cosine_scaled_reward/std": 0.33500272035598755, + "rewards/repetition_penalty_reward/mean": -0.12700791656970978, + "rewards/repetition_penalty_reward/std": 0.0510396771132946, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3092.9375, + "completions/mean_terminated_length": 3017.07568359375, + "completions/min_length": 1895.0, + "completions/min_terminated_length": 1895.0, + "epoch": 0.08298666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18327857553958893, + "learning_rate": 1e-06, + "loss": -0.0161, + "num_tokens": 295398600.0, + "reward": 2.6226136684417725, + "reward_std": 0.2285882830619812, + "rewards/cosine_scaled_reward/mean": 0.8175864219665527, + "rewards/cosine_scaled_reward/std": 0.35323092341423035, + "rewards/repetition_penalty_reward/mean": -0.12231657654047012, + "rewards/repetition_penalty_reward/std": 0.04444298893213272, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3212.0703125, + "completions/mean_terminated_length": 3157.05419921875, + "completions/min_length": 1879.0, + "completions/min_terminated_length": 1879.0, + "epoch": 0.0832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10938741266727448, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 296325970.0, + "reward": 2.5627360343933105, + "reward_std": 0.16607502102851868, + "rewards/cosine_scaled_reward/mean": 0.7930000424385071, + "rewards/cosine_scaled_reward/std": 0.4169759750366211, + "rewards/repetition_penalty_reward/mean": -0.13260750472545624, + "rewards/repetition_penalty_reward/std": 0.04929269477725029, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 3147.953125, + "completions/mean_terminated_length": 3084.750244140625, + "completions/min_length": 1960.0, + "completions/min_terminated_length": 1960.0, + "epoch": 0.08341333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16152621805667877, + "learning_rate": 1e-06, + "loss": -0.0148, + "num_tokens": 297223618.0, + "reward": 2.524066925048828, + "reward_std": 0.1699940264225006, + "rewards/cosine_scaled_reward/mean": 0.767318606376648, + "rewards/cosine_scaled_reward/std": 0.4368087351322174, + "rewards/repetition_penalty_reward/mean": -0.1299704909324646, + "rewards/repetition_penalty_reward/std": 0.04877452179789543, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3236.640625, + "completions/mean_terminated_length": 3143.636474609375, + "completions/min_length": 2051.0, + "completions/min_terminated_length": 2051.0, + "epoch": 0.08362666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12291693687438965, + "learning_rate": 1e-06, + "loss": -0.0316, + "num_tokens": 298113694.0, + "reward": 2.5816733837127686, + "reward_std": 0.18931283056735992, + "rewards/cosine_scaled_reward/mean": 0.8049663305282593, + "rewards/cosine_scaled_reward/std": 0.4033796489238739, + "rewards/repetition_penalty_reward/mean": -0.12641799449920654, + "rewards/repetition_penalty_reward/std": 0.03853068873286247, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3092.8359375, + "completions/mean_terminated_length": 3034.801513671875, + "completions/min_length": 1967.0, + "completions/min_terminated_length": 1967.0, + "epoch": 0.08384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12058772891759872, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 299006500.0, + "reward": 2.593824863433838, + "reward_std": 0.1544109284877777, + "rewards/cosine_scaled_reward/mean": 0.8009454011917114, + "rewards/cosine_scaled_reward/std": 0.3817402124404907, + "rewards/repetition_penalty_reward/mean": -0.12196415662765503, + "rewards/repetition_penalty_reward/std": 0.045104991644620895, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3119.30078125, + "completions/mean_terminated_length": 3058.510498046875, + "completions/min_length": 1893.0, + "completions/min_terminated_length": 1893.0, + "epoch": 0.08405333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06525684148073196, + "learning_rate": 1e-06, + "loss": -0.0213, + "num_tokens": 299904713.0, + "reward": 2.593860626220703, + "reward_std": 0.09809240698814392, + "rewards/cosine_scaled_reward/mean": 0.8014312386512756, + "rewards/cosine_scaled_reward/std": 0.38566121459007263, + "rewards/repetition_penalty_reward/mean": -0.12553951144218445, + "rewards/repetition_penalty_reward/std": 0.053088318556547165, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 3153.375, + "completions/mean_terminated_length": 3102.9462890625, + "completions/min_length": 1773.0, + "completions/min_terminated_length": 1773.0, + "epoch": 0.08426666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14195215702056885, + "learning_rate": 1e-06, + "loss": -0.021, + "num_tokens": 300820209.0, + "reward": 2.5252044200897217, + "reward_std": 0.17579331994056702, + "rewards/cosine_scaled_reward/mean": 0.7644021511077881, + "rewards/cosine_scaled_reward/std": 0.4409472942352295, + "rewards/repetition_penalty_reward/mean": -0.12279140949249268, + "rewards/repetition_penalty_reward/std": 0.03642672300338745, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3192.4765625, + "completions/mean_terminated_length": 3140.20654296875, + "completions/min_length": 1863.0, + "completions/min_terminated_length": 1863.0, + "epoch": 0.08448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13026125729084015, + "learning_rate": 1e-06, + "loss": -0.0239, + "num_tokens": 301749675.0, + "reward": 2.576292037963867, + "reward_std": 0.2108149230480194, + "rewards/cosine_scaled_reward/mean": 0.7987449169158936, + "rewards/cosine_scaled_reward/std": 0.4059189260005951, + "rewards/repetition_penalty_reward/mean": -0.12557795643806458, + "rewards/repetition_penalty_reward/std": 0.03922073915600777, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 3099.82421875, + "completions/mean_terminated_length": 3046.53076171875, + "completions/min_length": 1668.0, + "completions/min_terminated_length": 1668.0, + "epoch": 0.08469333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12833480536937714, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 302647086.0, + "reward": 2.6329143047332764, + "reward_std": 0.1479964256286621, + "rewards/cosine_scaled_reward/mean": 0.8212152123451233, + "rewards/cosine_scaled_reward/std": 0.3519037067890167, + "rewards/repetition_penalty_reward/mean": -0.12189459800720215, + "rewards/repetition_penalty_reward/std": 0.04479314014315605, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3087.08203125, + "completions/mean_terminated_length": 3054.5361328125, + "completions/min_length": 1757.0, + "completions/min_terminated_length": 1757.0, + "epoch": 0.08490666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09485691785812378, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 303571071.0, + "reward": 2.6275954246520996, + "reward_std": 0.11551915854215622, + "rewards/cosine_scaled_reward/mean": 0.8233013153076172, + "rewards/cosine_scaled_reward/std": 0.344821572303772, + "rewards/repetition_penalty_reward/mean": -0.12929978966712952, + "rewards/repetition_penalty_reward/std": 0.04905636981129646, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 3061.21484375, + "completions/mean_terminated_length": 2992.229248046875, + "completions/min_length": 1874.0, + "completions/min_terminated_length": 1874.0, + "epoch": 0.08512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11544538289308548, + "learning_rate": 1e-06, + "loss": -0.0454, + "num_tokens": 304450226.0, + "reward": 2.679389476776123, + "reward_std": 0.20466876029968262, + "rewards/cosine_scaled_reward/mean": 0.8510940074920654, + "rewards/cosine_scaled_reward/std": 0.2858965992927551, + "rewards/repetition_penalty_reward/mean": -0.12873569130897522, + "rewards/repetition_penalty_reward/std": 0.046985287219285965, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.95703125, + "rewards/reward_reference/std": 0.20318391919136047, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4010.0, + "completions/mean_length": 3165.70703125, + "completions/mean_terminated_length": 3099.535400390625, + "completions/min_length": 2123.0, + "completions/min_terminated_length": 2123.0, + "epoch": 0.08533333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11605282127857208, + "learning_rate": 1e-06, + "loss": -0.0212, + "num_tokens": 305356367.0, + "reward": 2.6028542518615723, + "reward_std": 0.14681796729564667, + "rewards/cosine_scaled_reward/mean": 0.817654013633728, + "rewards/cosine_scaled_reward/std": 0.37154901027679443, + "rewards/repetition_penalty_reward/mean": -0.13433128595352173, + "rewards/repetition_penalty_reward/std": 0.039992932230234146, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718994140625, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 3128.8515625, + "completions/mean_terminated_length": 3113.500244140625, + "completions/min_length": 1981.0, + "completions/min_terminated_length": 1981.0, + "epoch": 0.08554666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1029190719127655, + "learning_rate": 1e-06, + "loss": -0.014, + "num_tokens": 306299541.0, + "reward": 2.6625068187713623, + "reward_std": 0.11479123681783676, + "rewards/cosine_scaled_reward/mean": 0.8496817946434021, + "rewards/cosine_scaled_reward/std": 0.31631430983543396, + "rewards/repetition_penalty_reward/mean": -0.12936252355575562, + "rewards/repetition_penalty_reward/std": 0.03983324021100998, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9453125, + "rewards/reward_reference/std": 0.22781464457511902, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4068.0, + "completions/mean_length": 3188.91015625, + "completions/mean_terminated_length": 3112.0380859375, + "completions/min_length": 1893.0, + "completions/min_terminated_length": 1893.0, + "epoch": 0.08576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15154629945755005, + "learning_rate": 1e-06, + "loss": -0.03, + "num_tokens": 307196046.0, + "reward": 2.497952461242676, + "reward_std": 0.2996126413345337, + "rewards/cosine_scaled_reward/mean": 0.7567933201789856, + "rewards/cosine_scaled_reward/std": 0.4561953842639923, + "rewards/repetition_penalty_reward/mean": -0.13384085893630981, + "rewards/repetition_penalty_reward/std": 0.05501502379775047, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.875, + "rewards/reward_reference/std": 0.33136674761772156, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 3156.04296875, + "completions/mean_terminated_length": 3105.757080078125, + "completions/min_length": 2047.0, + "completions/min_terminated_length": 2047.0, + "epoch": 0.08597333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10560856759548187, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 308107361.0, + "reward": 2.6281182765960693, + "reward_std": 0.15326428413391113, + "rewards/cosine_scaled_reward/mean": 0.8252940773963928, + "rewards/cosine_scaled_reward/std": 0.35686615109443665, + "rewards/repetition_penalty_reward/mean": -0.1268633008003235, + "rewards/repetition_penalty_reward/std": 0.04566507786512375, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9296875, + "rewards/reward_reference/std": 0.2561737895011902, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4053.0, + "completions/mean_length": 3228.66015625, + "completions/mean_terminated_length": 3151.153076171875, + "completions/min_length": 1917.0, + "completions/min_terminated_length": 1917.0, + "epoch": 0.08618666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16378062963485718, + "learning_rate": 1e-06, + "loss": -0.0199, + "num_tokens": 309010934.0, + "reward": 2.417421579360962, + "reward_std": 0.20660261809825897, + "rewards/cosine_scaled_reward/mean": 0.7245820760726929, + "rewards/cosine_scaled_reward/std": 0.4962194561958313, + "rewards/repetition_penalty_reward/mean": -0.1423168182373047, + "rewards/repetition_penalty_reward/std": 0.058403536677360535, + "rewards/reward_format/mean": 0.987500011920929, + "rewards/reward_format/std": 0.0994100272655487, + "rewards/reward_reference/mean": 0.84765625, + "rewards/reward_reference/std": 0.3600577116012573, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 3154.10546875, + "completions/mean_terminated_length": 3069.93603515625, + "completions/min_length": 1942.0, + "completions/min_terminated_length": 1942.0, + "epoch": 0.0864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11778714507818222, + "learning_rate": 1e-06, + "loss": -0.0457, + "num_tokens": 309891297.0, + "reward": 2.558377742767334, + "reward_std": 0.24009853601455688, + "rewards/cosine_scaled_reward/mean": 0.7882504463195801, + "rewards/cosine_scaled_reward/std": 0.40825164318084717, + "rewards/repetition_penalty_reward/mean": -0.1298729032278061, + "rewards/repetition_penalty_reward/std": 0.054273542016744614, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4047.0, + "completions/mean_length": 3222.48828125, + "completions/mean_terminated_length": 3144.4296875, + "completions/min_length": 2138.0, + "completions/min_terminated_length": 2138.0, + "epoch": 0.08661333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18284331262111664, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 310788626.0, + "reward": 2.5298140048980713, + "reward_std": 0.2251349836587906, + "rewards/cosine_scaled_reward/mean": 0.776664674282074, + "rewards/cosine_scaled_reward/std": 0.43793678283691406, + "rewards/repetition_penalty_reward/mean": -0.13435065746307373, + "rewards/repetition_penalty_reward/std": 0.045195017009973526, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.890625, + "rewards/reward_reference/std": 0.31272050738334656, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 3215.89453125, + "completions/mean_terminated_length": 3161.116455078125, + "completions/min_length": 1896.0, + "completions/min_terminated_length": 1896.0, + "epoch": 0.08682666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1203547865152359, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 311714119.0, + "reward": 2.6104822158813477, + "reward_std": 0.14991873502731323, + "rewards/cosine_scaled_reward/mean": 0.8213518857955933, + "rewards/cosine_scaled_reward/std": 0.3793308436870575, + "rewards/repetition_penalty_reward/mean": -0.13274472951889038, + "rewards/repetition_penalty_reward/std": 0.04116351529955864, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 3208.79296875, + "completions/mean_terminated_length": 3137.66650390625, + "completions/min_length": 1861.0, + "completions/min_terminated_length": 1861.0, + "epoch": 0.08704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1600077599287033, + "learning_rate": 1e-06, + "loss": 0.0214, + "num_tokens": 312615894.0, + "reward": 2.5284624099731445, + "reward_std": 0.28709399700164795, + "rewards/cosine_scaled_reward/mean": 0.7823736667633057, + "rewards/cosine_scaled_reward/std": 0.4272303581237793, + "rewards/repetition_penalty_reward/mean": -0.13906735181808472, + "rewards/repetition_penalty_reward/std": 0.050136372447013855, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 3203.31640625, + "completions/mean_terminated_length": 3127.665283203125, + "completions/min_length": 2044.0, + "completions/min_terminated_length": 2044.0, + "epoch": 0.08725333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11707901954650879, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 313510083.0, + "reward": 2.570059061050415, + "reward_std": 0.19044455885887146, + "rewards/cosine_scaled_reward/mean": 0.8027787208557129, + "rewards/cosine_scaled_reward/std": 0.39987897872924805, + "rewards/repetition_penalty_reward/mean": -0.13662593066692352, + "rewards/repetition_penalty_reward/std": 0.05554681271314621, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 3137.3671875, + "completions/mean_terminated_length": 3064.86572265625, + "completions/min_length": 1734.0, + "completions/min_terminated_length": 1734.0, + "epoch": 0.08746666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14157453179359436, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 314396293.0, + "reward": 2.5433688163757324, + "reward_std": 0.20625907182693481, + "rewards/cosine_scaled_reward/mean": 0.7789784669876099, + "rewards/cosine_scaled_reward/std": 0.4177873134613037, + "rewards/repetition_penalty_reward/mean": -0.13795334100723267, + "rewards/repetition_penalty_reward/std": 0.05312160775065422, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4058.0, + "completions/mean_length": 3174.421875, + "completions/mean_terminated_length": 3125.119140625, + "completions/min_length": 2025.0, + "completions/min_terminated_length": 2025.0, + "epoch": 0.08768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1355607509613037, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 315326941.0, + "reward": 2.5864953994750977, + "reward_std": 0.20761072635650635, + "rewards/cosine_scaled_reward/mean": 0.8066941499710083, + "rewards/cosine_scaled_reward/std": 0.39073434472084045, + "rewards/repetition_penalty_reward/mean": -0.1311362087726593, + "rewards/repetition_penalty_reward/std": 0.0458197258412838, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 3194.16796875, + "completions/mean_terminated_length": 3145.921630859375, + "completions/min_length": 2059.0, + "completions/min_terminated_length": 2059.0, + "epoch": 0.08789333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06964617222547531, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 316257524.0, + "reward": 2.6956582069396973, + "reward_std": 0.08016351610422134, + "rewards/cosine_scaled_reward/mean": 0.8655639886856079, + "rewards/cosine_scaled_reward/std": 0.2969217598438263, + "rewards/repetition_penalty_reward/mean": -0.12303093820810318, + "rewards/repetition_penalty_reward/std": 0.044344719499349594, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.953125, + "rewards/reward_reference/std": 0.21178513765335083, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3134.00390625, + "completions/mean_terminated_length": 3098.951416015625, + "completions/min_length": 1788.0, + "completions/min_terminated_length": 1788.0, + "epoch": 0.08810666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11384371668100357, + "learning_rate": 1e-06, + "loss": -0.0179, + "num_tokens": 317185773.0, + "reward": 2.6399922370910645, + "reward_std": 0.1448422223329544, + "rewards/cosine_scaled_reward/mean": 0.8305256366729736, + "rewards/cosine_scaled_reward/std": 0.34458062052726746, + "rewards/repetition_penalty_reward/mean": -0.12412697076797485, + "rewards/repetition_penalty_reward/std": 0.0361146442592144, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 3201.82421875, + "completions/mean_terminated_length": 3121.9189453125, + "completions/min_length": 1926.0, + "completions/min_terminated_length": 1926.0, + "epoch": 0.08832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13148584961891174, + "learning_rate": 1e-06, + "loss": -0.0272, + "num_tokens": 318072120.0, + "reward": 2.6402688026428223, + "reward_std": 0.1609860360622406, + "rewards/cosine_scaled_reward/mean": 0.8379240036010742, + "rewards/cosine_scaled_reward/std": 0.3452746570110321, + "rewards/repetition_penalty_reward/mean": -0.12890510261058807, + "rewards/repetition_penalty_reward/std": 0.050265178084373474, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.5, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 3258.58203125, + "completions/mean_terminated_length": 3202.75439453125, + "completions/min_length": 2023.0, + "completions/min_terminated_length": 2023.0, + "epoch": 0.08853333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12287775427103043, + "learning_rate": 1e-06, + "loss": -0.0233, + "num_tokens": 318997017.0, + "reward": 2.6155078411102295, + "reward_std": 0.18347862362861633, + "rewards/cosine_scaled_reward/mean": 0.8230103254318237, + "rewards/cosine_scaled_reward/std": 0.38649454712867737, + "rewards/repetition_penalty_reward/mean": -0.12547120451927185, + "rewards/repetition_penalty_reward/std": 0.041851840913295746, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.28125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3299.890625, + "completions/mean_terminated_length": 3221.3046875, + "completions/min_length": 2126.0, + "completions/min_terminated_length": 2126.0, + "epoch": 0.08874666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10421235859394073, + "learning_rate": 1e-06, + "loss": -0.0141, + "num_tokens": 319914633.0, + "reward": 2.5600852966308594, + "reward_std": 0.15454831719398499, + "rewards/cosine_scaled_reward/mean": 0.7934485673904419, + "rewards/cosine_scaled_reward/std": 0.4298205077648163, + "rewards/repetition_penalty_reward/mean": -0.12867553532123566, + "rewards/repetition_penalty_reward/std": 0.05109499394893646, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 3273.75390625, + "completions/mean_terminated_length": 3176.807861328125, + "completions/min_length": 2142.0, + "completions/min_terminated_length": 2142.0, + "epoch": 0.08896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09872046858072281, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 320801902.0, + "reward": 2.6604604721069336, + "reward_std": 0.17045599222183228, + "rewards/cosine_scaled_reward/mean": 0.8498491644859314, + "rewards/cosine_scaled_reward/std": 0.34010493755340576, + "rewards/repetition_penalty_reward/mean": -0.13079512119293213, + "rewards/repetition_penalty_reward/std": 0.04310750588774681, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3320.2265625, + "completions/mean_terminated_length": 3224.9560546875, + "completions/min_length": 1994.0, + "completions/min_terminated_length": 1994.0, + "epoch": 0.08917333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1512015163898468, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 321695904.0, + "reward": 2.4330129623413086, + "reward_std": 0.24782538414001465, + "rewards/cosine_scaled_reward/mean": 0.7282810211181641, + "rewards/cosine_scaled_reward/std": 0.5042746663093567, + "rewards/repetition_penalty_reward/mean": -0.13589312136173248, + "rewards/repetition_penalty_reward/std": 0.05037867650389671, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.84375, + "rewards/reward_reference/std": 0.3638034462928772, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3331.8046875, + "completions/mean_terminated_length": 3259.95751953125, + "completions/min_length": 2094.0, + "completions/min_terminated_length": 2094.0, + "epoch": 0.08938666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08986438810825348, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 322614434.0, + "reward": 2.7352826595306396, + "reward_std": 0.13047119975090027, + "rewards/cosine_scaled_reward/mean": 0.8981503844261169, + "rewards/cosine_scaled_reward/std": 0.2551228106021881, + "rewards/repetition_penalty_reward/mean": -0.13161778450012207, + "rewards/repetition_penalty_reward/std": 0.04430250823497772, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.96875, + "rewards/reward_reference/std": 0.17433346807956696, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 3391.77734375, + "completions/mean_terminated_length": 3305.2939453125, + "completions/min_length": 2039.0, + "completions/min_terminated_length": 2039.0, + "epoch": 0.0896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12648317217826843, + "learning_rate": 1e-06, + "loss": -0.0324, + "num_tokens": 323522857.0, + "reward": 2.5961928367614746, + "reward_std": 0.212894469499588, + "rewards/cosine_scaled_reward/mean": 0.8207013607025146, + "rewards/cosine_scaled_reward/std": 0.4113256633281708, + "rewards/repetition_penalty_reward/mean": -0.12841475009918213, + "rewards/repetition_penalty_reward/std": 0.04414404183626175, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.25, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3354.9453125, + "completions/mean_terminated_length": 3278.284423828125, + "completions/min_length": 2324.0, + "completions/min_terminated_length": 2324.0, + "epoch": 0.08981333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1353956162929535, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 324441127.0, + "reward": 2.646284580230713, + "reward_std": 0.15197622776031494, + "rewards/cosine_scaled_reward/mean": 0.8503248691558838, + "rewards/cosine_scaled_reward/std": 0.3605615496635437, + "rewards/repetition_penalty_reward/mean": -0.13138386607170105, + "rewards/repetition_penalty_reward/std": 0.044571150094270706, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3371.5390625, + "completions/mean_terminated_length": 3275.37158203125, + "completions/min_length": 2072.0, + "completions/min_terminated_length": 2072.0, + "epoch": 0.09002666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06844349950551987, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 325350233.0, + "reward": 2.638679027557373, + "reward_std": 0.11508074402809143, + "rewards/cosine_scaled_reward/mean": 0.8407710194587708, + "rewards/cosine_scaled_reward/std": 0.3784065842628479, + "rewards/repetition_penalty_reward/mean": -0.12474842369556427, + "rewards/repetition_penalty_reward/std": 0.04072347283363342, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4069.0, + "completions/mean_length": 3297.40234375, + "completions/mean_terminated_length": 3240.59814453125, + "completions/min_length": 2336.0, + "completions/min_terminated_length": 2336.0, + "epoch": 0.09024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0915771871805191, + "learning_rate": 1e-06, + "loss": -0.0279, + "num_tokens": 326292292.0, + "reward": 2.5805721282958984, + "reward_std": 0.10786420106887817, + "rewards/cosine_scaled_reward/mean": 0.812868595123291, + "rewards/cosine_scaled_reward/std": 0.40911364555358887, + "rewards/repetition_penalty_reward/mean": -0.13542136549949646, + "rewards/repetition_penalty_reward/std": 0.04858802258968353, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90625, + "rewards/reward_reference/std": 0.2920515835285187, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 3348.2578125, + "completions/mean_terminated_length": 3277.95751953125, + "completions/min_length": 2299.0, + "completions/min_terminated_length": 2299.0, + "epoch": 0.09045333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12993597984313965, + "learning_rate": 1e-06, + "loss": -0.0219, + "num_tokens": 327220986.0, + "reward": 2.604248046875, + "reward_std": 0.253852903842926, + "rewards/cosine_scaled_reward/mean": 0.8276076316833496, + "rewards/cosine_scaled_reward/std": 0.3967602252960205, + "rewards/repetition_penalty_reward/mean": -0.12804700434207916, + "rewards/repetition_penalty_reward/std": 0.04108881205320358, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 3322.1171875, + "completions/mean_terminated_length": 3203.5947265625, + "completions/min_length": 2081.0, + "completions/min_terminated_length": 2081.0, + "epoch": 0.09066666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13421866297721863, + "learning_rate": 1e-06, + "loss": -0.0247, + "num_tokens": 328094492.0, + "reward": 2.5595273971557617, + "reward_std": 0.2225823998451233, + "rewards/cosine_scaled_reward/mean": 0.794215202331543, + "rewards/cosine_scaled_reward/std": 0.4304060935974121, + "rewards/repetition_penalty_reward/mean": -0.12921899557113647, + "rewards/repetition_penalty_reward/std": 0.05206966772675514, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 3315.33984375, + "completions/mean_terminated_length": 3245.57861328125, + "completions/min_length": 2084.0, + "completions/min_terminated_length": 2084.0, + "epoch": 0.09088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10778502374887466, + "learning_rate": 1e-06, + "loss": -0.0441, + "num_tokens": 329020015.0, + "reward": 2.598879814147949, + "reward_std": 0.19090493023395538, + "rewards/cosine_scaled_reward/mean": 0.823809027671814, + "rewards/cosine_scaled_reward/std": 0.3936808109283447, + "rewards/repetition_penalty_reward/mean": -0.13274173438549042, + "rewards/repetition_penalty_reward/std": 0.04350915551185608, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3340.73046875, + "completions/mean_terminated_length": 3236.671142578125, + "completions/min_length": 2181.0, + "completions/min_terminated_length": 2181.0, + "epoch": 0.09109333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1536858230829239, + "learning_rate": 1e-06, + "loss": -0.0536, + "num_tokens": 329909346.0, + "reward": 2.5599093437194824, + "reward_std": 0.26450973749160767, + "rewards/cosine_scaled_reward/mean": 0.7959344387054443, + "rewards/cosine_scaled_reward/std": 0.43361762166023254, + "rewards/repetition_penalty_reward/mean": -0.13055622577667236, + "rewards/repetition_penalty_reward/std": 0.04218065366148949, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4041.0, + "completions/mean_length": 3372.89453125, + "completions/mean_terminated_length": 3265.887939453125, + "completions/min_length": 2365.0, + "completions/min_terminated_length": 2365.0, + "epoch": 0.09130666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08796551823616028, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 330795287.0, + "reward": 2.514080047607422, + "reward_std": 0.1660141944885254, + "rewards/cosine_scaled_reward/mean": 0.7753437757492065, + "rewards/cosine_scaled_reward/std": 0.46698227524757385, + "rewards/repetition_penalty_reward/mean": -0.1370450258255005, + "rewards/repetition_penalty_reward/std": 0.055892687290906906, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.87890625, + "rewards/reward_reference/std": 0.3268752694129944, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 3394.15625, + "completions/mean_terminated_length": 3328.171142578125, + "completions/min_length": 2423.0, + "completions/min_terminated_length": 2423.0, + "epoch": 0.09152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1999899446964264, + "learning_rate": 1e-06, + "loss": -0.0179, + "num_tokens": 331737955.0, + "reward": 2.449087142944336, + "reward_std": 0.2750634551048279, + "rewards/cosine_scaled_reward/mean": 0.7363446950912476, + "rewards/cosine_scaled_reward/std": 0.5115602016448975, + "rewards/repetition_penalty_reward/mean": -0.12944519519805908, + "rewards/repetition_penalty_reward/std": 0.041707854717969894, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.8515625, + "rewards/reward_reference/std": 0.3562295734882355, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 3360.3046875, + "completions/mean_terminated_length": 3294.5615234375, + "completions/min_length": 2207.0, + "completions/min_terminated_length": 2207.0, + "epoch": 0.09173333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11115996539592743, + "learning_rate": 1e-06, + "loss": -0.0196, + "num_tokens": 332670321.0, + "reward": 2.6029627323150635, + "reward_std": 0.14713042974472046, + "rewards/cosine_scaled_reward/mean": 0.8259959816932678, + "rewards/cosine_scaled_reward/std": 0.39926013350486755, + "rewards/repetition_penalty_reward/mean": -0.13709580898284912, + "rewards/repetition_penalty_reward/std": 0.05000806227326393, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9140625, + "rewards/reward_reference/std": 0.28082075715065, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 3402.5, + "completions/mean_terminated_length": 3274.07421875, + "completions/min_length": 2088.0, + "completions/min_terminated_length": 2088.0, + "epoch": 0.09194666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14282870292663574, + "learning_rate": 1e-06, + "loss": -0.0179, + "num_tokens": 333535445.0, + "reward": 2.4626400470733643, + "reward_std": 0.18670928478240967, + "rewards/cosine_scaled_reward/mean": 0.7408066987991333, + "rewards/cosine_scaled_reward/std": 0.5033049583435059, + "rewards/repetition_penalty_reward/mean": -0.13363544642925262, + "rewards/repetition_penalty_reward/std": 0.04304096847772598, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.85546875, + "rewards/reward_reference/std": 0.35231640934944153, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 3392.3984375, + "completions/mean_terminated_length": 3284.6396484375, + "completions/min_length": 2074.0, + "completions/min_terminated_length": 2074.0, + "epoch": 0.09216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04934278875589371, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 334433423.0, + "reward": 2.657198667526245, + "reward_std": 0.08821704983711243, + "rewards/cosine_scaled_reward/mean": 0.8536912798881531, + "rewards/cosine_scaled_reward/std": 0.3593747615814209, + "rewards/repetition_penalty_reward/mean": -0.12696149945259094, + "rewards/repetition_penalty_reward/std": 0.0377291664481163, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 3376.1015625, + "completions/mean_terminated_length": 3250.614501953125, + "completions/min_length": 2119.0, + "completions/min_terminated_length": 2119.0, + "epoch": 0.09237333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1504306197166443, + "learning_rate": 1e-06, + "loss": -0.0269, + "num_tokens": 335293601.0, + "reward": 2.3855834007263184, + "reward_std": 0.20364665985107422, + "rewards/cosine_scaled_reward/mean": 0.6995294094085693, + "rewards/cosine_scaled_reward/std": 0.5381035804748535, + "rewards/repetition_penalty_reward/mean": -0.13816462457180023, + "rewards/repetition_penalty_reward/std": 0.04862834885716438, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.82421875, + "rewards/reward_reference/std": 0.3813795745372772, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.8125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3470.88671875, + "completions/mean_terminated_length": 3361.921875, + "completions/min_length": 2334.0, + "completions/min_terminated_length": 2334.0, + "epoch": 0.09258666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10625244677066803, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 336194808.0, + "reward": 2.586014747619629, + "reward_std": 0.1232042908668518, + "rewards/cosine_scaled_reward/mean": 0.8126987218856812, + "rewards/cosine_scaled_reward/std": 0.43467089533805847, + "rewards/repetition_penalty_reward/mean": -0.12512150406837463, + "rewards/repetition_penalty_reward/std": 0.0379415787756443, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3534.79296875, + "completions/mean_terminated_length": 3415.1044921875, + "completions/min_length": 2386.0, + "completions/min_terminated_length": 2386.0, + "epoch": 0.0928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13819520175457, + "learning_rate": 1e-06, + "loss": -0.014, + "num_tokens": 337078519.0, + "reward": 2.671761989593506, + "reward_std": 0.12655839323997498, + "rewards/cosine_scaled_reward/mean": 0.8669031858444214, + "rewards/cosine_scaled_reward/std": 0.3638756275177002, + "rewards/repetition_penalty_reward/mean": -0.1256098747253418, + "rewards/repetition_penalty_reward/std": 0.04022699594497681, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.6875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 3376.28515625, + "completions/mean_terminated_length": 3235.03271484375, + "completions/min_length": 2154.0, + "completions/min_terminated_length": 2154.0, + "epoch": 0.09301333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16057516634464264, + "learning_rate": 1e-06, + "loss": -0.0324, + "num_tokens": 337933628.0, + "reward": 2.4101791381835938, + "reward_std": 0.22112424671649933, + "rewards/cosine_scaled_reward/mean": 0.7129418849945068, + "rewards/cosine_scaled_reward/std": 0.5243740081787109, + "rewards/repetition_penalty_reward/mean": -0.12385663390159607, + "rewards/repetition_penalty_reward/std": 0.04228730499744415, + "rewards/reward_format/mean": 0.981249988079071, + "rewards/reward_format/std": 0.12126781791448593, + "rewards/reward_reference/mean": 0.83984375, + "rewards/reward_reference/std": 0.36746934056282043, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 3468.8125, + "completions/mean_terminated_length": 3391.78955078125, + "completions/min_length": 2311.0, + "completions/min_terminated_length": 2311.0, + "epoch": 0.09322666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09278582036495209, + "learning_rate": 1e-06, + "loss": -0.0249, + "num_tokens": 338874828.0, + "reward": 2.712590217590332, + "reward_std": 0.11666490882635117, + "rewards/cosine_scaled_reward/mean": 0.8886799812316895, + "rewards/cosine_scaled_reward/std": 0.3155267834663391, + "rewards/repetition_penalty_reward/mean": -0.1253085881471634, + "rewards/repetition_penalty_reward/std": 0.03902411833405495, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94921875, + "rewards/reward_reference/std": 0.21998079121112823, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.5625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 3454.30078125, + "completions/mean_terminated_length": 3313.73828125, + "completions/min_length": 2340.0, + "completions/min_terminated_length": 2340.0, + "epoch": 0.09344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16439048945903778, + "learning_rate": 1e-06, + "loss": -0.0203, + "num_tokens": 339738661.0, + "reward": 2.550375461578369, + "reward_std": 0.22821134328842163, + "rewards/cosine_scaled_reward/mean": 0.7951457500457764, + "rewards/cosine_scaled_reward/std": 0.4520246088504791, + "rewards/repetition_penalty_reward/mean": -0.12523919343948364, + "rewards/repetition_penalty_reward/std": 0.041806966066360474, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.88671875, + "rewards/reward_reference/std": 0.31755712628364563, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 3443.12890625, + "completions/mean_terminated_length": 3362.95166015625, + "completions/min_length": 2043.0, + "completions/min_terminated_length": 2043.0, + "epoch": 0.09365333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11715667694807053, + "learning_rate": 1e-06, + "loss": -0.0249, + "num_tokens": 340669206.0, + "reward": 2.6908936500549316, + "reward_std": 0.14655840396881104, + "rewards/cosine_scaled_reward/mean": 0.8710836172103882, + "rewards/cosine_scaled_reward/std": 0.3402611017227173, + "rewards/repetition_penalty_reward/mean": -0.12159596383571625, + "rewards/repetition_penalty_reward/std": 0.043441347777843475, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 3457.76171875, + "completions/mean_terminated_length": 3328.91552734375, + "completions/min_length": 2327.0, + "completions/min_terminated_length": 2327.0, + "epoch": 0.09386666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1541212946176529, + "learning_rate": 1e-06, + "loss": -0.0148, + "num_tokens": 341539165.0, + "reward": 2.5942134857177734, + "reward_std": 0.22023794054985046, + "rewards/cosine_scaled_reward/mean": 0.8143563866615295, + "rewards/cosine_scaled_reward/std": 0.42871448397636414, + "rewards/repetition_penalty_reward/mean": -0.11936160922050476, + "rewards/repetition_penalty_reward/std": 0.04116738215088844, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 3334.8203125, + "completions/mean_terminated_length": 3233.77880859375, + "completions/min_length": 1930.0, + "completions/min_terminated_length": 1930.0, + "epoch": 0.09408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1119907945394516, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 342422431.0, + "reward": 2.6673107147216797, + "reward_std": 0.16998052597045898, + "rewards/cosine_scaled_reward/mean": 0.8509924411773682, + "rewards/cosine_scaled_reward/std": 0.3510325849056244, + "rewards/repetition_penalty_reward/mean": -0.11805684864521027, + "rewards/repetition_penalty_reward/std": 0.041798681020736694, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.9375, + "rewards/reward_reference/std": 0.24253563582897186, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3551.625, + "completions/mean_terminated_length": 3373.927490234375, + "completions/min_length": 2225.0, + "completions/min_terminated_length": 2225.0, + "epoch": 0.09429333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.155339315533638, + "learning_rate": 1e-06, + "loss": -0.0634, + "num_tokens": 343229003.0, + "reward": 2.412459373474121, + "reward_std": 0.36707803606987, + "rewards/cosine_scaled_reward/mean": 0.7190250158309937, + "rewards/cosine_scaled_reward/std": 0.5402841567993164, + "rewards/repetition_penalty_reward/mean": -0.1323469579219818, + "rewards/repetition_penalty_reward/std": 0.05035141855478287, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.83203125, + "rewards/reward_reference/std": 0.3745708465576172, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3416.41015625, + "completions/mean_terminated_length": 3305.204345703125, + "completions/min_length": 1969.0, + "completions/min_terminated_length": 1969.0, + "epoch": 0.09450666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1449149250984192, + "learning_rate": 1e-06, + "loss": -0.0217, + "num_tokens": 344119452.0, + "reward": 2.6318774223327637, + "reward_std": 0.20957745611667633, + "rewards/cosine_scaled_reward/mean": 0.8327311873435974, + "rewards/cosine_scaled_reward/std": 0.39399969577789307, + "rewards/repetition_penalty_reward/mean": -0.11569737643003464, + "rewards/repetition_penalty_reward/std": 0.039580464363098145, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.91796875, + "rewards/reward_reference/std": 0.2749498784542084, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3448.515625, + "completions/mean_terminated_length": 3310.4267578125, + "completions/min_length": 2306.0, + "completions/min_terminated_length": 2306.0, + "epoch": 0.09472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14609962701797485, + "learning_rate": 1e-06, + "loss": -0.0273, + "num_tokens": 344981920.0, + "reward": 2.466052293777466, + "reward_std": 0.21688896417617798, + "rewards/cosine_scaled_reward/mean": 0.7364804744720459, + "rewards/cosine_scaled_reward/std": 0.5121752619743347, + "rewards/repetition_penalty_reward/mean": -0.11808443069458008, + "rewards/repetition_penalty_reward/std": 0.03446255251765251, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.84765625, + "rewards/reward_reference/std": 0.3600577116012573, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.09375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3618.640625, + "completions/mean_terminated_length": 3469.31298828125, + "completions/min_length": 2316.0, + "completions/min_terminated_length": 2316.0, + "epoch": 0.09493333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12748895585536957, + "learning_rate": 1e-06, + "loss": -0.0366, + "num_tokens": 345826308.0, + "reward": 2.4843618869781494, + "reward_std": 0.23571541905403137, + "rewards/cosine_scaled_reward/mean": 0.7587255239486694, + "rewards/cosine_scaled_reward/std": 0.5151540040969849, + "rewards/repetition_penalty_reward/mean": -0.12670737504959106, + "rewards/repetition_penalty_reward/std": 0.04888736084103584, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.85546875, + "rewards/reward_reference/std": 0.35231640934944153, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3355.34765625, + "completions/mean_terminated_length": 3249.540283203125, + "completions/min_length": 2222.0, + "completions/min_terminated_length": 2222.0, + "epoch": 0.09514666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12772390246391296, + "learning_rate": 1e-06, + "loss": -0.0216, + "num_tokens": 346719945.0, + "reward": 2.671082019805908, + "reward_std": 0.11974822729825974, + "rewards/cosine_scaled_reward/mean": 0.8572664260864258, + "rewards/cosine_scaled_reward/std": 0.34457895159721375, + "rewards/repetition_penalty_reward/mean": -0.11977824568748474, + "rewards/repetition_penalty_reward/std": 0.044986020773649216, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 3482.47265625, + "completions/mean_terminated_length": 3318.46044921875, + "completions/min_length": 2169.0, + "completions/min_terminated_length": 2169.0, + "epoch": 0.09536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22144626080989838, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 347549682.0, + "reward": 2.5736947059631348, + "reward_std": 0.2570610046386719, + "rewards/cosine_scaled_reward/mean": 0.81336510181427, + "rewards/cosine_scaled_reward/std": 0.43019360303878784, + "rewards/repetition_penalty_reward/mean": -0.1318579763174057, + "rewards/repetition_penalty_reward/std": 0.046924762427806854, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.65625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 3514.81640625, + "completions/mean_terminated_length": 3397.48828125, + "completions/min_length": 2114.0, + "completions/min_terminated_length": 2114.0, + "epoch": 0.09557333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13463345170021057, + "learning_rate": 1e-06, + "loss": -0.0252, + "num_tokens": 348440599.0, + "reward": 2.5912740230560303, + "reward_std": 0.15143904089927673, + "rewards/cosine_scaled_reward/mean": 0.8223309516906738, + "rewards/cosine_scaled_reward/std": 0.4279668629169464, + "rewards/repetition_penalty_reward/mean": -0.12402550876140594, + "rewards/repetition_penalty_reward/std": 0.04709490016102791, + "rewards/reward_format/mean": 0.9906250238418579, + "rewards/reward_format/std": 0.08626226335763931, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3559.171875, + "completions/mean_terminated_length": 3456.800048828125, + "completions/min_length": 1997.0, + "completions/min_terminated_length": 1997.0, + "epoch": 0.09578666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10657992213964462, + "learning_rate": 1e-06, + "loss": -0.018, + "num_tokens": 349347991.0, + "reward": 2.6517934799194336, + "reward_std": 0.20156292617321014, + "rewards/cosine_scaled_reward/mean": 0.8590583205223083, + "rewards/cosine_scaled_reward/std": 0.38307467103004456, + "rewards/repetition_penalty_reward/mean": -0.13304612040519714, + "rewards/repetition_penalty_reward/std": 0.044906023889780045, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.92578125, + "rewards/reward_reference/std": 0.2626400291919708, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3457.07421875, + "completions/mean_terminated_length": 3365.79931640625, + "completions/min_length": 2277.0, + "completions/min_terminated_length": 2277.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1106138601899147, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 350266990.0, + "reward": 2.6805636882781982, + "reward_std": 0.16519448161125183, + "rewards/cosine_scaled_reward/mean": 0.8684450387954712, + "rewards/cosine_scaled_reward/std": 0.34893450140953064, + "rewards/repetition_penalty_reward/mean": -0.12147516012191772, + "rewards/repetition_penalty_reward/std": 0.03809197247028351, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3519.98046875, + "completions/mean_terminated_length": 3376.677978515625, + "completions/min_length": 2497.0, + "completions/min_terminated_length": 2497.0, + "epoch": 0.09621333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1882275491952896, + "learning_rate": 1e-06, + "loss": -0.0435, + "num_tokens": 351124233.0, + "reward": 2.6901729106903076, + "reward_std": 0.1846616119146347, + "rewards/cosine_scaled_reward/mean": 0.8774366974830627, + "rewards/cosine_scaled_reward/std": 0.3409196436405182, + "rewards/repetition_penalty_reward/mean": -0.1286701261997223, + "rewards/repetition_penalty_reward/std": 0.04067756608128548, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.94140625, + "rewards/reward_reference/std": 0.23532284796237946, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4081.0, + "completions/mean_length": 3531.19921875, + "completions/mean_terminated_length": 3394.111572265625, + "completions/min_length": 2322.0, + "completions/min_terminated_length": 2322.0, + "epoch": 0.09642666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13614989817142487, + "learning_rate": 1e-06, + "loss": -0.0721, + "num_tokens": 351979164.0, + "reward": 2.565354108810425, + "reward_std": 0.22787414491176605, + "rewards/cosine_scaled_reward/mean": 0.8094824552536011, + "rewards/cosine_scaled_reward/std": 0.4485189914703369, + "rewards/repetition_penalty_reward/mean": -0.13865964114665985, + "rewards/repetition_penalty_reward/std": 0.04454709589481354, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.89453125, + "rewards/reward_reference/std": 0.3077581524848938, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3545.9140625, + "completions/mean_terminated_length": 3405.6962890625, + "completions/min_length": 2251.0, + "completions/min_terminated_length": 2251.0, + "epoch": 0.09664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15346837043762207, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 352838342.0, + "reward": 2.4719109535217285, + "reward_std": 0.26720231771469116, + "rewards/cosine_scaled_reward/mean": 0.7608464956283569, + "rewards/cosine_scaled_reward/std": 0.5024836659431458, + "rewards/repetition_penalty_reward/mean": -0.14206044375896454, + "rewards/repetition_penalty_reward/std": 0.042888157069683075, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.859375, + "rewards/reward_reference/std": 0.3483152687549591, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 3482.3359375, + "completions/mean_terminated_length": 3322.1181640625, + "completions/min_length": 1981.0, + "completions/min_terminated_length": 1981.0, + "epoch": 0.09685333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12531593441963196, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 353669904.0, + "reward": 2.4537057876586914, + "reward_std": 0.291486918926239, + "rewards/cosine_scaled_reward/mean": 0.747859537601471, + "rewards/cosine_scaled_reward/std": 0.5058099031448364, + "rewards/repetition_penalty_reward/mean": -0.1496226191520691, + "rewards/repetition_penalty_reward/std": 0.04818349331617355, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.85546875, + "rewards/reward_reference/std": 0.35231640934944153, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3583.19921875, + "completions/mean_terminated_length": 3429.619140625, + "completions/min_length": 2455.0, + "completions/min_terminated_length": 2455.0, + "epoch": 0.09706666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1776491403579712, + "learning_rate": 1e-06, + "loss": -0.0616, + "num_tokens": 354501715.0, + "reward": 2.485987901687622, + "reward_std": 0.270699143409729, + "rewards/cosine_scaled_reward/mean": 0.7776684761047363, + "rewards/cosine_scaled_reward/std": 0.48990023136138916, + "rewards/repetition_penalty_reward/mean": -0.1526181846857071, + "rewards/repetition_penalty_reward/std": 0.04843423143029213, + "rewards/reward_format/mean": 0.9937499761581421, + "rewards/reward_format/std": 0.0705718919634819, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3455.9609375, + "completions/mean_terminated_length": 3354.597412109375, + "completions/min_length": 2270.0, + "completions/min_terminated_length": 2270.0, + "epoch": 0.09728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11334540694952011, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 355403069.0, + "reward": 2.5903708934783936, + "reward_std": 0.16302460432052612, + "rewards/cosine_scaled_reward/mean": 0.8281469941139221, + "rewards/cosine_scaled_reward/std": 0.4118127226829529, + "rewards/repetition_penalty_reward/mean": -0.14793241024017334, + "rewards/repetition_penalty_reward/std": 0.04326983541250229, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.91015625, + "rewards/reward_reference/std": 0.2865179479122162, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3528.26171875, + "completions/mean_terminated_length": 3383.544189453125, + "completions/min_length": 2283.0, + "completions/min_terminated_length": 2283.0, + "epoch": 0.09749333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12938834726810455, + "learning_rate": 1e-06, + "loss": -0.037, + "num_tokens": 356256492.0, + "reward": 2.552910327911377, + "reward_std": 0.26627570390701294, + "rewards/cosine_scaled_reward/mean": 0.8132476806640625, + "rewards/cosine_scaled_reward/std": 0.4391486346721649, + "rewards/repetition_penalty_reward/mean": -0.15877488255500793, + "rewards/repetition_penalty_reward/std": 0.06030600890517235, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3463.6875, + "completions/mean_terminated_length": 3346.592529296875, + "completions/min_length": 2197.0, + "completions/min_terminated_length": 2197.0, + "epoch": 0.09770666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17623008787631989, + "learning_rate": 1e-06, + "loss": -0.0468, + "num_tokens": 357130112.0, + "reward": 2.471355438232422, + "reward_std": 0.32545268535614014, + "rewards/cosine_scaled_reward/mean": 0.7646437883377075, + "rewards/cosine_scaled_reward/std": 0.48779138922691345, + "rewards/repetition_penalty_reward/mean": -0.160475954413414, + "rewards/repetition_penalty_reward/std": 0.04666691645979881, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8671875, + "rewards/reward_reference/std": 0.3400367796421051, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 3483.28125, + "completions/mean_terminated_length": 3369.81494140625, + "completions/min_length": 2204.0, + "completions/min_terminated_length": 2204.0, + "epoch": 0.09792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13649289309978485, + "learning_rate": 1e-06, + "loss": -0.0153, + "num_tokens": 358018724.0, + "reward": 2.6438474655151367, + "reward_std": 0.1534462422132492, + "rewards/cosine_scaled_reward/mean": 0.8666955232620239, + "rewards/cosine_scaled_reward/std": 0.3531719744205475, + "rewards/repetition_penalty_reward/mean": -0.15644192695617676, + "rewards/repetition_penalty_reward/std": 0.04818189516663551, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 3496.74609375, + "completions/mean_terminated_length": 3347.663330078125, + "completions/min_length": 2117.0, + "completions/min_terminated_length": 2117.0, + "epoch": 0.09813333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1073637306690216, + "learning_rate": 1e-06, + "loss": -0.0394, + "num_tokens": 358869947.0, + "reward": 2.559465169906616, + "reward_std": 0.23732900619506836, + "rewards/cosine_scaled_reward/mean": 0.8185771703720093, + "rewards/cosine_scaled_reward/std": 0.42788994312286377, + "rewards/repetition_penalty_reward/mean": -0.1583307683467865, + "rewards/repetition_penalty_reward/std": 0.06375561654567719, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 3493.02734375, + "completions/mean_terminated_length": 3367.882080078125, + "completions/min_length": 2337.0, + "completions/min_terminated_length": 2337.0, + "epoch": 0.09834666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13014236092567444, + "learning_rate": 1e-06, + "loss": -0.0377, + "num_tokens": 359758530.0, + "reward": 2.6258108615875244, + "reward_std": 0.15342065691947937, + "rewards/cosine_scaled_reward/mean": 0.8476953506469727, + "rewards/cosine_scaled_reward/std": 0.3844573199748993, + "rewards/repetition_penalty_reward/mean": -0.1437593698501587, + "rewards/repetition_penalty_reward/std": 0.04323843494057655, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.921875, + "rewards/reward_reference/std": 0.26889389753341675, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 3481.15625, + "completions/mean_terminated_length": 3405.649169921875, + "completions/min_length": 2295.0, + "completions/min_terminated_length": 2295.0, + "epoch": 0.09856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11288342624902725, + "learning_rate": 1e-06, + "loss": -0.0204, + "num_tokens": 360691230.0, + "reward": 2.718127727508545, + "reward_std": 0.125985786318779, + "rewards/cosine_scaled_reward/mean": 0.9069023132324219, + "rewards/cosine_scaled_reward/std": 0.2774904668331146, + "rewards/repetition_penalty_reward/mean": -0.14971214532852173, + "rewards/repetition_penalty_reward/std": 0.03843717649579048, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.9609375, + "rewards/reward_reference/std": 0.19412322342395782, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 3515.06640625, + "completions/mean_terminated_length": 3363.39404296875, + "completions/min_length": 2279.0, + "completions/min_terminated_length": 2279.0, + "epoch": 0.09877333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13893939554691315, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 361531383.0, + "reward": 2.519890308380127, + "reward_std": 0.2613111436367035, + "rewards/cosine_scaled_reward/mean": 0.7902860641479492, + "rewards/cosine_scaled_reward/std": 0.4658883213996887, + "rewards/repetition_penalty_reward/mean": -0.1532084196805954, + "rewards/repetition_penalty_reward/std": 0.0518529936671257, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.8828125, + "rewards/reward_reference/std": 0.3222736418247223, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 3379.55859375, + "completions/mean_terminated_length": 3242.934814453125, + "completions/min_length": 1778.0, + "completions/min_terminated_length": 1778.0, + "epoch": 0.09898666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14748817682266235, + "learning_rate": 1e-06, + "loss": -0.0395, + "num_tokens": 362391402.0, + "reward": 2.564578056335449, + "reward_std": 0.20639969408512115, + "rewards/cosine_scaled_reward/mean": 0.8055846691131592, + "rewards/cosine_scaled_reward/std": 0.4240720868110657, + "rewards/repetition_penalty_reward/mean": -0.14335037767887115, + "rewards/repetition_penalty_reward/std": 0.03977758437395096, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.90234375, + "rewards/reward_reference/std": 0.29743078351020813, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 3315.11328125, + "completions/mean_terminated_length": 3230.601806640625, + "completions/min_length": 1942.0, + "completions/min_terminated_length": 1942.0, + "epoch": 0.0992, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09585642069578171, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 363297903.0, + "reward": 2.680453300476074, + "reward_std": 0.13919416069984436, + "rewards/cosine_scaled_reward/mean": 0.8720101118087769, + "rewards/cosine_scaled_reward/std": 0.308190256357193, + "rewards/repetition_penalty_reward/mean": -0.13765066862106323, + "rewards/repetition_penalty_reward/std": 0.0456363707780838, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.94921875, + "rewards/reward_reference/std": 0.21998079121112823, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.25, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 3323.46484375, + "completions/mean_terminated_length": 3243.54736328125, + "completions/min_length": 1967.0, + "completions/min_terminated_length": 1967.0, + "epoch": 0.09941333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11453945189714432, + "learning_rate": 1e-06, + "loss": -0.0383, + "num_tokens": 364204806.0, + "reward": 2.5500752925872803, + "reward_std": 0.1567349135875702, + "rewards/cosine_scaled_reward/mean": 0.7999416589736938, + "rewards/cosine_scaled_reward/std": 0.4272773563861847, + "rewards/repetition_penalty_reward/mean": -0.145178884267807, + "rewards/repetition_penalty_reward/std": 0.04210899397730827, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.8984375, + "rewards/reward_reference/std": 0.3026638329029083, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 3424.1484375, + "completions/mean_terminated_length": 3273.06201171875, + "completions/min_length": 2276.0, + "completions/min_terminated_length": 2276.0, + "epoch": 0.09962666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14379972219467163, + "learning_rate": 1e-06, + "loss": -0.0218, + "num_tokens": 365051776.0, + "reward": 2.4092652797698975, + "reward_std": 0.253988653421402, + "rewards/cosine_scaled_reward/mean": 0.7186833620071411, + "rewards/cosine_scaled_reward/std": 0.5256325602531433, + "rewards/repetition_penalty_reward/mean": -0.1461367905139923, + "rewards/repetition_penalty_reward/std": 0.050273049622774124, + "rewards/reward_format/mean": 0.996874988079071, + "rewards/reward_format/std": 0.05000000447034836, + "rewards/reward_reference/mean": 0.83984375, + "rewards/reward_reference/std": 0.36746934056282043, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -5.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3387.23046875, + "completions/mean_terminated_length": 3252.06982421875, + "completions/min_length": 2080.0, + "completions/min_terminated_length": 2080.0, + "epoch": 0.09984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12260466814041138, + "learning_rate": 1e-06, + "loss": -0.022, + "num_tokens": 365908611.0, + "reward": 2.4367387294769287, + "reward_std": 0.1926591992378235, + "rewards/cosine_scaled_reward/mean": 0.7306523323059082, + "rewards/cosine_scaled_reward/std": 0.5114856362342834, + "rewards/repetition_penalty_reward/mean": -0.14156997203826904, + "rewards/repetition_penalty_reward/std": 0.04533535614609718, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.84765625, + "rewards/reward_reference/std": 0.3600577116012573, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": -6.09375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 3267.3125, + "completions/mean_terminated_length": 3161.44482421875, + "completions/min_length": 1937.0, + "completions/min_terminated_length": 1937.0, + "epoch": 0.10005333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10559126734733582, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 366791167.0, + "reward": 2.658348560333252, + "reward_std": 0.13092412054538727, + "rewards/cosine_scaled_reward/mean": 0.8457194566726685, + "rewards/cosine_scaled_reward/std": 0.3431178033351898, + "rewards/repetition_penalty_reward/mean": -0.12096454203128815, + "rewards/repetition_penalty_reward/std": 0.03693599998950958, + "rewards/reward_format/mean": 1.0, + "rewards/reward_format/std": 0.0, + "rewards/reward_reference/mean": 0.93359375, + "rewards/reward_reference/std": 0.24947863817214966, + "step": 469 + } + ], + "logging_steps": 1, + "max_steps": 4688, + "num_input_tokens_seen": 366791167, + "num_train_epochs": 1, + "save_steps": 469, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}