{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10005333333333333, "eval_steps": 500, "global_step": 469, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3841.0, "completions/mean_length": 1272.1640625, "completions/mean_terminated_length": 1249.9290771484375, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.00021333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.22031651437282562, "learning_rate": 1e-06, "loss": -0.0145, "num_tokens": 482922.0, "reward": 2.009742021560669, "reward_std": 0.5785077810287476, "rewards/cosine_scaled_reward/mean": 0.3822018802165985, "rewards/cosine_scaled_reward/std": 0.3494165539741516, "rewards/repetition_penalty_reward/mean": -0.060741037130355835, "rewards/repetition_penalty_reward/std": 0.045807842165231705, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.69140625, "rewards/reward_reference/std": 0.46281787753105164, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3649.0, "completions/mean_length": 1335.62890625, "completions/mean_terminated_length": 1302.8973388671875, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.00042666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.22476382553577423, "learning_rate": 1e-06, "loss": -0.0109, "num_tokens": 974007.0, "reward": 2.014347553253174, "reward_std": 0.6111550331115723, "rewards/cosine_scaled_reward/mean": 0.3857782185077667, "rewards/cosine_scaled_reward/std": 0.35800445079803467, "rewards/repetition_penalty_reward/mean": -0.06752431392669678, "rewards/repetition_penalty_reward/std": 0.05347849428653717, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.69921875, "rewards/reward_reference/std": 0.45949608087539673, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 4086.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1288.3828125, "completions/mean_terminated_length": 1288.3828125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.00064, "frac_reward_zero_std": 0.0, "grad_norm": 0.20334957540035248, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 1465649.0, "reward": 2.1703991889953613, "reward_std": 0.5016853213310242, "rewards/cosine_scaled_reward/mean": 0.4534637928009033, "rewards/cosine_scaled_reward/std": 0.3181309998035431, "rewards/repetition_penalty_reward/mean": -0.06431479007005692, "rewards/repetition_penalty_reward/std": 0.03901338577270508, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.78125, "rewards/reward_reference/std": 0.41420844197273254, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3700.0, "completions/mean_length": 1424.984375, "completions/mean_terminated_length": 1403.9527587890625, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.0008533333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.19816721975803375, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 1986721.0, "reward": 2.2232859134674072, "reward_std": 0.4856342077255249, "rewards/cosine_scaled_reward/mean": 0.48364967107772827, "rewards/cosine_scaled_reward/std": 0.32795608043670654, "rewards/repetition_penalty_reward/mean": -0.07364509999752045, "rewards/repetition_penalty_reward/std": 0.055652402341365814, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.81640625, "rewards/reward_reference/std": 0.387910932302475, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3592.0, "completions/max_terminated_length": 3592.0, "completions/mean_length": 1338.8125, "completions/mean_terminated_length": 1338.8125, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.0010666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13079342246055603, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 2486485.0, "reward": 2.3322033882141113, "reward_std": 0.22138240933418274, "rewards/cosine_scaled_reward/mean": 0.5233694314956665, "rewards/cosine_scaled_reward/std": 0.27166542410850525, "rewards/repetition_penalty_reward/mean": -0.06616615504026413, "rewards/repetition_penalty_reward/std": 0.04239708185195923, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3989.0, "completions/mean_length": 1401.1953125, "completions/mean_terminated_length": 1379.976318359375, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.00128, "frac_reward_zero_std": 0.0, "grad_norm": 0.11242787539958954, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 2997299.0, "reward": 2.324979066848755, "reward_std": 0.17272153496742249, "rewards/cosine_scaled_reward/mean": 0.5257084369659424, "rewards/cosine_scaled_reward/std": 0.2853335738182068, "rewards/repetition_penalty_reward/mean": -0.06869829446077347, "rewards/repetition_penalty_reward/std": 0.05674975365400314, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3755.0, "completions/max_terminated_length": 3755.0, "completions/mean_length": 1290.30859375, "completions/mean_terminated_length": 1290.30859375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.0014933333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.09726474434137344, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 3489290.0, "reward": 2.472348928451538, "reward_std": 0.11173544079065323, "rewards/cosine_scaled_reward/mean": 0.5793841481208801, "rewards/cosine_scaled_reward/std": 0.1878894716501236, "rewards/repetition_penalty_reward/mean": -0.0679725855588913, "rewards/repetition_penalty_reward/std": 0.05107080563902855, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4009.0, "completions/mean_length": 1331.0390625, "completions/mean_terminated_length": 1320.1961669921875, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.0017066666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10814463347196579, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 3989000.0, "reward": 2.4125254154205322, "reward_std": 0.13539515435695648, "rewards/cosine_scaled_reward/mean": 0.5574886202812195, "rewards/cosine_scaled_reward/std": 0.22672739624977112, "rewards/repetition_penalty_reward/mean": -0.06683817505836487, "rewards/repetition_penalty_reward/std": 0.042216457426548004, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3870.0, "completions/max_terminated_length": 3870.0, "completions/mean_length": 1249.4140625, "completions/mean_terminated_length": 1249.4140625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.00192, "frac_reward_zero_std": 0.0, "grad_norm": 0.0961054190993309, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 4473702.0, "reward": 2.467017650604248, "reward_std": 0.11271210014820099, "rewards/cosine_scaled_reward/mean": 0.5698755979537964, "rewards/cosine_scaled_reward/std": 0.18185605108737946, "rewards/repetition_penalty_reward/mean": -0.05988934636116028, "rewards/repetition_penalty_reward/std": 0.037777043879032135, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3699.0, "completions/max_terminated_length": 3699.0, "completions/mean_length": 1270.58984375, "completions/mean_terminated_length": 1270.58984375, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.0021333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.10294565558433533, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 4969001.0, "reward": 2.4084889888763428, "reward_std": 0.12390808016061783, "rewards/cosine_scaled_reward/mean": 0.543577253818512, "rewards/cosine_scaled_reward/std": 0.2268955409526825, "rewards/repetition_penalty_reward/mean": -0.05696332827210426, "rewards/repetition_penalty_reward/std": 0.033603210002183914, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3572.0, "completions/mean_length": 1343.6171875, "completions/mean_terminated_length": 1321.94482421875, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.0023466666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10278859734535217, "learning_rate": 1e-06, "loss": -0.018, "num_tokens": 5464207.0, "reward": 2.426042318344116, "reward_std": 0.14806506037712097, "rewards/cosine_scaled_reward/mean": 0.560874342918396, "rewards/cosine_scaled_reward/std": 0.22602853178977966, "rewards/repetition_penalty_reward/mean": -0.06530068069696426, "rewards/repetition_penalty_reward/std": 0.048898182809352875, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 1317.421875, "completions/mean_terminated_length": 1306.5255126953125, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.00256, "frac_reward_zero_std": 0.0, "grad_norm": 0.1048831194639206, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 5956883.0, "reward": 2.4089107513427734, "reward_std": 0.14584854245185852, "rewards/cosine_scaled_reward/mean": 0.5502076148986816, "rewards/cosine_scaled_reward/std": 0.2330145388841629, "rewards/repetition_penalty_reward/mean": -0.06707821041345596, "rewards/repetition_penalty_reward/std": 0.045541103929281235, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1363.87109375, "completions/mean_terminated_length": 1320.5040283203125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.0027733333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.1066160574555397, "learning_rate": 1e-06, "loss": -0.0122, "num_tokens": 6444286.0, "reward": 2.264991044998169, "reward_std": 0.16080796718597412, "rewards/cosine_scaled_reward/mean": 0.4897496700286865, "rewards/cosine_scaled_reward/std": 0.30944791436195374, "rewards/repetition_penalty_reward/mean": -0.07788346707820892, "rewards/repetition_penalty_reward/std": 0.07128535211086273, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.859375, "rewards/reward_reference/std": 0.3483152687549591, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3437.0, "completions/mean_length": 1307.9140625, "completions/mean_terminated_length": 1296.98046875, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.0029866666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.1263291835784912, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 6929804.0, "reward": 2.425128936767578, "reward_std": 0.18342849612236023, "rewards/cosine_scaled_reward/mean": 0.5549286603927612, "rewards/cosine_scaled_reward/std": 0.22249378263950348, "rewards/repetition_penalty_reward/mean": -0.0633934885263443, "rewards/repetition_penalty_reward/std": 0.0460335873067379, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 1294.0390625, "completions/mean_terminated_length": 1283.051025390625, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.0032, "frac_reward_zero_std": 0.0, "grad_norm": 0.08281808346509933, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 7419574.0, "reward": 2.4733660221099854, "reward_std": 0.09956402331590652, "rewards/cosine_scaled_reward/mean": 0.5843473076820374, "rewards/cosine_scaled_reward/std": 0.1703619360923767, "rewards/repetition_penalty_reward/mean": -0.06801241636276245, "rewards/repetition_penalty_reward/std": 0.055640317499637604, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3652.0, "completions/max_terminated_length": 3652.0, "completions/mean_length": 1338.58984375, "completions/mean_terminated_length": 1338.58984375, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.0034133333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.09150482714176178, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 7930145.0, "reward": 2.4295272827148438, "reward_std": 0.1320231556892395, "rewards/cosine_scaled_reward/mean": 0.5594995021820068, "rewards/cosine_scaled_reward/std": 0.22843553125858307, "rewards/repetition_penalty_reward/mean": -0.06356588006019592, "rewards/repetition_penalty_reward/std": 0.044449321925640106, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1384.94921875, "completions/mean_terminated_length": 1374.3177490234375, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.0036266666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.12217939645051956, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 8441464.0, "reward": 2.330857753753662, "reward_std": 0.18522366881370544, "rewards/cosine_scaled_reward/mean": 0.5207034945487976, "rewards/cosine_scaled_reward/std": 0.2845655679702759, "rewards/repetition_penalty_reward/mean": -0.06875194609165192, "rewards/repetition_penalty_reward/std": 0.04453708976507187, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3870.0, "completions/mean_length": 1400.40234375, "completions/mean_terminated_length": 1389.8314208984375, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 0.00384, "frac_reward_zero_std": 0.0, "grad_norm": 0.09449199587106705, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 8958591.0, "reward": 2.371668577194214, "reward_std": 0.15123680233955383, "rewards/cosine_scaled_reward/mean": 0.5336047410964966, "rewards/cosine_scaled_reward/std": 0.2732281982898712, "rewards/repetition_penalty_reward/mean": -0.06506102532148361, "rewards/repetition_penalty_reward/std": 0.04685278609395027, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3348.0, "completions/mean_length": 1337.62109375, "completions/mean_terminated_length": 1304.9130859375, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.004053333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.09695859253406525, "learning_rate": 1e-06, "loss": -0.0154, "num_tokens": 9451434.0, "reward": 2.4069466590881348, "reward_std": 0.11520685255527496, "rewards/cosine_scaled_reward/mean": 0.5477147102355957, "rewards/cosine_scaled_reward/std": 0.24599869549274445, "rewards/repetition_penalty_reward/mean": -0.06733058393001556, "rewards/repetition_penalty_reward/std": 0.055632736533880234, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3706.0, "completions/max_terminated_length": 3706.0, "completions/mean_length": 1414.38671875, "completions/mean_terminated_length": 1414.38671875, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.004266666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11205258220434189, "learning_rate": 1e-06, "loss": -0.0116, "num_tokens": 9976625.0, "reward": 2.3993844985961914, "reward_std": 0.15085497498512268, "rewards/cosine_scaled_reward/mean": 0.5560339093208313, "rewards/cosine_scaled_reward/std": 0.25779932737350464, "rewards/repetition_penalty_reward/mean": -0.06680548191070557, "rewards/repetition_penalty_reward/std": 0.0472581572830677, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3791.0, "completions/mean_length": 1426.796875, "completions/mean_terminated_length": 1384.4287109375, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.00448, "frac_reward_zero_std": 0.0, "grad_norm": 0.09845885634422302, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 10482365.0, "reward": 2.345158338546753, "reward_std": 0.14726582169532776, "rewards/cosine_scaled_reward/mean": 0.5292760133743286, "rewards/cosine_scaled_reward/std": 0.2867335379123688, "rewards/repetition_penalty_reward/mean": -0.07630515843629837, "rewards/repetition_penalty_reward/std": 0.05343927443027496, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3537.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 1426.203125, "completions/mean_terminated_length": 1426.203125, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.004693333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1269364356994629, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 11005109.0, "reward": 2.360685348510742, "reward_std": 0.18901582062244415, "rewards/cosine_scaled_reward/mean": 0.5373590588569641, "rewards/cosine_scaled_reward/std": 0.278967946767807, "rewards/repetition_penalty_reward/mean": -0.0712050348520279, "rewards/repetition_penalty_reward/std": 0.05502448230981827, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 1537.12890625, "completions/mean_terminated_length": 1454.5845947265625, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.004906666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1407482624053955, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 11524066.0, "reward": 2.2200188636779785, "reward_std": 0.21890655159950256, "rewards/cosine_scaled_reward/mean": 0.48634082078933716, "rewards/cosine_scaled_reward/std": 0.3508237600326538, "rewards/repetition_penalty_reward/mean": -0.08272843062877655, "rewards/repetition_penalty_reward/std": 0.07371426373720169, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.83203125, "rewards/reward_reference/std": 0.3745708465576172, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3893.0, "completions/mean_length": 1413.125, "completions/mean_terminated_length": 1370.539794921875, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.00512, "frac_reward_zero_std": 0.0, "grad_norm": 0.09089359641075134, "learning_rate": 1e-06, "loss": -0.0207, "num_tokens": 12036734.0, "reward": 2.3405649662017822, "reward_std": 0.13450174033641815, "rewards/cosine_scaled_reward/mean": 0.522262454032898, "rewards/cosine_scaled_reward/std": 0.2896619141101837, "rewards/repetition_penalty_reward/mean": -0.06919749081134796, "rewards/repetition_penalty_reward/std": 0.05012732744216919, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2961.0, "completions/max_terminated_length": 2961.0, "completions/mean_length": 1314.49609375, "completions/mean_terminated_length": 1314.49609375, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.005333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.06929586082696915, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 12535273.0, "reward": 2.4096481800079346, "reward_std": 0.09283026307821274, "rewards/cosine_scaled_reward/mean": 0.5563814640045166, "rewards/cosine_scaled_reward/std": 0.22016239166259766, "rewards/repetition_penalty_reward/mean": -0.06470194458961487, "rewards/repetition_penalty_reward/std": 0.042412176728248596, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3672.0, "completions/mean_length": 1402.0390625, "completions/mean_terminated_length": 1326.30517578125, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 0.005546666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10325585305690765, "learning_rate": 1e-06, "loss": -0.0165, "num_tokens": 13034299.0, "reward": 2.3359270095825195, "reward_std": 0.13954925537109375, "rewards/cosine_scaled_reward/mean": 0.5185195803642273, "rewards/cosine_scaled_reward/std": 0.2903260290622711, "rewards/repetition_penalty_reward/mean": -0.07399865984916687, "rewards/repetition_penalty_reward/std": 0.06387010216712952, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 1295.125, "completions/mean_terminated_length": 1295.125, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.00576, "frac_reward_zero_std": 0.0, "grad_norm": 0.1057233065366745, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 13528279.0, "reward": 2.425154447555542, "reward_std": 0.12597481906414032, "rewards/cosine_scaled_reward/mean": 0.5589825510978699, "rewards/cosine_scaled_reward/std": 0.20628339052200317, "rewards/repetition_penalty_reward/mean": -0.05960933491587639, "rewards/repetition_penalty_reward/std": 0.027123799547553062, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 1445.77734375, "completions/mean_terminated_length": 1435.3843994140625, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.005973333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13786497712135315, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 14062230.0, "reward": 2.406646251678467, "reward_std": 0.22535404562950134, "rewards/cosine_scaled_reward/mean": 0.561261773109436, "rewards/cosine_scaled_reward/std": 0.25999942421913147, "rewards/repetition_penalty_reward/mean": -0.06945927441120148, "rewards/repetition_penalty_reward/std": 0.053039710968732834, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1456.67578125, "completions/mean_terminated_length": 1435.8936767578125, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.006186666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.12175247073173523, "learning_rate": 1e-06, "loss": -0.0315, "num_tokens": 14588299.0, "reward": 2.405393123626709, "reward_std": 0.19517257809638977, "rewards/cosine_scaled_reward/mean": 0.5614031553268433, "rewards/cosine_scaled_reward/std": 0.26180750131607056, "rewards/repetition_penalty_reward/mean": -0.07085388153791428, "rewards/repetition_penalty_reward/std": 0.05317673459649086, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3379.0, "completions/mean_length": 1438.1015625, "completions/mean_terminated_length": 1417.1732177734375, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 0.0064, "frac_reward_zero_std": 0.0, "grad_norm": 0.12587440013885498, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 15107513.0, "reward": 2.3349556922912598, "reward_std": 0.2020719051361084, "rewards/cosine_scaled_reward/mean": 0.5276631116867065, "rewards/cosine_scaled_reward/std": 0.29128366708755493, "rewards/repetition_penalty_reward/mean": -0.06770722568035126, "rewards/repetition_penalty_reward/std": 0.04827690124511719, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3383.0, "completions/mean_length": 1442.3125, "completions/mean_terminated_length": 1431.906005859375, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.006613333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1460960954427719, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 15636525.0, "reward": 2.3774428367614746, "reward_std": 0.21828415989875793, "rewards/cosine_scaled_reward/mean": 0.5441263914108276, "rewards/cosine_scaled_reward/std": 0.2732025682926178, "rewards/repetition_penalty_reward/mean": -0.06512115895748138, "rewards/repetition_penalty_reward/std": 0.04758916050195694, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3752.0, "completions/mean_length": 1484.0546875, "completions/mean_terminated_length": 1463.4881591796875, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "epoch": 0.006826666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.14973130822181702, "learning_rate": 1e-06, "loss": -0.0236, "num_tokens": 16167843.0, "reward": 2.349381685256958, "reward_std": 0.24881769716739655, "rewards/cosine_scaled_reward/mean": 0.5373992919921875, "rewards/cosine_scaled_reward/std": 0.2915312647819519, "rewards/repetition_penalty_reward/mean": -0.07161141186952591, "rewards/repetition_penalty_reward/std": 0.054504405707120895, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3925.0, "completions/mean_length": 1444.78515625, "completions/mean_terminated_length": 1423.909423828125, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.00704, "frac_reward_zero_std": 0.0, "grad_norm": 0.10494732111692429, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 16693652.0, "reward": 2.421522855758667, "reward_std": 0.1476818323135376, "rewards/cosine_scaled_reward/mean": 0.567467451095581, "rewards/cosine_scaled_reward/std": 0.249455064535141, "rewards/repetition_penalty_reward/mean": -0.07172583043575287, "rewards/repetition_penalty_reward/std": 0.057648930698633194, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3803.0, "completions/max_terminated_length": 3803.0, "completions/mean_length": 1356.015625, "completions/mean_terminated_length": 1356.015625, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.007253333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.09943025559186935, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 17200788.0, "reward": 2.4666028022766113, "reward_std": 0.14464417099952698, "rewards/cosine_scaled_reward/mean": 0.5810573101043701, "rewards/cosine_scaled_reward/std": 0.1915411502122879, "rewards/repetition_penalty_reward/mean": -0.05976710468530655, "rewards/repetition_penalty_reward/std": 0.03395003080368042, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9453125, "rewards/reward_reference/std": 0.22781464457511902, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3482.0, "completions/mean_length": 1397.02734375, "completions/mean_terminated_length": 1375.775634765625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.007466666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10014528036117554, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 17711819.0, "reward": 2.452289342880249, "reward_std": 0.15739864110946655, "rewards/cosine_scaled_reward/mean": 0.574344277381897, "rewards/cosine_scaled_reward/std": 0.22381597757339478, "rewards/repetition_penalty_reward/mean": -0.06424228847026825, "rewards/repetition_penalty_reward/std": 0.04820853844285011, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9453125, "rewards/reward_reference/std": 0.22781464457511902, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3419.0, "completions/mean_length": 1457.8203125, "completions/mean_terminated_length": 1437.0472412109375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.00768, "frac_reward_zero_std": 0.0, "grad_norm": 0.12512792646884918, "learning_rate": 1e-06, "loss": -0.0205, "num_tokens": 18241569.0, "reward": 2.326671600341797, "reward_std": 0.20083504915237427, "rewards/cosine_scaled_reward/mean": 0.5292975306510925, "rewards/cosine_scaled_reward/std": 0.2945072054862976, "rewards/repetition_penalty_reward/mean": -0.074501171708107, "rewards/repetition_penalty_reward/std": 0.055338870733976364, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3618.0, "completions/mean_length": 1373.0703125, "completions/mean_terminated_length": 1362.3922119140625, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.007893333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.08265010267496109, "learning_rate": 1e-06, "loss": -0.0097, "num_tokens": 18748431.0, "reward": 2.423821449279785, "reward_std": 0.1160140410065651, "rewards/cosine_scaled_reward/mean": 0.5647482872009277, "rewards/cosine_scaled_reward/std": 0.2289552241563797, "rewards/repetition_penalty_reward/mean": -0.06280169636011124, "rewards/repetition_penalty_reward/std": 0.04026205465197563, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3864.0, "completions/mean_length": 1464.5546875, "completions/mean_terminated_length": 1443.8345947265625, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 0.008106666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.21051424741744995, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 19270157.0, "reward": 2.389101028442383, "reward_std": 0.2080249786376953, "rewards/cosine_scaled_reward/mean": 0.5605127811431885, "rewards/cosine_scaled_reward/std": 0.25950494408607483, "rewards/repetition_penalty_reward/mean": -0.07141192257404327, "rewards/repetition_penalty_reward/std": 0.05478326603770256, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 4076.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 1544.125, "completions/mean_terminated_length": 1544.125, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "epoch": 0.00832, "frac_reward_zero_std": 0.0, "grad_norm": 0.13599231839179993, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 19814469.0, "reward": 2.3371737003326416, "reward_std": 0.21445316076278687, "rewards/cosine_scaled_reward/mean": 0.5401486158370972, "rewards/cosine_scaled_reward/std": 0.30538463592529297, "rewards/repetition_penalty_reward/mean": -0.07797486335039139, "rewards/repetition_penalty_reward/std": 0.05641409009695053, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3719.0, "completions/mean_length": 1509.828125, "completions/mean_terminated_length": 1489.464599609375, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.008533333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12129685282707214, "learning_rate": 1e-06, "loss": -0.0166, "num_tokens": 20351553.0, "reward": 2.3992435932159424, "reward_std": 0.21829761564731598, "rewards/cosine_scaled_reward/mean": 0.5619106888771057, "rewards/cosine_scaled_reward/std": 0.2718035876750946, "rewards/repetition_penalty_reward/mean": -0.07282336056232452, "rewards/repetition_penalty_reward/std": 0.0561990961432457, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3967.0, "completions/mean_length": 1472.12109375, "completions/mean_terminated_length": 1461.8314208984375, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 0.008746666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1293008178472519, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 20887544.0, "reward": 2.3384013175964355, "reward_std": 0.20636960864067078, "rewards/cosine_scaled_reward/mean": 0.5350905060768127, "rewards/cosine_scaled_reward/std": 0.29096055030822754, "rewards/repetition_penalty_reward/mean": -0.07168925553560257, "rewards/repetition_penalty_reward/std": 0.04815354570746422, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1545.1015625, "completions/mean_terminated_length": 1473.3895263671875, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.00896, "frac_reward_zero_std": 0.0, "grad_norm": 0.1378139704465866, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 21419114.0, "reward": 2.301651954650879, "reward_std": 0.1993783414363861, "rewards/cosine_scaled_reward/mean": 0.521456241607666, "rewards/cosine_scaled_reward/std": 0.3168460726737976, "rewards/repetition_penalty_reward/mean": -0.07058563828468323, "rewards/repetition_penalty_reward/std": 0.05316044017672539, "rewards/reward_format/mean": 0.9874999523162842, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.86328125, "rewards/reward_reference/std": 0.34422317147254944, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3695.0, "completions/mean_length": 1472.265625, "completions/mean_terminated_length": 1461.9765625, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.009173333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.10675688087940216, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 21958298.0, "reward": 2.4147229194641113, "reward_std": 0.14962312579154968, "rewards/cosine_scaled_reward/mean": 0.5648523569107056, "rewards/cosine_scaled_reward/std": 0.2577730417251587, "rewards/repetition_penalty_reward/mean": -0.06497295200824738, "rewards/repetition_penalty_reward/std": 0.04444069415330887, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3991.0, "completions/mean_length": 1509.57421875, "completions/mean_terminated_length": 1478.9051513671875, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.009386666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10473625361919403, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 22491545.0, "reward": 2.3497352600097656, "reward_std": 0.1567041128873825, "rewards/cosine_scaled_reward/mean": 0.5374966859817505, "rewards/cosine_scaled_reward/std": 0.2960168719291687, "rewards/repetition_penalty_reward/mean": -0.06666764616966248, "rewards/repetition_penalty_reward/std": 0.04307934269309044, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1599.6640625, "completions/mean_terminated_length": 1539.7520751953125, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 0.0096, "frac_reward_zero_std": 0.0, "grad_norm": 0.11697933822870255, "learning_rate": 1e-06, "loss": -0.0498, "num_tokens": 23036343.0, "reward": 2.4010181427001953, "reward_std": 0.23659676313400269, "rewards/cosine_scaled_reward/mean": 0.5671864151954651, "rewards/cosine_scaled_reward/std": 0.28708207607269287, "rewards/repetition_penalty_reward/mean": -0.0739808902144432, "rewards/repetition_penalty_reward/std": 0.05830969288945198, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3781.0, "completions/mean_length": 1523.109375, "completions/mean_terminated_length": 1502.850341796875, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.009813333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.13121408224105835, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 23589571.0, "reward": 2.303783893585205, "reward_std": 0.2267310470342636, "rewards/cosine_scaled_reward/mean": 0.5189235806465149, "rewards/cosine_scaled_reward/std": 0.31321921944618225, "rewards/repetition_penalty_reward/mean": -0.06748341023921967, "rewards/repetition_penalty_reward/std": 0.05285539850592613, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.85546875, "rewards/reward_reference/std": 0.35231640934944153, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3932.0, "completions/max_terminated_length": 3932.0, "completions/mean_length": 1465.68359375, "completions/mean_terminated_length": 1465.68359375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.010026666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.12762288749217987, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 24123094.0, "reward": 2.360924482345581, "reward_std": 0.1576913297176361, "rewards/cosine_scaled_reward/mean": 0.5412070751190186, "rewards/cosine_scaled_reward/std": 0.279924601316452, "rewards/repetition_penalty_reward/mean": -0.06309500336647034, "rewards/repetition_penalty_reward/std": 0.04309338703751564, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 1575.7734375, "completions/mean_terminated_length": 1545.889404296875, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 0.01024, "frac_reward_zero_std": 0.0, "grad_norm": 0.1287793666124344, "learning_rate": 1e-06, "loss": -0.0152, "num_tokens": 24679392.0, "reward": 2.373932361602783, "reward_std": 0.24167510867118835, "rewards/cosine_scaled_reward/mean": 0.5552927255630493, "rewards/cosine_scaled_reward/std": 0.2919711470603943, "rewards/repetition_penalty_reward/mean": -0.07198527455329895, "rewards/repetition_penalty_reward/std": 0.052775539457798004, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3708.0, "completions/mean_length": 1634.5, "completions/mean_terminated_length": 1544.809814453125, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.010453333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12446154654026031, "learning_rate": 1e-06, "loss": -0.0409, "num_tokens": 25222184.0, "reward": 2.3497486114501953, "reward_std": 0.2241676151752472, "rewards/cosine_scaled_reward/mean": 0.550166130065918, "rewards/cosine_scaled_reward/std": 0.31175705790519714, "rewards/repetition_penalty_reward/mean": -0.0793239176273346, "rewards/repetition_penalty_reward/std": 0.06476996839046478, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 1526.890625, "completions/mean_terminated_length": 1475.713134765625, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.010666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.09193289279937744, "learning_rate": 1e-06, "loss": -0.011, "num_tokens": 25756788.0, "reward": 2.4244213104248047, "reward_std": 0.15554851293563843, "rewards/cosine_scaled_reward/mean": 0.5812386274337769, "rewards/cosine_scaled_reward/std": 0.25069162249565125, "rewards/repetition_penalty_reward/mean": -0.0724422037601471, "rewards/repetition_penalty_reward/std": 0.05959644913673401, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 1562.3515625, "completions/mean_terminated_length": 1522.135009765625, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "epoch": 0.01088, "frac_reward_zero_std": 0.0, "grad_norm": 0.13995690643787384, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 26302410.0, "reward": 2.387965679168701, "reward_std": 0.20636487007141113, "rewards/cosine_scaled_reward/mean": 0.5577713251113892, "rewards/cosine_scaled_reward/std": 0.2876478433609009, "rewards/repetition_penalty_reward/mean": -0.06589921563863754, "rewards/repetition_penalty_reward/std": 0.049709804356098175, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4010.0, "completions/mean_length": 1536.3125, "completions/mean_terminated_length": 1505.9605712890625, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.011093333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12113416194915771, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 26855006.0, "reward": 2.423652410507202, "reward_std": 0.17835842072963715, "rewards/cosine_scaled_reward/mean": 0.5704998970031738, "rewards/cosine_scaled_reward/std": 0.2632061541080475, "rewards/repetition_penalty_reward/mean": -0.06559744477272034, "rewards/repetition_penalty_reward/std": 0.05109817534685135, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 3876.0, "completions/mean_length": 1622.54296875, "completions/mean_terminated_length": 1542.7540283203125, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.011306666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1375443935394287, "learning_rate": 1e-06, "loss": -0.0355, "num_tokens": 27399657.0, "reward": 2.3556900024414062, "reward_std": 0.13406921923160553, "rewards/cosine_scaled_reward/mean": 0.542635440826416, "rewards/cosine_scaled_reward/std": 0.31481704115867615, "rewards/repetition_penalty_reward/mean": -0.07210142910480499, "rewards/repetition_penalty_reward/std": 0.06866049021482468, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3825.0, "completions/mean_length": 1507.78125, "completions/mean_terminated_length": 1487.401611328125, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.01152, "frac_reward_zero_std": 0.0, "grad_norm": 0.09410291910171509, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 27940741.0, "reward": 2.426107406616211, "reward_std": 0.12273335456848145, "rewards/cosine_scaled_reward/mean": 0.5737569332122803, "rewards/cosine_scaled_reward/std": 0.24874204397201538, "rewards/repetition_penalty_reward/mean": -0.0624934583902359, "rewards/repetition_penalty_reward/std": 0.04799724370241165, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3801.0, "completions/mean_length": 1612.08203125, "completions/mean_terminated_length": 1602.34130859375, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.011733333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12277078628540039, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 28519370.0, "reward": 2.3675689697265625, "reward_std": 0.20383627712726593, "rewards/cosine_scaled_reward/mean": 0.5437647104263306, "rewards/cosine_scaled_reward/std": 0.3121073544025421, "rewards/repetition_penalty_reward/mean": -0.06682077050209045, "rewards/repetition_penalty_reward/std": 0.04597178474068642, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3848.0, "completions/mean_length": 1528.94140625, "completions/mean_terminated_length": 1518.8746337890625, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.011946666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.06064042076468468, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 29074019.0, "reward": 2.4705758094787598, "reward_std": 0.09662184119224548, "rewards/cosine_scaled_reward/mean": 0.5923464298248291, "rewards/cosine_scaled_reward/std": 0.23122240602970123, "rewards/repetition_penalty_reward/mean": -0.05927072837948799, "rewards/repetition_penalty_reward/std": 0.036969270557165146, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3724.0, "completions/mean_length": 1532.11328125, "completions/mean_terminated_length": 1501.7115478515625, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.01216, "frac_reward_zero_std": 0.0, "grad_norm": 0.09682509303092957, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 29622644.0, "reward": 2.3833389282226562, "reward_std": 0.14281076192855835, "rewards/cosine_scaled_reward/mean": 0.552849531173706, "rewards/cosine_scaled_reward/std": 0.28276121616363525, "rewards/repetition_penalty_reward/mean": -0.06404202431440353, "rewards/repetition_penalty_reward/std": 0.05114806815981865, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3964.0, "completions/mean_length": 1569.6953125, "completions/mean_terminated_length": 1549.8031005859375, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 0.012373333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.10780574381351471, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 30178094.0, "reward": 2.3729443550109863, "reward_std": 0.16647489368915558, "rewards/cosine_scaled_reward/mean": 0.5469235181808472, "rewards/cosine_scaled_reward/std": 0.29688480496406555, "rewards/repetition_penalty_reward/mean": -0.06069795787334442, "rewards/repetition_penalty_reward/std": 0.04385991394519806, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3892.0, "completions/mean_length": 1562.03515625, "completions/mean_terminated_length": 1552.09814453125, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 0.012586666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10283534973859787, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 30732855.0, "reward": 2.440593719482422, "reward_std": 0.18287886679172516, "rewards/cosine_scaled_reward/mean": 0.584496796131134, "rewards/cosine_scaled_reward/std": 0.253009557723999, "rewards/repetition_penalty_reward/mean": -0.06265303492546082, "rewards/repetition_penalty_reward/std": 0.04739008843898773, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4010.0, "completions/mean_length": 1643.4375, "completions/mean_terminated_length": 1604.508056640625, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 0.0128, "frac_reward_zero_std": 0.0, "grad_norm": 0.10289309918880463, "learning_rate": 1e-06, "loss": -0.0138, "num_tokens": 31302263.0, "reward": 2.4196486473083496, "reward_std": 0.19348298013210297, "rewards/cosine_scaled_reward/mean": 0.578532338142395, "rewards/cosine_scaled_reward/std": 0.28303632140159607, "rewards/repetition_penalty_reward/mean": -0.0659150779247284, "rewards/repetition_penalty_reward/std": 0.048271458595991135, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 1636.42578125, "completions/mean_terminated_length": 1617.05908203125, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.013013333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.0934101939201355, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 31874212.0, "reward": 2.444908380508423, "reward_std": 0.1609368920326233, "rewards/cosine_scaled_reward/mean": 0.5966625213623047, "rewards/cosine_scaled_reward/std": 0.26409921050071716, "rewards/repetition_penalty_reward/mean": -0.07441024482250214, "rewards/repetition_penalty_reward/std": 0.061948809772729874, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1661.08203125, "completions/mean_terminated_length": 1622.4326171875, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 0.013226666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10913221538066864, "learning_rate": 1e-06, "loss": -0.0188, "num_tokens": 32445621.0, "reward": 2.4171042442321777, "reward_std": 0.18818770349025726, "rewards/cosine_scaled_reward/mean": 0.5740669965744019, "rewards/cosine_scaled_reward/std": 0.2936588227748871, "rewards/repetition_penalty_reward/mean": -0.06790010631084442, "rewards/repetition_penalty_reward/std": 0.05626881867647171, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 1632.52734375, "completions/mean_terminated_length": 1613.1298828125, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.01344, "frac_reward_zero_std": 0.0, "grad_norm": 0.09852778911590576, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 33019764.0, "reward": 2.3997602462768555, "reward_std": 0.15767687559127808, "rewards/cosine_scaled_reward/mean": 0.5749585628509521, "rewards/cosine_scaled_reward/std": 0.286191463470459, "rewards/repetition_penalty_reward/mean": -0.06738582998514175, "rewards/repetition_penalty_reward/std": 0.04598398134112358, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 1688.46484375, "completions/mean_terminated_length": 1659.9171142578125, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 0.013653333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.11886442452669144, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 33606475.0, "reward": 2.3002028465270996, "reward_std": 0.19646117091178894, "rewards/cosine_scaled_reward/mean": 0.5263907313346863, "rewards/cosine_scaled_reward/std": 0.3443312644958496, "rewards/repetition_penalty_reward/mean": -0.07775020599365234, "rewards/repetition_penalty_reward/std": 0.058893799781799316, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8515625, "rewards/reward_reference/std": 0.3562295734882355, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 4025.0, "completions/max_terminated_length": 4025.0, "completions/mean_length": 1567.79296875, "completions/mean_terminated_length": 1567.79296875, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.013866666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.09965450316667557, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 34171822.0, "reward": 2.453932285308838, "reward_std": 0.18380096554756165, "rewards/cosine_scaled_reward/mean": 0.5916061997413635, "rewards/cosine_scaled_reward/std": 0.24207937717437744, "rewards/repetition_penalty_reward/mean": -0.06345503032207489, "rewards/repetition_penalty_reward/std": 0.037380099296569824, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 4061.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1609.9296875, "completions/mean_terminated_length": 1609.9296875, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.01408, "frac_reward_zero_std": 0.0, "grad_norm": 0.11172179132699966, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 34739068.0, "reward": 2.3695924282073975, "reward_std": 0.19150829315185547, "rewards/cosine_scaled_reward/mean": 0.5578655004501343, "rewards/cosine_scaled_reward/std": 0.29703474044799805, "rewards/repetition_penalty_reward/mean": -0.07108558714389801, "rewards/repetition_penalty_reward/std": 0.04733828827738762, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3984.0, "completions/mean_length": 1664.4140625, "completions/mean_terminated_length": 1606.0560302734375, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.014293333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.15400172770023346, "learning_rate": 1e-06, "loss": -0.0638, "num_tokens": 35304030.0, "reward": 2.4119789600372314, "reward_std": 0.2544468939304352, "rewards/cosine_scaled_reward/mean": 0.5801342725753784, "rewards/cosine_scaled_reward/std": 0.28957661986351013, "rewards/repetition_penalty_reward/mean": -0.07284298539161682, "rewards/repetition_penalty_reward/std": 0.07479114085435867, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4025.0, "completions/mean_length": 1690.984375, "completions/mean_terminated_length": 1603.352294921875, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 0.014506666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.08533817529678345, "learning_rate": 1e-06, "loss": -0.0186, "num_tokens": 35867958.0, "reward": 2.358893394470215, "reward_std": 0.1489824652671814, "rewards/cosine_scaled_reward/mean": 0.556054949760437, "rewards/cosine_scaled_reward/std": 0.3177521228790283, "rewards/repetition_penalty_reward/mean": -0.07684915512800217, "rewards/repetition_penalty_reward/std": 0.07336685806512833, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 1666.86328125, "completions/mean_terminated_length": 1647.7362060546875, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.01472, "frac_reward_zero_std": 0.0, "grad_norm": 0.12211061269044876, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 36458403.0, "reward": 2.379150390625, "reward_std": 0.22196923196315765, "rewards/cosine_scaled_reward/mean": 0.5638763904571533, "rewards/cosine_scaled_reward/std": 0.30511754751205444, "rewards/repetition_penalty_reward/mean": -0.07222599536180496, "rewards/repetition_penalty_reward/std": 0.04784548282623291, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3987.0, "completions/mean_length": 1706.54296875, "completions/mean_terminated_length": 1697.172607421875, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.014933333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.08350487053394318, "learning_rate": 1e-06, "loss": -0.0159, "num_tokens": 37059562.0, "reward": 2.497920036315918, "reward_std": 0.12415796518325806, "rewards/cosine_scaled_reward/mean": 0.6232722997665405, "rewards/cosine_scaled_reward/std": 0.24374060332775116, "rewards/repetition_penalty_reward/mean": -0.06675856560468674, "rewards/repetition_penalty_reward/std": 0.04270366206765175, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3730.0, "completions/mean_length": 1697.40625, "completions/mean_terminated_length": 1688.0001220703125, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 0.015146666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.09371346235275269, "learning_rate": 1e-06, "loss": -0.0116, "num_tokens": 37652426.0, "reward": 2.481433868408203, "reward_std": 0.16789013147354126, "rewards/cosine_scaled_reward/mean": 0.6213948726654053, "rewards/cosine_scaled_reward/std": 0.2410319447517395, "rewards/repetition_penalty_reward/mean": -0.07433594018220901, "rewards/repetition_penalty_reward/std": 0.05567564442753792, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3244.0, "completions/mean_length": 1703.26953125, "completions/mean_terminated_length": 1674.8973388671875, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 0.01536, "frac_reward_zero_std": 0.0, "grad_norm": 0.08914853632450104, "learning_rate": 1e-06, "loss": -0.0221, "num_tokens": 38244783.0, "reward": 2.4869542121887207, "reward_std": 0.12982487678527832, "rewards/cosine_scaled_reward/mean": 0.6192671060562134, "rewards/cosine_scaled_reward/std": 0.24536055326461792, "rewards/repetition_penalty_reward/mean": -0.06746931374073029, "rewards/repetition_penalty_reward/std": 0.053976256400346756, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3789.0, "completions/mean_length": 1795.80859375, "completions/mean_terminated_length": 1768.53369140625, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 0.015573333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13626892864704132, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 38857138.0, "reward": 2.3470053672790527, "reward_std": 0.2492281198501587, "rewards/cosine_scaled_reward/mean": 0.5566455125808716, "rewards/cosine_scaled_reward/std": 0.3459410071372986, "rewards/repetition_penalty_reward/mean": -0.07760874181985855, "rewards/repetition_penalty_reward/std": 0.054391711950302124, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3685.0, "completions/mean_length": 1721.2734375, "completions/mean_terminated_length": 1711.9609375, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 0.015786666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 0.06626079231500626, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 39464616.0, "reward": 2.4405510425567627, "reward_std": 0.11783230304718018, "rewards/cosine_scaled_reward/mean": 0.5952588319778442, "rewards/cosine_scaled_reward/std": 0.2840306758880615, "rewards/repetition_penalty_reward/mean": -0.06486397236585617, "rewards/repetition_penalty_reward/std": 0.04353988170623779, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 1689.69140625, "completions/mean_terminated_length": 1661.158203125, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.1122131422162056, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 40046925.0, "reward": 2.3153374195098877, "reward_std": 0.18534180521965027, "rewards/cosine_scaled_reward/mean": 0.5333756804466248, "rewards/cosine_scaled_reward/std": 0.3348003029823303, "rewards/repetition_penalty_reward/mean": -0.06647560000419617, "rewards/repetition_penalty_reward/std": 0.05163053795695305, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8515625, "rewards/reward_reference/std": 0.3562295734882355, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 1863.859375, "completions/mean_terminated_length": 1801.1083984375, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.016213333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.0832432359457016, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 40660097.0, "reward": 2.320946216583252, "reward_std": 0.14731647074222565, "rewards/cosine_scaled_reward/mean": 0.5464308857917786, "rewards/cosine_scaled_reward/std": 0.36857691407203674, "rewards/repetition_penalty_reward/mean": -0.07860984653234482, "rewards/repetition_penalty_reward/std": 0.059814877808094025, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.859375, "rewards/reward_reference/std": 0.3483152687549591, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3703.0, "completions/max_terminated_length": 3703.0, "completions/mean_length": 1677.28515625, "completions/mean_terminated_length": 1677.28515625, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.016426666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.09381895512342453, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 41260350.0, "reward": 2.333367347717285, "reward_std": 0.16393491625785828, "rewards/cosine_scaled_reward/mean": 0.5406162738800049, "rewards/cosine_scaled_reward/std": 0.327106237411499, "rewards/repetition_penalty_reward/mean": -0.06271764636039734, "rewards/repetition_penalty_reward/std": 0.03436657041311264, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.85546875, "rewards/reward_reference/std": 0.35231640934944153, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3929.0, "completions/mean_length": 1841.54296875, "completions/mean_terminated_length": 1796.633544921875, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "epoch": 0.01664, "frac_reward_zero_std": 0.0, "grad_norm": 0.1164449080824852, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 41870129.0, "reward": 2.2937211990356445, "reward_std": 0.2213471531867981, "rewards/cosine_scaled_reward/mean": 0.5263729691505432, "rewards/cosine_scaled_reward/std": 0.37787869572639465, "rewards/repetition_penalty_reward/mean": -0.07718320190906525, "rewards/repetition_penalty_reward/std": 0.052163559943437576, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.84765625, "rewards/reward_reference/std": 0.3600577116012573, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 1882.796875, "completions/mean_terminated_length": 1783.428466796875, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.016853333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.11616773903369904, "learning_rate": 1e-06, "loss": -0.0399, "num_tokens": 42464209.0, "reward": 2.329927444458008, "reward_std": 0.2060365378856659, "rewards/cosine_scaled_reward/mean": 0.5598228573799133, "rewards/cosine_scaled_reward/std": 0.36188092827796936, "rewards/repetition_penalty_reward/mean": -0.08770774304866791, "rewards/repetition_penalty_reward/std": 0.07510842382907867, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3895.0, "completions/max_terminated_length": 3895.0, "completions/mean_length": 1749.42578125, "completions/mean_terminated_length": 1749.42578125, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "epoch": 0.017066666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1016409620642662, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 43071518.0, "reward": 2.4230754375457764, "reward_std": 0.15485021471977234, "rewards/cosine_scaled_reward/mean": 0.5909368395805359, "rewards/cosine_scaled_reward/std": 0.29618415236473083, "rewards/repetition_penalty_reward/mean": -0.07020512223243713, "rewards/repetition_penalty_reward/std": 0.041945360600948334, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3290.0, "completions/mean_length": 1692.96484375, "completions/mean_terminated_length": 1664.470458984375, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.01728, "frac_reward_zero_std": 0.0, "grad_norm": 0.08751388639211655, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 43656253.0, "reward": 2.5253281593322754, "reward_std": 0.12936024367809296, "rewards/cosine_scaled_reward/mean": 0.6400842070579529, "rewards/cosine_scaled_reward/std": 0.20249401032924652, "rewards/repetition_penalty_reward/mean": -0.0717872753739357, "rewards/repetition_penalty_reward/std": 0.05126966908574104, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 1845.78515625, "completions/mean_terminated_length": 1800.960205078125, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 0.017493333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12477394193410873, "learning_rate": 1e-06, "loss": -0.0299, "num_tokens": 44268842.0, "reward": 2.382167339324951, "reward_std": 0.20765957236289978, "rewards/cosine_scaled_reward/mean": 0.5757420063018799, "rewards/cosine_scaled_reward/std": 0.3393123149871826, "rewards/repetition_penalty_reward/mean": -0.07794953882694244, "rewards/repetition_penalty_reward/std": 0.05676357075572014, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3896.0, "completions/mean_length": 1813.8359375, "completions/mean_terminated_length": 1777.6112060546875, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "epoch": 0.017706666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.11310097575187683, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 44878580.0, "reward": 2.4027891159057617, "reward_std": 0.20823755860328674, "rewards/cosine_scaled_reward/mean": 0.5920209884643555, "rewards/cosine_scaled_reward/std": 0.31304964423179626, "rewards/repetition_penalty_reward/mean": -0.08063797652721405, "rewards/repetition_penalty_reward/std": 0.05689622461795807, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 1746.9296875, "completions/mean_terminated_length": 1737.7177734375, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.01792, "frac_reward_zero_std": 0.0, "grad_norm": 0.0818057730793953, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 45484070.0, "reward": 2.4049665927886963, "reward_std": 0.12297569960355759, "rewards/cosine_scaled_reward/mean": 0.5779396891593933, "rewards/cosine_scaled_reward/std": 0.3099997639656067, "rewards/repetition_penalty_reward/mean": -0.0682855099439621, "rewards/repetition_penalty_reward/std": 0.04200433939695358, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3760.0, "completions/mean_length": 1762.9609375, "completions/mean_terminated_length": 1735.2965087890625, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 0.018133333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.07859272509813309, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 46083580.0, "reward": 2.47398042678833, "reward_std": 0.1363099366426468, "rewards/cosine_scaled_reward/mean": 0.6111050248146057, "rewards/cosine_scaled_reward/std": 0.27342745661735535, "rewards/repetition_penalty_reward/mean": -0.06681206077337265, "rewards/repetition_penalty_reward/std": 0.050561416894197464, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3901.0, "completions/mean_length": 1759.5546875, "completions/mean_terminated_length": 1722.4683837890625, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.018346666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.07375761866569519, "learning_rate": 1e-06, "loss": -0.0134, "num_tokens": 46677474.0, "reward": 2.5168585777282715, "reward_std": 0.10227377712726593, "rewards/cosine_scaled_reward/mean": 0.6395881175994873, "rewards/cosine_scaled_reward/std": 0.22985778748989105, "rewards/repetition_penalty_reward/mean": -0.07194818556308746, "rewards/repetition_penalty_reward/std": 0.042965106666088104, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94921875, "rewards/reward_reference/std": 0.21998079121112823, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3865.0, "completions/mean_length": 1764.23046875, "completions/mean_terminated_length": 1745.8701171875, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "epoch": 0.01856, "frac_reward_zero_std": 0.0, "grad_norm": 0.09555403888225555, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 47287537.0, "reward": 2.4244163036346436, "reward_std": 0.1566460132598877, "rewards/cosine_scaled_reward/mean": 0.5867924690246582, "rewards/cosine_scaled_reward/std": 0.30464106798171997, "rewards/repetition_penalty_reward/mean": -0.07253240048885345, "rewards/repetition_penalty_reward/std": 0.04896243289113045, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3812.0, "completions/mean_length": 1809.65625, "completions/mean_terminated_length": 1800.6903076171875, "completions/min_length": 1076.0, "completions/min_terminated_length": 1076.0, "epoch": 0.018773333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.08929698914289474, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 47910045.0, "reward": 2.4683656692504883, "reward_std": 0.13447409868240356, "rewards/cosine_scaled_reward/mean": 0.6193373203277588, "rewards/cosine_scaled_reward/std": 0.2785070240497589, "rewards/repetition_penalty_reward/mean": -0.07284662127494812, "rewards/repetition_penalty_reward/std": 0.04199657589197159, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 1735.84375, "completions/mean_terminated_length": 1717.2598876953125, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 0.018986666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.05542474985122681, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 48511993.0, "reward": 2.4176106452941895, "reward_std": 0.06906415522098541, "rewards/cosine_scaled_reward/mean": 0.5868261456489563, "rewards/cosine_scaled_reward/std": 0.296097069978714, "rewards/repetition_penalty_reward/mean": -0.06765298545360565, "rewards/repetition_penalty_reward/std": 0.03946821764111519, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3982.0, "completions/mean_length": 1745.66796875, "completions/mean_terminated_length": 1679.5943603515625, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.0192, "frac_reward_zero_std": 0.0, "grad_norm": 0.09133629500865936, "learning_rate": 1e-06, "loss": -0.011, "num_tokens": 49090192.0, "reward": 2.322629451751709, "reward_std": 0.1549871861934662, "rewards/cosine_scaled_reward/mean": 0.5519901514053345, "rewards/cosine_scaled_reward/std": 0.33279502391815186, "rewards/repetition_penalty_reward/mean": -0.08170440793037415, "rewards/repetition_penalty_reward/std": 0.06908124685287476, "rewards/reward_format/mean": 0.9812500476837158, "rewards/reward_format/std": 0.12126781791448593, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 1705.6953125, "completions/mean_terminated_length": 1638.4979248046875, "completions/min_length": 984.0, "completions/min_terminated_length": 984.0, "epoch": 0.019413333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.08402293175458908, "learning_rate": 1e-06, "loss": -0.0185, "num_tokens": 49654050.0, "reward": 2.4880495071411133, "reward_std": 0.12586981058120728, "rewards/cosine_scaled_reward/mean": 0.6164243221282959, "rewards/cosine_scaled_reward/std": 0.24689579010009766, "rewards/repetition_penalty_reward/mean": -0.06743744760751724, "rewards/repetition_penalty_reward/std": 0.04767215624451637, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.9453125, "rewards/reward_reference/std": 0.22781464457511902, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3693.0, "completions/max_terminated_length": 3693.0, "completions/mean_length": 1792.48828125, "completions/mean_terminated_length": 1792.48828125, "completions/min_length": 1058.0, "completions/min_terminated_length": 1058.0, "epoch": 0.019626666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.08743888139724731, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 50275411.0, "reward": 2.3785476684570312, "reward_std": 0.1486774981021881, "rewards/cosine_scaled_reward/mean": 0.5760810971260071, "rewards/cosine_scaled_reward/std": 0.3226264417171478, "rewards/repetition_penalty_reward/mean": -0.07643987238407135, "rewards/repetition_penalty_reward/std": 0.04226500540971756, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 1894.8125, "completions/mean_terminated_length": 1786.5572509765625, "completions/min_length": 1093.0, "completions/min_terminated_length": 1093.0, "epoch": 0.01984, "frac_reward_zero_std": 0.0, "grad_norm": 0.13492800295352936, "learning_rate": 1e-06, "loss": -0.0355, "num_tokens": 50865851.0, "reward": 2.384434700012207, "reward_std": 0.23413318395614624, "rewards/cosine_scaled_reward/mean": 0.581243634223938, "rewards/cosine_scaled_reward/std": 0.3413110673427582, "rewards/repetition_penalty_reward/mean": -0.0858713760972023, "rewards/repetition_penalty_reward/std": 0.0653069019317627, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3629.0, "completions/mean_length": 1738.40625, "completions/mean_terminated_length": 1719.842529296875, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 0.020053333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.06449340283870697, "learning_rate": 1e-06, "loss": -0.015, "num_tokens": 51469135.0, "reward": 2.540250778198242, "reward_std": 0.10148729383945465, "rewards/cosine_scaled_reward/mean": 0.6441509127616882, "rewards/cosine_scaled_reward/std": 0.21270470321178436, "rewards/repetition_penalty_reward/mean": -0.06483766436576843, "rewards/repetition_penalty_reward/std": 0.03865836560726166, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3683.0, "completions/mean_length": 1811.0859375, "completions/mean_terminated_length": 1765.56982421875, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 0.020266666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.10099369287490845, "learning_rate": 1e-06, "loss": -0.0201, "num_tokens": 52071265.0, "reward": 2.4201340675354004, "reward_std": 0.1737205535173416, "rewards/cosine_scaled_reward/mean": 0.6000106334686279, "rewards/cosine_scaled_reward/std": 0.3004744052886963, "rewards/repetition_penalty_reward/mean": -0.0798763632774353, "rewards/repetition_penalty_reward/std": 0.06247806176543236, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3961.0, "completions/mean_length": 1730.5625, "completions/mean_terminated_length": 1721.286376953125, "completions/min_length": 1064.0, "completions/min_terminated_length": 1064.0, "epoch": 0.02048, "frac_reward_zero_std": 0.0, "grad_norm": 0.07633478939533234, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 52681929.0, "reward": 2.4455995559692383, "reward_std": 0.1185745969414711, "rewards/cosine_scaled_reward/mean": 0.6002703905105591, "rewards/cosine_scaled_reward/std": 0.2775324583053589, "rewards/repetition_penalty_reward/mean": -0.06873318552970886, "rewards/repetition_penalty_reward/std": 0.042261138558387756, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 1756.83203125, "completions/mean_terminated_length": 1747.658935546875, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 0.020693333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.0499541349709034, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 53282854.0, "reward": 2.4268693923950195, "reward_std": 0.07386516034603119, "rewards/cosine_scaled_reward/mean": 0.594407320022583, "rewards/cosine_scaled_reward/std": 0.29383519291877747, "rewards/repetition_penalty_reward/mean": -0.07378794252872467, "rewards/repetition_penalty_reward/std": 0.048145439475774765, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1892.8359375, "completions/mean_terminated_length": 1848.9482421875, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "epoch": 0.020906666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11263293027877808, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 53905152.0, "reward": 2.305772542953491, "reward_std": 0.18838255107402802, "rewards/cosine_scaled_reward/mean": 0.5442582368850708, "rewards/cosine_scaled_reward/std": 0.3724890649318695, "rewards/repetition_penalty_reward/mean": -0.07129818201065063, "rewards/repetition_penalty_reward/std": 0.04757782071828842, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8359375, "rewards/reward_reference/std": 0.3710577189922333, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1890.90625, "completions/mean_terminated_length": 1846.9801025390625, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.02112, "frac_reward_zero_std": 0.0, "grad_norm": 0.11732571572065353, "learning_rate": 1e-06, "loss": -0.0177, "num_tokens": 54530496.0, "reward": 2.391127109527588, "reward_std": 0.19609452784061432, "rewards/cosine_scaled_reward/mean": 0.5890634059906006, "rewards/cosine_scaled_reward/std": 0.3334295153617859, "rewards/repetition_penalty_reward/mean": -0.07684235274791718, "rewards/repetition_penalty_reward/std": 0.04716531187295914, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 1772.96484375, "completions/mean_terminated_length": 1745.4190673828125, "completions/min_length": 1082.0, "completions/min_terminated_length": 1082.0, "epoch": 0.021333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.08264286071062088, "learning_rate": 1e-06, "loss": -0.0155, "num_tokens": 55142359.0, "reward": 2.398106575012207, "reward_std": 0.10862451791763306, "rewards/cosine_scaled_reward/mean": 0.5802006721496582, "rewards/cosine_scaled_reward/std": 0.3117934763431549, "rewards/repetition_penalty_reward/mean": -0.06959399580955505, "rewards/repetition_penalty_reward/std": 0.04771846532821655, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 1838.60546875, "completions/mean_terminated_length": 1784.4281005859375, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "epoch": 0.021546666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.10220247507095337, "learning_rate": 1e-06, "loss": -0.0234, "num_tokens": 55750342.0, "reward": 2.4891393184661865, "reward_std": 0.1632259488105774, "rewards/cosine_scaled_reward/mean": 0.628940224647522, "rewards/cosine_scaled_reward/std": 0.27056002616882324, "rewards/repetition_penalty_reward/mean": -0.07339469343423843, "rewards/repetition_penalty_reward/std": 0.051973793655633926, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3805.0, "completions/mean_length": 1867.9296875, "completions/mean_terminated_length": 1859.1922607421875, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "epoch": 0.02176, "frac_reward_zero_std": 0.0, "grad_norm": 0.11397871375083923, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 56386668.0, "reward": 2.396754264831543, "reward_std": 0.1740192025899887, "rewards/cosine_scaled_reward/mean": 0.5868169069290161, "rewards/cosine_scaled_reward/std": 0.33226945996284485, "rewards/repetition_penalty_reward/mean": -0.06975027173757553, "rewards/repetition_penalty_reward/std": 0.057229943573474884, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 4009.0, "completions/max_terminated_length": 4009.0, "completions/mean_length": 1682.66015625, "completions/mean_terminated_length": 1682.66015625, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 0.021973333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.056475766003131866, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 56986793.0, "reward": 2.5267632007598877, "reward_std": 0.06429925560951233, "rewards/cosine_scaled_reward/mean": 0.6376233100891113, "rewards/cosine_scaled_reward/std": 0.20191699266433716, "rewards/repetition_penalty_reward/mean": -0.0639849454164505, "rewards/repetition_penalty_reward/std": 0.03479979932308197, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.953125, "rewards/reward_reference/std": 0.21178513765335083, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3586.0, "completions/mean_length": 1783.94921875, "completions/mean_terminated_length": 1765.744140625, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 0.022186666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11576827615499496, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 57592760.0, "reward": 2.4529004096984863, "reward_std": 0.18349312245845795, "rewards/cosine_scaled_reward/mean": 0.6015920639038086, "rewards/cosine_scaled_reward/std": 0.2936531603336334, "rewards/repetition_penalty_reward/mean": -0.06275419145822525, "rewards/repetition_penalty_reward/std": 0.03655506670475006, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3947.0, "completions/mean_length": 1912.76171875, "completions/mean_terminated_length": 1878.1072998046875, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "epoch": 0.0224, "frac_reward_zero_std": 0.0, "grad_norm": 0.10496613383293152, "learning_rate": 1e-06, "loss": -0.0234, "num_tokens": 58232699.0, "reward": 2.2054691314697266, "reward_std": 0.20665660500526428, "rewards/cosine_scaled_reward/mean": 0.48843634128570557, "rewards/cosine_scaled_reward/std": 0.4195135831832886, "rewards/repetition_penalty_reward/mean": -0.06968589127063751, "rewards/repetition_penalty_reward/std": 0.044393159449100494, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.79296875, "rewards/reward_reference/std": 0.40597182512283325, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3891.0, "completions/mean_length": 1855.42578125, "completions/mean_terminated_length": 1810.7928466796875, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "epoch": 0.022613333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.07337377220392227, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 58844188.0, "reward": 2.4211018085479736, "reward_std": 0.12404131889343262, "rewards/cosine_scaled_reward/mean": 0.5939671993255615, "rewards/cosine_scaled_reward/std": 0.32044410705566406, "rewards/repetition_penalty_reward/mean": -0.06817790120840073, "rewards/repetition_penalty_reward/std": 0.045555904507637024, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 1896.109375, "completions/mean_terminated_length": 1861.1905517578125, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "epoch": 0.022826666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.10826458781957626, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 59486044.0, "reward": 2.4502131938934326, "reward_std": 0.17930112779140472, "rewards/cosine_scaled_reward/mean": 0.6160438656806946, "rewards/cosine_scaled_reward/std": 0.30508628487586975, "rewards/repetition_penalty_reward/mean": -0.0728619247674942, "rewards/repetition_penalty_reward/std": 0.05683686584234238, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3477.0, "completions/mean_length": 1825.83984375, "completions/mean_terminated_length": 1798.9210205078125, "completions/min_length": 1036.0, "completions/min_terminated_length": 1036.0, "epoch": 0.02304, "frac_reward_zero_std": 0.0, "grad_norm": 0.09361043572425842, "learning_rate": 1e-06, "loss": -0.0163, "num_tokens": 60105751.0, "reward": 2.4900074005126953, "reward_std": 0.12909862399101257, "rewards/cosine_scaled_reward/mean": 0.6301261186599731, "rewards/cosine_scaled_reward/std": 0.26686570048332214, "rewards/repetition_penalty_reward/mean": -0.0705873891711235, "rewards/repetition_penalty_reward/std": 0.043489061295986176, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3810.0, "completions/mean_length": 1801.35546875, "completions/mean_terminated_length": 1792.35693359375, "completions/min_length": 972.0, "completions/min_terminated_length": 972.0, "epoch": 0.023253333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.09894856065511703, "learning_rate": 1e-06, "loss": -0.0122, "num_tokens": 60725642.0, "reward": 2.4845056533813477, "reward_std": 0.14575153589248657, "rewards/cosine_scaled_reward/mean": 0.6300021409988403, "rewards/cosine_scaled_reward/std": 0.2584840953350067, "rewards/repetition_penalty_reward/mean": -0.06815264374017715, "rewards/repetition_penalty_reward/std": 0.049226224422454834, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 1938.015625, "completions/mean_terminated_length": 1895.0279541015625, "completions/min_length": 1074.0, "completions/min_terminated_length": 1074.0, "epoch": 0.023466666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.08501613140106201, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 61361878.0, "reward": 2.5716586112976074, "reward_std": 0.13666792213916779, "rewards/cosine_scaled_reward/mean": 0.6835469007492065, "rewards/cosine_scaled_reward/std": 0.21558046340942383, "rewards/repetition_penalty_reward/mean": -0.07048188149929047, "rewards/repetition_penalty_reward/std": 0.040720950812101364, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718994140625, "rewards/reward_reference/mean": 0.96484375, "rewards/reward_reference/std": 0.18453538417816162, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1827.046875, "completions/mean_terminated_length": 1791.0318603515625, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.02368, "frac_reward_zero_std": 0.0, "grad_norm": 0.08992420881986618, "learning_rate": 1e-06, "loss": -0.0133, "num_tokens": 61965338.0, "reward": 2.503587007522583, "reward_std": 0.15698347985744476, "rewards/cosine_scaled_reward/mean": 0.635814905166626, "rewards/cosine_scaled_reward/std": 0.2587208151817322, "rewards/repetition_penalty_reward/mean": -0.06582161784172058, "rewards/repetition_penalty_reward/std": 0.04158446192741394, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 1858.609375, "completions/mean_terminated_length": 1832.0791015625, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 0.023893333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 0.09149877727031708, "learning_rate": 1e-06, "loss": -0.0306, "num_tokens": 62585210.0, "reward": 2.4405405521392822, "reward_std": 0.13705675303936005, "rewards/cosine_scaled_reward/mean": 0.6155904531478882, "rewards/cosine_scaled_reward/std": 0.295393705368042, "rewards/repetition_penalty_reward/mean": -0.06801863014698029, "rewards/repetition_penalty_reward/std": 0.04448341205716133, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3933.0, "completions/mean_length": 1910.171875, "completions/mean_terminated_length": 1866.6295166015625, "completions/min_length": 1114.0, "completions/min_terminated_length": 1114.0, "epoch": 0.024106666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.08117534965276718, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 63218522.0, "reward": 2.3720033168792725, "reward_std": 0.11090590059757233, "rewards/cosine_scaled_reward/mean": 0.5723944902420044, "rewards/cosine_scaled_reward/std": 0.35445037484169006, "rewards/repetition_penalty_reward/mean": -0.07539094239473343, "rewards/repetition_penalty_reward/std": 0.0551920086145401, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3800.0, "completions/max_terminated_length": 3800.0, "completions/mean_length": 1847.49609375, "completions/mean_terminated_length": 1847.49609375, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.02432, "frac_reward_zero_std": 0.0, "grad_norm": 0.09835077077150345, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 63849337.0, "reward": 2.4445981979370117, "reward_std": 0.15487414598464966, "rewards/cosine_scaled_reward/mean": 0.6096133589744568, "rewards/cosine_scaled_reward/std": 0.2992246448993683, "rewards/repetition_penalty_reward/mean": -0.06735902279615402, "rewards/repetition_penalty_reward/std": 0.039419885724782944, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1888.6796875, "completions/mean_terminated_length": 1871.2991943359375, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "epoch": 0.024533333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.09056826680898666, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 64491495.0, "reward": 2.4354710578918457, "reward_std": 0.1214878261089325, "rewards/cosine_scaled_reward/mean": 0.6106761693954468, "rewards/cosine_scaled_reward/std": 0.309811532497406, "rewards/repetition_penalty_reward/mean": -0.07051754742860794, "rewards/repetition_penalty_reward/std": 0.049914922565221786, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 2027.52734375, "completions/mean_terminated_length": 1977.884033203125, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "epoch": 0.024746666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.09796389192342758, "learning_rate": 1e-06, "loss": -0.0353, "num_tokens": 65146530.0, "reward": 2.429624080657959, "reward_std": 0.19394034147262573, "rewards/cosine_scaled_reward/mean": 0.6130001544952393, "rewards/cosine_scaled_reward/std": 0.34257784485816956, "rewards/repetition_penalty_reward/mean": -0.07400095462799072, "rewards/repetition_penalty_reward/std": 0.041537486016750336, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3967.0, "completions/mean_length": 1915.296875, "completions/mean_terminated_length": 1871.856689453125, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "epoch": 0.02496, "frac_reward_zero_std": 0.0, "grad_norm": 0.09959474205970764, "learning_rate": 1e-06, "loss": -0.0367, "num_tokens": 65778562.0, "reward": 2.3682241439819336, "reward_std": 0.15466037392616272, "rewards/cosine_scaled_reward/mean": 0.5779094696044922, "rewards/cosine_scaled_reward/std": 0.35092681646347046, "rewards/repetition_penalty_reward/mean": -0.06984167546033859, "rewards/repetition_penalty_reward/std": 0.05132605880498886, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.86328125, "rewards/reward_reference/std": 0.34422317147254944, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3970.0, "completions/mean_length": 1996.67578125, "completions/mean_terminated_length": 1937.6585693359375, "completions/min_length": 1097.0, "completions/min_terminated_length": 1097.0, "epoch": 0.025173333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 0.10874690860509872, "learning_rate": 1e-06, "loss": -0.0386, "num_tokens": 66428271.0, "reward": 2.454425573348999, "reward_std": 0.18959057331085205, "rewards/cosine_scaled_reward/mean": 0.6289983987808228, "rewards/cosine_scaled_reward/std": 0.317272812128067, "rewards/repetition_penalty_reward/mean": -0.0847291648387909, "rewards/repetition_penalty_reward/std": 0.05817628279328346, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3950.0, "completions/mean_length": 1973.50390625, "completions/mean_terminated_length": 1948.3360595703125, "completions/min_length": 1139.0, "completions/min_terminated_length": 1139.0, "epoch": 0.025386666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.0805424302816391, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 67085776.0, "reward": 2.535606861114502, "reward_std": 0.12788008153438568, "rewards/cosine_scaled_reward/mean": 0.6682774424552917, "rewards/cosine_scaled_reward/std": 0.2557551860809326, "rewards/repetition_penalty_reward/mean": -0.07095189392566681, "rewards/repetition_penalty_reward/std": 0.04125396907329559, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3879.0, "completions/mean_length": 1930.96484375, "completions/mean_terminated_length": 1905.2926025390625, "completions/min_length": 1056.0, "completions/min_terminated_length": 1056.0, "epoch": 0.0256, "frac_reward_zero_std": 0.0, "grad_norm": 0.11947871744632721, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 67731251.0, "reward": 2.4403347969055176, "reward_std": 0.18954503536224365, "rewards/cosine_scaled_reward/mean": 0.6084376573562622, "rewards/cosine_scaled_reward/std": 0.32468897104263306, "rewards/repetition_penalty_reward/mean": -0.07044674456119537, "rewards/repetition_penalty_reward/std": 0.043707504868507385, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3712.0, "completions/mean_length": 2024.26953125, "completions/mean_terminated_length": 1931.2529296875, "completions/min_length": 1157.0, "completions/min_terminated_length": 1157.0, "epoch": 0.025813333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.07135090231895447, "learning_rate": 1e-06, "loss": -0.0286, "num_tokens": 68365360.0, "reward": 2.4914162158966064, "reward_std": 0.11930274963378906, "rewards/cosine_scaled_reward/mean": 0.6463751792907715, "rewards/cosine_scaled_reward/std": 0.30041638016700745, "rewards/repetition_penalty_reward/mean": -0.07214657962322235, "rewards/repetition_penalty_reward/std": 0.05392837896943092, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 2024.44140625, "completions/mean_terminated_length": 1991.5596923828125, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "epoch": 0.026026666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.12721113860607147, "learning_rate": 1e-06, "loss": -0.0302, "num_tokens": 69023685.0, "reward": 2.404798984527588, "reward_std": 0.2287992238998413, "rewards/cosine_scaled_reward/mean": 0.6008037328720093, "rewards/cosine_scaled_reward/std": 0.3563878536224365, "rewards/repetition_penalty_reward/mean": -0.07569223642349243, "rewards/repetition_penalty_reward/std": 0.05869562551379204, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3926.0, "completions/mean_length": 2080.6953125, "completions/mean_terminated_length": 2024.0400390625, "completions/min_length": 1153.0, "completions/min_terminated_length": 1153.0, "epoch": 0.02624, "frac_reward_zero_std": 0.0, "grad_norm": 0.07020558416843414, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 69699179.0, "reward": 2.508188009262085, "reward_std": 0.09734837710857391, "rewards/cosine_scaled_reward/mean": 0.6614010334014893, "rewards/cosine_scaled_reward/std": 0.29930007457733154, "rewards/repetition_penalty_reward/mean": -0.0758691132068634, "rewards/repetition_penalty_reward/std": 0.05258987843990326, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3513.0, "completions/mean_length": 2013.26171875, "completions/mean_terminated_length": 2005.09423828125, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.026453333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 0.10724949091672897, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 70370350.0, "reward": 2.5433502197265625, "reward_std": 0.1528834104537964, "rewards/cosine_scaled_reward/mean": 0.6737625598907471, "rewards/cosine_scaled_reward/std": 0.2616587281227112, "rewards/repetition_penalty_reward/mean": -0.06791241466999054, "rewards/repetition_penalty_reward/std": 0.04298345372080803, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3957.0, "completions/mean_length": 2100.45703125, "completions/mean_terminated_length": 2044.3572998046875, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 0.02666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10530390590429306, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 71047395.0, "reward": 2.407808780670166, "reward_std": 0.19165164232254028, "rewards/cosine_scaled_reward/mean": 0.6080106496810913, "rewards/cosine_scaled_reward/std": 0.36637604236602783, "rewards/repetition_penalty_reward/mean": -0.0798891931772232, "rewards/repetition_penalty_reward/std": 0.052827976644039154, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3566.0, "completions/mean_length": 1920.1484375, "completions/mean_terminated_length": 1911.6158447265625, "completions/min_length": 1080.0, "completions/min_terminated_length": 1080.0, "epoch": 0.02688, "frac_reward_zero_std": 0.0, "grad_norm": 0.026686642318964005, "learning_rate": 1e-06, "loss": -0.0103, "num_tokens": 71693881.0, "reward": 2.619581937789917, "reward_std": 0.05261857807636261, "rewards/cosine_scaled_reward/mean": 0.7018906474113464, "rewards/cosine_scaled_reward/std": 0.16163213551044464, "rewards/repetition_penalty_reward/mean": -0.06277747452259064, "rewards/repetition_penalty_reward/std": 0.04004902392625809, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.98046875, "rewards/reward_reference/std": 0.13865381479263306, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3853.0, "completions/mean_length": 2037.4765625, "completions/mean_terminated_length": 2013.0672607421875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "epoch": 0.027093333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.06936460733413696, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 72356207.0, "reward": 2.4788427352905273, "reward_std": 0.10106191039085388, "rewards/cosine_scaled_reward/mean": 0.6478254199028015, "rewards/cosine_scaled_reward/std": 0.3071100413799286, "rewards/repetition_penalty_reward/mean": -0.07288884371519089, "rewards/repetition_penalty_reward/std": 0.04149799793958664, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4032.0, "completions/mean_length": 2047.5234375, "completions/mean_terminated_length": 1955.5509033203125, "completions/min_length": 1143.0, "completions/min_terminated_length": 1143.0, "epoch": 0.027306666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.09897647798061371, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 72987537.0, "reward": 2.4084055423736572, "reward_std": 0.1814638078212738, "rewards/cosine_scaled_reward/mean": 0.6032418012619019, "rewards/cosine_scaled_reward/std": 0.35705065727233887, "rewards/repetition_penalty_reward/mean": -0.06905508041381836, "rewards/repetition_penalty_reward/std": 0.048145100474357605, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3861.0, "completions/mean_length": 2110.09765625, "completions/mean_terminated_length": 2094.460693359375, "completions/min_length": 1158.0, "completions/min_terminated_length": 1158.0, "epoch": 0.02752, "frac_reward_zero_std": 0.0, "grad_norm": 0.1392839103937149, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 73680226.0, "reward": 2.4746334552764893, "reward_std": 0.24597196280956268, "rewards/cosine_scaled_reward/mean": 0.6423944234848022, "rewards/cosine_scaled_reward/std": 0.33495256304740906, "rewards/repetition_penalty_reward/mean": -0.07010472565889359, "rewards/repetition_penalty_reward/std": 0.03858313709497452, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5, "completions/max_length": 4096.0, "completions/max_terminated_length": 3984.0, "completions/mean_length": 2141.734375, "completions/mean_terminated_length": 2011.4500732421875, "completions/min_length": 1148.0, "completions/min_terminated_length": 1148.0, "epoch": 0.027733333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 0.093904048204422, "learning_rate": 1e-06, "loss": -0.0385, "num_tokens": 74322770.0, "reward": 2.463771343231201, "reward_std": 0.19271329045295715, "rewards/cosine_scaled_reward/mean": 0.6441590189933777, "rewards/cosine_scaled_reward/std": 0.3357996940612793, "rewards/repetition_penalty_reward/mean": -0.07726290076971054, "rewards/repetition_penalty_reward/std": 0.055103596299886703, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 2108.25, "completions/mean_terminated_length": 2044.1290283203125, "completions/min_length": 1217.0, "completions/min_terminated_length": 1217.0, "epoch": 0.02794666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10858944058418274, "learning_rate": 1e-06, "loss": -0.0266, "num_tokens": 74983862.0, "reward": 2.411698818206787, "reward_std": 0.14864076673984528, "rewards/cosine_scaled_reward/mean": 0.6128315925598145, "rewards/cosine_scaled_reward/std": 0.3646450638771057, "rewards/repetition_penalty_reward/mean": -0.0745701789855957, "rewards/repetition_penalty_reward/std": 0.0539807491004467, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3876.0, "completions/mean_length": 2086.9140625, "completions/mean_terminated_length": 2030.4337158203125, "completions/min_length": 1081.0, "completions/min_terminated_length": 1081.0, "epoch": 0.02816, "frac_reward_zero_std": 0.0, "grad_norm": 0.12158370763063431, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 75646960.0, "reward": 2.420889139175415, "reward_std": 0.14892278611660004, "rewards/cosine_scaled_reward/mean": 0.6176682710647583, "rewards/cosine_scaled_reward/std": 0.35261261463165283, "rewards/repetition_penalty_reward/mean": -0.07412292063236237, "rewards/repetition_penalty_reward/std": 0.06017336621880531, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3954.0, "completions/mean_length": 2100.36328125, "completions/mean_terminated_length": 2068.6865234375, "completions/min_length": 1299.0, "completions/min_terminated_length": 1299.0, "epoch": 0.028373333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.08128384500741959, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 76331945.0, "reward": 2.5049729347229004, "reward_std": 0.11164649575948715, "rewards/cosine_scaled_reward/mean": 0.6589208245277405, "rewards/cosine_scaled_reward/std": 0.309569776058197, "rewards/repetition_penalty_reward/mean": -0.06879155337810516, "rewards/repetition_penalty_reward/std": 0.036213457584381104, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4003.0, "completions/mean_length": 2168.25390625, "completions/mean_terminated_length": 2153.07470703125, "completions/min_length": 1157.0, "completions/min_terminated_length": 1157.0, "epoch": 0.028586666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1047474816441536, "learning_rate": 1e-06, "loss": -0.0122, "num_tokens": 77025026.0, "reward": 2.452206611633301, "reward_std": 0.17508813738822937, "rewards/cosine_scaled_reward/mean": 0.6381878852844238, "rewards/cosine_scaled_reward/std": 0.35581979155540466, "rewards/repetition_penalty_reward/mean": -0.06957493722438812, "rewards/repetition_penalty_reward/std": 0.04151101037859917, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3802.0, "completions/mean_length": 2059.671875, "completions/mean_terminated_length": 2043.6378173828125, "completions/min_length": 1275.0, "completions/min_terminated_length": 1275.0, "epoch": 0.0288, "frac_reward_zero_std": 0.0, "grad_norm": 0.08779332041740417, "learning_rate": 1e-06, "loss": -0.0137, "num_tokens": 77701418.0, "reward": 2.5491394996643066, "reward_std": 0.12299126386642456, "rewards/cosine_scaled_reward/mean": 0.6745621562004089, "rewards/cosine_scaled_reward/std": 0.2754439413547516, "rewards/repetition_penalty_reward/mean": -0.06292291730642319, "rewards/repetition_penalty_reward/std": 0.04240197688341141, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4013.0, "completions/mean_length": 2068.34765625, "completions/mean_terminated_length": 2036.162841796875, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "epoch": 0.029013333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 0.09798014909029007, "learning_rate": 1e-06, "loss": -0.0196, "num_tokens": 78378615.0, "reward": 2.6103098392486572, "reward_std": 0.09224607050418854, "rewards/cosine_scaled_reward/mean": 0.7064558267593384, "rewards/cosine_scaled_reward/std": 0.22068408131599426, "rewards/repetition_penalty_reward/mean": -0.06098976358771324, "rewards/repetition_penalty_reward/std": 0.04063894599676132, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.96484375, "rewards/reward_reference/std": 0.18453538417816162, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 3710.0, "completions/max_terminated_length": 3710.0, "completions/mean_length": 2071.12109375, "completions/mean_terminated_length": 2071.12109375, "completions/min_length": 1181.0, "completions/min_terminated_length": 1181.0, "epoch": 0.029226666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 0.3282462954521179, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 79071558.0, "reward": 2.4916515350341797, "reward_std": 0.12393692880868912, "rewards/cosine_scaled_reward/mean": 0.6448882818222046, "rewards/cosine_scaled_reward/std": 0.3213672935962677, "rewards/repetition_penalty_reward/mean": -0.06339280307292938, "rewards/repetition_penalty_reward/std": 0.0333862230181694, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3987.0, "completions/mean_length": 2098.19140625, "completions/mean_terminated_length": 2066.480224609375, "completions/min_length": 1271.0, "completions/min_terminated_length": 1271.0, "epoch": 0.02944, "frac_reward_zero_std": 0.0, "grad_norm": 0.08934274315834045, "learning_rate": 1e-06, "loss": -0.0331, "num_tokens": 79752351.0, "reward": 2.5181455612182617, "reward_std": 0.14886733889579773, "rewards/cosine_scaled_reward/mean": 0.6674562096595764, "rewards/cosine_scaled_reward/std": 0.2978929877281189, "rewards/repetition_penalty_reward/mean": -0.06806077808141708, "rewards/repetition_penalty_reward/std": 0.04327305778861046, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3661.0, "completions/mean_length": 2254.41015625, "completions/mean_terminated_length": 2179.548583984375, "completions/min_length": 1266.0, "completions/min_terminated_length": 1266.0, "epoch": 0.029653333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12618717551231384, "learning_rate": 1e-06, "loss": -0.0466, "num_tokens": 80450128.0, "reward": 2.5388686656951904, "reward_std": 0.24636448919773102, "rewards/cosine_scaled_reward/mean": 0.6855044960975647, "rewards/cosine_scaled_reward/std": 0.3177576959133148, "rewards/repetition_penalty_reward/mean": -0.06929213553667068, "rewards/repetition_penalty_reward/std": 0.0481431819498539, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3906.0, "completions/mean_length": 2127.328125, "completions/mean_terminated_length": 2096.07958984375, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.029866666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.11913932859897614, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 81140656.0, "reward": 2.5316383838653564, "reward_std": 0.18211862444877625, "rewards/cosine_scaled_reward/mean": 0.673290491104126, "rewards/cosine_scaled_reward/std": 0.29935595393180847, "rewards/repetition_penalty_reward/mean": -0.07133965194225311, "rewards/repetition_penalty_reward/std": 0.04711141809821129, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 2295.82421875, "completions/mean_terminated_length": 2237.75390625, "completions/min_length": 1371.0, "completions/min_terminated_length": 1371.0, "epoch": 0.03008, "frac_reward_zero_std": 0.0, "grad_norm": 0.13442406058311462, "learning_rate": 1e-06, "loss": -0.0334, "num_tokens": 81863259.0, "reward": 2.4578609466552734, "reward_std": 0.2603323757648468, "rewards/cosine_scaled_reward/mean": 0.649109959602356, "rewards/cosine_scaled_reward/std": 0.3731234669685364, "rewards/repetition_penalty_reward/mean": -0.07484269142150879, "rewards/repetition_penalty_reward/std": 0.04426012560725212, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 2203.62109375, "completions/mean_terminated_length": 2158.2041015625, "completions/min_length": 1266.0, "completions/min_terminated_length": 1266.0, "epoch": 0.030293333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 0.07963822036981583, "learning_rate": 1e-06, "loss": -0.0364, "num_tokens": 82563706.0, "reward": 2.5777063369750977, "reward_std": 0.1257934868335724, "rewards/cosine_scaled_reward/mean": 0.7073277235031128, "rewards/cosine_scaled_reward/std": 0.26815474033355713, "rewards/repetition_penalty_reward/mean": -0.06868387758731842, "rewards/repetition_penalty_reward/std": 0.054385874420404434, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.9453125, "rewards/reward_reference/std": 0.22781464457511902, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 2262.61328125, "completions/mean_terminated_length": 2211.072265625, "completions/min_length": 1203.0, "completions/min_terminated_length": 1203.0, "epoch": 0.030506666666666668, "frac_reward_zero_std": 0.0, "grad_norm": 0.10525289922952652, "learning_rate": 1e-06, "loss": -0.0235, "num_tokens": 83278755.0, "reward": 2.4962220191955566, "reward_std": 0.18696096539497375, "rewards/cosine_scaled_reward/mean": 0.6652387380599976, "rewards/cosine_scaled_reward/std": 0.34593114256858826, "rewards/repetition_penalty_reward/mean": -0.07136042416095734, "rewards/repetition_penalty_reward/std": 0.04237104579806328, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3829.0, "completions/mean_length": 2297.6015625, "completions/mean_terminated_length": 2232.072998046875, "completions/min_length": 1335.0, "completions/min_terminated_length": 1335.0, "epoch": 0.03072, "frac_reward_zero_std": 0.0, "grad_norm": 0.15209636092185974, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 83990089.0, "reward": 2.5153186321258545, "reward_std": 0.2199176400899887, "rewards/cosine_scaled_reward/mean": 0.6713955998420715, "rewards/cosine_scaled_reward/std": 0.3484877049922943, "rewards/repetition_penalty_reward/mean": -0.07013943791389465, "rewards/repetition_penalty_reward/std": 0.0492706373333931, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4001.0, "completions/mean_length": 2090.34765625, "completions/mean_terminated_length": 2066.565185546875, "completions/min_length": 1152.0, "completions/min_terminated_length": 1152.0, "epoch": 0.030933333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.09697643667459488, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 84684358.0, "reward": 2.6115381717681885, "reward_std": 0.1272924244403839, "rewards/cosine_scaled_reward/mean": 0.7085531949996948, "rewards/cosine_scaled_reward/std": 0.22511249780654907, "rewards/repetition_penalty_reward/mean": -0.06185879185795784, "rewards/repetition_penalty_reward/std": 0.03542029857635498, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.96484375, "rewards/reward_reference/std": 0.18453538417816162, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 2369.5859375, "completions/mean_terminated_length": 2306.68017578125, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "epoch": 0.031146666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.08577720075845718, "learning_rate": 1e-06, "loss": -0.0259, "num_tokens": 85418660.0, "reward": 2.4473724365234375, "reward_std": 0.10904312133789062, "rewards/cosine_scaled_reward/mean": 0.6387135982513428, "rewards/cosine_scaled_reward/std": 0.40183010697364807, "rewards/repetition_penalty_reward/mean": -0.07024732232093811, "rewards/repetition_penalty_reward/std": 0.04344237968325615, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 2296.45703125, "completions/mean_terminated_length": 2230.88671875, "completions/min_length": 1198.0, "completions/min_terminated_length": 1198.0, "epoch": 0.03136, "frac_reward_zero_std": 0.0, "grad_norm": 0.16243378818035126, "learning_rate": 1e-06, "loss": -0.0572, "num_tokens": 86130453.0, "reward": 2.5525739192962646, "reward_std": 0.20578095316886902, "rewards/cosine_scaled_reward/mean": 0.6998293399810791, "rewards/cosine_scaled_reward/std": 0.31000176072120667, "rewards/repetition_penalty_reward/mean": -0.0714741051197052, "rewards/repetition_penalty_reward/std": 0.06012459099292755, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3801.0, "completions/mean_length": 2168.5078125, "completions/mean_terminated_length": 2153.330810546875, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "epoch": 0.031573333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.11189655214548111, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 86843211.0, "reward": 2.5102152824401855, "reward_std": 0.11899837106466293, "rewards/cosine_scaled_reward/mean": 0.6611608266830444, "rewards/cosine_scaled_reward/std": 0.3290916681289673, "rewards/repetition_penalty_reward/mean": -0.0611017569899559, "rewards/repetition_penalty_reward/std": 0.04228059947490692, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 2289.94921875, "completions/mean_terminated_length": 2224.141845703125, "completions/min_length": 1175.0, "completions/min_terminated_length": 1175.0, "epoch": 0.031786666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.07405582815408707, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 87555606.0, "reward": 2.5253071784973145, "reward_std": 0.08711743354797363, "rewards/cosine_scaled_reward/mean": 0.6888059377670288, "rewards/cosine_scaled_reward/std": 0.32022789120674133, "rewards/repetition_penalty_reward/mean": -0.07365491986274719, "rewards/repetition_penalty_reward/std": 0.0674201175570488, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3995.0, "completions/mean_length": 2282.83203125, "completions/mean_terminated_length": 2246.713134765625, "completions/min_length": 1189.0, "completions/min_terminated_length": 1189.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.11084163933992386, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 88283223.0, "reward": 2.4095571041107178, "reward_std": 0.14110702276229858, "rewards/cosine_scaled_reward/mean": 0.6185789108276367, "rewards/cosine_scaled_reward/std": 0.40124762058258057, "rewards/repetition_penalty_reward/mean": -0.06995944678783417, "rewards/repetition_penalty_reward/std": 0.04662758484482765, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3901.0, "completions/mean_length": 2269.77734375, "completions/mean_terminated_length": 2248.12255859375, "completions/min_length": 1339.0, "completions/min_terminated_length": 1339.0, "epoch": 0.03221333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.08746644854545593, "learning_rate": 1e-06, "loss": -0.0327, "num_tokens": 89019246.0, "reward": 2.6399552822113037, "reward_std": 0.09949071705341339, "rewards/cosine_scaled_reward/mean": 0.7395962476730347, "rewards/cosine_scaled_reward/std": 0.23498032987117767, "rewards/repetition_penalty_reward/mean": -0.05745348334312439, "rewards/repetition_penalty_reward/std": 0.02995491400361061, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3631.0, "completions/mean_length": 2177.3203125, "completions/mean_terminated_length": 2162.212646484375, "completions/min_length": 1313.0, "completions/min_terminated_length": 1313.0, "epoch": 0.032426666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.06584572046995163, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 89737848.0, "reward": 2.6734888553619385, "reward_std": 0.07492285966873169, "rewards/cosine_scaled_reward/mean": 0.7505329847335815, "rewards/cosine_scaled_reward/std": 0.1640416830778122, "rewards/repetition_penalty_reward/mean": -0.0614192858338356, "rewards/repetition_penalty_reward/std": 0.03193984180688858, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.984375, "rewards/reward_reference/std": 0.12426253408193588, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 2339.8046875, "completions/mean_terminated_length": 2297.656005859375, "completions/min_length": 1232.0, "completions/min_terminated_length": 1232.0, "epoch": 0.03264, "frac_reward_zero_std": 0.0, "grad_norm": 0.16215871274471283, "learning_rate": 1e-06, "loss": -0.046, "num_tokens": 90474706.0, "reward": 2.512293815612793, "reward_std": 0.2166244387626648, "rewards/cosine_scaled_reward/mean": 0.6809740662574768, "rewards/cosine_scaled_reward/std": 0.34952786564826965, "rewards/repetition_penalty_reward/mean": -0.06243017315864563, "rewards/repetition_penalty_reward/std": 0.040823645889759064, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 2528.88671875, "completions/mean_terminated_length": 2458.5263671875, "completions/min_length": 1344.0, "completions/min_terminated_length": 1344.0, "epoch": 0.03285333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13708092272281647, "learning_rate": 1e-06, "loss": -0.0313, "num_tokens": 91241253.0, "reward": 2.446394920349121, "reward_std": 0.2406942993402481, "rewards/cosine_scaled_reward/mean": 0.6548641324043274, "rewards/cosine_scaled_reward/std": 0.42234691977500916, "rewards/repetition_penalty_reward/mean": -0.07565662264823914, "rewards/repetition_penalty_reward/std": 0.058208148926496506, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 2224.78125, "completions/mean_terminated_length": 2217.443359375, "completions/min_length": 1331.0, "completions/min_terminated_length": 1331.0, "epoch": 0.03306666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11987435817718506, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 91966513.0, "reward": 2.5100762844085693, "reward_std": 0.1404195874929428, "rewards/cosine_scaled_reward/mean": 0.6662790775299072, "rewards/cosine_scaled_reward/std": 0.3378640413284302, "rewards/repetition_penalty_reward/mean": -0.06635906547307968, "rewards/repetition_penalty_reward/std": 0.03748669847846031, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 2387.51171875, "completions/mean_terminated_length": 2380.81201171875, "completions/min_length": 1170.0, "completions/min_terminated_length": 1170.0, "epoch": 0.03328, "frac_reward_zero_std": 0.0, "grad_norm": 0.133942648768425, "learning_rate": 1e-06, "loss": -0.0175, "num_tokens": 92736440.0, "reward": 2.584214925765991, "reward_std": 0.1829279661178589, "rewards/cosine_scaled_reward/mean": 0.7195907235145569, "rewards/cosine_scaled_reward/std": 0.3137247860431671, "rewards/repetition_penalty_reward/mean": -0.06115696206688881, "rewards/repetition_penalty_reward/std": 0.030387477949261665, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3841.0, "completions/mean_length": 2337.01171875, "completions/mean_terminated_length": 2272.919189453125, "completions/min_length": 1126.0, "completions/min_terminated_length": 1126.0, "epoch": 0.03349333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.08026068657636642, "learning_rate": 1e-06, "loss": -0.0285, "num_tokens": 93468063.0, "reward": 2.5636253356933594, "reward_std": 0.14338311553001404, "rewards/cosine_scaled_reward/mean": 0.7090778350830078, "rewards/cosine_scaled_reward/std": 0.3102107048034668, "rewards/repetition_penalty_reward/mean": -0.06967125833034515, "rewards/repetition_penalty_reward/std": 0.04909246787428856, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 2347.16015625, "completions/mean_terminated_length": 2340.302001953125, "completions/min_length": 1254.0, "completions/min_terminated_length": 1254.0, "epoch": 0.03370666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.07013080269098282, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 94229528.0, "reward": 2.6471498012542725, "reward_std": 0.06843972951173782, "rewards/cosine_scaled_reward/mean": 0.7538424134254456, "rewards/cosine_scaled_reward/std": 0.2390926480293274, "rewards/repetition_penalty_reward/mean": -0.06763020157814026, "rewards/repetition_penalty_reward/std": 0.04745958000421524, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 2336.1640625, "completions/mean_terminated_length": 2286.690673828125, "completions/min_length": 1311.0, "completions/min_terminated_length": 1311.0, "epoch": 0.03392, "frac_reward_zero_std": 0.0, "grad_norm": 0.11125911772251129, "learning_rate": 1e-06, "loss": -0.0059, "num_tokens": 94959914.0, "reward": 2.49423885345459, "reward_std": 0.16092067956924438, "rewards/cosine_scaled_reward/mean": 0.6674948930740356, "rewards/cosine_scaled_reward/std": 0.3645523488521576, "rewards/repetition_penalty_reward/mean": -0.06388123333454132, "rewards/repetition_penalty_reward/std": 0.03779073432087898, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2357.67578125, "completions/mean_terminated_length": 2343.98828125, "completions/min_length": 1368.0, "completions/min_terminated_length": 1368.0, "epoch": 0.034133333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.17198342084884644, "learning_rate": 1e-06, "loss": -0.0143, "num_tokens": 95715799.0, "reward": 2.5218825340270996, "reward_std": 0.16397586464881897, "rewards/cosine_scaled_reward/mean": 0.6893811225891113, "rewards/cosine_scaled_reward/std": 0.3468390107154846, "rewards/repetition_penalty_reward/mean": -0.06671740859746933, "rewards/repetition_penalty_reward/std": 0.036188509315252304, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 2388.20703125, "completions/mean_terminated_length": 2381.510009765625, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.034346666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.12249568849802017, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 96481476.0, "reward": 2.613097667694092, "reward_std": 0.16200359165668488, "rewards/cosine_scaled_reward/mean": 0.7398969531059265, "rewards/cosine_scaled_reward/std": 0.28102821111679077, "rewards/repetition_penalty_reward/mean": -0.06508070230484009, "rewards/repetition_penalty_reward/std": 0.03461037203669548, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3821.0, "completions/mean_length": 2416.1640625, "completions/mean_terminated_length": 2340.74267578125, "completions/min_length": 1243.0, "completions/min_terminated_length": 1243.0, "epoch": 0.03456, "frac_reward_zero_std": 0.0, "grad_norm": 0.13304626941680908, "learning_rate": 1e-06, "loss": -0.0298, "num_tokens": 97219774.0, "reward": 2.577547550201416, "reward_std": 0.1826041340827942, "rewards/cosine_scaled_reward/mean": 0.7215290069580078, "rewards/cosine_scaled_reward/std": 0.31414729356765747, "rewards/repetition_penalty_reward/mean": -0.07366900146007538, "rewards/repetition_penalty_reward/std": 0.050914179533720016, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4003.0, "completions/mean_length": 2342.35546875, "completions/mean_terminated_length": 2335.478515625, "completions/min_length": 1415.0, "completions/min_terminated_length": 1415.0, "epoch": 0.03477333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.08213608711957932, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 97981425.0, "reward": 2.6511223316192627, "reward_std": 0.07822506129741669, "rewards/cosine_scaled_reward/mean": 0.7545344829559326, "rewards/cosine_scaled_reward/std": 0.23536533117294312, "rewards/repetition_penalty_reward/mean": -0.06122472137212753, "rewards/repetition_penalty_reward/std": 0.03276967257261276, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3932.0, "completions/mean_length": 2422.30859375, "completions/mean_terminated_length": 2347.1630859375, "completions/min_length": 1465.0, "completions/min_terminated_length": 1465.0, "epoch": 0.034986666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1392398625612259, "learning_rate": 1e-06, "loss": -0.0218, "num_tokens": 98721128.0, "reward": 2.518110513687134, "reward_std": 0.192726731300354, "rewards/cosine_scaled_reward/mean": 0.6899570226669312, "rewards/cosine_scaled_reward/std": 0.3583027422428131, "rewards/repetition_penalty_reward/mean": -0.07575271278619766, "rewards/repetition_penalty_reward/std": 0.059990935027599335, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3879.0, "completions/mean_length": 2430.46875, "completions/mean_terminated_length": 2390.49609375, "completions/min_length": 1569.0, "completions/min_terminated_length": 1569.0, "epoch": 0.0352, "frac_reward_zero_std": 0.0, "grad_norm": 0.11965960264205933, "learning_rate": 1e-06, "loss": -0.0261, "num_tokens": 99479680.0, "reward": 2.5446901321411133, "reward_std": 0.14299365878105164, "rewards/cosine_scaled_reward/mean": 0.7057998180389404, "rewards/cosine_scaled_reward/std": 0.34264159202575684, "rewards/repetition_penalty_reward/mean": -0.07126598805189133, "rewards/repetition_penalty_reward/std": 0.04200437292456627, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 2471.875, "completions/mean_terminated_length": 2384.987548828125, "completions/min_length": 1349.0, "completions/min_terminated_length": 1349.0, "epoch": 0.03541333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1257084310054779, "learning_rate": 1e-06, "loss": -0.0213, "num_tokens": 100223592.0, "reward": 2.496830940246582, "reward_std": 0.1805962324142456, "rewards/cosine_scaled_reward/mean": 0.6858898997306824, "rewards/cosine_scaled_reward/std": 0.3756991922855377, "rewards/repetition_penalty_reward/mean": -0.07812131196260452, "rewards/repetition_penalty_reward/std": 0.060796987265348434, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3871.0, "completions/mean_length": 2479.078125, "completions/mean_terminated_length": 2440.272216796875, "completions/min_length": 1397.0, "completions/min_terminated_length": 1397.0, "epoch": 0.03562666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.18038783967494965, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 100991524.0, "reward": 2.527858018875122, "reward_std": 0.2230907678604126, "rewards/cosine_scaled_reward/mean": 0.6958828568458557, "rewards/cosine_scaled_reward/std": 0.36940014362335205, "rewards/repetition_penalty_reward/mean": -0.07427485287189484, "rewards/repetition_penalty_reward/std": 0.047159843146800995, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 2387.18359375, "completions/mean_terminated_length": 2360.0595703125, "completions/min_length": 1488.0, "completions/min_terminated_length": 1488.0, "epoch": 0.03584, "frac_reward_zero_std": 0.0, "grad_norm": 0.10775547474622726, "learning_rate": 1e-06, "loss": -0.0238, "num_tokens": 101752855.0, "reward": 2.671187400817871, "reward_std": 0.11521877348423004, "rewards/cosine_scaled_reward/mean": 0.7671756148338318, "rewards/cosine_scaled_reward/std": 0.2231522649526596, "rewards/repetition_penalty_reward/mean": -0.06083208695054054, "rewards/repetition_penalty_reward/std": 0.036698468029499054, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.96484375, "rewards/reward_reference/std": 0.18453538417816162, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 2439.29296875, "completions/mean_terminated_length": 2406.291015625, "completions/min_length": 1414.0, "completions/min_terminated_length": 1414.0, "epoch": 0.03605333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.07064104825258255, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 102523330.0, "reward": 2.598905563354492, "reward_std": 0.09044383466243744, "rewards/cosine_scaled_reward/mean": 0.7386330366134644, "rewards/cosine_scaled_reward/std": 0.2991005480289459, "rewards/repetition_penalty_reward/mean": -0.0709775984287262, "rewards/repetition_penalty_reward/std": 0.045941270887851715, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 2463.8046875, "completions/mean_terminated_length": 2424.632080078125, "completions/min_length": 1516.0, "completions/min_terminated_length": 1516.0, "epoch": 0.03626666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13372494280338287, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 103297172.0, "reward": 2.620488405227661, "reward_std": 0.1414932757616043, "rewards/cosine_scaled_reward/mean": 0.746385931968689, "rewards/cosine_scaled_reward/std": 0.2921934127807617, "rewards/repetition_penalty_reward/mean": -0.06730364263057709, "rewards/repetition_penalty_reward/std": 0.0357523076236248, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 2397.1640625, "completions/mean_terminated_length": 2377.019775390625, "completions/min_length": 1352.0, "completions/min_terminated_length": 1352.0, "epoch": 0.03648, "frac_reward_zero_std": 0.0, "grad_norm": 0.07134996354579926, "learning_rate": 1e-06, "loss": -0.0163, "num_tokens": 104062430.0, "reward": 2.5790481567382812, "reward_std": 0.08886488527059555, "rewards/cosine_scaled_reward/mean": 0.7279187440872192, "rewards/cosine_scaled_reward/std": 0.30195677280426025, "rewards/repetition_penalty_reward/mean": -0.07152681052684784, "rewards/repetition_penalty_reward/std": 0.03701797500252724, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 2537.82421875, "completions/mean_terminated_length": 2519.347900390625, "completions/min_length": 1523.0, "completions/min_terminated_length": 1523.0, "epoch": 0.036693333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.09948837012052536, "learning_rate": 1e-06, "loss": -0.0109, "num_tokens": 104860645.0, "reward": 2.573436975479126, "reward_std": 0.12694615125656128, "rewards/cosine_scaled_reward/mean": 0.7350782155990601, "rewards/cosine_scaled_reward/std": 0.334741473197937, "rewards/repetition_penalty_reward/mean": -0.07648499310016632, "rewards/repetition_penalty_reward/std": 0.03855476155877113, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3928.0, "completions/mean_length": 2518.53515625, "completions/mean_terminated_length": 2493.49609375, "completions/min_length": 1453.0, "completions/min_terminated_length": 1453.0, "epoch": 0.036906666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.11611484736204147, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 105656278.0, "reward": 2.664511203765869, "reward_std": 0.1288546919822693, "rewards/cosine_scaled_reward/mean": 0.7776679992675781, "rewards/cosine_scaled_reward/std": 0.2540041506290436, "rewards/repetition_penalty_reward/mean": -0.07018814235925674, "rewards/repetition_penalty_reward/std": 0.03482862934470177, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 2552.69921875, "completions/mean_terminated_length": 2515.66015625, "completions/min_length": 1630.0, "completions/min_terminated_length": 1630.0, "epoch": 0.03712, "frac_reward_zero_std": 0.0, "grad_norm": 0.07543495297431946, "learning_rate": 1e-06, "loss": -0.01, "num_tokens": 106455677.0, "reward": 2.617737293243408, "reward_std": 0.07202796638011932, "rewards/cosine_scaled_reward/mean": 0.7571834921836853, "rewards/cosine_scaled_reward/std": 0.30415284633636475, "rewards/repetition_penalty_reward/mean": -0.07382114231586456, "rewards/repetition_penalty_reward/std": 0.041181910783052444, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 2674.3125, "completions/mean_terminated_length": 2634.34521484375, "completions/min_length": 1519.0, "completions/min_terminated_length": 1519.0, "epoch": 0.037333333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.11848440021276474, "learning_rate": 1e-06, "loss": -0.02, "num_tokens": 107270749.0, "reward": 2.5728800296783447, "reward_std": 0.14646834135055542, "rewards/cosine_scaled_reward/mean": 0.7433780431747437, "rewards/cosine_scaled_reward/std": 0.3599579930305481, "rewards/repetition_penalty_reward/mean": -0.07518541812896729, "rewards/repetition_penalty_reward/std": 0.04847825691103935, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 2718.9375, "completions/mean_terminated_length": 2651.212890625, "completions/min_length": 1497.0, "completions/min_terminated_length": 1497.0, "epoch": 0.037546666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1338227242231369, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 108074361.0, "reward": 2.518850803375244, "reward_std": 0.18713831901550293, "rewards/cosine_scaled_reward/mean": 0.7246681451797485, "rewards/cosine_scaled_reward/std": 0.3936954736709595, "rewards/repetition_penalty_reward/mean": -0.08628620207309723, "rewards/repetition_penalty_reward/std": 0.05177297443151474, "rewards/reward_format/mean": 0.9781249761581421, "rewards/reward_format/std": 0.13072198629379272, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 2679.640625, "completions/mean_terminated_length": 2657.158935546875, "completions/min_length": 1473.0, "completions/min_terminated_length": 1473.0, "epoch": 0.03776, "frac_reward_zero_std": 0.0, "grad_norm": 0.12189571559429169, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 108901109.0, "reward": 2.581157684326172, "reward_std": 0.14895252883434296, "rewards/cosine_scaled_reward/mean": 0.7483083009719849, "rewards/cosine_scaled_reward/std": 0.35640743374824524, "rewards/repetition_penalty_reward/mean": -0.07808814942836761, "rewards/repetition_penalty_reward/std": 0.033407896757125854, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 2704.85546875, "completions/mean_terminated_length": 2648.3046875, "completions/min_length": 1542.0, "completions/min_terminated_length": 1542.0, "epoch": 0.03797333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12708529829978943, "learning_rate": 1e-06, "loss": -0.0187, "num_tokens": 109710816.0, "reward": 2.556445598602295, "reward_std": 0.15058737993240356, "rewards/cosine_scaled_reward/mean": 0.7380162477493286, "rewards/cosine_scaled_reward/std": 0.37427443265914917, "rewards/repetition_penalty_reward/mean": -0.0784459114074707, "rewards/repetition_penalty_reward/std": 0.039408691227436066, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 2772.37109375, "completions/mean_terminated_length": 2707.2744140625, "completions/min_length": 1277.0, "completions/min_terminated_length": 1277.0, "epoch": 0.03818666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1372547298669815, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 110533871.0, "reward": 2.4635605812072754, "reward_std": 0.17123311758041382, "rewards/cosine_scaled_reward/mean": 0.6944536566734314, "rewards/cosine_scaled_reward/std": 0.4396909177303314, "rewards/repetition_penalty_reward/mean": -0.08401811122894287, "rewards/repetition_penalty_reward/std": 0.051559146493673325, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718994140625, "rewards/reward_reference/mean": 0.859375, "rewards/reward_reference/std": 0.3483152687549591, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 2734.51171875, "completions/mean_terminated_length": 2684.90283203125, "completions/min_length": 1469.0, "completions/min_terminated_length": 1469.0, "epoch": 0.0384, "frac_reward_zero_std": 0.0, "grad_norm": 0.10627460479736328, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 111356254.0, "reward": 2.5642635822296143, "reward_std": 0.08983881771564484, "rewards/cosine_scaled_reward/mean": 0.7460783123970032, "rewards/cosine_scaled_reward/std": 0.37204012274742126, "rewards/repetition_penalty_reward/mean": -0.07868963479995728, "rewards/repetition_penalty_reward/std": 0.041410669684410095, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 2769.875, "completions/mean_terminated_length": 2764.674560546875, "completions/min_length": 1579.0, "completions/min_terminated_length": 1579.0, "epoch": 0.03861333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.09451936185359955, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 112228258.0, "reward": 2.7477917671203613, "reward_std": 0.08577438443899155, "rewards/cosine_scaled_reward/mean": 0.843535304069519, "rewards/cosine_scaled_reward/std": 0.204621359705925, "rewards/repetition_penalty_reward/mean": -0.0723060667514801, "rewards/repetition_penalty_reward/std": 0.031312841922044754, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9765625, "rewards/reward_reference/std": 0.15158477425575256, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 2760.140625, "completions/mean_terminated_length": 2694.4423828125, "completions/min_length": 1352.0, "completions/min_terminated_length": 1352.0, "epoch": 0.03882666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13490934669971466, "learning_rate": 1e-06, "loss": -0.0398, "num_tokens": 113043882.0, "reward": 2.5234622955322266, "reward_std": 0.19220773875713348, "rewards/cosine_scaled_reward/mean": 0.7239133715629578, "rewards/cosine_scaled_reward/std": 0.4048174321651459, "rewards/repetition_penalty_reward/mean": -0.08404479920864105, "rewards/repetition_penalty_reward/std": 0.046684637665748596, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3860.0, "completions/mean_length": 2787.796875, "completions/mean_terminated_length": 2772.28466796875, "completions/min_length": 1674.0, "completions/min_terminated_length": 1674.0, "epoch": 0.03904, "frac_reward_zero_std": 0.0, "grad_norm": 0.14421382546424866, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 113910150.0, "reward": 2.5739893913269043, "reward_std": 0.16917826235294342, "rewards/cosine_scaled_reward/mean": 0.747693657875061, "rewards/cosine_scaled_reward/std": 0.3862946033477783, "rewards/repetition_penalty_reward/mean": -0.07995417714118958, "rewards/repetition_penalty_reward/std": 0.03675444424152374, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3901.0, "completions/mean_length": 2733.01171875, "completions/mean_terminated_length": 2700.300048828125, "completions/min_length": 1620.0, "completions/min_terminated_length": 1620.0, "epoch": 0.039253333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.12685048580169678, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 114744729.0, "reward": 2.5625431537628174, "reward_std": 0.1313878297805786, "rewards/cosine_scaled_reward/mean": 0.7464696168899536, "rewards/cosine_scaled_reward/std": 0.3718789219856262, "rewards/repetition_penalty_reward/mean": -0.08080147951841354, "rewards/repetition_penalty_reward/std": 0.04628358036279678, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 2822.77734375, "completions/mean_terminated_length": 2797.414306640625, "completions/min_length": 1797.0, "completions/min_terminated_length": 1797.0, "epoch": 0.039466666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.14532560110092163, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 115605096.0, "reward": 2.493752956390381, "reward_std": 0.18467921018600464, "rewards/cosine_scaled_reward/mean": 0.7073122262954712, "rewards/cosine_scaled_reward/std": 0.4398163855075836, "rewards/repetition_penalty_reward/mean": -0.08074688911437988, "rewards/repetition_penalty_reward/std": 0.038459036499261856, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4007.0, "completions/mean_length": 2905.9609375, "completions/mean_terminated_length": 2887.071533203125, "completions/min_length": 1849.0, "completions/min_terminated_length": 1849.0, "epoch": 0.03968, "frac_reward_zero_std": 0.0, "grad_norm": 0.07806304842233658, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 116500474.0, "reward": 2.704318046569824, "reward_std": 0.08798962831497192, "rewards/cosine_scaled_reward/mean": 0.8352360725402832, "rewards/cosine_scaled_reward/std": 0.2752145230770111, "rewards/repetition_penalty_reward/mean": -0.08794920146465302, "rewards/repetition_penalty_reward/std": 0.032611243426799774, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 2934.3984375, "completions/mean_terminated_length": 2915.96044921875, "completions/min_length": 2009.0, "completions/min_terminated_length": 2009.0, "epoch": 0.039893333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.12208955734968185, "learning_rate": 1e-06, "loss": -0.0099, "num_tokens": 117398232.0, "reward": 2.709160327911377, "reward_std": 0.12949120998382568, "rewards/cosine_scaled_reward/mean": 0.8370053172111511, "rewards/cosine_scaled_reward/std": 0.28149205446243286, "rewards/repetition_penalty_reward/mean": -0.08487646281719208, "rewards/repetition_penalty_reward/std": 0.0364808514714241, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 2816.8046875, "completions/mean_terminated_length": 2801.636474609375, "completions/min_length": 1413.0, "completions/min_terminated_length": 1413.0, "epoch": 0.040106666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.11472379416227341, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 118263698.0, "reward": 2.5987226963043213, "reward_std": 0.1316905915737152, "rewards/cosine_scaled_reward/mean": 0.7706905603408813, "rewards/cosine_scaled_reward/std": 0.36017245054244995, "rewards/repetition_penalty_reward/mean": -0.08603024482727051, "rewards/repetition_penalty_reward/std": 0.030638542026281357, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 3020.15234375, "completions/mean_terminated_length": 2971.848876953125, "completions/min_length": 1944.0, "completions/min_terminated_length": 1944.0, "epoch": 0.04032, "frac_reward_zero_std": 0.0, "grad_norm": 0.31249523162841797, "learning_rate": 1e-06, "loss": -0.0538, "num_tokens": 119145317.0, "reward": 2.6029105186462402, "reward_std": 0.25811532139778137, "rewards/cosine_scaled_reward/mean": 0.7884730100631714, "rewards/cosine_scaled_reward/std": 0.3834865689277649, "rewards/repetition_penalty_reward/mean": -0.08946871757507324, "rewards/repetition_penalty_reward/std": 0.042005784809589386, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3139.890625, "completions/mean_terminated_length": 3088.74072265625, "completions/min_length": 1696.0, "completions/min_terminated_length": 1696.0, "epoch": 0.04053333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.05340828001499176, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 120058193.0, "reward": 2.6640408039093018, "reward_std": 0.06567069888114929, "rewards/cosine_scaled_reward/mean": 0.8290449380874634, "rewards/cosine_scaled_reward/std": 0.34697195887565613, "rewards/repetition_penalty_reward/mean": -0.09156674891710281, "rewards/repetition_penalty_reward/std": 0.03702438622713089, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3050.08984375, "completions/mean_terminated_length": 2984.991943359375, "completions/min_length": 1597.0, "completions/min_terminated_length": 1597.0, "epoch": 0.04074666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1327456533908844, "learning_rate": 1e-06, "loss": -0.0159, "num_tokens": 120942404.0, "reward": 2.5852432250976562, "reward_std": 0.21544049680233002, "rewards/cosine_scaled_reward/mean": 0.789260983467102, "rewards/cosine_scaled_reward/std": 0.3850567936897278, "rewards/repetition_penalty_reward/mean": -0.09620514512062073, "rewards/repetition_penalty_reward/std": 0.04582031071186066, "rewards/reward_format/mean": 0.9781249761581421, "rewards/reward_format/std": 0.13072198629379272, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 3017.37890625, "completions/mean_terminated_length": 2964.331787109375, "completions/min_length": 1971.0, "completions/min_terminated_length": 1971.0, "epoch": 0.04096, "frac_reward_zero_std": 0.0, "grad_norm": 0.06460432708263397, "learning_rate": 1e-06, "loss": -0.0154, "num_tokens": 121827473.0, "reward": 2.5872888565063477, "reward_std": 0.09368535131216049, "rewards/cosine_scaled_reward/mean": 0.7789064645767212, "rewards/cosine_scaled_reward/std": 0.3939122259616852, "rewards/repetition_penalty_reward/mean": -0.0978674441576004, "rewards/repetition_penalty_reward/std": 0.040292881429195404, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 2939.30859375, "completions/mean_terminated_length": 2882.421875, "completions/min_length": 1532.0, "completions/min_terminated_length": 1532.0, "epoch": 0.04117333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1415102332830429, "learning_rate": 1e-06, "loss": -0.022, "num_tokens": 122691892.0, "reward": 2.592499017715454, "reward_std": 0.16990575194358826, "rewards/cosine_scaled_reward/mean": 0.7752484083175659, "rewards/cosine_scaled_reward/std": 0.3793916404247284, "rewards/repetition_penalty_reward/mean": -0.09681184589862823, "rewards/repetition_penalty_reward/std": 0.03823775798082352, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 2930.1796875, "completions/mean_terminated_length": 2887.700439453125, "completions/min_length": 1627.0, "completions/min_terminated_length": 1627.0, "epoch": 0.04138666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.14961861073970795, "learning_rate": 1e-06, "loss": -0.0091, "num_tokens": 123563606.0, "reward": 2.638584613800049, "reward_std": 0.20664985477924347, "rewards/cosine_scaled_reward/mean": 0.8019155263900757, "rewards/cosine_scaled_reward/std": 0.33979693055152893, "rewards/repetition_penalty_reward/mean": -0.09379956871271133, "rewards/repetition_penalty_reward/std": 0.03525862842798233, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 2985.79296875, "completions/mean_terminated_length": 2945.340087890625, "completions/min_length": 1421.0, "completions/min_terminated_length": 1421.0, "epoch": 0.0416, "frac_reward_zero_std": 0.0, "grad_norm": 0.1307501643896103, "learning_rate": 1e-06, "loss": -0.019, "num_tokens": 124465661.0, "reward": 2.659268856048584, "reward_std": 0.16081370413303375, "rewards/cosine_scaled_reward/mean": 0.8172322511672974, "rewards/cosine_scaled_reward/std": 0.32748943567276, "rewards/repetition_penalty_reward/mean": -0.09546343237161636, "rewards/repetition_penalty_reward/std": 0.03714780509471893, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 2998.1875, "completions/mean_terminated_length": 2944.196533203125, "completions/min_length": 1623.0, "completions/min_terminated_length": 1623.0, "epoch": 0.041813333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12217960506677628, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 125349101.0, "reward": 2.6592578887939453, "reward_std": 0.13490937650203705, "rewards/cosine_scaled_reward/mean": 0.8189342021942139, "rewards/cosine_scaled_reward/std": 0.32874596118927, "rewards/repetition_penalty_reward/mean": -0.09327013045549393, "rewards/repetition_penalty_reward/std": 0.033582936972379684, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 2956.6015625, "completions/mean_terminated_length": 2919.8466796875, "completions/min_length": 1686.0, "completions/min_terminated_length": 1686.0, "epoch": 0.042026666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.06713753193616867, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 126240259.0, "reward": 2.714076042175293, "reward_std": 0.09007962793111801, "rewards/cosine_scaled_reward/mean": 0.8496314287185669, "rewards/cosine_scaled_reward/std": 0.2547302544116974, "rewards/repetition_penalty_reward/mean": -0.09649277478456497, "rewards/repetition_penalty_reward/std": 0.04336199536919594, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3068.90625, "completions/mean_terminated_length": 2991.22705078125, "completions/min_length": 1184.0, "completions/min_terminated_length": 1184.0, "epoch": 0.04224, "frac_reward_zero_std": 0.0, "grad_norm": 0.09822492301464081, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 127112907.0, "reward": 2.6488893032073975, "reward_std": 0.14325517416000366, "rewards/cosine_scaled_reward/mean": 0.8167030215263367, "rewards/cosine_scaled_reward/std": 0.3480437099933624, "rewards/repetition_penalty_reward/mean": -0.0975012555718422, "rewards/repetition_penalty_reward/std": 0.03966366872191429, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 3024.32421875, "completions/mean_terminated_length": 2957.62255859375, "completions/min_length": 1712.0, "completions/min_terminated_length": 1712.0, "epoch": 0.042453333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.09099367260932922, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 127996106.0, "reward": 2.632681369781494, "reward_std": 0.1334916204214096, "rewards/cosine_scaled_reward/mean": 0.7996670603752136, "rewards/cosine_scaled_reward/std": 0.3644350469112396, "rewards/repetition_penalty_reward/mean": -0.08886080980300903, "rewards/repetition_penalty_reward/std": 0.03873216733336449, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 3094.09375, "completions/mean_terminated_length": 3044.819580078125, "completions/min_length": 1948.0, "completions/min_terminated_length": 1948.0, "epoch": 0.042666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.14509928226470947, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 128896898.0, "reward": 2.5783491134643555, "reward_std": 0.20983529090881348, "rewards/cosine_scaled_reward/mean": 0.7796949148178101, "rewards/cosine_scaled_reward/std": 0.4083874821662903, "rewards/repetition_penalty_reward/mean": -0.10056446492671967, "rewards/repetition_penalty_reward/std": 0.04261765629053116, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 3027.64453125, "completions/mean_terminated_length": 2988.716552734375, "completions/min_length": 1566.0, "completions/min_terminated_length": 1566.0, "epoch": 0.04288, "frac_reward_zero_std": 0.0, "grad_norm": 0.1521781086921692, "learning_rate": 1e-06, "loss": -0.0159, "num_tokens": 129796651.0, "reward": 2.5637636184692383, "reward_std": 0.2006940245628357, "rewards/cosine_scaled_reward/mean": 0.7643671035766602, "rewards/cosine_scaled_reward/std": 0.41633033752441406, "rewards/repetition_penalty_reward/mean": -0.09122838079929352, "rewards/repetition_penalty_reward/std": 0.03793216124176979, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 3094.3671875, "completions/mean_terminated_length": 3040.78173828125, "completions/min_length": 1759.0, "completions/min_terminated_length": 1759.0, "epoch": 0.04309333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.039731431752443314, "learning_rate": 1e-06, "loss": -0.0152, "num_tokens": 130699769.0, "reward": 2.798168182373047, "reward_std": 0.07617372274398804, "rewards/cosine_scaled_reward/mean": 0.9041758179664612, "rewards/cosine_scaled_reward/std": 0.14127732813358307, "rewards/repetition_penalty_reward/mean": -0.0981951504945755, "rewards/repetition_penalty_reward/std": 0.03816307336091995, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9921875, "rewards/reward_reference/std": 0.08821486681699753, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 2993.23046875, "completions/mean_terminated_length": 2938.995849609375, "completions/min_length": 1732.0, "completions/min_terminated_length": 1732.0, "epoch": 0.04330666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.0980035662651062, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 131575184.0, "reward": 2.4957215785980225, "reward_std": 0.16899895668029785, "rewards/cosine_scaled_reward/mean": 0.7244171500205994, "rewards/cosine_scaled_reward/std": 0.45311957597732544, "rewards/repetition_penalty_reward/mean": -0.09275795519351959, "rewards/repetition_penalty_reward/std": 0.048649415373802185, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 3021.48828125, "completions/mean_terminated_length": 2973.244873046875, "completions/min_length": 1873.0, "completions/min_terminated_length": 1873.0, "epoch": 0.04352, "frac_reward_zero_std": 0.0, "grad_norm": 0.0876871794462204, "learning_rate": 1e-06, "loss": -0.016, "num_tokens": 132464137.0, "reward": 2.6541526317596436, "reward_std": 0.10013444721698761, "rewards/cosine_scaled_reward/mean": 0.8130102157592773, "rewards/cosine_scaled_reward/std": 0.3439813554286957, "rewards/repetition_penalty_reward/mean": -0.08854503184556961, "rewards/repetition_penalty_reward/std": 0.04010245203971863, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 2941.59375, "completions/mean_terminated_length": 2932.50390625, "completions/min_length": 1694.0, "completions/min_terminated_length": 1694.0, "epoch": 0.04373333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.08829687535762787, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 133372257.0, "reward": 2.6222023963928223, "reward_std": 0.09712390601634979, "rewards/cosine_scaled_reward/mean": 0.7839394211769104, "rewards/cosine_scaled_reward/std": 0.37110278010368347, "rewards/repetition_penalty_reward/mean": -0.08361180871725082, "rewards/repetition_penalty_reward/std": 0.036519844084978104, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 3063.41015625, "completions/mean_terminated_length": 3038.628173828125, "completions/min_length": 1935.0, "completions/min_terminated_length": 1935.0, "epoch": 0.04394666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1125572919845581, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 134293802.0, "reward": 2.643646478652954, "reward_std": 0.15754051506519318, "rewards/cosine_scaled_reward/mean": 0.8129024505615234, "rewards/cosine_scaled_reward/std": 0.35667291283607483, "rewards/repetition_penalty_reward/mean": -0.09113100171089172, "rewards/repetition_penalty_reward/std": 0.04145337641239166, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 3047.96875, "completions/mean_terminated_length": 2996.426025390625, "completions/min_length": 1952.0, "completions/min_terminated_length": 1952.0, "epoch": 0.04416, "frac_reward_zero_std": 0.0, "grad_norm": 0.12327634543180466, "learning_rate": 1e-06, "loss": -0.0238, "num_tokens": 135185766.0, "reward": 2.4939651489257812, "reward_std": 0.18519936501979828, "rewards/cosine_scaled_reward/mean": 0.7189508080482483, "rewards/cosine_scaled_reward/std": 0.4683718979358673, "rewards/repetition_penalty_reward/mean": -0.08826674520969391, "rewards/repetition_penalty_reward/std": 0.0476943776011467, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.86328125, "rewards/reward_reference/std": 0.34422317147254944, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3996.0, "completions/mean_length": 2941.19140625, "completions/mean_terminated_length": 2932.098388671875, "completions/min_length": 1626.0, "completions/min_terminated_length": 1626.0, "epoch": 0.044373333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.06048279255628586, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 136098811.0, "reward": 2.684300422668457, "reward_std": 0.07446961104869843, "rewards/cosine_scaled_reward/mean": 0.8218631744384766, "rewards/cosine_scaled_reward/std": 0.3134990632534027, "rewards/repetition_penalty_reward/mean": -0.07896900177001953, "rewards/repetition_penalty_reward/std": 0.0332108810544014, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 3074.71484375, "completions/mean_terminated_length": 2988.165283203125, "completions/min_length": 1816.0, "completions/min_terminated_length": 1816.0, "epoch": 0.04458666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.08215809613466263, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 136966294.0, "reward": 2.624622344970703, "reward_std": 0.1569027155637741, "rewards/cosine_scaled_reward/mean": 0.8008697628974915, "rewards/cosine_scaled_reward/std": 0.3725561201572418, "rewards/repetition_penalty_reward/mean": -0.08796609938144684, "rewards/repetition_penalty_reward/std": 0.05247509479522705, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 2938.37109375, "completions/mean_terminated_length": 2901.028076171875, "completions/min_length": 1573.0, "completions/min_terminated_length": 1573.0, "epoch": 0.0448, "frac_reward_zero_std": 0.0, "grad_norm": 0.14607393741607666, "learning_rate": 1e-06, "loss": -0.0152, "num_tokens": 137849509.0, "reward": 2.6370255947113037, "reward_std": 0.22698213160037994, "rewards/cosine_scaled_reward/mean": 0.7916585803031921, "rewards/cosine_scaled_reward/std": 0.3612592816352844, "rewards/repetition_penalty_reward/mean": -0.07650791108608246, "rewards/repetition_penalty_reward/std": 0.042758870869874954, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3943.0, "completions/mean_length": 2850.15234375, "completions/mean_terminated_length": 2830.377197265625, "completions/min_length": 1788.0, "completions/min_terminated_length": 1788.0, "epoch": 0.045013333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.12609753012657166, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 138723052.0, "reward": 2.524672031402588, "reward_std": 0.1426827311515808, "rewards/cosine_scaled_reward/mean": 0.7225947380065918, "rewards/cosine_scaled_reward/std": 0.4295172691345215, "rewards/repetition_penalty_reward/mean": -0.07292264699935913, "rewards/repetition_penalty_reward/std": 0.03526431694626808, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 2919.5390625, "completions/mean_terminated_length": 2891.30419921875, "completions/min_length": 1815.0, "completions/min_terminated_length": 1815.0, "epoch": 0.045226666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.1029074564576149, "learning_rate": 1e-06, "loss": -0.0146, "num_tokens": 139603246.0, "reward": 2.6630499362945557, "reward_std": 0.13122224807739258, "rewards/cosine_scaled_reward/mean": 0.8020139932632446, "rewards/cosine_scaled_reward/std": 0.33735036849975586, "rewards/repetition_penalty_reward/mean": -0.07255782186985016, "rewards/repetition_penalty_reward/std": 0.031210284680128098, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3999.0, "completions/mean_length": 2900.22265625, "completions/mean_terminated_length": 2846.53466796875, "completions/min_length": 1855.0, "completions/min_terminated_length": 1855.0, "epoch": 0.04544, "frac_reward_zero_std": 0.0, "grad_norm": 0.11880956590175629, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 140461603.0, "reward": 2.6664743423461914, "reward_std": 0.14175042510032654, "rewards/cosine_scaled_reward/mean": 0.8035378456115723, "rewards/cosine_scaled_reward/std": 0.32895320653915405, "rewards/repetition_penalty_reward/mean": -0.0745634138584137, "rewards/repetition_penalty_reward/std": 0.035252682864665985, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 2865.43359375, "completions/mean_terminated_length": 2835.900146484375, "completions/min_length": 1427.0, "completions/min_terminated_length": 1427.0, "epoch": 0.04565333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.08788994699716568, "learning_rate": 1e-06, "loss": -0.0391, "num_tokens": 141330550.0, "reward": 2.6503522396087646, "reward_std": 0.13490185141563416, "rewards/cosine_scaled_reward/mean": 0.7931498289108276, "rewards/cosine_scaled_reward/std": 0.33805233240127563, "rewards/repetition_penalty_reward/mean": -0.0724850744009018, "rewards/repetition_penalty_reward/std": 0.02933771163225174, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 2900.1171875, "completions/mean_terminated_length": 2871.416015625, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.04586666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.12615323066711426, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 142217136.0, "reward": 2.5718753337860107, "reward_std": 0.15692222118377686, "rewards/cosine_scaled_reward/mean": 0.7568657398223877, "rewards/cosine_scaled_reward/std": 0.39952579140663147, "rewards/repetition_penalty_reward/mean": -0.07483415305614471, "rewards/repetition_penalty_reward/std": 0.03881849721074104, "rewards/reward_format/mean": 0.995312511920929, "rewards/reward_format/std": 0.05581394582986832, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3043.5234375, "completions/mean_terminated_length": 3013.935546875, "completions/min_length": 1927.0, "completions/min_terminated_length": 1927.0, "epoch": 0.04608, "frac_reward_zero_std": 0.0, "grad_norm": 0.12513042986392975, "learning_rate": 1e-06, "loss": -0.0171, "num_tokens": 143130158.0, "reward": 2.602346658706665, "reward_std": 0.18451841175556183, "rewards/cosine_scaled_reward/mean": 0.7779296636581421, "rewards/cosine_scaled_reward/std": 0.40503278374671936, "rewards/repetition_penalty_reward/mean": -0.07402050495147705, "rewards/repetition_penalty_reward/std": 0.030004704371094704, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 2995.80078125, "completions/mean_terminated_length": 2964.871337890625, "completions/min_length": 1747.0, "completions/min_terminated_length": 1747.0, "epoch": 0.04629333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12058738619089127, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 144025731.0, "reward": 2.4607934951782227, "reward_std": 0.14996573328971863, "rewards/cosine_scaled_reward/mean": 0.6966145038604736, "rewards/cosine_scaled_reward/std": 0.48234885931015015, "rewards/repetition_penalty_reward/mean": -0.08738350868225098, "rewards/repetition_penalty_reward/std": 0.043269261717796326, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8515625, "rewards/reward_reference/std": 0.3562295734882355, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 3143.25, "completions/mean_terminated_length": 3062.508544921875, "completions/min_length": 1899.0, "completions/min_terminated_length": 1899.0, "epoch": 0.04650666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.06288176029920578, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 144920803.0, "reward": 2.7394394874572754, "reward_std": 0.09670986980199814, "rewards/cosine_scaled_reward/mean": 0.8714437484741211, "rewards/cosine_scaled_reward/std": 0.26286423206329346, "rewards/repetition_penalty_reward/mean": -0.09059803187847137, "rewards/repetition_penalty_reward/std": 0.04828861728310585, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.96484375, "rewards/reward_reference/std": 0.18453538417816162, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 3217.70703125, "completions/mean_terminated_length": 3122.653564453125, "completions/min_length": 2044.0, "completions/min_terminated_length": 2044.0, "epoch": 0.04672, "frac_reward_zero_std": 0.0, "grad_norm": 0.12472854554653168, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 145811928.0, "reward": 2.718686103820801, "reward_std": 0.12714329361915588, "rewards/cosine_scaled_reward/mean": 0.8730841279029846, "rewards/cosine_scaled_reward/std": 0.28264540433883667, "rewards/repetition_penalty_reward/mean": -0.1114293783903122, "rewards/repetition_penalty_reward/std": 0.05651494488120079, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3317.3203125, "completions/mean_terminated_length": 3251.33056640625, "completions/min_length": 1816.0, "completions/min_terminated_length": 1816.0, "epoch": 0.046933333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.11304979771375656, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 146742942.0, "reward": 2.702650547027588, "reward_std": 0.14594215154647827, "rewards/cosine_scaled_reward/mean": 0.8594841957092285, "rewards/cosine_scaled_reward/std": 0.33492517471313477, "rewards/repetition_penalty_reward/mean": -0.09824004769325256, "rewards/repetition_penalty_reward/std": 0.05412546917796135, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3472.67578125, "completions/mean_terminated_length": 3317.60498046875, "completions/min_length": 1775.0, "completions/min_terminated_length": 1775.0, "epoch": 0.04714666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13369408249855042, "learning_rate": 1e-06, "loss": -0.0467, "num_tokens": 147579567.0, "reward": 2.5761454105377197, "reward_std": 0.23749884963035583, "rewards/cosine_scaled_reward/mean": 0.798820972442627, "rewards/cosine_scaled_reward/std": 0.4479582905769348, "rewards/repetition_penalty_reward/mean": -0.1133006364107132, "rewards/repetition_penalty_reward/std": 0.05030689388513565, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3550.10546875, "completions/mean_terminated_length": 3427.344482421875, "completions/min_length": 1707.0, "completions/min_terminated_length": 1707.0, "epoch": 0.04736, "frac_reward_zero_std": 0.0, "grad_norm": 0.22356118261814117, "learning_rate": 1e-06, "loss": -0.0519, "num_tokens": 148456190.0, "reward": 2.5239830017089844, "reward_std": 0.34182125329971313, "rewards/cosine_scaled_reward/mean": 0.7705343961715698, "rewards/cosine_scaled_reward/std": 0.49305281043052673, "rewards/repetition_penalty_reward/mean": -0.11373880505561829, "rewards/repetition_penalty_reward/std": 0.04404772073030472, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 3501.703125, "completions/mean_terminated_length": 3335.2998046875, "completions/min_length": 2298.0, "completions/min_terminated_length": 2298.0, "epoch": 0.047573333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.12591539323329926, "learning_rate": 1e-06, "loss": -0.0701, "num_tokens": 149286418.0, "reward": 2.5235965251922607, "reward_std": 0.3133578598499298, "rewards/cosine_scaled_reward/mean": 0.7684545516967773, "rewards/cosine_scaled_reward/std": 0.4874875545501709, "rewards/repetition_penalty_reward/mean": -0.10970175266265869, "rewards/repetition_penalty_reward/std": 0.05052530765533447, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 3428.93359375, "completions/mean_terminated_length": 3294.267578125, "completions/min_length": 2225.0, "completions/min_terminated_length": 2225.0, "epoch": 0.047786666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.20103350281715393, "learning_rate": 1e-06, "loss": -0.0459, "num_tokens": 150155365.0, "reward": 2.6452536582946777, "reward_std": 0.23817402124404907, "rewards/cosine_scaled_reward/mean": 0.8362963199615479, "rewards/cosine_scaled_reward/std": 0.39203858375549316, "rewards/repetition_penalty_reward/mean": -0.10666733235120773, "rewards/repetition_penalty_reward/std": 0.04530767351388931, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 3358.8359375, "completions/mean_terminated_length": 3257.271240234375, "completions/min_length": 2110.0, "completions/min_terminated_length": 2110.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.09694981575012207, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 151056083.0, "reward": 2.6646728515625, "reward_std": 0.1226922944188118, "rewards/cosine_scaled_reward/mean": 0.8418905735015869, "rewards/cosine_scaled_reward/std": 0.37268179655075073, "rewards/repetition_penalty_reward/mean": -0.10299905389547348, "rewards/repetition_penalty_reward/std": 0.038974642753601074, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 3305.85546875, "completions/mean_terminated_length": 3192.977783203125, "completions/min_length": 2018.0, "completions/min_terminated_length": 2018.0, "epoch": 0.04821333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.10110696405172348, "learning_rate": 1e-06, "loss": -0.0127, "num_tokens": 151933410.0, "reward": 2.6624765396118164, "reward_std": 0.13132601976394653, "rewards/cosine_scaled_reward/mean": 0.8468172550201416, "rewards/cosine_scaled_reward/std": 0.35259246826171875, "rewards/repetition_penalty_reward/mean": -0.11480934172868729, "rewards/repetition_penalty_reward/std": 0.0483165867626667, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3393.78125, "completions/mean_terminated_length": 3210.443359375, "completions/min_length": 1401.0, "completions/min_terminated_length": 1401.0, "epoch": 0.048426666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10942267626523972, "learning_rate": 1e-06, "loss": -0.0237, "num_tokens": 152751126.0, "reward": 2.390054225921631, "reward_std": 0.24889340996742249, "rewards/cosine_scaled_reward/mean": 0.6856964826583862, "rewards/cosine_scaled_reward/std": 0.5464708209037781, "rewards/repetition_penalty_reward/mean": -0.11595490574836731, "rewards/repetition_penalty_reward/std": 0.0469362810254097, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8203125, "rewards/reward_reference/std": 0.38467901945114136, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3496.51171875, "completions/mean_terminated_length": 3347.370849609375, "completions/min_length": 2328.0, "completions/min_terminated_length": 2328.0, "epoch": 0.04864, "frac_reward_zero_std": 0.0, "grad_norm": 0.1789751499891281, "learning_rate": 1e-06, "loss": -0.0884, "num_tokens": 153594401.0, "reward": 2.5644519329071045, "reward_std": 0.27266085147857666, "rewards/cosine_scaled_reward/mean": 0.793789267539978, "rewards/cosine_scaled_reward/std": 0.4596695303916931, "rewards/repetition_penalty_reward/mean": -0.11605618894100189, "rewards/repetition_penalty_reward/std": 0.051774267107248306, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3340.65234375, "completions/mean_terminated_length": 3213.036376953125, "completions/min_length": 1993.0, "completions/min_terminated_length": 1993.0, "epoch": 0.04885333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.11022138595581055, "learning_rate": 1e-06, "loss": -0.0348, "num_tokens": 154459104.0, "reward": 2.6423568725585938, "reward_std": 0.20323346555233002, "rewards/cosine_scaled_reward/mean": 0.8313206434249878, "rewards/cosine_scaled_reward/std": 0.38284415006637573, "rewards/repetition_penalty_reward/mean": -0.1108388751745224, "rewards/repetition_penalty_reward/std": 0.05063518136739731, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 3300.359375, "completions/mean_terminated_length": 3186.696533203125, "completions/min_length": 1988.0, "completions/min_terminated_length": 1988.0, "epoch": 0.04906666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1534515768289566, "learning_rate": 1e-06, "loss": -0.0095, "num_tokens": 155334276.0, "reward": 2.536712646484375, "reward_std": 0.24921995401382446, "rewards/cosine_scaled_reward/mean": 0.770061194896698, "rewards/cosine_scaled_reward/std": 0.4599458873271942, "rewards/repetition_penalty_reward/mean": -0.11225477606058121, "rewards/repetition_penalty_reward/std": 0.04974426329135895, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 3381.6640625, "completions/mean_terminated_length": 3253.281005859375, "completions/min_length": 1435.0, "completions/min_terminated_length": 1435.0, "epoch": 0.04928, "frac_reward_zero_std": 0.0, "grad_norm": 0.15593107044696808, "learning_rate": 1e-06, "loss": -0.0331, "num_tokens": 156200458.0, "reward": 2.547464370727539, "reward_std": 0.21806156635284424, "rewards/cosine_scaled_reward/mean": 0.7779920101165771, "rewards/cosine_scaled_reward/std": 0.46156105399131775, "rewards/repetition_penalty_reward/mean": -0.11334001272916794, "rewards/repetition_penalty_reward/std": 0.05135047063231468, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3302.9296875, "completions/mean_terminated_length": 3189.634033203125, "completions/min_length": 2091.0, "completions/min_terminated_length": 2091.0, "epoch": 0.049493333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.1349734663963318, "learning_rate": 1e-06, "loss": -0.0397, "num_tokens": 157074428.0, "reward": 2.4278252124786377, "reward_std": 0.1906101256608963, "rewards/cosine_scaled_reward/mean": 0.7002975344657898, "rewards/cosine_scaled_reward/std": 0.5271919369697571, "rewards/repetition_penalty_reward/mean": -0.10137848556041718, "rewards/repetition_penalty_reward/std": 0.05053536593914032, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.83203125, "rewards/reward_reference/std": 0.3745708465576172, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3317.5859375, "completions/mean_terminated_length": 3269.13720703125, "completions/min_length": 1730.0, "completions/min_terminated_length": 1730.0, "epoch": 0.04970666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.12044618278741837, "learning_rate": 1e-06, "loss": -0.019, "num_tokens": 158039622.0, "reward": 2.680562734603882, "reward_std": 0.18000000715255737, "rewards/cosine_scaled_reward/mean": 0.8430810570716858, "rewards/cosine_scaled_reward/std": 0.36536091566085815, "rewards/repetition_penalty_reward/mean": -0.09220585227012634, "rewards/repetition_penalty_reward/std": 0.03817109391093254, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 3241.15625, "completions/mean_terminated_length": 3148.640625, "completions/min_length": 2108.0, "completions/min_terminated_length": 2108.0, "epoch": 0.04992, "frac_reward_zero_std": 0.0, "grad_norm": 0.10091330856084824, "learning_rate": 1e-06, "loss": -0.0177, "num_tokens": 158919982.0, "reward": 2.644728899002075, "reward_std": 0.20279303193092346, "rewards/cosine_scaled_reward/mean": 0.822848916053772, "rewards/cosine_scaled_reward/std": 0.3769766688346863, "rewards/repetition_penalty_reward/mean": -0.09999504685401917, "rewards/repetition_penalty_reward/std": 0.03829427435994148, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3326.4375, "completions/mean_terminated_length": 3240.195556640625, "completions/min_length": 2029.0, "completions/min_terminated_length": 2029.0, "epoch": 0.050133333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.13295038044452667, "learning_rate": 1e-06, "loss": -0.0124, "num_tokens": 159822851.0, "reward": 2.5747616291046143, "reward_std": 0.16040503978729248, "rewards/cosine_scaled_reward/mean": 0.7831013202667236, "rewards/cosine_scaled_reward/std": 0.449871689081192, "rewards/repetition_penalty_reward/mean": -0.09505827724933624, "rewards/repetition_penalty_reward/std": 0.03566938266158104, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 3277.328125, "completions/mean_terminated_length": 3160.375244140625, "completions/min_length": 1833.0, "completions/min_terminated_length": 1833.0, "epoch": 0.050346666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.16465094685554504, "learning_rate": 1e-06, "loss": -0.0298, "num_tokens": 160686123.0, "reward": 2.4961965084075928, "reward_std": 0.24923688173294067, "rewards/cosine_scaled_reward/mean": 0.744510293006897, "rewards/cosine_scaled_reward/std": 0.4811403155326843, "rewards/repetition_penalty_reward/mean": -0.0967513769865036, "rewards/repetition_penalty_reward/std": 0.045376941561698914, "rewards/reward_format/mean": 0.981249988079071, "rewards/reward_format/std": 0.12126781791448593, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 3352.30078125, "completions/mean_terminated_length": 3249.835693359375, "completions/min_length": 1897.0, "completions/min_terminated_length": 1897.0, "epoch": 0.05056, "frac_reward_zero_std": 0.0, "grad_norm": 0.1568382978439331, "learning_rate": 1e-06, "loss": -0.0474, "num_tokens": 161582056.0, "reward": 2.635594367980957, "reward_std": 0.2220080941915512, "rewards/cosine_scaled_reward/mean": 0.8191026449203491, "rewards/cosine_scaled_reward/std": 0.4030058979988098, "rewards/repetition_penalty_reward/mean": -0.09444564580917358, "rewards/repetition_penalty_reward/std": 0.0460088886320591, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 3279.74609375, "completions/mean_terminated_length": 3179.50439453125, "completions/min_length": 1848.0, "completions/min_terminated_length": 1848.0, "epoch": 0.05077333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.17059451341629028, "learning_rate": 1e-06, "loss": -0.0619, "num_tokens": 162473719.0, "reward": 2.5931172370910645, "reward_std": 0.25767046213150024, "rewards/cosine_scaled_reward/mean": 0.7888476848602295, "rewards/cosine_scaled_reward/std": 0.4332166910171509, "rewards/repetition_penalty_reward/mean": -0.09104306995868683, "rewards/repetition_penalty_reward/std": 0.04912455752491951, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 3219.9453125, "completions/mean_terminated_length": 3141.659423828125, "completions/min_length": 1931.0, "completions/min_terminated_length": 1931.0, "epoch": 0.050986666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10416688770055771, "learning_rate": 1e-06, "loss": -0.0127, "num_tokens": 163372405.0, "reward": 2.6563687324523926, "reward_std": 0.12349631637334824, "rewards/cosine_scaled_reward/mean": 0.8200638294219971, "rewards/cosine_scaled_reward/std": 0.37859129905700684, "rewards/repetition_penalty_reward/mean": -0.08557005226612091, "rewards/repetition_penalty_reward/std": 0.03227576985955238, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 3282.00390625, "completions/mean_terminated_length": 3148.804443359375, "completions/min_length": 2080.0, "completions/min_terminated_length": 2080.0, "epoch": 0.0512, "frac_reward_zero_std": 0.0, "grad_norm": 0.1761353760957718, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 164225194.0, "reward": 2.449544906616211, "reward_std": 0.2723526358604431, "rewards/cosine_scaled_reward/mean": 0.7084175944328308, "rewards/cosine_scaled_reward/std": 0.5179768800735474, "rewards/repetition_penalty_reward/mean": -0.09168502688407898, "rewards/repetition_penalty_reward/std": 0.03688832372426987, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8359375, "rewards/reward_reference/std": 0.3710577189922333, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 3087.98046875, "completions/mean_terminated_length": 3016.2802734375, "completions/min_length": 2035.0, "completions/min_terminated_length": 2035.0, "epoch": 0.05141333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1311665177345276, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 165107713.0, "reward": 2.599454164505005, "reward_std": 0.16613246500492096, "rewards/cosine_scaled_reward/mean": 0.7890911102294922, "rewards/cosine_scaled_reward/std": 0.39621245861053467, "rewards/repetition_penalty_reward/mean": -0.09198064357042313, "rewards/repetition_penalty_reward/std": 0.03552812337875366, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3141.2734375, "completions/mean_terminated_length": 3065.510498046875, "completions/min_length": 1743.0, "completions/min_terminated_length": 1743.0, "epoch": 0.05162666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1170988455414772, "learning_rate": 1e-06, "loss": -0.0173, "num_tokens": 165988723.0, "reward": 2.692446708679199, "reward_std": 0.16342981159687042, "rewards/cosine_scaled_reward/mean": 0.8425164222717285, "rewards/cosine_scaled_reward/std": 0.3216044306755066, "rewards/repetition_penalty_reward/mean": -0.09538224339485168, "rewards/repetition_penalty_reward/std": 0.036083489656448364, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9453125, "rewards/reward_reference/std": 0.22781464457511902, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3263.703125, "completions/mean_terminated_length": 3127.509033203125, "completions/min_length": 1848.0, "completions/min_terminated_length": 1848.0, "epoch": 0.05184, "frac_reward_zero_std": 0.0, "grad_norm": 0.14324931800365448, "learning_rate": 1e-06, "loss": -0.0426, "num_tokens": 166838611.0, "reward": 2.5586345195770264, "reward_std": 0.2131500393152237, "rewards/cosine_scaled_reward/mean": 0.7745422124862671, "rewards/cosine_scaled_reward/std": 0.44403311610221863, "rewards/repetition_penalty_reward/mean": -0.10028272867202759, "rewards/repetition_penalty_reward/std": 0.04599667713046074, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 3275.5390625, "completions/mean_terminated_length": 3199.14111328125, "completions/min_length": 1784.0, "completions/min_terminated_length": 1784.0, "epoch": 0.05205333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12255389243364334, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 167756922.0, "reward": 2.6074233055114746, "reward_std": 0.15961477160453796, "rewards/cosine_scaled_reward/mean": 0.8047130107879639, "rewards/cosine_scaled_reward/std": 0.4105326533317566, "rewards/repetition_penalty_reward/mean": -0.0996333584189415, "rewards/repetition_penalty_reward/std": 0.041832976043224335, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 3361.8125, "completions/mean_terminated_length": 3268.017578125, "completions/min_length": 2264.0, "completions/min_terminated_length": 2264.0, "epoch": 0.05226666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1401897817850113, "learning_rate": 1e-06, "loss": -0.0166, "num_tokens": 168667354.0, "reward": 2.6775858402252197, "reward_std": 0.18005535006523132, "rewards/cosine_scaled_reward/mean": 0.8490869998931885, "rewards/cosine_scaled_reward/std": 0.3619866967201233, "rewards/repetition_penalty_reward/mean": -0.1050950288772583, "rewards/repetition_penalty_reward/std": 0.03700846806168556, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 3354.1640625, "completions/mean_terminated_length": 3237.036376953125, "completions/min_length": 2237.0, "completions/min_terminated_length": 2237.0, "epoch": 0.05248, "frac_reward_zero_std": 0.0, "grad_norm": 0.1567317396402359, "learning_rate": 1e-06, "loss": -0.0361, "num_tokens": 169544251.0, "reward": 2.568568229675293, "reward_std": 0.2007070928812027, "rewards/cosine_scaled_reward/mean": 0.8031135201454163, "rewards/cosine_scaled_reward/std": 0.4272143840789795, "rewards/repetition_penalty_reward/mean": -0.12360787391662598, "rewards/repetition_penalty_reward/std": 0.056523095816373825, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3360.3984375, "completions/mean_terminated_length": 3243.8681640625, "completions/min_length": 1815.0, "completions/min_terminated_length": 1815.0, "epoch": 0.052693333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.11087020486593246, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 170415658.0, "reward": 2.5351624488830566, "reward_std": 0.11044815182685852, "rewards/cosine_scaled_reward/mean": 0.7818148136138916, "rewards/cosine_scaled_reward/std": 0.4538746476173401, "rewards/repetition_penalty_reward/mean": -0.12946489453315735, "rewards/repetition_penalty_reward/std": 0.04873738810420036, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3427.25, "completions/mean_terminated_length": 3252.650146484375, "completions/min_length": 2025.0, "completions/min_terminated_length": 2025.0, "epoch": 0.052906666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.15536320209503174, "learning_rate": 1e-06, "loss": -0.0183, "num_tokens": 171231498.0, "reward": 2.3969178199768066, "reward_std": 0.2622104287147522, "rewards/cosine_scaled_reward/mean": 0.712480366230011, "rewards/cosine_scaled_reward/std": 0.5295530557632446, "rewards/repetition_penalty_reward/mean": -0.13275012373924255, "rewards/repetition_penalty_reward/std": 0.042228639125823975, "rewards/reward_format/mean": 0.981249988079071, "rewards/reward_format/std": 0.12126781791448593, "rewards/reward_reference/mean": 0.8359375, "rewards/reward_reference/std": 0.3710577189922333, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3377.05859375, "completions/mean_terminated_length": 3244.273193359375, "completions/min_length": 1885.0, "completions/min_terminated_length": 1885.0, "epoch": 0.05312, "frac_reward_zero_std": 0.0, "grad_norm": 0.13335011899471283, "learning_rate": 1e-06, "loss": -0.0273, "num_tokens": 172091853.0, "reward": 2.5199947357177734, "reward_std": 0.16750986874103546, "rewards/cosine_scaled_reward/mean": 0.7740402817726135, "rewards/cosine_scaled_reward/std": 0.46254417300224304, "rewards/repetition_penalty_reward/mean": -0.1329517662525177, "rewards/repetition_penalty_reward/std": 0.04756597429513931, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 3280.61328125, "completions/mean_terminated_length": 3186.118408203125, "completions/min_length": 2232.0, "completions/min_terminated_length": 2232.0, "epoch": 0.05333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12476290017366409, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 172972160.0, "reward": 2.71635103225708, "reward_std": 0.13595899939537048, "rewards/cosine_scaled_reward/mean": 0.8891398906707764, "rewards/cosine_scaled_reward/std": 0.26229527592658997, "rewards/repetition_penalty_reward/mean": -0.13138249516487122, "rewards/repetition_penalty_reward/std": 0.042913768440485, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.96484375, "rewards/reward_reference/std": 0.18453538417816162, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 3309.2578125, "completions/mean_terminated_length": 3150.431884765625, "completions/min_length": 1909.0, "completions/min_terminated_length": 1909.0, "epoch": 0.053546666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.15214867889881134, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 173800374.0, "reward": 2.6647098064422607, "reward_std": 0.14656221866607666, "rewards/cosine_scaled_reward/mean": 0.8541313409805298, "rewards/cosine_scaled_reward/std": 0.33413589000701904, "rewards/repetition_penalty_reward/mean": -0.13082784414291382, "rewards/repetition_penalty_reward/std": 0.043466996401548386, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 3286.8203125, "completions/mean_terminated_length": 3224.38134765625, "completions/min_length": 2185.0, "completions/min_terminated_length": 2185.0, "epoch": 0.05376, "frac_reward_zero_std": 0.0, "grad_norm": 0.11460880935192108, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 174722612.0, "reward": 2.7309741973876953, "reward_std": 0.10704682767391205, "rewards/cosine_scaled_reward/mean": 0.8857072591781616, "rewards/cosine_scaled_reward/std": 0.27719277143478394, "rewards/repetition_penalty_reward/mean": -0.11567074060440063, "rewards/repetition_penalty_reward/std": 0.030316317453980446, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3266.65625, "completions/mean_terminated_length": 3139.581787109375, "completions/min_length": 2063.0, "completions/min_terminated_length": 2063.0, "epoch": 0.05397333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.22021500766277313, "learning_rate": 1e-06, "loss": -0.0572, "num_tokens": 175574204.0, "reward": 2.482029438018799, "reward_std": 0.24132297933101654, "rewards/cosine_scaled_reward/mean": 0.7378534078598022, "rewards/cosine_scaled_reward/std": 0.4845937490463257, "rewards/repetition_penalty_reward/mean": -0.11910516023635864, "rewards/repetition_penalty_reward/std": 0.04768124595284462, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.86328125, "rewards/reward_reference/std": 0.34422317147254944, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3190.765625, "completions/mean_terminated_length": 3109.50439453125, "completions/min_length": 2085.0, "completions/min_terminated_length": 2085.0, "epoch": 0.05418666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11906640976667404, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 176467160.0, "reward": 2.697772979736328, "reward_std": 0.08515664935112, "rewards/cosine_scaled_reward/mean": 0.8567619323730469, "rewards/cosine_scaled_reward/std": 0.30950209498405457, "rewards/repetition_penalty_reward/mean": -0.10820753127336502, "rewards/repetition_penalty_reward/std": 0.02926632948219776, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94921875, "rewards/reward_reference/std": 0.21998079121112823, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3160.6171875, "completions/mean_terminated_length": 3067.116455078125, "completions/min_length": 1814.0, "completions/min_terminated_length": 1814.0, "epoch": 0.0544, "frac_reward_zero_std": 0.0, "grad_norm": 0.12647591531276703, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 177336651.0, "reward": 2.4994921684265137, "reward_std": 0.12857121229171753, "rewards/cosine_scaled_reward/mean": 0.7419434785842896, "rewards/cosine_scaled_reward/std": 0.4661872386932373, "rewards/repetition_penalty_reward/mean": -0.1135449931025505, "rewards/repetition_penalty_reward/std": 0.03488827869296074, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 3018.65234375, "completions/mean_terminated_length": 2955.219970703125, "completions/min_length": 2017.0, "completions/min_terminated_length": 2017.0, "epoch": 0.05461333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.10715894401073456, "learning_rate": 1e-06, "loss": -0.0174, "num_tokens": 178201099.0, "reward": 2.680346965789795, "reward_std": 0.10996614396572113, "rewards/cosine_scaled_reward/mean": 0.8404624462127686, "rewards/cosine_scaled_reward/std": 0.2958315908908844, "rewards/repetition_penalty_reward/mean": -0.11011554300785065, "rewards/repetition_penalty_reward/std": 0.03434314951300621, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.953125, "rewards/reward_reference/std": 0.21178513765335083, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3964.0, "completions/mean_length": 2943.23828125, "completions/mean_terminated_length": 2915.572021484375, "completions/min_length": 1742.0, "completions/min_terminated_length": 1742.0, "epoch": 0.05482666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.03318123519420624, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 179101432.0, "reward": 2.7122738361358643, "reward_std": 0.047639843076467514, "rewards/cosine_scaled_reward/mean": 0.8511331081390381, "rewards/cosine_scaled_reward/std": 0.25263121724128723, "rewards/repetition_penalty_reward/mean": -0.10370296239852905, "rewards/repetition_penalty_reward/std": 0.032866187393665314, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.96484375, "rewards/reward_reference/std": 0.18453538417816162, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3934.0, "completions/mean_length": 2962.69921875, "completions/mean_terminated_length": 2906.226318359375, "completions/min_length": 1761.0, "completions/min_terminated_length": 1761.0, "epoch": 0.05504, "frac_reward_zero_std": 0.0, "grad_norm": 0.14833249151706696, "learning_rate": 1e-06, "loss": -0.0222, "num_tokens": 179978589.0, "reward": 2.6928648948669434, "reward_std": 0.11383727192878723, "rewards/cosine_scaled_reward/mean": 0.8450006246566772, "rewards/cosine_scaled_reward/std": 0.2710968852043152, "rewards/repetition_penalty_reward/mean": -0.09744831174612045, "rewards/repetition_penalty_reward/std": 0.04459652677178383, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 2981.18359375, "completions/mean_terminated_length": 2909.14599609375, "completions/min_length": 2071.0, "completions/min_terminated_length": 2071.0, "epoch": 0.055253333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.18252234160900116, "learning_rate": 1e-06, "loss": -0.0232, "num_tokens": 180840680.0, "reward": 2.587800979614258, "reward_std": 0.24309676885604858, "rewards/cosine_scaled_reward/mean": 0.7760927677154541, "rewards/cosine_scaled_reward/std": 0.3907049298286438, "rewards/repetition_penalty_reward/mean": -0.098448246717453, "rewards/repetition_penalty_reward/std": 0.03178011253476143, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 2906.25, "completions/mean_terminated_length": 2859.044677734375, "completions/min_length": 1778.0, "completions/min_terminated_length": 1778.0, "epoch": 0.055466666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.12263341248035431, "learning_rate": 1e-06, "loss": -0.0353, "num_tokens": 181704285.0, "reward": 2.6258513927459717, "reward_std": 0.14923956990242004, "rewards/cosine_scaled_reward/mean": 0.7974638938903809, "rewards/cosine_scaled_reward/std": 0.343732088804245, "rewards/repetition_penalty_reward/mean": -0.09817510098218918, "rewards/repetition_penalty_reward/std": 0.032411616295576096, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 3021.8515625, "completions/mean_terminated_length": 3000.454345703125, "completions/min_length": 2023.0, "completions/min_terminated_length": 2023.0, "epoch": 0.05568, "frac_reward_zero_std": 0.0, "grad_norm": 0.1144053116440773, "learning_rate": 1e-06, "loss": -0.0253, "num_tokens": 182623943.0, "reward": 2.600637912750244, "reward_std": 0.13089832663536072, "rewards/cosine_scaled_reward/mean": 0.793508768081665, "rewards/cosine_scaled_reward/std": 0.38059523701667786, "rewards/repetition_penalty_reward/mean": -0.10380822420120239, "rewards/repetition_penalty_reward/std": 0.03530118241906166, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 3025.87890625, "completions/mean_terminated_length": 2991.35888671875, "completions/min_length": 1986.0, "completions/min_terminated_length": 1986.0, "epoch": 0.05589333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12886103987693787, "learning_rate": 1e-06, "loss": -0.0184, "num_tokens": 183529604.0, "reward": 2.684856414794922, "reward_std": 0.12615951895713806, "rewards/cosine_scaled_reward/mean": 0.8371844291687012, "rewards/cosine_scaled_reward/std": 0.30617836117744446, "rewards/repetition_penalty_reward/mean": -0.10154666006565094, "rewards/repetition_penalty_reward/std": 0.04301191866397858, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94921875, "rewards/reward_reference/std": 0.21998079121112823, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3973.0, "completions/mean_length": 3003.31640625, "completions/mean_terminated_length": 2925.593994140625, "completions/min_length": 1879.0, "completions/min_terminated_length": 1879.0, "epoch": 0.056106666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.13606540858745575, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 184387913.0, "reward": 2.5859298706054688, "reward_std": 0.16840368509292603, "rewards/cosine_scaled_reward/mean": 0.7858878374099731, "rewards/cosine_scaled_reward/std": 0.38099440932273865, "rewards/repetition_penalty_reward/mean": -0.10698917508125305, "rewards/repetition_penalty_reward/std": 0.041619643568992615, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 3049.58203125, "completions/mean_terminated_length": 2965.69189453125, "completions/min_length": 1703.0, "completions/min_terminated_length": 1703.0, "epoch": 0.05632, "frac_reward_zero_std": 0.0, "grad_norm": 0.0597972609102726, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 185252946.0, "reward": 2.6976442337036133, "reward_std": 0.07548146694898605, "rewards/cosine_scaled_reward/mean": 0.8483776450157166, "rewards/cosine_scaled_reward/std": 0.28662973642349243, "rewards/repetition_penalty_reward/mean": -0.10463981330394745, "rewards/repetition_penalty_reward/std": 0.04801265150308609, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 3129.72265625, "completions/mean_terminated_length": 3029.762939453125, "completions/min_length": 1824.0, "completions/min_terminated_length": 1824.0, "epoch": 0.05653333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1184777244925499, "learning_rate": 1e-06, "loss": -0.039, "num_tokens": 186116847.0, "reward": 2.613929271697998, "reward_std": 0.16660256683826447, "rewards/cosine_scaled_reward/mean": 0.8040981292724609, "rewards/cosine_scaled_reward/std": 0.38224369287490845, "rewards/repetition_penalty_reward/mean": -0.09954366832971573, "rewards/repetition_penalty_reward/std": 0.03987511247396469, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 3116.640625, "completions/mean_terminated_length": 3010.6494140625, "completions/min_length": 2146.0, "completions/min_terminated_length": 2146.0, "epoch": 0.05674666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.17567136883735657, "learning_rate": 1e-06, "loss": -0.0435, "num_tokens": 186971919.0, "reward": 2.539384365081787, "reward_std": 0.2967863976955414, "rewards/cosine_scaled_reward/mean": 0.7633931636810303, "rewards/cosine_scaled_reward/std": 0.43265610933303833, "rewards/repetition_penalty_reward/mean": -0.09900867938995361, "rewards/repetition_penalty_reward/std": 0.03749703988432884, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 3276.57421875, "completions/mean_terminated_length": 3133.73828125, "completions/min_length": 1998.0, "completions/min_terminated_length": 1998.0, "epoch": 0.05696, "frac_reward_zero_std": 0.0, "grad_norm": 0.18023541569709778, "learning_rate": 1e-06, "loss": -0.0461, "num_tokens": 187819014.0, "reward": 2.4331483840942383, "reward_std": 0.21694278717041016, "rewards/cosine_scaled_reward/mean": 0.7236498594284058, "rewards/cosine_scaled_reward/std": 0.5014133453369141, "rewards/repetition_penalty_reward/mean": -0.11081399023532867, "rewards/repetition_penalty_reward/std": 0.047140881419181824, "rewards/reward_format/mean": 0.96875, "rewards/reward_format/std": 0.15529859066009521, "rewards/reward_reference/mean": 0.8515625, "rewards/reward_reference/std": 0.3562295734882355, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 3139.97265625, "completions/mean_terminated_length": 3067.668212890625, "completions/min_length": 2059.0, "completions/min_terminated_length": 2059.0, "epoch": 0.05717333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.126700758934021, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 188712451.0, "reward": 2.656765937805176, "reward_std": 0.14348021149635315, "rewards/cosine_scaled_reward/mean": 0.8294387459754944, "rewards/cosine_scaled_reward/std": 0.34767425060272217, "rewards/repetition_penalty_reward/mean": -0.10001654922962189, "rewards/repetition_penalty_reward/std": 0.041904617100954056, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3171.8203125, "completions/mean_terminated_length": 3126.36865234375, "completions/min_length": 1776.0, "completions/min_terminated_length": 1776.0, "epoch": 0.05738666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.06981715559959412, "learning_rate": 1e-06, "loss": -0.0295, "num_tokens": 189632833.0, "reward": 2.670246124267578, "reward_std": 0.0859508365392685, "rewards/cosine_scaled_reward/mean": 0.8335663080215454, "rewards/cosine_scaled_reward/std": 0.34999513626098633, "rewards/repetition_penalty_reward/mean": -0.09691374003887177, "rewards/repetition_penalty_reward/std": 0.034196559339761734, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 3266.21875, "completions/mean_terminated_length": 3164.31591796875, "completions/min_length": 1990.0, "completions/min_terminated_length": 1990.0, "epoch": 0.0576, "frac_reward_zero_std": 0.0, "grad_norm": 0.09501556307077408, "learning_rate": 1e-06, "loss": -0.0297, "num_tokens": 190514865.0, "reward": 2.5719354152679443, "reward_std": 0.17583732306957245, "rewards/cosine_scaled_reward/mean": 0.7868057489395142, "rewards/cosine_scaled_reward/std": 0.43395617604255676, "rewards/repetition_penalty_reward/mean": -0.10315153002738953, "rewards/repetition_penalty_reward/std": 0.03619806841015816, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 3167.8203125, "completions/mean_terminated_length": 3093.4091796875, "completions/min_length": 1871.0, "completions/min_terminated_length": 1871.0, "epoch": 0.057813333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 0.09603821486234665, "learning_rate": 1e-06, "loss": -0.0295, "num_tokens": 191411267.0, "reward": 2.5477559566497803, "reward_std": 0.14840057492256165, "rewards/cosine_scaled_reward/mean": 0.7638838291168213, "rewards/cosine_scaled_reward/std": 0.4406859874725342, "rewards/repetition_penalty_reward/mean": -0.09425283968448639, "rewards/repetition_penalty_reward/std": 0.03563051298260689, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 3265.11328125, "completions/mean_terminated_length": 3129.14990234375, "completions/min_length": 1960.0, "completions/min_terminated_length": 1960.0, "epoch": 0.058026666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.16115225851535797, "learning_rate": 1e-06, "loss": -0.032, "num_tokens": 192265076.0, "reward": 2.475174903869629, "reward_std": 0.226247176527977, "rewards/cosine_scaled_reward/mean": 0.7361939549446106, "rewards/cosine_scaled_reward/std": 0.4867141842842102, "rewards/repetition_penalty_reward/mean": -0.10398783534765244, "rewards/repetition_penalty_reward/std": 0.04068681597709656, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.85546875, "rewards/reward_reference/std": 0.35231640934944153, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 3192.22265625, "completions/mean_terminated_length": 3090.056396484375, "completions/min_length": 2059.0, "completions/min_terminated_length": 2059.0, "epoch": 0.05824, "frac_reward_zero_std": 0.0, "grad_norm": 0.12524887919425964, "learning_rate": 1e-06, "loss": -0.0274, "num_tokens": 193135969.0, "reward": 2.5385026931762695, "reward_std": 0.20201915502548218, "rewards/cosine_scaled_reward/mean": 0.7618891000747681, "rewards/cosine_scaled_reward/std": 0.45050087571144104, "rewards/repetition_penalty_reward/mean": -0.09994899481534958, "rewards/repetition_penalty_reward/std": 0.033864665776491165, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 3212.69140625, "completions/mean_terminated_length": 3145.88671875, "completions/min_length": 2237.0, "completions/min_terminated_length": 2237.0, "epoch": 0.058453333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.12767969071865082, "learning_rate": 1e-06, "loss": -0.0307, "num_tokens": 194048178.0, "reward": 2.5636038780212402, "reward_std": 0.1429387778043747, "rewards/cosine_scaled_reward/mean": 0.7813065052032471, "rewards/cosine_scaled_reward/std": 0.4319900572299957, "rewards/repetition_penalty_reward/mean": -0.09973399341106415, "rewards/repetition_penalty_reward/std": 0.03400009125471115, "rewards/reward_format/mean": 0.9874999523162842, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3136.1640625, "completions/mean_terminated_length": 3054.822021484375, "completions/min_length": 1454.0, "completions/min_terminated_length": 1454.0, "epoch": 0.058666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.11141457408666611, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 194931672.0, "reward": 2.6866955757141113, "reward_std": 0.145915687084198, "rewards/cosine_scaled_reward/mean": 0.8439056873321533, "rewards/cosine_scaled_reward/std": 0.31848499178886414, "rewards/repetition_penalty_reward/mean": -0.09627248346805573, "rewards/repetition_penalty_reward/std": 0.03544648736715317, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.9453125, "rewards/reward_reference/std": 0.22781464457511902, "step": 275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3193.03125, "completions/mean_terminated_length": 3124.73974609375, "completions/min_length": 2225.0, "completions/min_terminated_length": 2225.0, "epoch": 0.05888, "frac_reward_zero_std": 0.0, "grad_norm": 0.08281750977039337, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 195837212.0, "reward": 2.665475845336914, "reward_std": 0.1273091584444046, "rewards/cosine_scaled_reward/mean": 0.8375442624092102, "rewards/cosine_scaled_reward/std": 0.34636956453323364, "rewards/repetition_penalty_reward/mean": -0.09316206723451614, "rewards/repetition_penalty_reward/std": 0.03389699384570122, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 3165.02734375, "completions/mean_terminated_length": 3090.392333984375, "completions/min_length": 2009.0, "completions/min_terminated_length": 2009.0, "epoch": 0.05909333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.08791069686412811, "learning_rate": 1e-06, "loss": -0.0169, "num_tokens": 196731383.0, "reward": 2.6439356803894043, "reward_std": 0.15615218877792358, "rewards/cosine_scaled_reward/mean": 0.8256608843803406, "rewards/cosine_scaled_reward/std": 0.3610367774963379, "rewards/repetition_penalty_reward/mean": -0.09813161194324493, "rewards/repetition_penalty_reward/std": 0.04111843183636665, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 3193.63671875, "completions/mean_terminated_length": 3108.79931640625, "completions/min_length": 2110.0, "completions/min_terminated_length": 2110.0, "epoch": 0.05930666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.14001445472240448, "learning_rate": 1e-06, "loss": -0.0237, "num_tokens": 197620078.0, "reward": 2.425388813018799, "reward_std": 0.21485967934131622, "rewards/cosine_scaled_reward/mean": 0.7161268591880798, "rewards/cosine_scaled_reward/std": 0.4970853924751282, "rewards/repetition_penalty_reward/mean": -0.11026932299137115, "rewards/repetition_penalty_reward/std": 0.04518568143248558, "rewards/reward_format/mean": 0.9718749523162842, "rewards/reward_format/std": 0.1476283222436905, "rewards/reward_reference/mean": 0.84765625, "rewards/reward_reference/std": 0.3600577116012573, "step": 278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 3300.109375, "completions/mean_terminated_length": 3213.97412109375, "completions/min_length": 2161.0, "completions/min_terminated_length": 2161.0, "epoch": 0.05952, "frac_reward_zero_std": 0.0, "grad_norm": 0.14491397142410278, "learning_rate": 1e-06, "loss": -0.0148, "num_tokens": 198519118.0, "reward": 2.5698421001434326, "reward_std": 0.20768402516841888, "rewards/cosine_scaled_reward/mean": 0.7817451357841492, "rewards/cosine_scaled_reward/std": 0.44465169310569763, "rewards/repetition_penalty_reward/mean": -0.09940297901630402, "rewards/repetition_penalty_reward/std": 0.037566814571619034, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 3297.74609375, "completions/mean_terminated_length": 3226.41259765625, "completions/min_length": 1910.0, "completions/min_terminated_length": 1910.0, "epoch": 0.05973333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13260126113891602, "learning_rate": 1e-06, "loss": -0.0481, "num_tokens": 199439145.0, "reward": 2.5585880279541016, "reward_std": 0.18200063705444336, "rewards/cosine_scaled_reward/mean": 0.7834160327911377, "rewards/cosine_scaled_reward/std": 0.4448276460170746, "rewards/repetition_penalty_reward/mean": -0.10217203199863434, "rewards/repetition_penalty_reward/std": 0.036684300750494, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3296.578125, "completions/mean_terminated_length": 3190.62841796875, "completions/min_length": 2143.0, "completions/min_terminated_length": 2143.0, "epoch": 0.05994666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10680922865867615, "learning_rate": 1e-06, "loss": -0.0296, "num_tokens": 200329419.0, "reward": 2.635676383972168, "reward_std": 0.1627238392829895, "rewards/cosine_scaled_reward/mean": 0.8256530165672302, "rewards/cosine_scaled_reward/std": 0.3862798810005188, "rewards/repetition_penalty_reward/mean": -0.09935159981250763, "rewards/repetition_penalty_reward/std": 0.04032512754201889, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 3258.14453125, "completions/mean_terminated_length": 3167.467529296875, "completions/min_length": 1958.0, "completions/min_terminated_length": 1958.0, "epoch": 0.06016, "frac_reward_zero_std": 0.0, "grad_norm": 0.14413586258888245, "learning_rate": 1e-06, "loss": -0.0382, "num_tokens": 201221040.0, "reward": 2.547389030456543, "reward_std": 0.22861404716968536, "rewards/cosine_scaled_reward/mean": 0.7698594331741333, "rewards/cosine_scaled_reward/std": 0.45155566930770874, "rewards/repetition_penalty_reward/mean": -0.09981396794319153, "rewards/repetition_penalty_reward/std": 0.03832467272877693, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 3319.23046875, "completions/mean_terminated_length": 3183.830078125, "completions/min_length": 2119.0, "completions/min_terminated_length": 2119.0, "epoch": 0.060373333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.11200573295354843, "learning_rate": 1e-06, "loss": -0.0452, "num_tokens": 202074631.0, "reward": 2.628976345062256, "reward_std": 0.1832224726676941, "rewards/cosine_scaled_reward/mean": 0.8138462901115417, "rewards/cosine_scaled_reward/std": 0.4038219749927521, "rewards/repetition_penalty_reward/mean": -0.09893252700567245, "rewards/repetition_penalty_reward/std": 0.034034404903650284, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3257.13671875, "completions/mean_terminated_length": 3149.968994140625, "completions/min_length": 2014.0, "completions/min_terminated_length": 2014.0, "epoch": 0.060586666666666664, "frac_reward_zero_std": 0.0, "grad_norm": 0.09669145196676254, "learning_rate": 1e-06, "loss": -0.0382, "num_tokens": 202950866.0, "reward": 2.6120972633361816, "reward_std": 0.13966014981269836, "rewards/cosine_scaled_reward/mean": 0.819635808467865, "rewards/cosine_scaled_reward/std": 0.3874451816082001, "rewards/repetition_penalty_reward/mean": -0.10988226532936096, "rewards/repetition_penalty_reward/std": 0.0434051938354969, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 3347.43359375, "completions/mean_terminated_length": 3237.659423828125, "completions/min_length": 2152.0, "completions/min_terminated_length": 2152.0, "epoch": 0.0608, "frac_reward_zero_std": 0.0, "grad_norm": 0.1725301891565323, "learning_rate": 1e-06, "loss": -0.0637, "num_tokens": 203839344.0, "reward": 2.493455410003662, "reward_std": 0.24681459367275238, "rewards/cosine_scaled_reward/mean": 0.7412490248680115, "rewards/cosine_scaled_reward/std": 0.4978993535041809, "rewards/repetition_penalty_reward/mean": -0.10091851651668549, "rewards/repetition_penalty_reward/std": 0.03664093464612961, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.859375, "rewards/reward_reference/std": 0.3483152687549591, "step": 285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 3269.0703125, "completions/mean_terminated_length": 3146.69970703125, "completions/min_length": 2100.0, "completions/min_terminated_length": 2100.0, "epoch": 0.061013333333333336, "frac_reward_zero_std": 0.0, "grad_norm": 0.17975397408008575, "learning_rate": 1e-06, "loss": -0.0196, "num_tokens": 204701962.0, "reward": 2.495086193084717, "reward_std": 0.22708861529827118, "rewards/cosine_scaled_reward/mean": 0.7405046820640564, "rewards/cosine_scaled_reward/std": 0.48294419050216675, "rewards/repetition_penalty_reward/mean": -0.1001059040427208, "rewards/repetition_penalty_reward/std": 0.0333668552339077, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3339.1640625, "completions/mean_terminated_length": 3215.318115234375, "completions/min_length": 2036.0, "completions/min_terminated_length": 2036.0, "epoch": 0.061226666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 0.10416944324970245, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 205566048.0, "reward": 2.5090126991271973, "reward_std": 0.16860689222812653, "rewards/cosine_scaled_reward/mean": 0.7532828450202942, "rewards/cosine_scaled_reward/std": 0.4829765260219574, "rewards/repetition_penalty_reward/mean": -0.10833275318145752, "rewards/repetition_penalty_reward/std": 0.04052012786269188, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 3386.76171875, "completions/mean_terminated_length": 3251.51171875, "completions/min_length": 2190.0, "completions/min_terminated_length": 2190.0, "epoch": 0.06144, "frac_reward_zero_std": 0.0, "grad_norm": 0.12626913189888, "learning_rate": 1e-06, "loss": -0.0333, "num_tokens": 206423219.0, "reward": 2.553403377532959, "reward_std": 0.2632274925708771, "rewards/cosine_scaled_reward/mean": 0.7919354438781738, "rewards/cosine_scaled_reward/std": 0.44670379161834717, "rewards/repetition_penalty_reward/mean": -0.10728215426206589, "rewards/repetition_penalty_reward/std": 0.03807291015982628, "rewards/reward_format/mean": 0.9781249761581421, "rewards/reward_format/std": 0.13072198629379272, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3415.59765625, "completions/mean_terminated_length": 3270.48828125, "completions/min_length": 2249.0, "completions/min_terminated_length": 2249.0, "epoch": 0.06165333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.14491280913352966, "learning_rate": 1e-06, "loss": -0.069, "num_tokens": 207272948.0, "reward": 2.448690414428711, "reward_std": 0.23005658388137817, "rewards/cosine_scaled_reward/mean": 0.729130208492279, "rewards/cosine_scaled_reward/std": 0.5144220590591431, "rewards/repetition_penalty_reward/mean": -0.1124710738658905, "rewards/repetition_penalty_reward/std": 0.03470207005739212, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.84765625, "rewards/reward_reference/std": 0.3600577116012573, "step": 289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 3416.36328125, "completions/mean_terminated_length": 3308.728759765625, "completions/min_length": 1827.0, "completions/min_terminated_length": 1827.0, "epoch": 0.06186666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.16962240636348724, "learning_rate": 1e-06, "loss": -0.0594, "num_tokens": 208159941.0, "reward": 2.477572202682495, "reward_std": 0.23482105135917664, "rewards/cosine_scaled_reward/mean": 0.7438188791275024, "rewards/cosine_scaled_reward/std": 0.5046337842941284, "rewards/repetition_penalty_reward/mean": -0.11546549201011658, "rewards/repetition_penalty_reward/std": 0.041258033365011215, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.85546875, "rewards/reward_reference/std": 0.35231640934944153, "step": 290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 3439.75, "completions/mean_terminated_length": 3342.636962890625, "completions/min_length": 2159.0, "completions/min_terminated_length": 2159.0, "epoch": 0.06208, "frac_reward_zero_std": 0.0, "grad_norm": 0.1446293741464615, "learning_rate": 1e-06, "loss": -0.0473, "num_tokens": 209067349.0, "reward": 2.663702964782715, "reward_std": 0.18608683347702026, "rewards/cosine_scaled_reward/mean": 0.8517959117889404, "rewards/cosine_scaled_reward/std": 0.37194642424583435, "rewards/repetition_penalty_reward/mean": -0.10840542614459991, "rewards/repetition_penalty_reward/std": 0.0434514544904232, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3305.83984375, "completions/mean_terminated_length": 3188.910400390625, "completions/min_length": 1947.0, "completions/min_terminated_length": 1947.0, "epoch": 0.06229333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.11810626089572906, "learning_rate": 1e-06, "loss": -0.0112, "num_tokens": 209936872.0, "reward": 2.6013426780700684, "reward_std": 0.17282244563102722, "rewards/cosine_scaled_reward/mean": 0.8206616640090942, "rewards/cosine_scaled_reward/std": 0.39270922541618347, "rewards/repetition_penalty_reward/mean": -0.11541298031806946, "rewards/repetition_penalty_reward/std": 0.043614938855171204, "rewards/reward_format/mean": 0.9781249761581421, "rewards/reward_format/std": 0.13072198629379272, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3438.2109375, "completions/mean_terminated_length": 3323.55029296875, "completions/min_length": 2268.0, "completions/min_terminated_length": 2268.0, "epoch": 0.06250666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.0373576357960701, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 210821630.0, "reward": 2.6414241790771484, "reward_std": 0.13541769981384277, "rewards/cosine_scaled_reward/mean": 0.8415735960006714, "rewards/cosine_scaled_reward/std": 0.39156049489974976, "rewards/repetition_penalty_reward/mean": -0.11577431857585907, "rewards/repetition_penalty_reward/std": 0.03710121288895607, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3379.16796875, "completions/mean_terminated_length": 3265.642578125, "completions/min_length": 2079.0, "completions/min_terminated_length": 2079.0, "epoch": 0.06272, "frac_reward_zero_std": 0.0, "grad_norm": 0.20948846638202667, "learning_rate": 1e-06, "loss": -0.0318, "num_tokens": 211710845.0, "reward": 2.5950610637664795, "reward_std": 0.1684325486421585, "rewards/cosine_scaled_reward/mean": 0.8164889216423035, "rewards/cosine_scaled_reward/std": 0.4118175208568573, "rewards/repetition_penalty_reward/mean": -0.12220916152000427, "rewards/repetition_penalty_reward/std": 0.04563181474804878, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3329.10546875, "completions/mean_terminated_length": 3207.651611328125, "completions/min_length": 2152.0, "completions/min_terminated_length": 2152.0, "epoch": 0.06293333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13313370943069458, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 212586592.0, "reward": 2.592970371246338, "reward_std": 0.2044907510280609, "rewards/cosine_scaled_reward/mean": 0.8135062456130981, "rewards/cosine_scaled_reward/std": 0.4072699248790741, "rewards/repetition_penalty_reward/mean": -0.11819228529930115, "rewards/repetition_penalty_reward/std": 0.03720817714929581, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3270.796875, "completions/mean_terminated_length": 3177.512939453125, "completions/min_length": 2274.0, "completions/min_terminated_length": 2274.0, "epoch": 0.06314666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1521005481481552, "learning_rate": 1e-06, "loss": -0.0386, "num_tokens": 213475716.0, "reward": 2.567768096923828, "reward_std": 0.22235240042209625, "rewards/cosine_scaled_reward/mean": 0.8015576601028442, "rewards/cosine_scaled_reward/std": 0.41258522868156433, "rewards/repetition_penalty_reward/mean": -0.13300836086273193, "rewards/repetition_penalty_reward/std": 0.03633992001414299, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.5625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3497.609375, "completions/mean_terminated_length": 3366.533447265625, "completions/min_length": 2249.0, "completions/min_terminated_length": 2249.0, "epoch": 0.06336, "frac_reward_zero_std": 0.0, "grad_norm": 0.14543825387954712, "learning_rate": 1e-06, "loss": -0.0242, "num_tokens": 214346760.0, "reward": 2.4732470512390137, "reward_std": 0.29390355944633484, "rewards/cosine_scaled_reward/mean": 0.7561601400375366, "rewards/cosine_scaled_reward/std": 0.5014687776565552, "rewards/repetition_penalty_reward/mean": -0.1297881156206131, "rewards/repetition_penalty_reward/std": 0.04418834671378136, "rewards/reward_format/mean": 0.9874999523162842, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.859375, "rewards/reward_reference/std": 0.3483152687549591, "step": 297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 3238.6484375, "completions/mean_terminated_length": 3173.806884765625, "completions/min_length": 2302.0, "completions/min_terminated_length": 2302.0, "epoch": 0.06357333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.09670107811689377, "learning_rate": 1e-06, "loss": -0.0168, "num_tokens": 215257686.0, "reward": 2.6410131454467773, "reward_std": 0.12483850121498108, "rewards/cosine_scaled_reward/mean": 0.8424481749534607, "rewards/cosine_scaled_reward/std": 0.34818026423454285, "rewards/repetition_penalty_reward/mean": -0.13346639275550842, "rewards/repetition_penalty_reward/std": 0.04153239354491234, "rewards/reward_format/mean": 0.9984375238418579, "rewards/reward_format/std": 0.02500000037252903, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 3258.67578125, "completions/mean_terminated_length": 3159.951904296875, "completions/min_length": 2020.0, "completions/min_terminated_length": 2020.0, "epoch": 0.06378666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11681879311800003, "learning_rate": 1e-06, "loss": -0.0334, "num_tokens": 216144543.0, "reward": 2.4384727478027344, "reward_std": 0.16123497486114502, "rewards/cosine_scaled_reward/mean": 0.7305750846862793, "rewards/cosine_scaled_reward/std": 0.4939228594303131, "rewards/repetition_penalty_reward/mean": -0.13507118821144104, "rewards/repetition_penalty_reward/std": 0.048798564821481705, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.85546875, "rewards/reward_reference/std": 0.35231640934944153, "step": 299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3418.9609375, "completions/mean_terminated_length": 3335.81591796875, "completions/min_length": 2241.0, "completions/min_terminated_length": 2241.0, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.1509273201227188, "learning_rate": 1e-06, "loss": -0.0569, "num_tokens": 217073285.0, "reward": 2.488239049911499, "reward_std": 0.19046112895011902, "rewards/cosine_scaled_reward/mean": 0.7709412574768066, "rewards/cosine_scaled_reward/std": 0.477975994348526, "rewards/repetition_penalty_reward/mean": -0.13895225524902344, "rewards/repetition_penalty_reward/std": 0.041755057871341705, "rewards/reward_format/mean": 0.9812500476837158, "rewards/reward_format/std": 0.12126781791448593, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 3356.453125, "completions/mean_terminated_length": 3261.973388671875, "completions/min_length": 2103.0, "completions/min_terminated_length": 2103.0, "epoch": 0.06421333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.09332457184791565, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 217976461.0, "reward": 2.6025989055633545, "reward_std": 0.14444968104362488, "rewards/cosine_scaled_reward/mean": 0.8382408618927002, "rewards/cosine_scaled_reward/std": 0.3792869448661804, "rewards/repetition_penalty_reward/mean": -0.14189210534095764, "rewards/repetition_penalty_reward/std": 0.040102288126945496, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3446.98046875, "completions/mean_terminated_length": 3360.827392578125, "completions/min_length": 2168.0, "completions/min_terminated_length": 2168.0, "epoch": 0.06442666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.15771138668060303, "learning_rate": 1e-06, "loss": -0.0407, "num_tokens": 218901280.0, "reward": 2.5199830532073975, "reward_std": 0.21550403535366058, "rewards/cosine_scaled_reward/mean": 0.7968882322311401, "rewards/cosine_scaled_reward/std": 0.4498888850212097, "rewards/repetition_penalty_reward/mean": -0.15503031015396118, "rewards/repetition_penalty_reward/std": 0.05109821632504463, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 3348.5390625, "completions/mean_terminated_length": 3295.372314453125, "completions/min_length": 2140.0, "completions/min_terminated_length": 2140.0, "epoch": 0.06464, "frac_reward_zero_std": 0.0, "grad_norm": 0.08387938141822815, "learning_rate": 1e-06, "loss": -0.025, "num_tokens": 219845850.0, "reward": 2.6532342433929443, "reward_std": 0.11601746082305908, "rewards/cosine_scaled_reward/mean": 0.8620700836181641, "rewards/cosine_scaled_reward/std": 0.33811599016189575, "rewards/repetition_penalty_reward/mean": -0.14711710810661316, "rewards/repetition_penalty_reward/std": 0.0417744405567646, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 3422.9453125, "completions/mean_terminated_length": 3323.345458984375, "completions/min_length": 2324.0, "completions/min_terminated_length": 2324.0, "epoch": 0.06485333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1095823347568512, "learning_rate": 1e-06, "loss": -0.0295, "num_tokens": 220753264.0, "reward": 2.5606091022491455, "reward_std": 0.15420199930667877, "rewards/cosine_scaled_reward/mean": 0.8168035745620728, "rewards/cosine_scaled_reward/std": 0.4226503074169159, "rewards/repetition_penalty_reward/mean": -0.15619438886642456, "rewards/repetition_penalty_reward/std": 0.04338504374027252, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3535.171875, "completions/mean_terminated_length": 3415.564208984375, "completions/min_length": 2551.0, "completions/min_terminated_length": 2551.0, "epoch": 0.06506666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10752750188112259, "learning_rate": 1e-06, "loss": -0.0545, "num_tokens": 221632156.0, "reward": 2.4782791137695312, "reward_std": 0.2294086217880249, "rewards/cosine_scaled_reward/mean": 0.7806603908538818, "rewards/cosine_scaled_reward/std": 0.48239925503730774, "rewards/repetition_penalty_reward/mean": -0.17113137245178223, "rewards/repetition_penalty_reward/std": 0.04546995088458061, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 3519.09375, "completions/mean_terminated_length": 3338.625732421875, "completions/min_length": 2358.0, "completions/min_terminated_length": 2358.0, "epoch": 0.06528, "frac_reward_zero_std": 0.0, "grad_norm": 0.2055405229330063, "learning_rate": 1e-06, "loss": -0.1014, "num_tokens": 222445968.0, "reward": 2.464463472366333, "reward_std": 0.27007126808166504, "rewards/cosine_scaled_reward/mean": 0.7708021402359009, "rewards/cosine_scaled_reward/std": 0.4875037968158722, "rewards/repetition_penalty_reward/mean": -0.17118236422538757, "rewards/repetition_penalty_reward/std": 0.05047919228672981, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3404.953125, "completions/mean_terminated_length": 3299.1171875, "completions/min_length": 2167.0, "completions/min_terminated_length": 2167.0, "epoch": 0.06549333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.0901985913515091, "learning_rate": 1e-06, "loss": -0.044, "num_tokens": 223336840.0, "reward": 2.4638819694519043, "reward_std": 0.20657259225845337, "rewards/cosine_scaled_reward/mean": 0.7703253030776978, "rewards/cosine_scaled_reward/std": 0.4742421805858612, "rewards/repetition_penalty_reward/mean": -0.1650371104478836, "rewards/repetition_penalty_reward/std": 0.04591949284076691, "rewards/reward_format/mean": 0.9874999523162842, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4050.0, "completions/mean_length": 3365.19140625, "completions/mean_terminated_length": 3296.483154296875, "completions/min_length": 2215.0, "completions/min_terminated_length": 2215.0, "epoch": 0.06570666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.11064548045396805, "learning_rate": 1e-06, "loss": -0.0274, "num_tokens": 224260045.0, "reward": 2.606769561767578, "reward_std": 0.12193898856639862, "rewards/cosine_scaled_reward/mean": 0.8420050740242004, "rewards/cosine_scaled_reward/std": 0.3759949207305908, "rewards/repetition_penalty_reward/mean": -0.16101685166358948, "rewards/repetition_penalty_reward/std": 0.05338205397129059, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 3431.328125, "completions/mean_terminated_length": 3281.8564453125, "completions/min_length": 2136.0, "completions/min_terminated_length": 2136.0, "epoch": 0.06592, "frac_reward_zero_std": 0.0, "grad_norm": 0.14071083068847656, "learning_rate": 1e-06, "loss": -0.0582, "num_tokens": 225100965.0, "reward": 2.4305174350738525, "reward_std": 0.22230038046836853, "rewards/cosine_scaled_reward/mean": 0.7429267764091492, "rewards/cosine_scaled_reward/std": 0.5048913955688477, "rewards/repetition_penalty_reward/mean": -0.1639719009399414, "rewards/repetition_penalty_reward/std": 0.047285351902246475, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8515625, "rewards/reward_reference/std": 0.3562295734882355, "step": 309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3377.625, "completions/mean_terminated_length": 3278.64892578125, "completions/min_length": 2386.0, "completions/min_terminated_length": 2386.0, "epoch": 0.06613333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.08242145925760269, "learning_rate": 1e-06, "loss": -0.0368, "num_tokens": 226001273.0, "reward": 2.520883560180664, "reward_std": 0.16118432581424713, "rewards/cosine_scaled_reward/mean": 0.7903509736061096, "rewards/cosine_scaled_reward/std": 0.4486011564731598, "rewards/repetition_penalty_reward/mean": -0.15306127071380615, "rewards/repetition_penalty_reward/std": 0.04898226633667946, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 3350.13671875, "completions/mean_terminated_length": 3232.013671875, "completions/min_length": 2259.0, "completions/min_terminated_length": 2259.0, "epoch": 0.06634666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1643557995557785, "learning_rate": 1e-06, "loss": -0.05, "num_tokens": 226871648.0, "reward": 2.4743309020996094, "reward_std": 0.24664385616779327, "rewards/cosine_scaled_reward/mean": 0.7613555192947388, "rewards/cosine_scaled_reward/std": 0.47476089000701904, "rewards/repetition_penalty_reward/mean": -0.15811839699745178, "rewards/repetition_penalty_reward/std": 0.054658714681863785, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 3321.578125, "completions/mean_terminated_length": 3198.932373046875, "completions/min_length": 2180.0, "completions/min_terminated_length": 2180.0, "epoch": 0.06656, "frac_reward_zero_std": 0.0, "grad_norm": 0.09745794534683228, "learning_rate": 1e-06, "loss": -0.0309, "num_tokens": 227734296.0, "reward": 2.3746564388275146, "reward_std": 0.18470577895641327, "rewards/cosine_scaled_reward/mean": 0.6990936398506165, "rewards/cosine_scaled_reward/std": 0.5295417308807373, "rewards/repetition_penalty_reward/mean": -0.15021856129169464, "rewards/repetition_penalty_reward/std": 0.05101132020354271, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.83203125, "rewards/reward_reference/std": 0.3745708465576172, "step": 312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 3381.27734375, "completions/mean_terminated_length": 3271.8154296875, "completions/min_length": 2172.0, "completions/min_terminated_length": 2172.0, "epoch": 0.06677333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.06862661987543106, "learning_rate": 1e-06, "loss": -0.0099, "num_tokens": 228628475.0, "reward": 2.5650148391723633, "reward_std": 0.10304483771324158, "rewards/cosine_scaled_reward/mean": 0.8104842901229858, "rewards/cosine_scaled_reward/std": 0.4223276972770691, "rewards/repetition_penalty_reward/mean": -0.14546939730644226, "rewards/repetition_penalty_reward/std": 0.04435879364609718, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 3382.66015625, "completions/mean_terminated_length": 3246.6279296875, "completions/min_length": 2071.0, "completions/min_terminated_length": 2071.0, "epoch": 0.06698666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1515108048915863, "learning_rate": 1e-06, "loss": -0.0663, "num_tokens": 229494192.0, "reward": 2.4176552295684814, "reward_std": 0.323119193315506, "rewards/cosine_scaled_reward/mean": 0.7321640849113464, "rewards/cosine_scaled_reward/std": 0.5087533593177795, "rewards/repetition_penalty_reward/mean": -0.14654016494750977, "rewards/repetition_penalty_reward/std": 0.04701191931962967, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.84765625, "rewards/reward_reference/std": 0.3600577116012573, "step": 314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3217.7421875, "completions/mean_terminated_length": 3155.27197265625, "completions/min_length": 2127.0, "completions/min_terminated_length": 2127.0, "epoch": 0.0672, "frac_reward_zero_std": 0.0, "grad_norm": 0.08618131279945374, "learning_rate": 1e-06, "loss": -0.0266, "num_tokens": 230404190.0, "reward": 2.642916202545166, "reward_std": 0.12977439165115356, "rewards/cosine_scaled_reward/mean": 0.8511161804199219, "rewards/cosine_scaled_reward/std": 0.3265201151371002, "rewards/repetition_penalty_reward/mean": -0.143356055021286, "rewards/repetition_penalty_reward/std": 0.046873416751623154, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 3232.3515625, "completions/mean_terminated_length": 3155.17431640625, "completions/min_length": 2194.0, "completions/min_terminated_length": 2194.0, "epoch": 0.06741333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.0868743509054184, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 231304712.0, "reward": 2.633847236633301, "reward_std": 0.17269165813922882, "rewards/cosine_scaled_reward/mean": 0.8517568707466125, "rewards/cosine_scaled_reward/std": 0.32880187034606934, "rewards/repetition_penalty_reward/mean": -0.14994081854820251, "rewards/repetition_penalty_reward/std": 0.0538417249917984, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.84375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3367.59765625, "completions/mean_terminated_length": 3244.5341796875, "completions/min_length": 2088.0, "completions/min_terminated_length": 2088.0, "epoch": 0.06762666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11475294828414917, "learning_rate": 1e-06, "loss": -0.0357, "num_tokens": 232173373.0, "reward": 2.503598213195801, "reward_std": 0.21392254531383514, "rewards/cosine_scaled_reward/mean": 0.7742725610733032, "rewards/cosine_scaled_reward/std": 0.4625420868396759, "rewards/repetition_penalty_reward/mean": -0.14333048462867737, "rewards/repetition_penalty_reward/std": 0.04298888146877289, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 3266.171875, "completions/mean_terminated_length": 3121.522705078125, "completions/min_length": 1969.0, "completions/min_terminated_length": 1969.0, "epoch": 0.06784, "frac_reward_zero_std": 0.0, "grad_norm": 0.16944526135921478, "learning_rate": 1e-06, "loss": -0.0526, "num_tokens": 233010949.0, "reward": 2.3641538619995117, "reward_std": 0.34667205810546875, "rewards/cosine_scaled_reward/mean": 0.6916285753250122, "rewards/cosine_scaled_reward/std": 0.5271083116531372, "rewards/repetition_penalty_reward/mean": -0.14934960007667542, "rewards/repetition_penalty_reward/std": 0.047472234815359116, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.828125, "rewards/reward_reference/std": 0.3780108094215393, "step": 318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3283.04296875, "completions/mean_terminated_length": 3118.925048828125, "completions/min_length": 2031.0, "completions/min_terminated_length": 2031.0, "epoch": 0.06805333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.16193901002407074, "learning_rate": 1e-06, "loss": -0.0488, "num_tokens": 233842544.0, "reward": 2.514294147491455, "reward_std": 0.23703011870384216, "rewards/cosine_scaled_reward/mean": 0.7864575386047363, "rewards/cosine_scaled_reward/std": 0.42908984422683716, "rewards/repetition_penalty_reward/mean": -0.15497568249702454, "rewards/repetition_penalty_reward/std": 0.05239295959472656, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3214.52734375, "completions/mean_terminated_length": 3147.861572265625, "completions/min_length": 2131.0, "completions/min_terminated_length": 2131.0, "epoch": 0.06826666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1349935382604599, "learning_rate": 1e-06, "loss": -0.0489, "num_tokens": 234753935.0, "reward": 2.569523334503174, "reward_std": 0.24764062464237213, "rewards/cosine_scaled_reward/mean": 0.8103736042976379, "rewards/cosine_scaled_reward/std": 0.39255258440971375, "rewards/repetition_penalty_reward/mean": -0.15491291880607605, "rewards/repetition_penalty_reward/std": 0.04374931380152702, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 3301.375, "completions/mean_terminated_length": 3215.376708984375, "completions/min_length": 1930.0, "completions/min_terminated_length": 1930.0, "epoch": 0.06848, "frac_reward_zero_std": 0.0, "grad_norm": 0.11944566667079926, "learning_rate": 1e-06, "loss": -0.0181, "num_tokens": 235665967.0, "reward": 2.621381998062134, "reward_std": 0.19357608258724213, "rewards/cosine_scaled_reward/mean": 0.8457991480827332, "rewards/cosine_scaled_reward/std": 0.35158571600914, "rewards/repetition_penalty_reward/mean": -0.16191723942756653, "rewards/repetition_penalty_reward/std": 0.051886800676584244, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 3223.7109375, "completions/mean_terminated_length": 3107.92041015625, "completions/min_length": 1928.0, "completions/min_terminated_length": 1928.0, "epoch": 0.06869333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12029201537370682, "learning_rate": 1e-06, "loss": -0.0333, "num_tokens": 236527273.0, "reward": 2.5427041053771973, "reward_std": 0.19083553552627563, "rewards/cosine_scaled_reward/mean": 0.7951414585113525, "rewards/cosine_scaled_reward/std": 0.41195037961006165, "rewards/repetition_penalty_reward/mean": -0.15478110313415527, "rewards/repetition_penalty_reward/std": 0.04209055379033089, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 3223.125, "completions/mean_terminated_length": 3120.209716796875, "completions/min_length": 1631.0, "completions/min_terminated_length": 1631.0, "epoch": 0.06890666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10623487830162048, "learning_rate": 1e-06, "loss": -0.021, "num_tokens": 237403709.0, "reward": 2.5744786262512207, "reward_std": 0.15794894099235535, "rewards/cosine_scaled_reward/mean": 0.8062847256660461, "rewards/cosine_scaled_reward/std": 0.39761340618133545, "rewards/repetition_penalty_reward/mean": -0.1419624239206314, "rewards/repetition_penalty_reward/std": 0.045276276767253876, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 3123.16796875, "completions/mean_terminated_length": 3071.123291015625, "completions/min_length": 1944.0, "completions/min_terminated_length": 1944.0, "epoch": 0.06912, "frac_reward_zero_std": 0.0, "grad_norm": 0.0978594645857811, "learning_rate": 1e-06, "loss": -0.0269, "num_tokens": 238315692.0, "reward": 2.6401233673095703, "reward_std": 0.13173067569732666, "rewards/cosine_scaled_reward/mean": 0.8438149690628052, "rewards/cosine_scaled_reward/std": 0.3169075548648834, "rewards/repetition_penalty_reward/mean": -0.14900407195091248, "rewards/repetition_penalty_reward/std": 0.044161971658468246, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9453125, "rewards/reward_reference/std": 0.22781464457511902, "step": 324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3193.15625, "completions/mean_terminated_length": 3091.095458984375, "completions/min_length": 1871.0, "completions/min_terminated_length": 1871.0, "epoch": 0.06933333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1280767023563385, "learning_rate": 1e-06, "loss": -0.0333, "num_tokens": 239186876.0, "reward": 2.480194568634033, "reward_std": 0.1864958554506302, "rewards/cosine_scaled_reward/mean": 0.748255729675293, "rewards/cosine_scaled_reward/std": 0.46193239092826843, "rewards/repetition_penalty_reward/mean": -0.14306114614009857, "rewards/repetition_penalty_reward/std": 0.0422794334590435, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 3207.65625, "completions/mean_terminated_length": 3089.734619140625, "completions/min_length": 1846.0, "completions/min_terminated_length": 1846.0, "epoch": 0.06954666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10687022656202316, "learning_rate": 1e-06, "loss": -0.0097, "num_tokens": 240050532.0, "reward": 2.498737335205078, "reward_std": 0.19235044717788696, "rewards/cosine_scaled_reward/mean": 0.7589946389198303, "rewards/cosine_scaled_reward/std": 0.4520692229270935, "rewards/repetition_penalty_reward/mean": -0.13916371762752533, "rewards/repetition_penalty_reward/std": 0.04271097108721733, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.28125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3233.9765625, "completions/mean_terminated_length": 3148.884033203125, "completions/min_length": 2025.0, "completions/min_terminated_length": 2025.0, "epoch": 0.06976, "frac_reward_zero_std": 0.0, "grad_norm": 0.08872903883457184, "learning_rate": 1e-06, "loss": -0.04, "num_tokens": 240950130.0, "reward": 2.476565361022949, "reward_std": 0.15382640063762665, "rewards/cosine_scaled_reward/mean": 0.7487243413925171, "rewards/cosine_scaled_reward/std": 0.4689893126487732, "rewards/repetition_penalty_reward/mean": -0.14012770354747772, "rewards/repetition_penalty_reward/std": 0.03780807927250862, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 3089.98828125, "completions/mean_terminated_length": 3031.7890625, "completions/min_length": 1994.0, "completions/min_terminated_length": 1994.0, "epoch": 0.06997333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.131632000207901, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 241849251.0, "reward": 2.537092685699463, "reward_std": 0.14982327818870544, "rewards/cosine_scaled_reward/mean": 0.784229040145874, "rewards/cosine_scaled_reward/std": 0.4037418067455292, "rewards/repetition_penalty_reward/mean": -0.14010494947433472, "rewards/repetition_penalty_reward/std": 0.04123867303133011, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 2925.15625, "completions/mean_terminated_length": 2877.560791015625, "completions/min_length": 1770.0, "completions/min_terminated_length": 1770.0, "epoch": 0.07018666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10088398307561874, "learning_rate": 1e-06, "loss": -0.0143, "num_tokens": 242712703.0, "reward": 2.5291996002197266, "reward_std": 0.13664719462394714, "rewards/cosine_scaled_reward/mean": 0.7678295969963074, "rewards/cosine_scaled_reward/std": 0.38781559467315674, "rewards/repetition_penalty_reward/mean": -0.13238000869750977, "rewards/repetition_penalty_reward/std": 0.041910383850336075, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 3088.9453125, "completions/mean_terminated_length": 3017.313720703125, "completions/min_length": 1751.0, "completions/min_terminated_length": 1751.0, "epoch": 0.0704, "frac_reward_zero_std": 0.0, "grad_norm": 0.1253613382577896, "learning_rate": 1e-06, "loss": -0.0298, "num_tokens": 243592617.0, "reward": 2.6539814472198486, "reward_std": 0.12508264183998108, "rewards/cosine_scaled_reward/mean": 0.8431792855262756, "rewards/cosine_scaled_reward/std": 0.3038645088672638, "rewards/repetition_penalty_reward/mean": -0.13294783234596252, "rewards/repetition_penalty_reward/std": 0.044496990740299225, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.953125, "rewards/reward_reference/std": 0.21178513765335083, "step": 330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 3092.9296875, "completions/mean_terminated_length": 3039.267333984375, "completions/min_length": 1928.0, "completions/min_terminated_length": 1928.0, "epoch": 0.07061333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1407911628484726, "learning_rate": 1e-06, "loss": -0.0164, "num_tokens": 244490035.0, "reward": 2.546139717102051, "reward_std": 0.24451303482055664, "rewards/cosine_scaled_reward/mean": 0.7802634239196777, "rewards/cosine_scaled_reward/std": 0.40869632363319397, "rewards/repetition_penalty_reward/mean": -0.13646754622459412, "rewards/repetition_penalty_reward/std": 0.03807799518108368, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 3078.32421875, "completions/mean_terminated_length": 3049.71484375, "completions/min_length": 2056.0, "completions/min_terminated_length": 2056.0, "epoch": 0.07082666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.09870472550392151, "learning_rate": 1e-06, "loss": -0.0263, "num_tokens": 245405750.0, "reward": 2.5187315940856934, "reward_std": 0.11564701050519943, "rewards/cosine_scaled_reward/mean": 0.7655521035194397, "rewards/cosine_scaled_reward/std": 0.4269881546497345, "rewards/repetition_penalty_reward/mean": -0.1335393190383911, "rewards/repetition_penalty_reward/std": 0.040202755481004715, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3170.1171875, "completions/mean_terminated_length": 3051.83251953125, "completions/min_length": 1803.0, "completions/min_terminated_length": 1803.0, "epoch": 0.07104, "frac_reward_zero_std": 0.0, "grad_norm": 0.13158267736434937, "learning_rate": 1e-06, "loss": -0.021, "num_tokens": 246258068.0, "reward": 2.388303756713867, "reward_std": 0.20445595681667328, "rewards/cosine_scaled_reward/mean": 0.6967047452926636, "rewards/cosine_scaled_reward/std": 0.50649094581604, "rewards/repetition_penalty_reward/mean": -0.13730722665786743, "rewards/repetition_penalty_reward/std": 0.04322898015379906, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.83203125, "rewards/reward_reference/std": 0.3745708465576172, "step": 333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 2926.36328125, "completions/mean_terminated_length": 2848.3876953125, "completions/min_length": 1722.0, "completions/min_terminated_length": 1722.0, "epoch": 0.07125333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.13790081441402435, "learning_rate": 1e-06, "loss": -0.0391, "num_tokens": 247097269.0, "reward": 2.519500732421875, "reward_std": 0.17879009246826172, "rewards/cosine_scaled_reward/mean": 0.7537988424301147, "rewards/cosine_scaled_reward/std": 0.4042138457298279, "rewards/repetition_penalty_reward/mean": -0.13039204478263855, "rewards/repetition_penalty_reward/std": 0.04133962467312813, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 3129.53125, "completions/mean_terminated_length": 3052.050537109375, "completions/min_length": 1994.0, "completions/min_terminated_length": 1994.0, "epoch": 0.07146666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1301202028989792, "learning_rate": 1e-06, "loss": -0.0415, "num_tokens": 247987881.0, "reward": 2.525852918624878, "reward_std": 0.20776943862438202, "rewards/cosine_scaled_reward/mean": 0.772997260093689, "rewards/cosine_scaled_reward/std": 0.424231618642807, "rewards/repetition_penalty_reward/mean": -0.13542558252811432, "rewards/repetition_penalty_reward/std": 0.03885306045413017, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3204.3828125, "completions/mean_terminated_length": 3136.94970703125, "completions/min_length": 1910.0, "completions/min_terminated_length": 1910.0, "epoch": 0.07168, "frac_reward_zero_std": 0.0, "grad_norm": 0.14549526572227478, "learning_rate": 1e-06, "loss": -0.0383, "num_tokens": 248902923.0, "reward": 2.6701064109802246, "reward_std": 0.16786515712738037, "rewards/cosine_scaled_reward/mean": 0.8573368787765503, "rewards/cosine_scaled_reward/std": 0.31324511766433716, "rewards/repetition_penalty_reward/mean": -0.1333240270614624, "rewards/repetition_penalty_reward/std": 0.03887150436639786, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.94921875, "rewards/reward_reference/std": 0.21998079121112823, "step": 336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 3134.62890625, "completions/mean_terminated_length": 3053.15673828125, "completions/min_length": 1814.0, "completions/min_terminated_length": 1814.0, "epoch": 0.07189333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.114342100918293, "learning_rate": 1e-06, "loss": -0.0226, "num_tokens": 249785728.0, "reward": 2.490480422973633, "reward_std": 0.15718211233615875, "rewards/cosine_scaled_reward/mean": 0.7543493509292603, "rewards/cosine_scaled_reward/std": 0.44703423976898193, "rewards/repetition_penalty_reward/mean": -0.13652509450912476, "rewards/repetition_penalty_reward/std": 0.04738989472389221, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718994140625, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 3048.6015625, "completions/mean_terminated_length": 2969.38671875, "completions/min_length": 1985.0, "completions/min_terminated_length": 1985.0, "epoch": 0.07210666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.14780311286449432, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 250653090.0, "reward": 2.460996150970459, "reward_std": 0.24623346328735352, "rewards/cosine_scaled_reward/mean": 0.7283567786216736, "rewards/cosine_scaled_reward/std": 0.45921623706817627, "rewards/repetition_penalty_reward/mean": -0.13142311573028564, "rewards/repetition_penalty_reward/std": 0.0422639399766922, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 3052.28125, "completions/mean_terminated_length": 2991.900634765625, "completions/min_length": 1768.0, "completions/min_terminated_length": 1768.0, "epoch": 0.07232, "frac_reward_zero_std": 0.0, "grad_norm": 0.15734364092350006, "learning_rate": 1e-06, "loss": -0.0126, "num_tokens": 251538318.0, "reward": 2.5594005584716797, "reward_std": 0.19011801481246948, "rewards/cosine_scaled_reward/mean": 0.7857754826545715, "rewards/cosine_scaled_reward/std": 0.39223918318748474, "rewards/repetition_penalty_reward/mean": -0.13028106093406677, "rewards/repetition_penalty_reward/std": 0.03792421892285347, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3079.796875, "completions/mean_terminated_length": 2998.328857421875, "completions/min_length": 1895.0, "completions/min_terminated_length": 1895.0, "epoch": 0.07253333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.17438679933547974, "learning_rate": 1e-06, "loss": -0.0151, "num_tokens": 252415094.0, "reward": 2.578650951385498, "reward_std": 0.22237978875637054, "rewards/cosine_scaled_reward/mean": 0.7927536368370056, "rewards/cosine_scaled_reward/std": 0.3868005573749542, "rewards/repetition_penalty_reward/mean": -0.1320713758468628, "rewards/repetition_penalty_reward/std": 0.04136000573635101, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 3052.14453125, "completions/mean_terminated_length": 2958.86376953125, "completions/min_length": 1826.0, "completions/min_terminated_length": 1826.0, "epoch": 0.07274666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13218127191066742, "learning_rate": 1e-06, "loss": -0.03, "num_tokens": 253276631.0, "reward": 2.4949088096618652, "reward_std": 0.22383946180343628, "rewards/cosine_scaled_reward/mean": 0.7475392818450928, "rewards/cosine_scaled_reward/std": 0.4382774531841278, "rewards/repetition_penalty_reward/mean": -0.12841179966926575, "rewards/repetition_penalty_reward/std": 0.036752402782440186, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.28125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 3101.42578125, "completions/mean_terminated_length": 3003.2490234375, "completions/min_length": 1932.0, "completions/min_terminated_length": 1932.0, "epoch": 0.07296, "frac_reward_zero_std": 0.0, "grad_norm": 0.16702738404273987, "learning_rate": 1e-06, "loss": -0.0343, "num_tokens": 254126232.0, "reward": 2.5561437606811523, "reward_std": 0.23330280184745789, "rewards/cosine_scaled_reward/mean": 0.7809128761291504, "rewards/cosine_scaled_reward/std": 0.40656355023384094, "rewards/repetition_penalty_reward/mean": -0.12789398431777954, "rewards/repetition_penalty_reward/std": 0.036753393709659576, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 3152.01953125, "completions/mean_terminated_length": 3076.341552734375, "completions/min_length": 1919.0, "completions/min_terminated_length": 1919.0, "epoch": 0.07317333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.0745634138584137, "learning_rate": 1e-06, "loss": -0.0119, "num_tokens": 255025093.0, "reward": 2.526113748550415, "reward_std": 0.09965360909700394, "rewards/cosine_scaled_reward/mean": 0.7831565737724304, "rewards/cosine_scaled_reward/std": 0.41442909836769104, "rewards/repetition_penalty_reward/mean": -0.1328241229057312, "rewards/repetition_penalty_reward/std": 0.053901638835668564, "rewards/reward_format/mean": 0.9812500476837158, "rewards/reward_format/std": 0.12126781791448593, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 3020.8671875, "completions/mean_terminated_length": 2953.950439453125, "completions/min_length": 1875.0, "completions/min_terminated_length": 1875.0, "epoch": 0.07338666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11070467531681061, "learning_rate": 1e-06, "loss": -0.0319, "num_tokens": 255898239.0, "reward": 2.5960850715637207, "reward_std": 0.16182690858840942, "rewards/cosine_scaled_reward/mean": 0.806800127029419, "rewards/cosine_scaled_reward/std": 0.3528696298599243, "rewards/repetition_penalty_reward/mean": -0.13024629652500153, "rewards/repetition_penalty_reward/std": 0.03998575359582901, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 3150.12890625, "completions/mean_terminated_length": 3043.2041015625, "completions/min_length": 1923.0, "completions/min_terminated_length": 1923.0, "epoch": 0.0736, "frac_reward_zero_std": 0.0, "grad_norm": 0.1139230728149414, "learning_rate": 1e-06, "loss": -0.0359, "num_tokens": 256756796.0, "reward": 2.490581512451172, "reward_std": 0.23320190608501434, "rewards/cosine_scaled_reward/mean": 0.751013994216919, "rewards/cosine_scaled_reward/std": 0.45080894231796265, "rewards/repetition_penalty_reward/mean": -0.13699480891227722, "rewards/repetition_penalty_reward/std": 0.047232817858457565, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3135.7890625, "completions/mean_terminated_length": 3022.576416015625, "completions/min_length": 1962.0, "completions/min_terminated_length": 1962.0, "epoch": 0.07381333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13618801534175873, "learning_rate": 1e-06, "loss": -0.0365, "num_tokens": 257608746.0, "reward": 2.4578542709350586, "reward_std": 0.18854659795761108, "rewards/cosine_scaled_reward/mean": 0.7333296537399292, "rewards/cosine_scaled_reward/std": 0.4674873352050781, "rewards/repetition_penalty_reward/mean": -0.13641303777694702, "rewards/repetition_penalty_reward/std": 0.04397253319621086, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3082.71484375, "completions/mean_terminated_length": 2968.16943359375, "completions/min_length": 1786.0, "completions/min_terminated_length": 1786.0, "epoch": 0.07402666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.17331695556640625, "learning_rate": 1e-06, "loss": -0.0341, "num_tokens": 258451761.0, "reward": 2.4771933555603027, "reward_std": 0.28758424520492554, "rewards/cosine_scaled_reward/mean": 0.7422814965248108, "rewards/cosine_scaled_reward/std": 0.44633299112319946, "rewards/repetition_penalty_reward/mean": -0.13774433732032776, "rewards/repetition_penalty_reward/std": 0.03738153725862503, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 3093.9140625, "completions/mean_terminated_length": 2970.850830078125, "completions/min_length": 1671.0, "completions/min_terminated_length": 1671.0, "epoch": 0.07424, "frac_reward_zero_std": 0.0, "grad_norm": 0.1594753861427307, "learning_rate": 1e-06, "loss": -0.0315, "num_tokens": 259287891.0, "reward": 2.4308295249938965, "reward_std": 0.21208155155181885, "rewards/cosine_scaled_reward/mean": 0.7279205322265625, "rewards/cosine_scaled_reward/std": 0.4628751277923584, "rewards/repetition_penalty_reward/mean": -0.14005976915359497, "rewards/repetition_penalty_reward/std": 0.046011701226234436, "rewards/reward_format/mean": 0.9718749523162842, "rewards/reward_format/std": 0.1476283222436905, "rewards/reward_reference/mean": 0.87109375, "rewards/reward_reference/std": 0.33575257658958435, "step": 348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3122.5234375, "completions/mean_terminated_length": 3031.000244140625, "completions/min_length": 1635.0, "completions/min_terminated_length": 1635.0, "epoch": 0.07445333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1562374234199524, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 260165049.0, "reward": 2.5966033935546875, "reward_std": 0.22522631287574768, "rewards/cosine_scaled_reward/mean": 0.8105063438415527, "rewards/cosine_scaled_reward/std": 0.3691033720970154, "rewards/repetition_penalty_reward/mean": -0.13187175989151, "rewards/repetition_penalty_reward/std": 0.03818640485405922, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 3308.6328125, "completions/mean_terminated_length": 3223.419921875, "completions/min_length": 2104.0, "completions/min_terminated_length": 2104.0, "epoch": 0.07466666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13112296164035797, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 261085359.0, "reward": 2.4156880378723145, "reward_std": 0.11315947771072388, "rewards/cosine_scaled_reward/mean": 0.711700975894928, "rewards/cosine_scaled_reward/std": 0.5184807181358337, "rewards/repetition_penalty_reward/mean": -0.12882539629936218, "rewards/repetition_penalty_reward/std": 0.03247005119919777, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8359375, "rewards/reward_reference/std": 0.3710577189922333, "step": 350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 3232.9296875, "completions/mean_terminated_length": 3143.646484375, "completions/min_length": 1802.0, "completions/min_terminated_length": 1802.0, "epoch": 0.07488, "frac_reward_zero_std": 0.0, "grad_norm": 0.18751507997512817, "learning_rate": 1e-06, "loss": -0.0467, "num_tokens": 261980705.0, "reward": 2.4058680534362793, "reward_std": 0.25372546911239624, "rewards/cosine_scaled_reward/mean": 0.7078858017921448, "rewards/cosine_scaled_reward/std": 0.5087409019470215, "rewards/repetition_penalty_reward/mean": -0.13873633742332458, "rewards/repetition_penalty_reward/std": 0.04568910971283913, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.83984375, "rewards/reward_reference/std": 0.36746934056282043, "step": 351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3219.00390625, "completions/mean_terminated_length": 3119.864990234375, "completions/min_length": 1868.0, "completions/min_terminated_length": 1868.0, "epoch": 0.07509333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.09587662667036057, "learning_rate": 1e-06, "loss": -0.0318, "num_tokens": 262862994.0, "reward": 2.5543429851531982, "reward_std": 0.1828855276107788, "rewards/cosine_scaled_reward/mean": 0.7893514633178711, "rewards/cosine_scaled_reward/std": 0.4200114607810974, "rewards/repetition_penalty_reward/mean": -0.12719596922397614, "rewards/repetition_penalty_reward/std": 0.042319945991039276, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 3099.60546875, "completions/mean_terminated_length": 2986.969482421875, "completions/min_length": 1788.0, "completions/min_terminated_length": 1788.0, "epoch": 0.07530666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1518796682357788, "learning_rate": 1e-06, "loss": -0.0567, "num_tokens": 263715473.0, "reward": 2.518235683441162, "reward_std": 0.22827556729316711, "rewards/cosine_scaled_reward/mean": 0.7613465189933777, "rewards/cosine_scaled_reward/std": 0.42951709032058716, "rewards/repetition_penalty_reward/mean": -0.12357960641384125, "rewards/repetition_penalty_reward/std": 0.043831564486026764, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 3060.171875, "completions/mean_terminated_length": 2967.6083984375, "completions/min_length": 1789.0, "completions/min_terminated_length": 1789.0, "epoch": 0.07552, "frac_reward_zero_std": 0.0, "grad_norm": 0.10913080722093582, "learning_rate": 1e-06, "loss": -0.0263, "num_tokens": 264573281.0, "reward": 2.6362128257751465, "reward_std": 0.16269515454769135, "rewards/cosine_scaled_reward/mean": 0.8276014924049377, "rewards/cosine_scaled_reward/std": 0.3232189416885376, "rewards/repetition_penalty_reward/mean": -0.12966996431350708, "rewards/repetition_penalty_reward/std": 0.04010913148522377, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 3119.25390625, "completions/mean_terminated_length": 3008.839111328125, "completions/min_length": 1870.0, "completions/min_terminated_length": 1870.0, "epoch": 0.07573333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1123742014169693, "learning_rate": 1e-06, "loss": -0.0572, "num_tokens": 265430770.0, "reward": 2.573673725128174, "reward_std": 0.17991378903388977, "rewards/cosine_scaled_reward/mean": 0.7907612323760986, "rewards/cosine_scaled_reward/std": 0.39641907811164856, "rewards/repetition_penalty_reward/mean": -0.12411877512931824, "rewards/repetition_penalty_reward/std": 0.039008140563964844, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3150.96875, "completions/mean_terminated_length": 3075.20654296875, "completions/min_length": 1805.0, "completions/min_terminated_length": 1805.0, "epoch": 0.07594666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1424713432788849, "learning_rate": 1e-06, "loss": -0.0155, "num_tokens": 266325634.0, "reward": 2.5793204307556152, "reward_std": 0.15808260440826416, "rewards/cosine_scaled_reward/mean": 0.8081632852554321, "rewards/cosine_scaled_reward/std": 0.3798864781856537, "rewards/repetition_penalty_reward/mean": -0.13743659853935242, "rewards/repetition_penalty_reward/std": 0.043698474764823914, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3171.09765625, "completions/mean_terminated_length": 3057.51318359375, "completions/min_length": 1760.0, "completions/min_terminated_length": 1760.0, "epoch": 0.07616, "frac_reward_zero_std": 0.0, "grad_norm": 0.15009920299053192, "learning_rate": 1e-06, "loss": -0.0319, "num_tokens": 267183507.0, "reward": 2.5367894172668457, "reward_std": 0.2662561535835266, "rewards/cosine_scaled_reward/mean": 0.7738917469978333, "rewards/cosine_scaled_reward/std": 0.428524911403656, "rewards/repetition_penalty_reward/mean": -0.1316334754228592, "rewards/repetition_penalty_reward/std": 0.04720846191048622, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 3140.75, "completions/mean_terminated_length": 3041.930908203125, "completions/min_length": 1678.0, "completions/min_terminated_length": 1678.0, "epoch": 0.07637333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1082305908203125, "learning_rate": 1e-06, "loss": -0.0221, "num_tokens": 268049203.0, "reward": 2.462947368621826, "reward_std": 0.17016665637493134, "rewards/cosine_scaled_reward/mean": 0.7361332178115845, "rewards/cosine_scaled_reward/std": 0.4668603539466858, "rewards/repetition_penalty_reward/mean": -0.13099819421768188, "rewards/repetition_penalty_reward/std": 0.044717565178871155, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626225590705872, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 3161.84375, "completions/mean_terminated_length": 3074.017333984375, "completions/min_length": 1797.0, "completions/min_terminated_length": 1797.0, "epoch": 0.07658666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1251969188451767, "learning_rate": 1e-06, "loss": -0.047, "num_tokens": 268935583.0, "reward": 2.5594985485076904, "reward_std": 0.1853671371936798, "rewards/cosine_scaled_reward/mean": 0.7982162237167358, "rewards/cosine_scaled_reward/std": 0.3940296471118927, "rewards/repetition_penalty_reward/mean": -0.12934255599975586, "rewards/repetition_penalty_reward/std": 0.04356498643755913, "rewards/reward_format/mean": 0.984375, "rewards/reward_format/std": 0.11092304438352585, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3258.12890625, "completions/mean_terminated_length": 3155.232421875, "completions/min_length": 1929.0, "completions/min_terminated_length": 1929.0, "epoch": 0.0768, "frac_reward_zero_std": 0.0, "grad_norm": 0.12503835558891296, "learning_rate": 1e-06, "loss": -0.0417, "num_tokens": 269820536.0, "reward": 2.60491943359375, "reward_std": 0.19837068021297455, "rewards/cosine_scaled_reward/mean": 0.8162407875061035, "rewards/cosine_scaled_reward/std": 0.38866737484931946, "rewards/repetition_penalty_reward/mean": -0.1292901486158371, "rewards/repetition_penalty_reward/std": 0.04512140527367592, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3298.13671875, "completions/mean_terminated_length": 3137.065673828125, "completions/min_length": 1868.0, "completions/min_terminated_length": 1868.0, "epoch": 0.07701333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.16601675748825073, "learning_rate": 1e-06, "loss": -0.0397, "num_tokens": 270647575.0, "reward": 2.546705722808838, "reward_std": 0.22834451496601105, "rewards/cosine_scaled_reward/mean": 0.7922992706298828, "rewards/cosine_scaled_reward/std": 0.4232928454875946, "rewards/repetition_penalty_reward/mean": -0.12528109550476074, "rewards/repetition_penalty_reward/std": 0.0411822609603405, "rewards/reward_format/mean": 0.981249988079071, "rewards/reward_format/std": 0.12126781791448593, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 3211.41796875, "completions/mean_terminated_length": 3132.3701171875, "completions/min_length": 1917.0, "completions/min_terminated_length": 1917.0, "epoch": 0.07722666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.09665444493293762, "learning_rate": 1e-06, "loss": -0.0161, "num_tokens": 271544550.0, "reward": 2.56905460357666, "reward_std": 0.10312886536121368, "rewards/cosine_scaled_reward/mean": 0.7973703145980835, "rewards/cosine_scaled_reward/std": 0.4079588055610657, "rewards/repetition_penalty_reward/mean": -0.12675310671329498, "rewards/repetition_penalty_reward/std": 0.046840231865644455, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 3309.27734375, "completions/mean_terminated_length": 3184.683349609375, "completions/min_length": 1826.0, "completions/min_terminated_length": 1826.0, "epoch": 0.07744, "frac_reward_zero_std": 0.0, "grad_norm": 0.19870643317699432, "learning_rate": 1e-06, "loss": -0.0334, "num_tokens": 272415185.0, "reward": 2.4216766357421875, "reward_std": 0.30303946137428284, "rewards/cosine_scaled_reward/mean": 0.7191956043243408, "rewards/cosine_scaled_reward/std": 0.5092467069625854, "rewards/repetition_penalty_reward/mean": -0.13423755764961243, "rewards/repetition_penalty_reward/std": 0.04481777176260948, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.83984375, "rewards/reward_reference/std": 0.36746934056282043, "step": 363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3166.5546875, "completions/mean_terminated_length": 3038.497802734375, "completions/min_length": 1780.0, "completions/min_terminated_length": 1780.0, "epoch": 0.07765333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.11488021165132523, "learning_rate": 1e-06, "loss": -0.0241, "num_tokens": 273262835.0, "reward": 2.5682260990142822, "reward_std": 0.1561729609966278, "rewards/cosine_scaled_reward/mean": 0.7933893203735352, "rewards/cosine_scaled_reward/std": 0.3992950916290283, "rewards/repetition_penalty_reward/mean": -0.1314133107662201, "rewards/repetition_penalty_reward/std": 0.05208537355065346, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 3165.7890625, "completions/mean_terminated_length": 3078.33349609375, "completions/min_length": 2017.0, "completions/min_terminated_length": 2017.0, "epoch": 0.07786666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1785222291946411, "learning_rate": 1e-06, "loss": -0.024, "num_tokens": 274143013.0, "reward": 2.6224820613861084, "reward_std": 0.25625211000442505, "rewards/cosine_scaled_reward/mean": 0.8286576271057129, "rewards/cosine_scaled_reward/std": 0.3511349558830261, "rewards/repetition_penalty_reward/mean": -0.1366441696882248, "rewards/repetition_penalty_reward/std": 0.04617421701550484, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 3169.74609375, "completions/mean_terminated_length": 3120.193359375, "completions/min_length": 1721.0, "completions/min_terminated_length": 1721.0, "epoch": 0.07808, "frac_reward_zero_std": 0.0, "grad_norm": 0.11180980503559113, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 275077732.0, "reward": 2.5677073001861572, "reward_std": 0.1280892938375473, "rewards/cosine_scaled_reward/mean": 0.792206883430481, "rewards/cosine_scaled_reward/std": 0.40753600001335144, "rewards/repetition_penalty_reward/mean": -0.12684330344200134, "rewards/repetition_penalty_reward/std": 0.04561019316315651, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 3230.05078125, "completions/mean_terminated_length": 3115.101806640625, "completions/min_length": 2011.0, "completions/min_terminated_length": 2011.0, "epoch": 0.07829333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.15557599067687988, "learning_rate": 1e-06, "loss": -0.0496, "num_tokens": 275945429.0, "reward": 2.524695873260498, "reward_std": 0.2952307164669037, "rewards/cosine_scaled_reward/mean": 0.7712620496749878, "rewards/cosine_scaled_reward/std": 0.4414149224758148, "rewards/repetition_penalty_reward/mean": -0.13094106316566467, "rewards/repetition_penalty_reward/std": 0.04363211616873741, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3075.4921875, "completions/mean_terminated_length": 3007.45849609375, "completions/min_length": 2109.0, "completions/min_terminated_length": 2109.0, "epoch": 0.07850666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10304121673107147, "learning_rate": 1e-06, "loss": -0.0082, "num_tokens": 276824667.0, "reward": 2.6640024185180664, "reward_std": 0.15301068127155304, "rewards/cosine_scaled_reward/mean": 0.8460245132446289, "rewards/cosine_scaled_reward/std": 0.29841554164886475, "rewards/repetition_penalty_reward/mean": -0.13124100863933563, "rewards/repetition_penalty_reward/std": 0.04055340588092804, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94921875, "rewards/reward_reference/std": 0.21998079121112823, "step": 368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3073.58984375, "completions/mean_terminated_length": 2996.264892578125, "completions/min_length": 2018.0, "completions/min_terminated_length": 2018.0, "epoch": 0.07872, "frac_reward_zero_std": 0.0, "grad_norm": 0.16008536517620087, "learning_rate": 1e-06, "loss": -0.0362, "num_tokens": 277701158.0, "reward": 2.486879825592041, "reward_std": 0.22418928146362305, "rewards/cosine_scaled_reward/mean": 0.7446539402008057, "rewards/cosine_scaled_reward/std": 0.4457184672355652, "rewards/repetition_penalty_reward/mean": -0.13355520367622375, "rewards/repetition_penalty_reward/std": 0.04534250125288963, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 3048.96484375, "completions/mean_terminated_length": 3006.40234375, "completions/min_length": 1608.0, "completions/min_terminated_length": 1608.0, "epoch": 0.07893333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.07023734599351883, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 278601961.0, "reward": 2.585810661315918, "reward_std": 0.12515553832054138, "rewards/cosine_scaled_reward/mean": 0.8022688627243042, "rewards/cosine_scaled_reward/std": 0.36715206503868103, "rewards/repetition_penalty_reward/mean": -0.13520832359790802, "rewards/repetition_penalty_reward/std": 0.04460633173584938, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 3106.94921875, "completions/mean_terminated_length": 3023.13134765625, "completions/min_length": 1856.0, "completions/min_terminated_length": 1856.0, "epoch": 0.07914666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13042612373828888, "learning_rate": 1e-06, "loss": -0.0059, "num_tokens": 279480480.0, "reward": 2.483642578125, "reward_std": 0.2134016901254654, "rewards/cosine_scaled_reward/mean": 0.7506895065307617, "rewards/cosine_scaled_reward/std": 0.44514280557632446, "rewards/repetition_penalty_reward/mean": -0.14282819628715515, "rewards/repetition_penalty_reward/std": 0.04998833313584328, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 3048.23828125, "completions/mean_terminated_length": 2992.18505859375, "completions/min_length": 2046.0, "completions/min_terminated_length": 2046.0, "epoch": 0.07936, "frac_reward_zero_std": 0.0, "grad_norm": 0.14418748021125793, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 280377481.0, "reward": 2.578580856323242, "reward_std": 0.20151178538799286, "rewards/cosine_scaled_reward/mean": 0.8029334545135498, "rewards/cosine_scaled_reward/std": 0.3652251064777374, "rewards/repetition_penalty_reward/mean": -0.14700883626937866, "rewards/repetition_penalty_reward/std": 0.05446892976760864, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 3071.16015625, "completions/mean_terminated_length": 3020.758056640625, "completions/min_length": 1699.0, "completions/min_terminated_length": 1699.0, "epoch": 0.07957333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1292157769203186, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 281280130.0, "reward": 2.559688091278076, "reward_std": 0.18442192673683167, "rewards/cosine_scaled_reward/mean": 0.7905406951904297, "rewards/cosine_scaled_reward/std": 0.39117321372032166, "rewards/repetition_penalty_reward/mean": -0.14100870490074158, "rewards/repetition_penalty_reward/std": 0.049596257507801056, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3003.04296875, "completions/mean_terminated_length": 2949.290771484375, "completions/min_length": 1669.0, "completions/min_terminated_length": 1669.0, "epoch": 0.07978666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1299491673707962, "learning_rate": 1e-06, "loss": -0.0406, "num_tokens": 282171045.0, "reward": 2.5163817405700684, "reward_std": 0.19801975786685944, "rewards/cosine_scaled_reward/mean": 0.7635452747344971, "rewards/cosine_scaled_reward/std": 0.41028037667274475, "rewards/repetition_penalty_reward/mean": -0.1495072841644287, "rewards/repetition_penalty_reward/std": 0.05092966929078102, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 2962.68359375, "completions/mean_terminated_length": 2911.7998046875, "completions/min_length": 2057.0, "completions/min_terminated_length": 2057.0, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.12212635576725006, "learning_rate": 1e-06, "loss": -0.012, "num_tokens": 283054620.0, "reward": 2.554882526397705, "reward_std": 0.17215952277183533, "rewards/cosine_scaled_reward/mean": 0.7953221797943115, "rewards/cosine_scaled_reward/std": 0.3597381114959717, "rewards/repetition_penalty_reward/mean": -0.15293972194194794, "rewards/repetition_penalty_reward/std": 0.05043653026223183, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 2971.55859375, "completions/mean_terminated_length": 2911.4033203125, "completions/min_length": 1506.0, "completions/min_terminated_length": 1506.0, "epoch": 0.08021333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.2789333164691925, "learning_rate": 1e-06, "loss": -0.0239, "num_tokens": 283915795.0, "reward": 2.545088291168213, "reward_std": 0.16484610736370087, "rewards/cosine_scaled_reward/mean": 0.7841353416442871, "rewards/cosine_scaled_reward/std": 0.375940203666687, "rewards/repetition_penalty_reward/mean": -0.15076559782028198, "rewards/repetition_penalty_reward/std": 0.05685606971383095, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718994140625, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 3045.81640625, "completions/mean_terminated_length": 3007.55078125, "completions/min_length": 1876.0, "completions/min_terminated_length": 1876.0, "epoch": 0.08042666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.16344138979911804, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 284821132.0, "reward": 2.5898828506469727, "reward_std": 0.20654936134815216, "rewards/cosine_scaled_reward/mean": 0.8137679696083069, "rewards/cosine_scaled_reward/std": 0.35199347138404846, "rewards/repetition_penalty_reward/mean": -0.14654135704040527, "rewards/repetition_penalty_reward/std": 0.047246113419532776, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3033.90625, "completions/mean_terminated_length": 3008.416259765625, "completions/min_length": 1850.0, "completions/min_terminated_length": 1850.0, "epoch": 0.08064, "frac_reward_zero_std": 0.0, "grad_norm": 0.1035616546869278, "learning_rate": 1e-06, "loss": -0.0151, "num_tokens": 285739580.0, "reward": 2.6008670330047607, "reward_std": 0.09359882771968842, "rewards/cosine_scaled_reward/mean": 0.8161867260932922, "rewards/cosine_scaled_reward/std": 0.34512314200401306, "rewards/repetition_penalty_reward/mean": -0.137975811958313, "rewards/repetition_penalty_reward/std": 0.043298136442899704, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 3077.109375, "completions/mean_terminated_length": 3009.18359375, "completions/min_length": 1779.0, "completions/min_terminated_length": 1779.0, "epoch": 0.08085333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.06813926994800568, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 286625760.0, "reward": 2.673771858215332, "reward_std": 0.09984706342220306, "rewards/cosine_scaled_reward/mean": 0.8574278354644775, "rewards/cosine_scaled_reward/std": 0.27632588148117065, "rewards/repetition_penalty_reward/mean": -0.14068728685379028, "rewards/repetition_penalty_reward/std": 0.04103941097855568, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2952.99609375, "completions/mean_terminated_length": 2911.34814453125, "completions/min_length": 1943.0, "completions/min_terminated_length": 1943.0, "epoch": 0.08106666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.14166662096977234, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 287499027.0, "reward": 2.5731148719787598, "reward_std": 0.19441863894462585, "rewards/cosine_scaled_reward/mean": 0.7902377843856812, "rewards/cosine_scaled_reward/std": 0.3644700348377228, "rewards/repetition_penalty_reward/mean": -0.13899797201156616, "rewards/repetition_penalty_reward/std": 0.050455491989851, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 3016.1484375, "completions/mean_terminated_length": 2963.040771484375, "completions/min_length": 1691.0, "completions/min_terminated_length": 1691.0, "epoch": 0.08128, "frac_reward_zero_std": 0.0, "grad_norm": 0.22079779207706451, "learning_rate": 1e-06, "loss": -0.0234, "num_tokens": 288380001.0, "reward": 2.5718040466308594, "reward_std": 0.11867351830005646, "rewards/cosine_scaled_reward/mean": 0.7975947260856628, "rewards/cosine_scaled_reward/std": 0.3700672686100006, "rewards/repetition_penalty_reward/mean": -0.13985300064086914, "rewards/repetition_penalty_reward/std": 0.051185525953769684, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3013.15234375, "completions/mean_terminated_length": 2936.129638671875, "completions/min_length": 1764.0, "completions/min_terminated_length": 1764.0, "epoch": 0.08149333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.15201827883720398, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 289252860.0, "reward": 2.666290283203125, "reward_std": 0.17419229447841644, "rewards/cosine_scaled_reward/mean": 0.8443589210510254, "rewards/cosine_scaled_reward/std": 0.2822229564189911, "rewards/repetition_penalty_reward/mean": -0.13509991765022278, "rewards/repetition_penalty_reward/std": 0.042069874703884125, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3070.90625, "completions/mean_terminated_length": 3033.5546875, "completions/min_length": 2053.0, "completions/min_terminated_length": 2053.0, "epoch": 0.08170666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.13215400278568268, "learning_rate": 1e-06, "loss": -0.0289, "num_tokens": 290172020.0, "reward": 2.6237735748291016, "reward_std": 0.19740281999111176, "rewards/cosine_scaled_reward/mean": 0.8225434422492981, "rewards/cosine_scaled_reward/std": 0.34391024708747864, "rewards/repetition_penalty_reward/mean": -0.1323637068271637, "rewards/repetition_penalty_reward/std": 0.042291488498449326, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2975.4296875, "completions/mean_terminated_length": 2915.4814453125, "completions/min_length": 1947.0, "completions/min_terminated_length": 1947.0, "epoch": 0.08192, "frac_reward_zero_std": 0.0, "grad_norm": 0.10557766258716583, "learning_rate": 1e-06, "loss": -0.0438, "num_tokens": 291036618.0, "reward": 2.6430983543395996, "reward_std": 0.1428372859954834, "rewards/cosine_scaled_reward/mean": 0.8273340463638306, "rewards/cosine_scaled_reward/std": 0.30892860889434814, "rewards/repetition_penalty_reward/mean": -0.13032954931259155, "rewards/repetition_penalty_reward/std": 0.04752003774046898, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.94921875, "rewards/reward_reference/std": 0.21998079121112823, "step": 384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 3062.87890625, "completions/mean_terminated_length": 2970.557373046875, "completions/min_length": 1749.0, "completions/min_terminated_length": 1749.0, "epoch": 0.08213333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.08981072157621384, "learning_rate": 1e-06, "loss": -0.0342, "num_tokens": 291894135.0, "reward": 2.580354690551758, "reward_std": 0.17908355593681335, "rewards/cosine_scaled_reward/mean": 0.7964844107627869, "rewards/cosine_scaled_reward/std": 0.3772408068180084, "rewards/repetition_penalty_reward/mean": -0.13487963378429413, "rewards/repetition_penalty_reward/std": 0.046523887664079666, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3078.24609375, "completions/mean_terminated_length": 3005.853515625, "completions/min_length": 1897.0, "completions/min_terminated_length": 1897.0, "epoch": 0.08234666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11945135146379471, "learning_rate": 1e-06, "loss": -0.0369, "num_tokens": 292775774.0, "reward": 2.579617500305176, "reward_std": 0.22266732156276703, "rewards/cosine_scaled_reward/mean": 0.799963116645813, "rewards/cosine_scaled_reward/std": 0.377466082572937, "rewards/repetition_penalty_reward/mean": -0.13128305971622467, "rewards/repetition_penalty_reward/std": 0.053302206099033356, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3053.4765625, "completions/mean_terminated_length": 2974.63037109375, "completions/min_length": 2059.0, "completions/min_terminated_length": 2059.0, "epoch": 0.08256, "frac_reward_zero_std": 0.0, "grad_norm": 0.134053573012352, "learning_rate": 1e-06, "loss": -0.0201, "num_tokens": 293636572.0, "reward": 2.5145716667175293, "reward_std": 0.18531867861747742, "rewards/cosine_scaled_reward/mean": 0.7566462755203247, "rewards/cosine_scaled_reward/std": 0.42772749066352844, "rewards/repetition_penalty_reward/mean": -0.12644967436790466, "rewards/repetition_penalty_reward/std": 0.04349347576498985, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 3031.671875, "completions/mean_terminated_length": 2979.32763671875, "completions/min_length": 1821.0, "completions/min_terminated_length": 1821.0, "epoch": 0.08277333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.1286914199590683, "learning_rate": 1e-06, "loss": -0.0467, "num_tokens": 294520200.0, "reward": 2.630145788192749, "reward_std": 0.20631247758865356, "rewards/cosine_scaled_reward/mean": 0.8196536302566528, "rewards/cosine_scaled_reward/std": 0.33500272035598755, "rewards/repetition_penalty_reward/mean": -0.12700791656970978, "rewards/repetition_penalty_reward/std": 0.0510396771132946, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3092.9375, "completions/mean_terminated_length": 3017.07568359375, "completions/min_length": 1895.0, "completions/min_terminated_length": 1895.0, "epoch": 0.08298666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.18327857553958893, "learning_rate": 1e-06, "loss": -0.0161, "num_tokens": 295398600.0, "reward": 2.6226136684417725, "reward_std": 0.2285882830619812, "rewards/cosine_scaled_reward/mean": 0.8175864219665527, "rewards/cosine_scaled_reward/std": 0.35323092341423035, "rewards/repetition_penalty_reward/mean": -0.12231657654047012, "rewards/repetition_penalty_reward/std": 0.04444298893213272, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3212.0703125, "completions/mean_terminated_length": 3157.05419921875, "completions/min_length": 1879.0, "completions/min_terminated_length": 1879.0, "epoch": 0.0832, "frac_reward_zero_std": 0.0, "grad_norm": 0.10938741266727448, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 296325970.0, "reward": 2.5627360343933105, "reward_std": 0.16607502102851868, "rewards/cosine_scaled_reward/mean": 0.7930000424385071, "rewards/cosine_scaled_reward/std": 0.4169759750366211, "rewards/repetition_penalty_reward/mean": -0.13260750472545624, "rewards/repetition_penalty_reward/std": 0.04929269477725029, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 3147.953125, "completions/mean_terminated_length": 3084.750244140625, "completions/min_length": 1960.0, "completions/min_terminated_length": 1960.0, "epoch": 0.08341333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.16152621805667877, "learning_rate": 1e-06, "loss": -0.0148, "num_tokens": 297223618.0, "reward": 2.524066925048828, "reward_std": 0.1699940264225006, "rewards/cosine_scaled_reward/mean": 0.767318606376648, "rewards/cosine_scaled_reward/std": 0.4368087351322174, "rewards/repetition_penalty_reward/mean": -0.1299704909324646, "rewards/repetition_penalty_reward/std": 0.04877452179789543, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3236.640625, "completions/mean_terminated_length": 3143.636474609375, "completions/min_length": 2051.0, "completions/min_terminated_length": 2051.0, "epoch": 0.08362666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.12291693687438965, "learning_rate": 1e-06, "loss": -0.0316, "num_tokens": 298113694.0, "reward": 2.5816733837127686, "reward_std": 0.18931283056735992, "rewards/cosine_scaled_reward/mean": 0.8049663305282593, "rewards/cosine_scaled_reward/std": 0.4033796489238739, "rewards/repetition_penalty_reward/mean": -0.12641799449920654, "rewards/repetition_penalty_reward/std": 0.03853068873286247, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3092.8359375, "completions/mean_terminated_length": 3034.801513671875, "completions/min_length": 1967.0, "completions/min_terminated_length": 1967.0, "epoch": 0.08384, "frac_reward_zero_std": 0.0, "grad_norm": 0.12058772891759872, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 299006500.0, "reward": 2.593824863433838, "reward_std": 0.1544109284877777, "rewards/cosine_scaled_reward/mean": 0.8009454011917114, "rewards/cosine_scaled_reward/std": 0.3817402124404907, "rewards/repetition_penalty_reward/mean": -0.12196415662765503, "rewards/repetition_penalty_reward/std": 0.045104991644620895, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3119.30078125, "completions/mean_terminated_length": 3058.510498046875, "completions/min_length": 1893.0, "completions/min_terminated_length": 1893.0, "epoch": 0.08405333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.06525684148073196, "learning_rate": 1e-06, "loss": -0.0213, "num_tokens": 299904713.0, "reward": 2.593860626220703, "reward_std": 0.09809240698814392, "rewards/cosine_scaled_reward/mean": 0.8014312386512756, "rewards/cosine_scaled_reward/std": 0.38566121459007263, "rewards/repetition_penalty_reward/mean": -0.12553951144218445, "rewards/repetition_penalty_reward/std": 0.053088318556547165, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 3153.375, "completions/mean_terminated_length": 3102.9462890625, "completions/min_length": 1773.0, "completions/min_terminated_length": 1773.0, "epoch": 0.08426666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.14195215702056885, "learning_rate": 1e-06, "loss": -0.021, "num_tokens": 300820209.0, "reward": 2.5252044200897217, "reward_std": 0.17579331994056702, "rewards/cosine_scaled_reward/mean": 0.7644021511077881, "rewards/cosine_scaled_reward/std": 0.4409472942352295, "rewards/repetition_penalty_reward/mean": -0.12279140949249268, "rewards/repetition_penalty_reward/std": 0.03642672300338745, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3192.4765625, "completions/mean_terminated_length": 3140.20654296875, "completions/min_length": 1863.0, "completions/min_terminated_length": 1863.0, "epoch": 0.08448, "frac_reward_zero_std": 0.0, "grad_norm": 0.13026125729084015, "learning_rate": 1e-06, "loss": -0.0239, "num_tokens": 301749675.0, "reward": 2.576292037963867, "reward_std": 0.2108149230480194, "rewards/cosine_scaled_reward/mean": 0.7987449169158936, "rewards/cosine_scaled_reward/std": 0.4059189260005951, "rewards/repetition_penalty_reward/mean": -0.12557795643806458, "rewards/repetition_penalty_reward/std": 0.03922073915600777, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 3099.82421875, "completions/mean_terminated_length": 3046.53076171875, "completions/min_length": 1668.0, "completions/min_terminated_length": 1668.0, "epoch": 0.08469333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12833480536937714, "learning_rate": 1e-06, "loss": -0.0094, "num_tokens": 302647086.0, "reward": 2.6329143047332764, "reward_std": 0.1479964256286621, "rewards/cosine_scaled_reward/mean": 0.8212152123451233, "rewards/cosine_scaled_reward/std": 0.3519037067890167, "rewards/repetition_penalty_reward/mean": -0.12189459800720215, "rewards/repetition_penalty_reward/std": 0.04479314014315605, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3087.08203125, "completions/mean_terminated_length": 3054.5361328125, "completions/min_length": 1757.0, "completions/min_terminated_length": 1757.0, "epoch": 0.08490666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.09485691785812378, "learning_rate": 1e-06, "loss": 0.015, "num_tokens": 303571071.0, "reward": 2.6275954246520996, "reward_std": 0.11551915854215622, "rewards/cosine_scaled_reward/mean": 0.8233013153076172, "rewards/cosine_scaled_reward/std": 0.344821572303772, "rewards/repetition_penalty_reward/mean": -0.12929978966712952, "rewards/repetition_penalty_reward/std": 0.04905636981129646, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 3061.21484375, "completions/mean_terminated_length": 2992.229248046875, "completions/min_length": 1874.0, "completions/min_terminated_length": 1874.0, "epoch": 0.08512, "frac_reward_zero_std": 0.0, "grad_norm": 0.11544538289308548, "learning_rate": 1e-06, "loss": -0.0454, "num_tokens": 304450226.0, "reward": 2.679389476776123, "reward_std": 0.20466876029968262, "rewards/cosine_scaled_reward/mean": 0.8510940074920654, "rewards/cosine_scaled_reward/std": 0.2858965992927551, "rewards/repetition_penalty_reward/mean": -0.12873569130897522, "rewards/repetition_penalty_reward/std": 0.046985287219285965, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.95703125, "rewards/reward_reference/std": 0.20318391919136047, "step": 399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4010.0, "completions/mean_length": 3165.70703125, "completions/mean_terminated_length": 3099.535400390625, "completions/min_length": 2123.0, "completions/min_terminated_length": 2123.0, "epoch": 0.08533333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.11605282127857208, "learning_rate": 1e-06, "loss": -0.0212, "num_tokens": 305356367.0, "reward": 2.6028542518615723, "reward_std": 0.14681796729564667, "rewards/cosine_scaled_reward/mean": 0.817654013633728, "rewards/cosine_scaled_reward/std": 0.37154901027679443, "rewards/repetition_penalty_reward/mean": -0.13433128595352173, "rewards/repetition_penalty_reward/std": 0.039992932230234146, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718994140625, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 3128.8515625, "completions/mean_terminated_length": 3113.500244140625, "completions/min_length": 1981.0, "completions/min_terminated_length": 1981.0, "epoch": 0.08554666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1029190719127655, "learning_rate": 1e-06, "loss": -0.014, "num_tokens": 306299541.0, "reward": 2.6625068187713623, "reward_std": 0.11479123681783676, "rewards/cosine_scaled_reward/mean": 0.8496817946434021, "rewards/cosine_scaled_reward/std": 0.31631430983543396, "rewards/repetition_penalty_reward/mean": -0.12936252355575562, "rewards/repetition_penalty_reward/std": 0.03983324021100998, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9453125, "rewards/reward_reference/std": 0.22781464457511902, "step": 401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 3188.91015625, "completions/mean_terminated_length": 3112.0380859375, "completions/min_length": 1893.0, "completions/min_terminated_length": 1893.0, "epoch": 0.08576, "frac_reward_zero_std": 0.0, "grad_norm": 0.15154629945755005, "learning_rate": 1e-06, "loss": -0.03, "num_tokens": 307196046.0, "reward": 2.497952461242676, "reward_std": 0.2996126413345337, "rewards/cosine_scaled_reward/mean": 0.7567933201789856, "rewards/cosine_scaled_reward/std": 0.4561953842639923, "rewards/repetition_penalty_reward/mean": -0.13384085893630981, "rewards/repetition_penalty_reward/std": 0.05501502379775047, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.875, "rewards/reward_reference/std": 0.33136674761772156, "step": 402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 3156.04296875, "completions/mean_terminated_length": 3105.757080078125, "completions/min_length": 2047.0, "completions/min_terminated_length": 2047.0, "epoch": 0.08597333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.10560856759548187, "learning_rate": 1e-06, "loss": -0.0117, "num_tokens": 308107361.0, "reward": 2.6281182765960693, "reward_std": 0.15326428413391113, "rewards/cosine_scaled_reward/mean": 0.8252940773963928, "rewards/cosine_scaled_reward/std": 0.35686615109443665, "rewards/repetition_penalty_reward/mean": -0.1268633008003235, "rewards/repetition_penalty_reward/std": 0.04566507786512375, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9296875, "rewards/reward_reference/std": 0.2561737895011902, "step": 403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 3228.66015625, "completions/mean_terminated_length": 3151.153076171875, "completions/min_length": 1917.0, "completions/min_terminated_length": 1917.0, "epoch": 0.08618666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.16378062963485718, "learning_rate": 1e-06, "loss": -0.0199, "num_tokens": 309010934.0, "reward": 2.417421579360962, "reward_std": 0.20660261809825897, "rewards/cosine_scaled_reward/mean": 0.7245820760726929, "rewards/cosine_scaled_reward/std": 0.4962194561958313, "rewards/repetition_penalty_reward/mean": -0.1423168182373047, "rewards/repetition_penalty_reward/std": 0.058403536677360535, "rewards/reward_format/mean": 0.987500011920929, "rewards/reward_format/std": 0.0994100272655487, "rewards/reward_reference/mean": 0.84765625, "rewards/reward_reference/std": 0.3600577116012573, "step": 404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 3154.10546875, "completions/mean_terminated_length": 3069.93603515625, "completions/min_length": 1942.0, "completions/min_terminated_length": 1942.0, "epoch": 0.0864, "frac_reward_zero_std": 0.0, "grad_norm": 0.11778714507818222, "learning_rate": 1e-06, "loss": -0.0457, "num_tokens": 309891297.0, "reward": 2.558377742767334, "reward_std": 0.24009853601455688, "rewards/cosine_scaled_reward/mean": 0.7882504463195801, "rewards/cosine_scaled_reward/std": 0.40825164318084717, "rewards/repetition_penalty_reward/mean": -0.1298729032278061, "rewards/repetition_penalty_reward/std": 0.054273542016744614, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 3222.48828125, "completions/mean_terminated_length": 3144.4296875, "completions/min_length": 2138.0, "completions/min_terminated_length": 2138.0, "epoch": 0.08661333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.18284331262111664, "learning_rate": 1e-06, "loss": -0.0156, "num_tokens": 310788626.0, "reward": 2.5298140048980713, "reward_std": 0.2251349836587906, "rewards/cosine_scaled_reward/mean": 0.776664674282074, "rewards/cosine_scaled_reward/std": 0.43793678283691406, "rewards/repetition_penalty_reward/mean": -0.13435065746307373, "rewards/repetition_penalty_reward/std": 0.045195017009973526, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.890625, "rewards/reward_reference/std": 0.31272050738334656, "step": 406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 3215.89453125, "completions/mean_terminated_length": 3161.116455078125, "completions/min_length": 1896.0, "completions/min_terminated_length": 1896.0, "epoch": 0.08682666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1203547865152359, "learning_rate": 1e-06, "loss": -0.011, "num_tokens": 311714119.0, "reward": 2.6104822158813477, "reward_std": 0.14991873502731323, "rewards/cosine_scaled_reward/mean": 0.8213518857955933, "rewards/cosine_scaled_reward/std": 0.3793308436870575, "rewards/repetition_penalty_reward/mean": -0.13274472951889038, "rewards/repetition_penalty_reward/std": 0.04116351529955864, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 3208.79296875, "completions/mean_terminated_length": 3137.66650390625, "completions/min_length": 1861.0, "completions/min_terminated_length": 1861.0, "epoch": 0.08704, "frac_reward_zero_std": 0.0, "grad_norm": 0.1600077599287033, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 312615894.0, "reward": 2.5284624099731445, "reward_std": 0.28709399700164795, "rewards/cosine_scaled_reward/mean": 0.7823736667633057, "rewards/cosine_scaled_reward/std": 0.4272303581237793, "rewards/repetition_penalty_reward/mean": -0.13906735181808472, "rewards/repetition_penalty_reward/std": 0.050136372447013855, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 3203.31640625, "completions/mean_terminated_length": 3127.665283203125, "completions/min_length": 2044.0, "completions/min_terminated_length": 2044.0, "epoch": 0.08725333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.11707901954650879, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 313510083.0, "reward": 2.570059061050415, "reward_std": 0.19044455885887146, "rewards/cosine_scaled_reward/mean": 0.8027787208557129, "rewards/cosine_scaled_reward/std": 0.39987897872924805, "rewards/repetition_penalty_reward/mean": -0.13662593066692352, "rewards/repetition_penalty_reward/std": 0.05554681271314621, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 3137.3671875, "completions/mean_terminated_length": 3064.86572265625, "completions/min_length": 1734.0, "completions/min_terminated_length": 1734.0, "epoch": 0.08746666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.14157453179359436, "learning_rate": 1e-06, "loss": -0.0103, "num_tokens": 314396293.0, "reward": 2.5433688163757324, "reward_std": 0.20625907182693481, "rewards/cosine_scaled_reward/mean": 0.7789784669876099, "rewards/cosine_scaled_reward/std": 0.4177873134613037, "rewards/repetition_penalty_reward/mean": -0.13795334100723267, "rewards/repetition_penalty_reward/std": 0.05312160775065422, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 3174.421875, "completions/mean_terminated_length": 3125.119140625, "completions/min_length": 2025.0, "completions/min_terminated_length": 2025.0, "epoch": 0.08768, "frac_reward_zero_std": 0.0, "grad_norm": 0.1355607509613037, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 315326941.0, "reward": 2.5864953994750977, "reward_std": 0.20761072635650635, "rewards/cosine_scaled_reward/mean": 0.8066941499710083, "rewards/cosine_scaled_reward/std": 0.39073434472084045, "rewards/repetition_penalty_reward/mean": -0.1311362087726593, "rewards/repetition_penalty_reward/std": 0.0458197258412838, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3194.16796875, "completions/mean_terminated_length": 3145.921630859375, "completions/min_length": 2059.0, "completions/min_terminated_length": 2059.0, "epoch": 0.08789333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.06964617222547531, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 316257524.0, "reward": 2.6956582069396973, "reward_std": 0.08016351610422134, "rewards/cosine_scaled_reward/mean": 0.8655639886856079, "rewards/cosine_scaled_reward/std": 0.2969217598438263, "rewards/repetition_penalty_reward/mean": -0.12303093820810318, "rewards/repetition_penalty_reward/std": 0.044344719499349594, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.953125, "rewards/reward_reference/std": 0.21178513765335083, "step": 412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3134.00390625, "completions/mean_terminated_length": 3098.951416015625, "completions/min_length": 1788.0, "completions/min_terminated_length": 1788.0, "epoch": 0.08810666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.11384371668100357, "learning_rate": 1e-06, "loss": -0.0179, "num_tokens": 317185773.0, "reward": 2.6399922370910645, "reward_std": 0.1448422223329544, "rewards/cosine_scaled_reward/mean": 0.8305256366729736, "rewards/cosine_scaled_reward/std": 0.34458062052726746, "rewards/repetition_penalty_reward/mean": -0.12412697076797485, "rewards/repetition_penalty_reward/std": 0.0361146442592144, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 3201.82421875, "completions/mean_terminated_length": 3121.9189453125, "completions/min_length": 1926.0, "completions/min_terminated_length": 1926.0, "epoch": 0.08832, "frac_reward_zero_std": 0.0, "grad_norm": 0.13148584961891174, "learning_rate": 1e-06, "loss": -0.0272, "num_tokens": 318072120.0, "reward": 2.6402688026428223, "reward_std": 0.1609860360622406, "rewards/cosine_scaled_reward/mean": 0.8379240036010742, "rewards/cosine_scaled_reward/std": 0.3452746570110321, "rewards/repetition_penalty_reward/mean": -0.12890510261058807, "rewards/repetition_penalty_reward/std": 0.050265178084373474, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.5, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 3258.58203125, "completions/mean_terminated_length": 3202.75439453125, "completions/min_length": 2023.0, "completions/min_terminated_length": 2023.0, "epoch": 0.08853333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12287775427103043, "learning_rate": 1e-06, "loss": -0.0233, "num_tokens": 318997017.0, "reward": 2.6155078411102295, "reward_std": 0.18347862362861633, "rewards/cosine_scaled_reward/mean": 0.8230103254318237, "rewards/cosine_scaled_reward/std": 0.38649454712867737, "rewards/repetition_penalty_reward/mean": -0.12547120451927185, "rewards/repetition_penalty_reward/std": 0.041851840913295746, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.28125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3299.890625, "completions/mean_terminated_length": 3221.3046875, "completions/min_length": 2126.0, "completions/min_terminated_length": 2126.0, "epoch": 0.08874666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10421235859394073, "learning_rate": 1e-06, "loss": -0.0141, "num_tokens": 319914633.0, "reward": 2.5600852966308594, "reward_std": 0.15454831719398499, "rewards/cosine_scaled_reward/mean": 0.7934485673904419, "rewards/cosine_scaled_reward/std": 0.4298205077648163, "rewards/repetition_penalty_reward/mean": -0.12867553532123566, "rewards/repetition_penalty_reward/std": 0.05109499394893646, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3273.75390625, "completions/mean_terminated_length": 3176.807861328125, "completions/min_length": 2142.0, "completions/min_terminated_length": 2142.0, "epoch": 0.08896, "frac_reward_zero_std": 0.0, "grad_norm": 0.09872046858072281, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 320801902.0, "reward": 2.6604604721069336, "reward_std": 0.17045599222183228, "rewards/cosine_scaled_reward/mean": 0.8498491644859314, "rewards/cosine_scaled_reward/std": 0.34010493755340576, "rewards/repetition_penalty_reward/mean": -0.13079512119293213, "rewards/repetition_penalty_reward/std": 0.04310750588774681, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3320.2265625, "completions/mean_terminated_length": 3224.9560546875, "completions/min_length": 1994.0, "completions/min_terminated_length": 1994.0, "epoch": 0.08917333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1512015163898468, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 321695904.0, "reward": 2.4330129623413086, "reward_std": 0.24782538414001465, "rewards/cosine_scaled_reward/mean": 0.7282810211181641, "rewards/cosine_scaled_reward/std": 0.5042746663093567, "rewards/repetition_penalty_reward/mean": -0.13589312136173248, "rewards/repetition_penalty_reward/std": 0.05037867650389671, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.84375, "rewards/reward_reference/std": 0.3638034462928772, "step": 418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3331.8046875, "completions/mean_terminated_length": 3259.95751953125, "completions/min_length": 2094.0, "completions/min_terminated_length": 2094.0, "epoch": 0.08938666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.08986438810825348, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 322614434.0, "reward": 2.7352826595306396, "reward_std": 0.13047119975090027, "rewards/cosine_scaled_reward/mean": 0.8981503844261169, "rewards/cosine_scaled_reward/std": 0.2551228106021881, "rewards/repetition_penalty_reward/mean": -0.13161778450012207, "rewards/repetition_penalty_reward/std": 0.04430250823497772, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.96875, "rewards/reward_reference/std": 0.17433346807956696, "step": 419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 3391.77734375, "completions/mean_terminated_length": 3305.2939453125, "completions/min_length": 2039.0, "completions/min_terminated_length": 2039.0, "epoch": 0.0896, "frac_reward_zero_std": 0.0, "grad_norm": 0.12648317217826843, "learning_rate": 1e-06, "loss": -0.0324, "num_tokens": 323522857.0, "reward": 2.5961928367614746, "reward_std": 0.212894469499588, "rewards/cosine_scaled_reward/mean": 0.8207013607025146, "rewards/cosine_scaled_reward/std": 0.4113256633281708, "rewards/repetition_penalty_reward/mean": -0.12841475009918213, "rewards/repetition_penalty_reward/std": 0.04414404183626175, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3354.9453125, "completions/mean_terminated_length": 3278.284423828125, "completions/min_length": 2324.0, "completions/min_terminated_length": 2324.0, "epoch": 0.08981333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1353956162929535, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 324441127.0, "reward": 2.646284580230713, "reward_std": 0.15197622776031494, "rewards/cosine_scaled_reward/mean": 0.8503248691558838, "rewards/cosine_scaled_reward/std": 0.3605615496635437, "rewards/repetition_penalty_reward/mean": -0.13138386607170105, "rewards/repetition_penalty_reward/std": 0.044571150094270706, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3371.5390625, "completions/mean_terminated_length": 3275.37158203125, "completions/min_length": 2072.0, "completions/min_terminated_length": 2072.0, "epoch": 0.09002666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.06844349950551987, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 325350233.0, "reward": 2.638679027557373, "reward_std": 0.11508074402809143, "rewards/cosine_scaled_reward/mean": 0.8407710194587708, "rewards/cosine_scaled_reward/std": 0.3784065842628479, "rewards/repetition_penalty_reward/mean": -0.12474842369556427, "rewards/repetition_penalty_reward/std": 0.04072347283363342, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.46875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 3297.40234375, "completions/mean_terminated_length": 3240.59814453125, "completions/min_length": 2336.0, "completions/min_terminated_length": 2336.0, "epoch": 0.09024, "frac_reward_zero_std": 0.0, "grad_norm": 0.0915771871805191, "learning_rate": 1e-06, "loss": -0.0279, "num_tokens": 326292292.0, "reward": 2.5805721282958984, "reward_std": 0.10786420106887817, "rewards/cosine_scaled_reward/mean": 0.812868595123291, "rewards/cosine_scaled_reward/std": 0.40911364555358887, "rewards/repetition_penalty_reward/mean": -0.13542136549949646, "rewards/repetition_penalty_reward/std": 0.04858802258968353, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90625, "rewards/reward_reference/std": 0.2920515835285187, "step": 423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 3348.2578125, "completions/mean_terminated_length": 3277.95751953125, "completions/min_length": 2299.0, "completions/min_terminated_length": 2299.0, "epoch": 0.09045333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12993597984313965, "learning_rate": 1e-06, "loss": -0.0219, "num_tokens": 327220986.0, "reward": 2.604248046875, "reward_std": 0.253852903842926, "rewards/cosine_scaled_reward/mean": 0.8276076316833496, "rewards/cosine_scaled_reward/std": 0.3967602252960205, "rewards/repetition_penalty_reward/mean": -0.12804700434207916, "rewards/repetition_penalty_reward/std": 0.04108881205320358, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 3322.1171875, "completions/mean_terminated_length": 3203.5947265625, "completions/min_length": 2081.0, "completions/min_terminated_length": 2081.0, "epoch": 0.09066666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13421866297721863, "learning_rate": 1e-06, "loss": -0.0247, "num_tokens": 328094492.0, "reward": 2.5595273971557617, "reward_std": 0.2225823998451233, "rewards/cosine_scaled_reward/mean": 0.794215202331543, "rewards/cosine_scaled_reward/std": 0.4304060935974121, "rewards/repetition_penalty_reward/mean": -0.12921899557113647, "rewards/repetition_penalty_reward/std": 0.05206966772675514, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 3315.33984375, "completions/mean_terminated_length": 3245.57861328125, "completions/min_length": 2084.0, "completions/min_terminated_length": 2084.0, "epoch": 0.09088, "frac_reward_zero_std": 0.0, "grad_norm": 0.10778502374887466, "learning_rate": 1e-06, "loss": -0.0441, "num_tokens": 329020015.0, "reward": 2.598879814147949, "reward_std": 0.19090493023395538, "rewards/cosine_scaled_reward/mean": 0.823809027671814, "rewards/cosine_scaled_reward/std": 0.3936808109283447, "rewards/repetition_penalty_reward/mean": -0.13274173438549042, "rewards/repetition_penalty_reward/std": 0.04350915551185608, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3340.73046875, "completions/mean_terminated_length": 3236.671142578125, "completions/min_length": 2181.0, "completions/min_terminated_length": 2181.0, "epoch": 0.09109333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1536858230829239, "learning_rate": 1e-06, "loss": -0.0536, "num_tokens": 329909346.0, "reward": 2.5599093437194824, "reward_std": 0.26450973749160767, "rewards/cosine_scaled_reward/mean": 0.7959344387054443, "rewards/cosine_scaled_reward/std": 0.43361762166023254, "rewards/repetition_penalty_reward/mean": -0.13055622577667236, "rewards/repetition_penalty_reward/std": 0.04218065366148949, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.96875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 3372.89453125, "completions/mean_terminated_length": 3265.887939453125, "completions/min_length": 2365.0, "completions/min_terminated_length": 2365.0, "epoch": 0.09130666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.08796551823616028, "learning_rate": 1e-06, "loss": -0.0094, "num_tokens": 330795287.0, "reward": 2.514080047607422, "reward_std": 0.1660141944885254, "rewards/cosine_scaled_reward/mean": 0.7753437757492065, "rewards/cosine_scaled_reward/std": 0.46698227524757385, "rewards/repetition_penalty_reward/mean": -0.1370450258255005, "rewards/repetition_penalty_reward/std": 0.055892687290906906, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.87890625, "rewards/reward_reference/std": 0.3268752694129944, "step": 428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 3394.15625, "completions/mean_terminated_length": 3328.171142578125, "completions/min_length": 2423.0, "completions/min_terminated_length": 2423.0, "epoch": 0.09152, "frac_reward_zero_std": 0.0, "grad_norm": 0.1999899446964264, "learning_rate": 1e-06, "loss": -0.0179, "num_tokens": 331737955.0, "reward": 2.449087142944336, "reward_std": 0.2750634551048279, "rewards/cosine_scaled_reward/mean": 0.7363446950912476, "rewards/cosine_scaled_reward/std": 0.5115602016448975, "rewards/repetition_penalty_reward/mean": -0.12944519519805908, "rewards/repetition_penalty_reward/std": 0.041707854717969894, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.8515625, "rewards/reward_reference/std": 0.3562295734882355, "step": 429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3360.3046875, "completions/mean_terminated_length": 3294.5615234375, "completions/min_length": 2207.0, "completions/min_terminated_length": 2207.0, "epoch": 0.09173333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.11115996539592743, "learning_rate": 1e-06, "loss": -0.0196, "num_tokens": 332670321.0, "reward": 2.6029627323150635, "reward_std": 0.14713042974472046, "rewards/cosine_scaled_reward/mean": 0.8259959816932678, "rewards/cosine_scaled_reward/std": 0.39926013350486755, "rewards/repetition_penalty_reward/mean": -0.13709580898284912, "rewards/repetition_penalty_reward/std": 0.05000806227326393, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9140625, "rewards/reward_reference/std": 0.28082075715065, "step": 430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 3402.5, "completions/mean_terminated_length": 3274.07421875, "completions/min_length": 2088.0, "completions/min_terminated_length": 2088.0, "epoch": 0.09194666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.14282870292663574, "learning_rate": 1e-06, "loss": -0.0179, "num_tokens": 333535445.0, "reward": 2.4626400470733643, "reward_std": 0.18670928478240967, "rewards/cosine_scaled_reward/mean": 0.7408066987991333, "rewards/cosine_scaled_reward/std": 0.5033049583435059, "rewards/repetition_penalty_reward/mean": -0.13363544642925262, "rewards/repetition_penalty_reward/std": 0.04304096847772598, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.85546875, "rewards/reward_reference/std": 0.35231640934944153, "step": 431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.9375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 3392.3984375, "completions/mean_terminated_length": 3284.6396484375, "completions/min_length": 2074.0, "completions/min_terminated_length": 2074.0, "epoch": 0.09216, "frac_reward_zero_std": 0.0, "grad_norm": 0.04934278875589371, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 334433423.0, "reward": 2.657198667526245, "reward_std": 0.08821704983711243, "rewards/cosine_scaled_reward/mean": 0.8536912798881531, "rewards/cosine_scaled_reward/std": 0.3593747615814209, "rewards/repetition_penalty_reward/mean": -0.12696149945259094, "rewards/repetition_penalty_reward/std": 0.0377291664481163, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 3376.1015625, "completions/mean_terminated_length": 3250.614501953125, "completions/min_length": 2119.0, "completions/min_terminated_length": 2119.0, "epoch": 0.09237333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.1504306197166443, "learning_rate": 1e-06, "loss": -0.0269, "num_tokens": 335293601.0, "reward": 2.3855834007263184, "reward_std": 0.20364665985107422, "rewards/cosine_scaled_reward/mean": 0.6995294094085693, "rewards/cosine_scaled_reward/std": 0.5381035804748535, "rewards/repetition_penalty_reward/mean": -0.13816462457180023, "rewards/repetition_penalty_reward/std": 0.04862834885716438, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.82421875, "rewards/reward_reference/std": 0.3813795745372772, "step": 433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.8125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3470.88671875, "completions/mean_terminated_length": 3361.921875, "completions/min_length": 2334.0, "completions/min_terminated_length": 2334.0, "epoch": 0.09258666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.10625244677066803, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 336194808.0, "reward": 2.586014747619629, "reward_std": 0.1232042908668518, "rewards/cosine_scaled_reward/mean": 0.8126987218856812, "rewards/cosine_scaled_reward/std": 0.43467089533805847, "rewards/repetition_penalty_reward/mean": -0.12512150406837463, "rewards/repetition_penalty_reward/std": 0.0379415787756443, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3534.79296875, "completions/mean_terminated_length": 3415.1044921875, "completions/min_length": 2386.0, "completions/min_terminated_length": 2386.0, "epoch": 0.0928, "frac_reward_zero_std": 0.0, "grad_norm": 0.13819520175457, "learning_rate": 1e-06, "loss": -0.014, "num_tokens": 337078519.0, "reward": 2.671761989593506, "reward_std": 0.12655839323997498, "rewards/cosine_scaled_reward/mean": 0.8669031858444214, "rewards/cosine_scaled_reward/std": 0.3638756275177002, "rewards/repetition_penalty_reward/mean": -0.1256098747253418, "rewards/repetition_penalty_reward/std": 0.04022699594497681, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.6875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 3376.28515625, "completions/mean_terminated_length": 3235.03271484375, "completions/min_length": 2154.0, "completions/min_terminated_length": 2154.0, "epoch": 0.09301333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.16057516634464264, "learning_rate": 1e-06, "loss": -0.0324, "num_tokens": 337933628.0, "reward": 2.4101791381835938, "reward_std": 0.22112424671649933, "rewards/cosine_scaled_reward/mean": 0.7129418849945068, "rewards/cosine_scaled_reward/std": 0.5243740081787109, "rewards/repetition_penalty_reward/mean": -0.12385663390159607, "rewards/repetition_penalty_reward/std": 0.04228730499744415, "rewards/reward_format/mean": 0.981249988079071, "rewards/reward_format/std": 0.12126781791448593, "rewards/reward_reference/mean": 0.83984375, "rewards/reward_reference/std": 0.36746934056282043, "step": 436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 3468.8125, "completions/mean_terminated_length": 3391.78955078125, "completions/min_length": 2311.0, "completions/min_terminated_length": 2311.0, "epoch": 0.09322666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.09278582036495209, "learning_rate": 1e-06, "loss": -0.0249, "num_tokens": 338874828.0, "reward": 2.712590217590332, "reward_std": 0.11666490882635117, "rewards/cosine_scaled_reward/mean": 0.8886799812316895, "rewards/cosine_scaled_reward/std": 0.3155267834663391, "rewards/repetition_penalty_reward/mean": -0.1253085881471634, "rewards/repetition_penalty_reward/std": 0.03902411833405495, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94921875, "rewards/reward_reference/std": 0.21998079121112823, "step": 437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.5625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 3454.30078125, "completions/mean_terminated_length": 3313.73828125, "completions/min_length": 2340.0, "completions/min_terminated_length": 2340.0, "epoch": 0.09344, "frac_reward_zero_std": 0.0, "grad_norm": 0.16439048945903778, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 339738661.0, "reward": 2.550375461578369, "reward_std": 0.22821134328842163, "rewards/cosine_scaled_reward/mean": 0.7951457500457764, "rewards/cosine_scaled_reward/std": 0.4520246088504791, "rewards/repetition_penalty_reward/mean": -0.12523919343948364, "rewards/repetition_penalty_reward/std": 0.041806966066360474, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.88671875, "rewards/reward_reference/std": 0.31755712628364563, "step": 438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 3443.12890625, "completions/mean_terminated_length": 3362.95166015625, "completions/min_length": 2043.0, "completions/min_terminated_length": 2043.0, "epoch": 0.09365333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.11715667694807053, "learning_rate": 1e-06, "loss": -0.0249, "num_tokens": 340669206.0, "reward": 2.6908936500549316, "reward_std": 0.14655840396881104, "rewards/cosine_scaled_reward/mean": 0.8710836172103882, "rewards/cosine_scaled_reward/std": 0.3402611017227173, "rewards/repetition_penalty_reward/mean": -0.12159596383571625, "rewards/repetition_penalty_reward/std": 0.043441347777843475, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 3457.76171875, "completions/mean_terminated_length": 3328.91552734375, "completions/min_length": 2327.0, "completions/min_terminated_length": 2327.0, "epoch": 0.09386666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1541212946176529, "learning_rate": 1e-06, "loss": -0.0148, "num_tokens": 341539165.0, "reward": 2.5942134857177734, "reward_std": 0.22023794054985046, "rewards/cosine_scaled_reward/mean": 0.8143563866615295, "rewards/cosine_scaled_reward/std": 0.42871448397636414, "rewards/repetition_penalty_reward/mean": -0.11936160922050476, "rewards/repetition_penalty_reward/std": 0.04116738215088844, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 3334.8203125, "completions/mean_terminated_length": 3233.77880859375, "completions/min_length": 1930.0, "completions/min_terminated_length": 1930.0, "epoch": 0.09408, "frac_reward_zero_std": 0.0, "grad_norm": 0.1119907945394516, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 342422431.0, "reward": 2.6673107147216797, "reward_std": 0.16998052597045898, "rewards/cosine_scaled_reward/mean": 0.8509924411773682, "rewards/cosine_scaled_reward/std": 0.3510325849056244, "rewards/repetition_penalty_reward/mean": -0.11805684864521027, "rewards/repetition_penalty_reward/std": 0.041798681020736694, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.9375, "rewards/reward_reference/std": 0.24253563582897186, "step": 441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3551.625, "completions/mean_terminated_length": 3373.927490234375, "completions/min_length": 2225.0, "completions/min_terminated_length": 2225.0, "epoch": 0.09429333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.155339315533638, "learning_rate": 1e-06, "loss": -0.0634, "num_tokens": 343229003.0, "reward": 2.412459373474121, "reward_std": 0.36707803606987, "rewards/cosine_scaled_reward/mean": 0.7190250158309937, "rewards/cosine_scaled_reward/std": 0.5402841567993164, "rewards/repetition_penalty_reward/mean": -0.1323469579219818, "rewards/repetition_penalty_reward/std": 0.05035141855478287, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.83203125, "rewards/reward_reference/std": 0.3745708465576172, "step": 442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3416.41015625, "completions/mean_terminated_length": 3305.204345703125, "completions/min_length": 1969.0, "completions/min_terminated_length": 1969.0, "epoch": 0.09450666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.1449149250984192, "learning_rate": 1e-06, "loss": -0.0217, "num_tokens": 344119452.0, "reward": 2.6318774223327637, "reward_std": 0.20957745611667633, "rewards/cosine_scaled_reward/mean": 0.8327311873435974, "rewards/cosine_scaled_reward/std": 0.39399969577789307, "rewards/repetition_penalty_reward/mean": -0.11569737643003464, "rewards/repetition_penalty_reward/std": 0.039580464363098145, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.91796875, "rewards/reward_reference/std": 0.2749498784542084, "step": 443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.59375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3448.515625, "completions/mean_terminated_length": 3310.4267578125, "completions/min_length": 2306.0, "completions/min_terminated_length": 2306.0, "epoch": 0.09472, "frac_reward_zero_std": 0.0, "grad_norm": 0.14609962701797485, "learning_rate": 1e-06, "loss": -0.0273, "num_tokens": 344981920.0, "reward": 2.466052293777466, "reward_std": 0.21688896417617798, "rewards/cosine_scaled_reward/mean": 0.7364804744720459, "rewards/cosine_scaled_reward/std": 0.5121752619743347, "rewards/repetition_penalty_reward/mean": -0.11808443069458008, "rewards/repetition_penalty_reward/std": 0.03446255251765251, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.84765625, "rewards/reward_reference/std": 0.3600577116012573, "step": 444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3618.640625, "completions/mean_terminated_length": 3469.31298828125, "completions/min_length": 2316.0, "completions/min_terminated_length": 2316.0, "epoch": 0.09493333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12748895585536957, "learning_rate": 1e-06, "loss": -0.0366, "num_tokens": 345826308.0, "reward": 2.4843618869781494, "reward_std": 0.23571541905403137, "rewards/cosine_scaled_reward/mean": 0.7587255239486694, "rewards/cosine_scaled_reward/std": 0.5151540040969849, "rewards/repetition_penalty_reward/mean": -0.12670737504959106, "rewards/repetition_penalty_reward/std": 0.04888736084103584, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.85546875, "rewards/reward_reference/std": 0.35231640934944153, "step": 445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3355.34765625, "completions/mean_terminated_length": 3249.540283203125, "completions/min_length": 2222.0, "completions/min_terminated_length": 2222.0, "epoch": 0.09514666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.12772390246391296, "learning_rate": 1e-06, "loss": -0.0216, "num_tokens": 346719945.0, "reward": 2.671082019805908, "reward_std": 0.11974822729825974, "rewards/cosine_scaled_reward/mean": 0.8572664260864258, "rewards/cosine_scaled_reward/std": 0.34457895159721375, "rewards/repetition_penalty_reward/mean": -0.11977824568748474, "rewards/repetition_penalty_reward/std": 0.044986020773649216, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 3482.47265625, "completions/mean_terminated_length": 3318.46044921875, "completions/min_length": 2169.0, "completions/min_terminated_length": 2169.0, "epoch": 0.09536, "frac_reward_zero_std": 0.0, "grad_norm": 0.22144626080989838, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 347549682.0, "reward": 2.5736947059631348, "reward_std": 0.2570610046386719, "rewards/cosine_scaled_reward/mean": 0.81336510181427, "rewards/cosine_scaled_reward/std": 0.43019360303878784, "rewards/repetition_penalty_reward/mean": -0.1318579763174057, "rewards/repetition_penalty_reward/std": 0.046924762427806854, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.65625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 3514.81640625, "completions/mean_terminated_length": 3397.48828125, "completions/min_length": 2114.0, "completions/min_terminated_length": 2114.0, "epoch": 0.09557333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13463345170021057, "learning_rate": 1e-06, "loss": -0.0252, "num_tokens": 348440599.0, "reward": 2.5912740230560303, "reward_std": 0.15143904089927673, "rewards/cosine_scaled_reward/mean": 0.8223309516906738, "rewards/cosine_scaled_reward/std": 0.4279668629169464, "rewards/repetition_penalty_reward/mean": -0.12402550876140594, "rewards/repetition_penalty_reward/std": 0.04709490016102791, "rewards/reward_format/mean": 0.9906250238418579, "rewards/reward_format/std": 0.08626226335763931, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3559.171875, "completions/mean_terminated_length": 3456.800048828125, "completions/min_length": 1997.0, "completions/min_terminated_length": 1997.0, "epoch": 0.09578666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10657992213964462, "learning_rate": 1e-06, "loss": -0.018, "num_tokens": 349347991.0, "reward": 2.6517934799194336, "reward_std": 0.20156292617321014, "rewards/cosine_scaled_reward/mean": 0.8590583205223083, "rewards/cosine_scaled_reward/std": 0.38307467103004456, "rewards/repetition_penalty_reward/mean": -0.13304612040519714, "rewards/repetition_penalty_reward/std": 0.044906023889780045, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.92578125, "rewards/reward_reference/std": 0.2626400291919708, "step": 449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3457.07421875, "completions/mean_terminated_length": 3365.79931640625, "completions/min_length": 2277.0, "completions/min_terminated_length": 2277.0, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.1106138601899147, "learning_rate": 1e-06, "loss": -0.0085, "num_tokens": 350266990.0, "reward": 2.6805636882781982, "reward_std": 0.16519448161125183, "rewards/cosine_scaled_reward/mean": 0.8684450387954712, "rewards/cosine_scaled_reward/std": 0.34893450140953064, "rewards/repetition_penalty_reward/mean": -0.12147516012191772, "rewards/repetition_penalty_reward/std": 0.03809197247028351, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3519.98046875, "completions/mean_terminated_length": 3376.677978515625, "completions/min_length": 2497.0, "completions/min_terminated_length": 2497.0, "epoch": 0.09621333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1882275491952896, "learning_rate": 1e-06, "loss": -0.0435, "num_tokens": 351124233.0, "reward": 2.6901729106903076, "reward_std": 0.1846616119146347, "rewards/cosine_scaled_reward/mean": 0.8774366974830627, "rewards/cosine_scaled_reward/std": 0.3409196436405182, "rewards/repetition_penalty_reward/mean": -0.1286701261997223, "rewards/repetition_penalty_reward/std": 0.04067756608128548, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.94140625, "rewards/reward_reference/std": 0.23532284796237946, "step": 451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 3531.19921875, "completions/mean_terminated_length": 3394.111572265625, "completions/min_length": 2322.0, "completions/min_terminated_length": 2322.0, "epoch": 0.09642666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.13614989817142487, "learning_rate": 1e-06, "loss": -0.0721, "num_tokens": 351979164.0, "reward": 2.565354108810425, "reward_std": 0.22787414491176605, "rewards/cosine_scaled_reward/mean": 0.8094824552536011, "rewards/cosine_scaled_reward/std": 0.4485189914703369, "rewards/repetition_penalty_reward/mean": -0.13865964114665985, "rewards/repetition_penalty_reward/std": 0.04454709589481354, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.89453125, "rewards/reward_reference/std": 0.3077581524848938, "step": 452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3545.9140625, "completions/mean_terminated_length": 3405.6962890625, "completions/min_length": 2251.0, "completions/min_terminated_length": 2251.0, "epoch": 0.09664, "frac_reward_zero_std": 0.0, "grad_norm": 0.15346837043762207, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 352838342.0, "reward": 2.4719109535217285, "reward_std": 0.26720231771469116, "rewards/cosine_scaled_reward/mean": 0.7608464956283569, "rewards/cosine_scaled_reward/std": 0.5024836659431458, "rewards/repetition_penalty_reward/mean": -0.14206044375896454, "rewards/repetition_penalty_reward/std": 0.042888157069683075, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.859375, "rewards/reward_reference/std": 0.3483152687549591, "step": 453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3482.3359375, "completions/mean_terminated_length": 3322.1181640625, "completions/min_length": 1981.0, "completions/min_terminated_length": 1981.0, "epoch": 0.09685333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12531593441963196, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 353669904.0, "reward": 2.4537057876586914, "reward_std": 0.291486918926239, "rewards/cosine_scaled_reward/mean": 0.747859537601471, "rewards/cosine_scaled_reward/std": 0.5058099031448364, "rewards/repetition_penalty_reward/mean": -0.1496226191520691, "rewards/repetition_penalty_reward/std": 0.04818349331617355, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.85546875, "rewards/reward_reference/std": 0.35231640934944153, "step": 454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3583.19921875, "completions/mean_terminated_length": 3429.619140625, "completions/min_length": 2455.0, "completions/min_terminated_length": 2455.0, "epoch": 0.09706666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.1776491403579712, "learning_rate": 1e-06, "loss": -0.0616, "num_tokens": 354501715.0, "reward": 2.485987901687622, "reward_std": 0.270699143409729, "rewards/cosine_scaled_reward/mean": 0.7776684761047363, "rewards/cosine_scaled_reward/std": 0.48990023136138916, "rewards/repetition_penalty_reward/mean": -0.1526181846857071, "rewards/repetition_penalty_reward/std": 0.04843423143029213, "rewards/reward_format/mean": 0.9937499761581421, "rewards/reward_format/std": 0.0705718919634819, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.90625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3455.9609375, "completions/mean_terminated_length": 3354.597412109375, "completions/min_length": 2270.0, "completions/min_terminated_length": 2270.0, "epoch": 0.09728, "frac_reward_zero_std": 0.0, "grad_norm": 0.11334540694952011, "learning_rate": 1e-06, "loss": -0.0152, "num_tokens": 355403069.0, "reward": 2.5903708934783936, "reward_std": 0.16302460432052612, "rewards/cosine_scaled_reward/mean": 0.8281469941139221, "rewards/cosine_scaled_reward/std": 0.4118127226829529, "rewards/repetition_penalty_reward/mean": -0.14793241024017334, "rewards/repetition_penalty_reward/std": 0.04326983541250229, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.91015625, "rewards/reward_reference/std": 0.2865179479122162, "step": 456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3528.26171875, "completions/mean_terminated_length": 3383.544189453125, "completions/min_length": 2283.0, "completions/min_terminated_length": 2283.0, "epoch": 0.09749333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.12938834726810455, "learning_rate": 1e-06, "loss": -0.037, "num_tokens": 356256492.0, "reward": 2.552910327911377, "reward_std": 0.26627570390701294, "rewards/cosine_scaled_reward/mean": 0.8132476806640625, "rewards/cosine_scaled_reward/std": 0.4391486346721649, "rewards/repetition_penalty_reward/mean": -0.15877488255500793, "rewards/repetition_penalty_reward/std": 0.06030600890517235, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3463.6875, "completions/mean_terminated_length": 3346.592529296875, "completions/min_length": 2197.0, "completions/min_terminated_length": 2197.0, "epoch": 0.09770666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.17623008787631989, "learning_rate": 1e-06, "loss": -0.0468, "num_tokens": 357130112.0, "reward": 2.471355438232422, "reward_std": 0.32545268535614014, "rewards/cosine_scaled_reward/mean": 0.7646437883377075, "rewards/cosine_scaled_reward/std": 0.48779138922691345, "rewards/repetition_penalty_reward/mean": -0.160475954413414, "rewards/repetition_penalty_reward/std": 0.04666691645979881, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8671875, "rewards/reward_reference/std": 0.3400367796421051, "step": 458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.75, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 3483.28125, "completions/mean_terminated_length": 3369.81494140625, "completions/min_length": 2204.0, "completions/min_terminated_length": 2204.0, "epoch": 0.09792, "frac_reward_zero_std": 0.0, "grad_norm": 0.13649289309978485, "learning_rate": 1e-06, "loss": -0.0153, "num_tokens": 358018724.0, "reward": 2.6438474655151367, "reward_std": 0.1534462422132492, "rewards/cosine_scaled_reward/mean": 0.8666955232620239, "rewards/cosine_scaled_reward/std": 0.3531719744205475, "rewards/repetition_penalty_reward/mean": -0.15644192695617676, "rewards/repetition_penalty_reward/std": 0.04818189516663551, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.40625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 3496.74609375, "completions/mean_terminated_length": 3347.663330078125, "completions/min_length": 2117.0, "completions/min_terminated_length": 2117.0, "epoch": 0.09813333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.1073637306690216, "learning_rate": 1e-06, "loss": -0.0394, "num_tokens": 358869947.0, "reward": 2.559465169906616, "reward_std": 0.23732900619506836, "rewards/cosine_scaled_reward/mean": 0.8185771703720093, "rewards/cosine_scaled_reward/std": 0.42788994312286377, "rewards/repetition_penalty_reward/mean": -0.1583307683467865, "rewards/repetition_penalty_reward/std": 0.06375561654567719, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 3493.02734375, "completions/mean_terminated_length": 3367.882080078125, "completions/min_length": 2337.0, "completions/min_terminated_length": 2337.0, "epoch": 0.09834666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13014236092567444, "learning_rate": 1e-06, "loss": -0.0377, "num_tokens": 359758530.0, "reward": 2.6258108615875244, "reward_std": 0.15342065691947937, "rewards/cosine_scaled_reward/mean": 0.8476953506469727, "rewards/cosine_scaled_reward/std": 0.3844573199748993, "rewards/repetition_penalty_reward/mean": -0.1437593698501587, "rewards/repetition_penalty_reward/std": 0.04323843494057655, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.921875, "rewards/reward_reference/std": 0.26889389753341675, "step": 461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 3481.15625, "completions/mean_terminated_length": 3405.649169921875, "completions/min_length": 2295.0, "completions/min_terminated_length": 2295.0, "epoch": 0.09856, "frac_reward_zero_std": 0.0, "grad_norm": 0.11288342624902725, "learning_rate": 1e-06, "loss": -0.0204, "num_tokens": 360691230.0, "reward": 2.718127727508545, "reward_std": 0.125985786318779, "rewards/cosine_scaled_reward/mean": 0.9069023132324219, "rewards/cosine_scaled_reward/std": 0.2774904668331146, "rewards/repetition_penalty_reward/mean": -0.14971214532852173, "rewards/repetition_penalty_reward/std": 0.03843717649579048, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.9609375, "rewards/reward_reference/std": 0.19412322342395782, "step": 462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 3515.06640625, "completions/mean_terminated_length": 3363.39404296875, "completions/min_length": 2279.0, "completions/min_terminated_length": 2279.0, "epoch": 0.09877333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.13893939554691315, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 361531383.0, "reward": 2.519890308380127, "reward_std": 0.2613111436367035, "rewards/cosine_scaled_reward/mean": 0.7902860641479492, "rewards/cosine_scaled_reward/std": 0.4658883213996887, "rewards/repetition_penalty_reward/mean": -0.1532084196805954, "rewards/repetition_penalty_reward/std": 0.0518529936671257, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.8828125, "rewards/reward_reference/std": 0.3222736418247223, "step": 463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 3379.55859375, "completions/mean_terminated_length": 3242.934814453125, "completions/min_length": 1778.0, "completions/min_terminated_length": 1778.0, "epoch": 0.09898666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.14748817682266235, "learning_rate": 1e-06, "loss": -0.0395, "num_tokens": 362391402.0, "reward": 2.564578056335449, "reward_std": 0.20639969408512115, "rewards/cosine_scaled_reward/mean": 0.8055846691131592, "rewards/cosine_scaled_reward/std": 0.4240720868110657, "rewards/repetition_penalty_reward/mean": -0.14335037767887115, "rewards/repetition_penalty_reward/std": 0.03977758437395096, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.90234375, "rewards/reward_reference/std": 0.29743078351020813, "step": 464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 3315.11328125, "completions/mean_terminated_length": 3230.601806640625, "completions/min_length": 1942.0, "completions/min_terminated_length": 1942.0, "epoch": 0.0992, "frac_reward_zero_std": 0.0, "grad_norm": 0.09585642069578171, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 363297903.0, "reward": 2.680453300476074, "reward_std": 0.13919416069984436, "rewards/cosine_scaled_reward/mean": 0.8720101118087769, "rewards/cosine_scaled_reward/std": 0.308190256357193, "rewards/repetition_penalty_reward/mean": -0.13765066862106323, "rewards/repetition_penalty_reward/std": 0.0456363707780838, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.94921875, "rewards/reward_reference/std": 0.21998079121112823, "step": 465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 3323.46484375, "completions/mean_terminated_length": 3243.54736328125, "completions/min_length": 1967.0, "completions/min_terminated_length": 1967.0, "epoch": 0.09941333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.11453945189714432, "learning_rate": 1e-06, "loss": -0.0383, "num_tokens": 364204806.0, "reward": 2.5500752925872803, "reward_std": 0.1567349135875702, "rewards/cosine_scaled_reward/mean": 0.7999416589736938, "rewards/cosine_scaled_reward/std": 0.4272773563861847, "rewards/repetition_penalty_reward/mean": -0.145178884267807, "rewards/repetition_penalty_reward/std": 0.04210899397730827, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.8984375, "rewards/reward_reference/std": 0.3026638329029083, "step": 466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.53125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 3424.1484375, "completions/mean_terminated_length": 3273.06201171875, "completions/min_length": 2276.0, "completions/min_terminated_length": 2276.0, "epoch": 0.09962666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 0.14379972219467163, "learning_rate": 1e-06, "loss": -0.0218, "num_tokens": 365051776.0, "reward": 2.4092652797698975, "reward_std": 0.253988653421402, "rewards/cosine_scaled_reward/mean": 0.7186833620071411, "rewards/cosine_scaled_reward/std": 0.5256325602531433, "rewards/repetition_penalty_reward/mean": -0.1461367905139923, "rewards/repetition_penalty_reward/std": 0.050273049622774124, "rewards/reward_format/mean": 0.996874988079071, "rewards/reward_format/std": 0.05000000447034836, "rewards/reward_reference/mean": 0.83984375, "rewards/reward_reference/std": 0.36746934056282043, "step": 467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -5.71875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3387.23046875, "completions/mean_terminated_length": 3252.06982421875, "completions/min_length": 2080.0, "completions/min_terminated_length": 2080.0, "epoch": 0.09984, "frac_reward_zero_std": 0.0, "grad_norm": 0.12260466814041138, "learning_rate": 1e-06, "loss": -0.022, "num_tokens": 365908611.0, "reward": 2.4367387294769287, "reward_std": 0.1926591992378235, "rewards/cosine_scaled_reward/mean": 0.7306523323059082, "rewards/cosine_scaled_reward/std": 0.5114856362342834, "rewards/repetition_penalty_reward/mean": -0.14156997203826904, "rewards/repetition_penalty_reward/std": 0.04533535614609718, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.84765625, "rewards/reward_reference/std": 0.3600577116012573, "step": 468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": -6.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 3267.3125, "completions/mean_terminated_length": 3161.44482421875, "completions/min_length": 1937.0, "completions/min_terminated_length": 1937.0, "epoch": 0.10005333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.10559126734733582, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 366791167.0, "reward": 2.658348560333252, "reward_std": 0.13092412054538727, "rewards/cosine_scaled_reward/mean": 0.8457194566726685, "rewards/cosine_scaled_reward/std": 0.3431178033351898, "rewards/repetition_penalty_reward/mean": -0.12096454203128815, "rewards/repetition_penalty_reward/std": 0.03693599998950958, "rewards/reward_format/mean": 1.0, "rewards/reward_format/std": 0.0, "rewards/reward_reference/mean": 0.93359375, "rewards/reward_reference/std": 0.24947863817214966, "step": 469 } ], "logging_steps": 1, "max_steps": 4688, "num_input_tokens_seen": 366791167, "num_train_epochs": 1, "save_steps": 469, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }