diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,7 +2,7 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.36, + "epoch": 0.18, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, @@ -11,4512 +11,4512 @@ "log_history": [ { "clip_ratio": 0.0, - "completion_length": 328.68490409851074, - "epoch": 0.0016, - "grad_norm": 3.912095069885254, + "completion_length": 294.14584255218506, + "epoch": 0.0008, + "grad_norm": 3.741899013519287, "kl": 0.0, - "learning_rate": 2.222222222222222e-08, - "loss": 0.5058, - "num_tokens": 224519.0, - "reward": -0.46514276787638664, - "reward_std": 0.04444521979894489, - "rewards/SMILES_validity_reward": -0.59166669100523, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.007812500232830644, - "rewards/reasoning_steps_reward": 0.0434027785086073, - "rewards/repetition_penalty_reward": -0.07269583910237998, - "rewards/smiles_len_reward": -0.003906250116415322, + "learning_rate": 7.142857142857142e-08, + "loss": 0.1235, + "num_tokens": 77390.0, + "reward": -0.44646846456453204, + "reward_std": 0.05219822392973583, + "rewards/SMILES_validity_reward": -0.5666666901670396, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.08333333698101342, + "rewards/repetition_penalty_reward": -0.06572717940434813, + "rewards/smiles_len_reward": -0.015625000465661287, "rewards/tag_count_reward": 0.0, "step": 2 }, { "clip_ratio": 0.0, - "completion_length": 307.9791736602783, - "epoch": 0.0032, - "grad_norm": 4.011274814605713, - "kl": 0.0010592937469482422, - "learning_rate": 4.444444444444444e-08, - "loss": 0.5611, - "num_tokens": 441087.0, - "reward": -0.46445096656680107, - "reward_std": 0.046506868267897516, - "rewards/SMILES_validity_reward": -0.587500024586916, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0052083334885537624, - "rewards/reasoning_steps_reward": 0.029513889166992158, - "rewards/repetition_penalty_reward": -0.06998793449020013, - "rewards/smiles_len_reward": -0.007812500232830644, - "rewards/tag_count_reward": 0.0006510416860692203, + "completion_length": 314.82292652130127, + "epoch": 0.0016, + "grad_norm": 3.488884925842285, + "kl": 0.0005727410316467285, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.1119, + "num_tokens": 156765.0, + "reward": -0.45767286978662014, + "reward_std": 0.04655311269743834, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.05902777938172221, + "rewards/repetition_penalty_reward": -0.04981988831423223, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 4 }, { "clip_ratio": 0.0, - "completion_length": 294.9869842529297, - "epoch": 0.0048, - "grad_norm": 2.4506795406341553, - "kl": 0.0011034011840820312, - "learning_rate": 6.666666666666667e-08, - "loss": 0.3471, - "num_tokens": 652666.0, - "reward": -0.4590358715504408, - "reward_std": 0.058758297411259264, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.007812500232830644, - "rewards/reasoning_steps_reward": 0.0442708347691223, - "rewards/repetition_penalty_reward": -0.0656199580989778, - "rewards/smiles_len_reward": -0.009114583604969084, + "completion_length": 329.7812581062317, + "epoch": 0.0024, + "grad_norm": 5.260374069213867, + "kl": 0.0007290840148925781, + "learning_rate": 2.1428571428571426e-07, + "loss": 0.1747, + "num_tokens": 237576.0, + "reward": -0.4684038292616606, + "reward_std": 0.017661503254203126, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.07291666837409139, + "rewards/repetition_penalty_reward": -0.05695587984519079, + "rewards/smiles_len_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 6 }, { "clip_ratio": 0.0, - "completion_length": 286.95313262939453, - "epoch": 0.0064, - "grad_norm": 4.508991241455078, - "kl": 0.001209259033203125, - "learning_rate": 8.888888888888888e-08, - "loss": 0.2718, - "num_tokens": 861160.0, - "reward": -0.4643410835415125, - "reward_std": 0.044357261242112145, - "rewards/SMILES_validity_reward": -0.59166669100523, + "completion_length": 227.34375858306885, + "epoch": 0.0032, + "grad_norm": 4.617069244384766, + "kl": 0.0006937384605407715, + "learning_rate": 2.857142857142857e-07, + "loss": 0.1561, + "num_tokens": 308553.0, + "reward": -0.4692091681063175, + "reward_std": 0.013479128843755461, + "rewards/SMILES_validity_reward": -0.6000000238418579, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.034722222946584225, - "rewards/repetition_penalty_reward": -0.06381095026154071, - "rewards/smiles_len_reward": -0.006510416860692203, - "rewards/tag_count_reward": 0.0026041666860692203, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.04513889038935304, + "rewards/repetition_penalty_reward": -0.037231406051432714, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0, "step": 8 }, { "clip_ratio": 0.0, - "completion_length": 278.7838635444641, - "epoch": 0.008, - "grad_norm": 3.3212454319000244, - "kl": 0.0011568069458007812, - "learning_rate": 1.111111111111111e-07, - "loss": 0.3496, - "num_tokens": 1066517.0, - "reward": -0.4583498015999794, - "reward_std": 0.06702683249022812, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.015625000465661287, - "rewards/reasoning_steps_reward": 0.026909722946584225, - "rewards/repetition_penalty_reward": -0.0630204735789448, - "rewards/smiles_len_reward": -0.01092973274353426, - "rewards/tag_count_reward": 0.0, + "completion_length": 290.1770935058594, + "epoch": 0.004, + "grad_norm": 3.2441182136535645, + "kl": 0.0006629228591918945, + "learning_rate": 3.5714285714285716e-07, + "loss": 0.0965, + "num_tokens": 385562.0, + "reward": -0.46172465570271015, + "reward_std": 0.031036741798743606, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.11111111380159855, + "rewards/repetition_penalty_reward": -0.05700426420662552, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 10 }, { "clip_ratio": 0.0, - "completion_length": 379.56511402130127, - "epoch": 0.0096, - "grad_norm": 2.4336678981781006, - "kl": 0.0010323524475097656, - "learning_rate": 1.3333333333333334e-07, - "loss": 0.2837, - "num_tokens": 1310574.0, - "reward": -0.461483683437109, - "reward_std": 0.04768646776210517, - "rewards/SMILES_validity_reward": -0.587500024586916, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0052083334885537624, - "rewards/reasoning_steps_reward": 0.07031250116415322, - "rewards/repetition_penalty_reward": -0.08111370843835175, - "rewards/smiles_len_reward": -0.007812500232830644, - "rewards/tag_count_reward": 0.0006510416860692203, + "completion_length": 316.9166741371155, + "epoch": 0.0048, + "grad_norm": 4.713688850402832, + "kl": 0.0006747245788574219, + "learning_rate": 4.285714285714285e-07, + "loss": 0.1268, + "num_tokens": 465138.0, + "reward": -0.4592954497784376, + "reward_std": 0.041717870262800716, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.027777778217568994, + "rewards/repetition_penalty_reward": -0.053024730485049076, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.02083333395421505, "step": 12 }, { "clip_ratio": 0.0, - "completion_length": 372.9218864440918, - "epoch": 0.0112, - "grad_norm": 2.4363439083099365, - "kl": 0.0010030269622802734, - "learning_rate": 1.5555555555555556e-07, - "loss": 0.5532, - "num_tokens": 1552080.0, - "reward": -0.4688709732145071, - "reward_std": 0.03987264301395044, - "rewards/SMILES_validity_reward": -0.59166669100523, + "completion_length": 291.4479274749756, + "epoch": 0.0056, + "grad_norm": 3.528653621673584, + "kl": 0.0005544424057006836, + "learning_rate": 5e-07, + "loss": 0.0348, + "num_tokens": 542269.0, + "reward": -0.46717050671577454, + "reward_std": 0.020807951921597123, + "rewards/SMILES_validity_reward": -0.6000000238418579, "rewards/cosine_scaled_reward": -0.4999987781047821, "rewards/format_reward": 0.0, - "rewards/reasoning_steps_reward": 0.04427083511836827, - "rewards/repetition_penalty_reward": -0.08675744908396155, + "rewards/reasoning_steps_reward": 0.07986111333593726, + "rewards/repetition_penalty_reward": -0.05156701215310022, "rewards/smiles_len_reward": -0.0052083334885537624, - "rewards/tag_count_reward": 0.0006510416860692203, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 14 }, { "clip_ratio": 0.0, - "completion_length": 296.2369899749756, - "epoch": 0.0128, - "grad_norm": 3.011733293533325, - "kl": 0.0010938644409179688, - "learning_rate": 1.7777777777777776e-07, - "loss": 0.4054, - "num_tokens": 1764139.0, - "reward": -0.4688110891729593, - "reward_std": 0.03221098065841943, - "rewards/SMILES_validity_reward": -0.5958333574235439, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0052083334885537624, - "rewards/reasoning_steps_reward": 0.03906250122236088, - "rewards/repetition_penalty_reward": -0.06805969902779907, - "rewards/smiles_len_reward": -0.003906250116415322, - "rewards/tag_count_reward": 0.0, + "completion_length": 268.8750042915344, + "epoch": 0.0064, + "grad_norm": 3.8491642475128174, + "kl": 0.0007927417755126953, + "learning_rate": 4.999740409224932e-07, + "loss": 0.139, + "num_tokens": 617233.0, + "reward": -0.4442032668739557, + "reward_std": 0.06988084255863214, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.07986111240461469, + "rewards/repetition_penalty_reward": -0.047415557550266385, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 16 }, { "clip_ratio": 0.0, - "completion_length": 327.020845413208, - "epoch": 0.0144, - "grad_norm": 2.7837798595428467, - "kl": 0.0011706352233886719, - "learning_rate": 2e-07, - "loss": 0.4862, - "num_tokens": 1988019.0, - "reward": -0.47093865275382996, - "reward_std": 0.026894541137153283, - "rewards/SMILES_validity_reward": -0.5958333574235439, + "completion_length": 267.59375953674316, + "epoch": 0.0072, + "grad_norm": 4.2977681159973145, + "kl": 0.0007317066192626953, + "learning_rate": 4.998961690809627e-07, + "loss": 0.038, + "num_tokens": 692074.0, + "reward": -0.4525126740336418, + "reward_std": 0.05021150383981876, + "rewards/SMILES_validity_reward": -0.5833333563059568, "rewards/cosine_scaled_reward": -0.4999987781047821, "rewards/format_reward": 0.0, - "rewards/reasoning_steps_reward": 0.03732639015652239, - "rewards/repetition_penalty_reward": -0.07132309174630791, + "rewards/reasoning_steps_reward": 0.11458333558402956, + "rewards/repetition_penalty_reward": -0.053773476160131395, "rewards/smiles_len_reward": -0.0052083334885537624, - "rewards/tag_count_reward": 0.0006510416860692203, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 18 }, { "clip_ratio": 0.0, - "completion_length": 358.48177909851074, - "epoch": 0.016, - "grad_norm": 2.6016883850097656, - "kl": 0.0010716915130615234, - "learning_rate": 2.222222222222222e-07, - "loss": 0.5166, - "num_tokens": 2223980.0, - "reward": -0.4630376435816288, - "reward_std": 0.05124675569823012, - "rewards/SMILES_validity_reward": -0.587500024586916, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.007812500232830644, - "rewards/reasoning_steps_reward": 0.03819444519467652, - "rewards/repetition_penalty_reward": -0.07320236065424979, - "rewards/smiles_len_reward": -0.006957859538488265, - "rewards/tag_count_reward": 0.0006510416860692203, + "completion_length": 263.87500870227814, + "epoch": 0.008, + "grad_norm": 3.244373321533203, + "kl": 0.0007187128067016602, + "learning_rate": 4.997664006472578e-07, + "loss": 0.0803, + "num_tokens": 766558.0, + "reward": -0.46448596753180027, + "reward_std": 0.02012665472284425, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.09375000186264515, + "rewards/repetition_penalty_reward": -0.04381883708992973, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 20 }, { "clip_ratio": 0.0, - "completion_length": 375.92709732055664, - "epoch": 0.0176, - "grad_norm": 2.962827682495117, - "kl": 0.0009744167327880859, - "learning_rate": 2.4444444444444445e-07, - "loss": 0.6561, - "num_tokens": 2466640.0, - "reward": -0.4662686139345169, - "reward_std": 0.046040316578000784, - "rewards/SMILES_validity_reward": -0.59166669100523, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.007812500232830644, - "rewards/reasoning_steps_reward": 0.04079861199716106, - "rewards/repetition_penalty_reward": -0.07874601823277771, - "rewards/smiles_len_reward": -0.007812500232830644, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 249.27083921432495, + "epoch": 0.0088, + "grad_norm": 4.975526332855225, + "kl": 0.001035928726196289, + "learning_rate": 4.995847625707292e-07, + "loss": 0.0001, + "num_tokens": 839640.0, + "reward": -0.4649157803505659, + "reward_std": 0.019527284224750474, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.1006944456603378, + "rewards/repetition_penalty_reward": -0.03943644787068479, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 22 }, { "clip_ratio": 0.0, - "completion_length": 355.31772327423096, - "epoch": 0.0192, - "grad_norm": 3.939411163330078, - "kl": 0.0012691020965576172, - "learning_rate": 2.6666666666666667e-07, - "loss": 0.3223, - "num_tokens": 2701386.0, - "reward": -0.46141638420522213, - "reward_std": 0.06114580819848925, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0052083334885537624, - "rewards/reasoning_steps_reward": 0.047743057599291205, - "rewards/repetition_penalty_reward": -0.08709702454507351, - "rewards/smiles_len_reward": -0.007102272938936949, + "completion_length": 290.88542461395264, + "epoch": 0.0096, + "grad_norm": 4.3505401611328125, + "kl": 0.0008978843688964844, + "learning_rate": 4.993512925726318e-07, + "loss": 0.1512, + "num_tokens": 916717.0, + "reward": -0.4671813119202852, + "reward_std": 0.018046872515697032, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.07638889132067561, + "rewards/repetition_penalty_reward": -0.04820289777126163, + "rewards/smiles_len_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 24 }, { "clip_ratio": 0.0, - "completion_length": 409.07032585144043, - "epoch": 0.0208, - "grad_norm": 1.7252240180969238, - "kl": 0.0011508464813232422, - "learning_rate": 2.8888888888888885e-07, - "loss": 0.2639, - "num_tokens": 2956773.0, - "reward": -0.4627871084958315, - "reward_std": 0.05538264673668891, - "rewards/SMILES_validity_reward": -0.587500024586916, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0026041667442768812, - "rewards/reasoning_steps_reward": 0.07204861339414492, - "rewards/repetition_penalty_reward": -0.08872260828502476, - "rewards/smiles_len_reward": -0.007812500232830644, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 328.51042556762695, + "epoch": 0.0104, + "grad_norm": 4.017609596252441, + "kl": 0.0010578632354736328, + "learning_rate": 4.990660391382923e-07, + "loss": -0.0418, + "num_tokens": 997406.0, + "reward": -0.46294988691806793, + "reward_std": 0.03339050061185844, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.1041666695382446, + "rewards/repetition_penalty_reward": -0.06491638510487974, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 26 }, { "clip_ratio": 0.0, - "completion_length": 342.5130281448364, - "epoch": 0.0224, - "grad_norm": 3.2929608821868896, - "kl": 0.0012526512145996094, - "learning_rate": 3.111111111111111e-07, - "loss": 0.2808, - "num_tokens": 3186602.0, - "reward": -0.4632618837058544, - "reward_std": 0.04645203723339364, - "rewards/SMILES_validity_reward": -0.59166669100523, + "completion_length": 266.8333377838135, + "epoch": 0.0112, + "grad_norm": 5.310181617736816, + "kl": 0.0015530586242675781, + "learning_rate": 4.987290615070384e-07, + "loss": 0.2128, + "num_tokens": 1072174.0, + "reward": -0.4547987822443247, + "reward_std": 0.04966713820613222, + "rewards/SMILES_validity_reward": -0.5833333563059568, "rewards/cosine_scaled_reward": -0.4999987781047821, "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.06597222399432212, - "rewards/repetition_penalty_reward": -0.07775854406645522, + "rewards/reasoning_steps_reward": 0.0659722238779068, + "rewards/repetition_penalty_reward": -0.056669211451662704, "rewards/smiles_len_reward": -0.010416666977107525, - "rewards/tag_count_reward": 0.0, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 28 }, { "clip_ratio": 0.0, - "completion_length": 317.0312604904175, - "epoch": 0.024, - "grad_norm": 3.8771326541900635, - "kl": 0.0015463829040527344, - "learning_rate": 3.333333333333333e-07, - "loss": 0.457, - "num_tokens": 3406646.0, - "reward": -0.4599665645509958, - "reward_std": 0.05891757505014539, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0052083334885537624, - "rewards/reasoning_steps_reward": 0.03906250064028427, - "rewards/repetition_penalty_reward": -0.06320823275018483, - "rewards/smiles_len_reward": -0.007812500232830644, - "rewards/tag_count_reward": 0.0, + "completion_length": 268.0312581062317, + "epoch": 0.012, + "grad_norm": 5.723649978637695, + "kl": 0.0017752647399902344, + "learning_rate": 4.983404296598978e-07, + "loss": -0.0013, + "num_tokens": 1147057.0, + "reward": -0.45876461267471313, + "reward_std": 0.039007039871648885, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.1284722255077213, + "rewards/repetition_penalty_reward": -0.055181623916723765, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.013020833721384406, "step": 30 }, { "clip_ratio": 0.0, - "completion_length": 262.46615171432495, - "epoch": 0.0256, - "grad_norm": 4.99418306350708, - "kl": 0.0019631385803222656, - "learning_rate": 3.5555555555555553e-07, - "loss": 0.4474, - "num_tokens": 3605737.0, - "reward": -0.45110186748206615, - "reward_std": 0.08477685746038333, - "rewards/SMILES_validity_reward": -0.5763021092861891, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.020833333721384406, - "rewards/reasoning_steps_reward": 0.03906250145519152, - "rewards/repetition_penalty_reward": -0.06543084210716188, - "rewards/smiles_len_reward": -0.014338664012029767, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 208.0520886182785, + "epoch": 0.0128, + "grad_norm": 4.876113414764404, + "kl": 0.0021886825561523438, + "learning_rate": 4.979002243050646e-07, + "loss": 0.0591, + "num_tokens": 1216182.0, + "reward": -0.4663495346903801, + "reward_std": 0.01864687688066624, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.0833333358168602, + "rewards/repetition_penalty_reward": -0.04422530399460811, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 32 }, { "clip_ratio": 0.0, - "completion_length": 331.69532203674316, - "epoch": 0.0272, - "grad_norm": 3.533830165863037, - "kl": 0.0023183822631835938, - "learning_rate": 3.7777777777777775e-07, - "loss": 0.594, - "num_tokens": 3831412.0, - "reward": -0.4669134635478258, - "reward_std": 0.0388556448451709, - "rewards/SMILES_validity_reward": -0.59166669100523, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0026041667442768812, - "rewards/reasoning_steps_reward": 0.052083334478084, - "rewards/repetition_penalty_reward": -0.07955215487163514, - "rewards/smiles_len_reward": -0.009114583604969084, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 332.57292652130127, + "epoch": 0.0136, + "grad_norm": 4.4281511306762695, + "kl": 0.0018911361694335938, + "learning_rate": 4.974085368611381e-07, + "loss": 0.0615, + "num_tokens": 1297261.0, + "reward": -0.45041274279356003, + "reward_std": 0.05302129179472104, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.07291666767559946, + "rewards/repetition_penalty_reward": -0.04839907615678385, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.007812500232830644, "step": 34 }, { "clip_ratio": 0.0, - "completion_length": 254.330735206604, - "epoch": 0.0288, - "grad_norm": 5.293888092041016, - "kl": 0.0037059783935546875, - "learning_rate": 4e-07, - "loss": 0.4527, - "num_tokens": 4027379.0, - "reward": -0.4606757778674364, - "reward_std": 0.055119884957093745, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0026041667442768812, - "rewards/reasoning_steps_reward": 0.03385416738456115, - "rewards/repetition_penalty_reward": -0.05780024908017367, - "rewards/smiles_len_reward": -0.008593750302679837, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 227.93750929832458, + "epoch": 0.0144, + "grad_norm": 6.5882248878479, + "kl": 0.0024008750915527344, + "learning_rate": 4.968654694381379e-07, + "loss": 0.0836, + "num_tokens": 1368295.0, + "reward": -0.4686909671872854, + "reward_std": 0.013386835009441711, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.03819444542750716, + "rewards/repetition_penalty_reward": -0.03552157600643113, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.015625000465661287, "step": 36 }, { "clip_ratio": 0.0, - "completion_length": 344.2604250907898, - "epoch": 0.0304, - "grad_norm": 2.2371666431427, - "kl": 0.003749847412109375, - "learning_rate": 4.222222222222222e-07, - "loss": 0.1871, - "num_tokens": 4257879.0, - "reward": -0.4692084323614836, - "reward_std": 0.0285613224550616, + "completion_length": 233.9479250907898, + "epoch": 0.0152, + "grad_norm": 5.5114569664001465, + "kl": 0.0028159618377685547, + "learning_rate": 4.962711348162987e-07, + "loss": -0.024, + "num_tokens": 1439906.0, + "reward": -0.4651654362678528, + "reward_std": 0.025145536681520753, "rewards/SMILES_validity_reward": -0.6000000238418579, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.007812500232830644, - "rewards/reasoning_steps_reward": 0.06597222341224551, - "rewards/repetition_penalty_reward": -0.07172926003113389, - "rewards/smiles_len_reward": -0.011718750349245965, - "rewards/tag_count_reward": 0.001953125058207661, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.06597222411073744, + "rewards/repetition_penalty_reward": -0.048877436958719045, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.010416666744276881, "step": 38 }, { "clip_ratio": 0.0, - "completion_length": 339.0260524749756, - "epoch": 0.032, - "grad_norm": 3.276158094406128, - "kl": 0.0050830841064453125, - "learning_rate": 4.444444444444444e-07, - "loss": 0.3235, - "num_tokens": 4486369.0, - "reward": -0.4574108552187681, - "reward_std": 0.0688521406846121, - "rewards/SMILES_validity_reward": -0.583333358168602, + "completion_length": 246.72917079925537, + "epoch": 0.016, + "grad_norm": 4.999541282653809, + "kl": 0.0029511451721191406, + "learning_rate": 4.956256564226487e-07, + "loss": -0.0201, + "num_tokens": 1512744.0, + "reward": -0.4640405811369419, + "reward_std": 0.024847639388099196, + "rewards/SMILES_validity_reward": -0.6000000238418579, "rewards/cosine_scaled_reward": -0.4999987781047821, "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.05989583517657593, - "rewards/repetition_penalty_reward": -0.07410931127378717, - "rewards/smiles_len_reward": -0.011718750349245965, - "rewards/tag_count_reward": 0.003906250116415322, + "rewards/reasoning_steps_reward": 0.07291666837409139, + "rewards/repetition_penalty_reward": -0.04978172353003174, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.015625000465661287, "step": 40 }, { "clip_ratio": 0.0, - "completion_length": 253.33333778381348, - "epoch": 0.0336, - "grad_norm": 3.6556997299194336, - "kl": 0.006580352783203125, - "learning_rate": 4.6666666666666666e-07, - "loss": 0.5006, - "num_tokens": 4681953.0, - "reward": -0.46202344447374344, - "reward_std": 0.052374251157743856, - "rewards/SMILES_validity_reward": -0.587500024586916, + "completion_length": 230.2604217529297, + "epoch": 0.0168, + "grad_norm": 4.764357089996338, + "kl": 0.0031957626342773438, + "learning_rate": 4.949291683053768e-07, + "loss": -0.1051, + "num_tokens": 1584001.0, + "reward": -0.4276710934937, + "reward_std": 0.10604065289953724, + "rewards/SMILES_validity_reward": -0.5500000212341547, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.0269097225391306, - "rewards/repetition_penalty_reward": -0.0574314376572147, - "rewards/smiles_len_reward": -0.009114583604969084, - "rewards/tag_count_reward": 0.0006510416860692203, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.12500000349245965, + "rewards/repetition_penalty_reward": -0.038690910892910324, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 42 }, { "clip_ratio": 0.0, - "completion_length": 303.24480056762695, - "epoch": 0.0352, - "grad_norm": 4.723124027252197, - "kl": 0.009726524353027344, - "learning_rate": 4.888888888888889e-07, - "loss": 0.619, - "num_tokens": 4896703.0, - "reward": -0.460586316883564, - "reward_std": 0.05454236414516345, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.007812500232830644, - "rewards/reasoning_steps_reward": 0.034722223004791886, - "rewards/repetition_penalty_reward": -0.07157582964282483, - "rewards/smiles_len_reward": -0.009114583604969084, - "rewards/tag_count_reward": 0.0, + "completion_length": 162.5000033378601, + "epoch": 0.0176, + "grad_norm": 5.813767433166504, + "kl": 0.0059719085693359375, + "learning_rate": 4.941818151059955e-07, + "loss": 0.0334, + "num_tokens": 1648753.0, + "reward": -0.4659581948071718, + "reward_std": 0.017741082614520565, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.07638889085501432, + "rewards/repetition_penalty_reward": -0.033367487922078, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 44 }, { "clip_ratio": 0.0, - "completion_length": 330.3046941757202, - "epoch": 0.0368, - "grad_norm": 3.3449621200561523, - "kl": 0.016521453857421875, - "learning_rate": 4.999924786199418e-07, - "loss": 0.4126, - "num_tokens": 5121844.0, - "reward": -0.4563949555158615, - "reward_std": 0.07530709472484887, - "rewards/SMILES_validity_reward": -0.579166691750288, + "completion_length": 244.57292413711548, + "epoch": 0.0184, + "grad_norm": 3.382213830947876, + "kl": 0.005543231964111328, + "learning_rate": 4.933837520293017e-07, + "loss": 0.1422, + "num_tokens": 1721384.0, + "reward": -0.4495168598368764, + "reward_std": 0.0532101350290759, + "rewards/SMILES_validity_reward": -0.5833333563059568, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.03993055655155331, - "rewards/repetition_penalty_reward": -0.07119860564125702, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.07638889062218368, + "rewards/repetition_penalty_reward": -0.04291246473439969, "rewards/smiles_len_reward": -0.010416666977107525, - "rewards/tag_count_reward": 0.0006510416860692203, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 46 }, { "clip_ratio": 0.0, - "completion_length": 261.0156307220459, - "epoch": 0.0384, - "grad_norm": 5.478763103485107, - "kl": 0.02780914306640625, - "learning_rate": 4.999323102948654e-07, - "loss": 0.345, - "num_tokens": 5320378.0, - "reward": -0.4597903210669756, - "reward_std": 0.04729547622264363, - "rewards/SMILES_validity_reward": -0.587500024586916, + "completion_length": 250.81250488758087, + "epoch": 0.0192, + "grad_norm": 3.784503936767578, + "kl": 0.005627632141113281, + "learning_rate": 4.925351448111454e-07, + "loss": 0.0332, + "num_tokens": 1794614.0, + "reward": -0.4602485578507185, + "reward_std": 0.03407564776716754, + "rewards/SMILES_validity_reward": -0.6000000238418579, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.04166666732635349, - "rewards/repetition_penalty_reward": -0.05049094167770818, - "rewards/smiles_len_reward": -0.007829760666936636, - "rewards/tag_count_reward": 0.0, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.09722222457639873, + "rewards/repetition_penalty_reward": -0.04918775928672403, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.007812500232830644, "step": 48 }, { "clip_ratio": 0.0, - "completion_length": 257.11719608306885, - "epoch": 0.04, - "grad_norm": 4.790470600128174, - "kl": 0.0424652099609375, - "learning_rate": 4.998119881260575e-07, - "loss": 0.3691, - "num_tokens": 5517415.0, - "reward": -0.4537742752581835, - "reward_std": 0.06936266523553059, - "rewards/SMILES_validity_reward": -0.579166691750288, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.013020833721384406, - "rewards/reasoning_steps_reward": 0.04861111240461469, - "rewards/repetition_penalty_reward": -0.05909780884394422, - "rewards/smiles_len_reward": -0.01345486135687679, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 210.15625739097595, + "epoch": 0.02, + "grad_norm": 4.50042200088501, + "kl": 0.009564399719238281, + "learning_rate": 4.91636169684011e-07, + "loss": 0.0973, + "num_tokens": 1863941.0, + "reward": -0.4667747635394335, + "reward_std": 0.015594157837767852, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.07638889038935304, + "rewards/repetition_penalty_reward": -0.04413735281559639, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, - "completion_length": 253.916672706604, - "epoch": 0.0416, - "grad_norm": 8.038966178894043, - "kl": 0.0573883056640625, - "learning_rate": 4.996315410727229e-07, - "loss": 0.4202, - "num_tokens": 5713223.0, - "reward": -0.4641662258654833, - "reward_std": 0.038869212061399594, - "rewards/SMILES_validity_reward": -0.59166669100523, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.0052083334885537624, - "rewards/reasoning_steps_reward": 0.03993055736646056, - "rewards/repetition_penalty_reward": -0.052296683279564604, - "rewards/smiles_len_reward": -0.0052083334885537624, - "rewards/tag_count_reward": 0.001953125058207661, + "completion_length": 209.0416705608368, + "epoch": 0.0208, + "grad_norm": 3.6077358722686768, + "kl": 0.010011672973632812, + "learning_rate": 4.906870133404186e-07, + "loss": 0.1673, + "num_tokens": 1933161.0, + "reward": -0.4403855809941888, + "reward_std": 0.07992339776683366, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.04513888992369175, + "rewards/repetition_penalty_reward": -0.03441216413921211, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0, "step": 52 }, { "clip_ratio": 0.0, - "completion_length": 205.43489933013916, - "epoch": 0.0432, - "grad_norm": 6.251274108886719, - "kl": 0.0938720703125, - "learning_rate": 4.99391012564956e-07, - "loss": 0.4254, - "num_tokens": 5890414.0, - "reward": -0.448165163397789, - "reward_std": 0.09143485396634787, - "rewards/SMILES_validity_reward": -0.57083335891366, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.013020833721384406, - "rewards/reasoning_steps_reward": 0.032118056842591614, - "rewards/repetition_penalty_reward": -0.04787064273841679, - "rewards/smiles_len_reward": -0.011733278282918036, + "completion_length": 207.75000667572021, + "epoch": 0.0216, + "grad_norm": 4.577792644500732, + "kl": 0.011927604675292969, + "learning_rate": 4.896878728941531e-07, + "loss": 0.0505, + "num_tokens": 2002257.0, + "reward": -0.4528631530702114, + "reward_std": 0.042765807069372386, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.08680555806495249, + "rewards/repetition_penalty_reward": -0.029500389107852243, + "rewards/smiles_len_reward": -0.0052083334885537624, "rewards/tag_count_reward": 0.0026041667442768812, "step": 54 }, { "clip_ratio": 0.0, - "completion_length": 229.63281869888306, - "epoch": 0.0448, - "grad_norm": 4.371728897094727, - "kl": 0.1113739013671875, - "learning_rate": 4.990904604932884e-07, - "loss": 0.1543, - "num_tokens": 6076897.0, - "reward": -0.4571216255426407, - "reward_std": 0.060153877711854875, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.007812500232830644, - "rewards/reasoning_steps_reward": 0.046875001688022166, - "rewards/repetition_penalty_reward": -0.049048895947635174, - "rewards/smiles_len_reward": -0.011751507059670985, - "rewards/tag_count_reward": 0.0026041666860692203, + "completion_length": 180.7500035762787, + "epoch": 0.0224, + "grad_norm": 6.483789443969727, + "kl": 0.013889312744140625, + "learning_rate": 4.886389558393284e-07, + "loss": 0.1321, + "num_tokens": 2068761.0, + "reward": -0.4464299641549587, + "reward_std": 0.07374286159756593, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.02083333395421505, + "rewards/repetition_penalty_reward": -0.03148799104383215, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.007812500232830644, "step": 56 }, { "clip_ratio": 0.0, - "completion_length": 258.5156297683716, - "epoch": 0.0464, - "grad_norm": 6.046833038330078, - "kl": 0.1477203369140625, - "learning_rate": 4.987299571947553e-07, - "loss": 0.3914, - "num_tokens": 6274471.0, - "reward": -0.4582570604979992, - "reward_std": 0.05744875143864192, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.007812500232830644, - "rewards/reasoning_steps_reward": 0.03645833377959207, - "rewards/repetition_penalty_reward": -0.052860176598187536, - "rewards/smiles_len_reward": -0.0075757576851174235, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 229.35417246818542, + "epoch": 0.0232, + "grad_norm": 3.902653932571411, + "kl": 0.013338088989257812, + "learning_rate": 4.875404800072976e-07, + "loss": -0.0142, + "num_tokens": 2139931.0, + "reward": -0.45120589807629585, + "reward_std": 0.04398537143060821, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.10763889225199819, + "rewards/repetition_penalty_reward": -0.031157015706412494, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0, "step": 58 }, { "clip_ratio": 0.0, - "completion_length": 220.3307328224182, - "epoch": 0.048, - "grad_norm": 3.264477014541626, - "kl": 0.1656341552734375, - "learning_rate": 4.983095894354857e-07, - "loss": 0.3925, - "num_tokens": 6457382.0, - "reward": -0.4511374644935131, - "reward_std": 0.08122232253663242, - "rewards/SMILES_validity_reward": -0.575000025331974, + "completion_length": 197.83333957195282, + "epoch": 0.024, + "grad_norm": 4.851426601409912, + "kl": 0.02254486083984375, + "learning_rate": 4.86392673521415e-07, + "loss": 0.3276, + "num_tokens": 2208075.0, + "reward": -0.469699963927269, + "reward_std": 0.015661527457268676, + "rewards/SMILES_validity_reward": -0.6000000238418579, "rewards/cosine_scaled_reward": -0.4999987781047821, "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.04079861118225381, - "rewards/repetition_penalty_reward": -0.0453608765383251, - "rewards/smiles_len_reward": -0.014365261304192245, - "rewards/tag_count_reward": 0.0013020833721384406, + "rewards/reasoning_steps_reward": 0.017361111473292112, + "rewards/repetition_penalty_reward": -0.04040325006644707, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0, "step": 60 }, { "clip_ratio": 0.0, - "completion_length": 255.14323806762695, - "epoch": 0.0496, - "grad_norm": 4.010850429534912, - "kl": 0.195281982421875, - "learning_rate": 4.978294583898195e-07, - "loss": 0.6237, - "num_tokens": 6653661.0, - "reward": -0.45640311203897, - "reward_std": 0.07789475272875279, - "rewards/SMILES_validity_reward": -0.5804687775671482, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.013020833721384406, - "rewards/reasoning_steps_reward": 0.03211805649334565, - "rewards/repetition_penalty_reward": -0.0569572810200043, - "rewards/smiles_len_reward": -0.016927083604969084, - "rewards/tag_count_reward": 0.001953125058207661, + "completion_length": 174.0625078678131, + "epoch": 0.0248, + "grad_norm": 4.745796203613281, + "kl": 0.026813507080078125, + "learning_rate": 4.851957747496606e-07, + "loss": 0.0778, + "num_tokens": 2273937.0, + "reward": -0.46857298351824284, + "reward_std": 0.016187770637770882, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.05208333418704569, + "rewards/repetition_penalty_reward": -0.030001492603332736, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 62 }, { "clip_ratio": 0.0, - "completion_length": 200.16927552223206, - "epoch": 0.0512, - "grad_norm": 5.870493412017822, - "kl": 0.6861572265625, - "learning_rate": 4.972896796159568e-07, - "loss": 0.1471, - "num_tokens": 6828830.0, - "reward": -0.4545394852757454, - "reward_std": 0.06280390484607778, - "rewards/SMILES_validity_reward": -0.583333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.013020833721384406, - "rewards/reasoning_steps_reward": 0.04687500145519152, - "rewards/repetition_penalty_reward": -0.04258153022965416, - "rewards/smiles_len_reward": -0.009324597000158974, - "rewards/tag_count_reward": 0.003906249941792339, + "completion_length": 135.04167068004608, + "epoch": 0.0256, + "grad_norm": 6.314318656921387, + "kl": 0.023064613342285156, + "learning_rate": 4.839500322551386e-07, + "loss": 0.0664, + "num_tokens": 2336053.0, + "reward": -0.4527385290712118, + "reward_std": 0.0479089121290599, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.05208333441987634, + "rewards/repetition_penalty_reward": -0.022177818129421212, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0, "step": 64 }, { "clip_ratio": 0.0, - "completion_length": 168.05990076065063, - "epoch": 0.0528, - "grad_norm": 9.24417781829834, - "kl": 1.529541015625, - "learning_rate": 4.966903830281448e-07, - "loss": 0.5681, - "num_tokens": 6991669.0, - "reward": -0.46538511849939823, - "reward_std": 0.03016815922455862, - "rewards/SMILES_validity_reward": -0.5958333574235439, + "completion_length": 172.0937558412552, + "epoch": 0.0264, + "grad_norm": 6.215081214904785, + "kl": 0.0323638916015625, + "learning_rate": 4.826557047444563e-07, + "loss": 0.1401, + "num_tokens": 2401726.0, + "reward": -0.46973296627402306, + "reward_std": 0.012096146092517301, + "rewards/SMILES_validity_reward": -0.6000000238418579, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.026909722830168903, - "rewards/repetition_penalty_reward": -0.03530300591955893, - "rewards/smiles_len_reward": -0.007828538189642131, - "rewards/tag_count_reward": 0.001953125058207661, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.04166666814126074, + "rewards/repetition_penalty_reward": -0.03378878638613969, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0, "step": 66 }, { "clip_ratio": 0.0, - "completion_length": 163.63021278381348, - "epoch": 0.0544, - "grad_norm": 7.270186901092529, - "kl": 5.887939453125, - "learning_rate": 4.960317128654107e-07, - "loss": 0.4877, - "num_tokens": 7152807.0, - "reward": -0.44715421833097935, - "reward_std": 0.07873522458248772, - "rewards/SMILES_validity_reward": -0.575000025331974, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.02343750069849193, - "rewards/reasoning_steps_reward": 0.029513889516238123, - "rewards/repetition_penalty_reward": -0.03269748640013859, - "rewards/smiles_len_reward": -0.015625000349245965, - "rewards/tag_count_reward": 0.001953125058207661, + "completion_length": 115.77083659172058, + "epoch": 0.0272, + "grad_norm": 5.195652961730957, + "kl": 0.04044342041015625, + "learning_rate": 4.813130610139993e-07, + "loss": 0.1761, + "num_tokens": 2461992.0, + "reward": -0.4498000517487526, + "reward_std": 0.06054470940580359, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.0034722222480922937, + "rewards/repetition_penalty_reward": -0.016577735252212733, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 68 }, { "clip_ratio": 0.0, - "completion_length": 117.88281559944153, - "epoch": 0.056, - "grad_norm": 21.904245376586914, - "kl": 13.5673828125, - "learning_rate": 4.953138276568461e-07, - "loss": 0.3171, - "num_tokens": 7296378.0, - "reward": -0.45215606689453125, - "reward_std": 0.06071481961407699, - "rewards/SMILES_validity_reward": -0.579166691750288, + "completion_length": 214.18750643730164, + "epoch": 0.028, + "grad_norm": 5.4179182052612305, + "kl": 0.028334617614746094, + "learning_rate": 4.799223798941089e-07, + "loss": 0.0926, + "num_tokens": 2531706.0, + "reward": -0.456182649359107, + "reward_std": 0.04664806976506952, + "rewards/SMILES_validity_reward": -0.5833333563059568, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.010416666977107525, - "rewards/reasoning_steps_reward": 0.03559027874143794, - "rewards/repetition_penalty_reward": -0.023167401901446283, - "rewards/smiles_len_reward": -0.011718750349245965, - "rewards/tag_count_reward": 0.0006510416860692203, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.08333333558402956, + "rewards/repetition_penalty_reward": -0.046202394034480676, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.0, "step": 70 }, { "clip_ratio": 0.0, - "completion_length": 155.77083659172058, - "epoch": 0.0576, - "grad_norm": 9.583633422851562, - "kl": 14.696044921875, - "learning_rate": 4.945369001834514e-07, - "loss": 0.2378, - "num_tokens": 7454498.0, - "reward": -0.44742128998041153, - "reward_std": 0.0873336758086225, - "rewards/SMILES_validity_reward": -0.57083335891366, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.013020833721384406, - "rewards/reasoning_steps_reward": 0.026909722946584225, - "rewards/repetition_penalty_reward": -0.030680728232255206, - "rewards/smiles_len_reward": -0.014322917093522847, - "rewards/tag_count_reward": 0.0006510416860692203, + "completion_length": 137.51041901111603, + "epoch": 0.0288, + "grad_norm": 4.121431827545166, + "kl": 0.051410675048828125, + "learning_rate": 4.78483950191177e-07, + "loss": -0.0549, + "num_tokens": 2594059.0, + "reward": -0.4664184954017401, + "reward_std": 0.013460547677823342, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.0625000016298145, + "rewards/repetition_penalty_reward": -0.026685798191465437, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 72 }, { "clip_ratio": 0.0, - "completion_length": 177.04167103767395, - "epoch": 0.0592, - "grad_norm": 23.011547088623047, - "kl": 23.6376953125, - "learning_rate": 4.937011174365514e-07, - "loss": 0.2045, - "num_tokens": 7620786.0, - "reward": -0.4481741450726986, - "reward_std": 0.07758373257820494, - "rewards/SMILES_validity_reward": -0.575000025331974, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.01822916720993817, - "rewards/reasoning_steps_reward": 0.040798612229991704, - "rewards/repetition_penalty_reward": -0.03660338817280717, - "rewards/smiles_len_reward": -0.015625000349245965, - "rewards/tag_count_reward": 0.0, + "completion_length": 184.61458683013916, + "epoch": 0.0296, + "grad_norm": 5.257875442504883, + "kl": 0.050022125244140625, + "learning_rate": 4.769980706276687e-07, + "loss": 0.0014, + "num_tokens": 2660934.0, + "reward": -0.4514905624091625, + "reward_std": 0.053132650078623556, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.07986111333593726, + "rewards/repetition_penalty_reward": -0.04268423894245643, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.010416666977107525, "step": 74 }, { "clip_ratio": 0.0, - "completion_length": 144.41146230697632, - "epoch": 0.0608, - "grad_norm": 6.287862777709961, - "kl": 8.23095703125, - "learning_rate": 4.928066805727901e-07, - "loss": 0.2391, - "num_tokens": 7774544.0, - "reward": -0.45081394724547863, - "reward_std": 0.07222701624414185, - "rewards/SMILES_validity_reward": -0.579166691750288, + "completion_length": 112.86458694934845, + "epoch": 0.0304, + "grad_norm": 4.131363868713379, + "kl": 0.058177947998046875, + "learning_rate": 4.7546504978008595e-07, + "loss": 0.1018, + "num_tokens": 2720921.0, + "reward": -0.46908595971763134, + "reward_std": 0.0056425063230562955, + "rewards/SMILES_validity_reward": -0.6000000238418579, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.02690972329583019, - "rewards/repetition_penalty_reward": -0.03107271766930353, - "rewards/smiles_len_reward": -0.01231060631107539, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.020833333721384406, + "rewards/repetition_penalty_reward": -0.011693818843923509, + "rewards/smiles_len_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 76 }, { "clip_ratio": 0.0, - "completion_length": 166.8411512374878, - "epoch": 0.0624, - "grad_norm": 6.666407585144043, - "kl": 7.9921875, - "learning_rate": 4.918538048657159e-07, - "loss": 0.45, - "num_tokens": 7936915.0, - "reward": -0.4527244567871094, - "reward_std": 0.06426695434493013, - "rewards/SMILES_validity_reward": -0.579166691750288, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.013020833721384406, - "rewards/reasoning_steps_reward": 0.031250000873114914, - "rewards/repetition_penalty_reward": -0.03297455201391131, - "rewards/smiles_len_reward": -0.013020833721384406, - "rewards/tag_count_reward": 0.0026041666860692203, + "completion_length": 119.19792026281357, + "epoch": 0.0312, + "grad_norm": 5.673096656799316, + "kl": 0.06604766845703125, + "learning_rate": 4.738852060148848e-07, + "loss": -0.087, + "num_tokens": 2781516.0, + "reward": -0.4646188300102949, + "reward_std": 0.014277806236350443, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.06250000139698386, + "rewards/repetition_penalty_reward": -0.013897485987399705, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 78 }, { "clip_ratio": 0.0, - "completion_length": 134.31771183013916, - "epoch": 0.064, - "grad_norm": 8.43528938293457, - "kl": 5.4541015625, - "learning_rate": 4.908427196539701e-07, - "loss": 0.146, - "num_tokens": 8086797.0, - "reward": -0.44427087157964706, - "reward_std": 0.08865486389186117, - "rewards/SMILES_validity_reward": -0.575000025331974, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.033854167675599456, - "rewards/reasoning_steps_reward": 0.026041667093522847, - "rewards/repetition_penalty_reward": -0.026433447477757, - "rewards/smiles_len_reward": -0.01953125058207661, - "rewards/tag_count_reward": 0.0006510416860692203, + "completion_length": 106.760418176651, + "epoch": 0.032, + "grad_norm": 5.955003261566162, + "kl": 0.0661163330078125, + "learning_rate": 4.722588674223593e-07, + "loss": 0.0259, + "num_tokens": 2840917.0, + "reward": -0.46621129661798477, + "reward_std": 0.01656446641209186, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.03819444519467652, + "rewards/repetition_penalty_reward": -0.02374570633401163, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 80 }, { "clip_ratio": 0.0, - "completion_length": 162.90104603767395, - "epoch": 0.0656, - "grad_norm": 8.4097900390625, - "kl": 4.1484375, - "learning_rate": 4.897736682860885e-07, - "loss": 0.192, - "num_tokens": 8247655.0, - "reward": -0.45605068281292915, - "reward_std": 0.059727372688939795, - "rewards/SMILES_validity_reward": -0.587500024586916, + "completion_length": 143.8645884990692, + "epoch": 0.0328, + "grad_norm": 5.669166564941406, + "kl": 0.05596160888671875, + "learning_rate": 4.70586371748506e-07, + "loss": 0.0061, + "num_tokens": 2903880.0, + "reward": -0.4342301404103637, + "reward_std": 0.08722544111515163, + "rewards/SMILES_validity_reward": -0.5666666887700558, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.03906250145519152, - "rewards/repetition_penalty_reward": -0.034038055848213844, - "rewards/smiles_len_reward": -0.01618303614668548, - "rewards/tag_count_reward": 0.0006510416860692203, + "rewards/format_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.041666667675599456, + "rewards/repetition_penalty_reward": -0.02146888236165978, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.0, "step": 82 }, { "clip_ratio": 0.0, - "completion_length": 110.7239625453949, - "epoch": 0.0672, - "grad_norm": 10.938655853271484, - "kl": 3.68115234375, - "learning_rate": 4.88646908061933e-07, - "loss": 0.0666, - "num_tokens": 8388477.0, - "reward": -0.44183752313256264, - "reward_std": 0.08375674461422022, - "rewards/SMILES_validity_reward": -0.5750000234693289, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.039062500931322575, - "rewards/reasoning_steps_reward": 0.03038194531109184, - "rewards/repetition_penalty_reward": -0.02076093477808172, - "rewards/smiles_len_reward": -0.023439725977368653, - "rewards/tag_count_reward": 0.0032552084303461015, + "completion_length": 152.95833730697632, + "epoch": 0.0336, + "grad_norm": 5.404734134674072, + "kl": 0.07171249389648438, + "learning_rate": 4.6886806632488363e-07, + "loss": 0.1132, + "num_tokens": 2967716.0, + "reward": -0.47036008536815643, + "reward_std": 0.013487992946465965, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.027777778450399637, + "rewards/repetition_penalty_reward": -0.02617112870211713, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 84 }, { "clip_ratio": 0.0, - "completion_length": 128.8776068687439, - "epoch": 0.0688, - "grad_norm": 34.703914642333984, - "kl": 6.35546875, - "learning_rate": 4.874627101707643e-07, - "loss": 0.424, - "num_tokens": 8536270.0, - "reward": -0.4417836368083954, - "reward_std": 0.10183166417846223, - "rewards/SMILES_validity_reward": -0.57083335891366, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.03385416744276881, - "rewards/reasoning_steps_reward": 0.019965278392191976, - "rewards/repetition_penalty_reward": -0.028516643127659336, - "rewards/smiles_len_reward": -0.019572260905988514, - "rewards/tag_count_reward": 0.00455729168606922, + "completion_length": 132.33333587646484, + "epoch": 0.0344, + "grad_norm": 5.923622131347656, + "kl": 0.0746612548828125, + "learning_rate": 4.6710430799648143e-07, + "loss": 0.1332, + "num_tokens": 3029572.0, + "reward": -0.4556627534329891, + "reward_std": 0.04119024146348238, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.027777778450399637, + "rewards/repetition_penalty_reward": -0.021906152804149315, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0, "step": 86 }, { "clip_ratio": 0.0, - "completion_length": 153.65365076065063, - "epoch": 0.0704, - "grad_norm": 9.4151029586792, - "kl": 3.0634765625, - "learning_rate": 4.86221359625972e-07, - "loss": 0.1732, - "num_tokens": 8693577.0, - "reward": -0.43013707362115383, - "reward_std": 0.12213981148670428, - "rewards/SMILES_validity_reward": -0.5651041902601719, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.05989583348855376, - "rewards/reasoning_steps_reward": 0.03038194525288418, - "rewards/repetition_penalty_reward": -0.0299287144880509, - "rewards/smiles_len_reward": -0.026434181840159, - "rewards/tag_count_reward": 0.0006510416860692203, + "completion_length": 124.13541960716248, + "epoch": 0.0352, + "grad_norm": 5.512033939361572, + "kl": 0.091949462890625, + "learning_rate": 4.652954630476127e-07, + "loss": 0.1804, + "num_tokens": 3090641.0, + "reward": -0.47078334353864193, + "reward_std": 0.007824531650840072, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.017361111473292112, + "rewards/repetition_penalty_reward": -0.025195291149429977, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0, "step": 88 }, { "clip_ratio": 0.0, - "completion_length": 140.5572965145111, - "epoch": 0.072, - "grad_norm": 7.3330078125, - "kl": 2.5065155029296875, - "learning_rate": 4.849231551964771e-07, - "loss": 0.1509, - "num_tokens": 8845855.0, - "reward": -0.432578993961215, - "reward_std": 0.13273732305970043, - "rewards/SMILES_validity_reward": -0.5583333596587181, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.033854167675599456, - "rewards/reasoning_steps_reward": 0.03559027856681496, - "rewards/repetition_penalty_reward": -0.03383599827066064, - "rewards/smiles_len_reward": -0.021425190032459795, - "rewards/tag_count_reward": 0.0006510416860692203, + "completion_length": 84.52083671092987, + "epoch": 0.036, + "grad_norm": 3.839015007019043, + "kl": 0.09572601318359375, + "learning_rate": 4.6344190712584713e-07, + "loss": 0.0499, + "num_tokens": 3147907.0, + "reward": -0.4675292409956455, + "reward_std": 0.008711494119779672, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.03472222317941487, + "rewards/repetition_penalty_reward": -0.010015465391916223, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0, "step": 90 }, { "clip_ratio": 0.0, - "completion_length": 137.8619828224182, - "epoch": 0.0736, - "grad_norm": 7.545994281768799, - "kl": 2.7392578125, - "learning_rate": 4.835684093348244e-07, - "loss": 0.2127, - "num_tokens": 8997098.0, - "reward": -0.4239292126148939, - "reward_std": 0.1364244522410445, - "rewards/SMILES_validity_reward": -0.5541666932404041, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.05468750069849193, - "rewards/reasoning_steps_reward": 0.031250000989530236, - "rewards/repetition_penalty_reward": -0.028238558385055512, - "rewards/smiles_len_reward": -0.031106452457606792, - "rewards/tag_count_reward": 0.003906250116415322, + "completion_length": 93.9270852804184, + "epoch": 0.0368, + "grad_norm": 3.3888838291168213, + "kl": 0.116241455078125, + "learning_rate": 4.615440251639995e-07, + "loss": -0.0578, + "num_tokens": 3206076.0, + "reward": -0.4662612807005644, + "reward_std": 0.01008661445666803, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.05208333465270698, + "rewards/repetition_penalty_reward": -0.00948869220155757, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0, "step": 92 }, { "clip_ratio": 0.0, - "completion_length": 162.50000405311584, - "epoch": 0.0752, - "grad_norm": 12.123269081115723, - "kl": 3.155029296875, - "learning_rate": 4.821574481019811e-07, - "loss": 0.2579, - "num_tokens": 9157802.0, - "reward": -0.4132191240787506, - "reward_std": 0.1652318238047883, - "rewards/SMILES_validity_reward": -0.541666692122817, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.06510416837409139, - "rewards/reasoning_steps_reward": 0.028645834245253354, - "rewards/repetition_penalty_reward": -0.029634669452207163, - "rewards/smiles_len_reward": -0.03615117573644966, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 106.22916853427887, + "epoch": 0.0376, + "grad_norm": 3.9039151668548584, + "kl": 0.122039794921875, + "learning_rate": 4.596022113001894e-07, + "loss": -0.0583, + "num_tokens": 3265426.0, + "reward": -0.4652919042855501, + "reward_std": 0.013997638350701891, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.05902777914889157, + "rewards/repetition_penalty_reward": -0.019760221184697002, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0078125, "step": 94 }, { "clip_ratio": 0.0, - "completion_length": 151.7630271911621, - "epoch": 0.0768, - "grad_norm": 10.459805488586426, - "kl": 3.26171875, - "learning_rate": 4.806906110888606e-07, - "loss": 0.2427, - "num_tokens": 9314383.0, - "reward": -0.39828602597117424, - "reward_std": 0.20478773265494965, - "rewards/SMILES_validity_reward": -0.5208333618938923, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.06770833465270698, - "rewards/reasoning_steps_reward": 0.02777777868323028, - "rewards/repetition_penalty_reward": -0.03425826533930376, - "rewards/smiles_len_reward": -0.03888054273556918, - "rewards/tag_count_reward": 0.0052083334303461015, + "completion_length": 91.39583814144135, + "epoch": 0.0384, + "grad_norm": 6.249267101287842, + "kl": 0.128753662109375, + "learning_rate": 4.576168687959895e-07, + "loss": 0.0203, + "num_tokens": 3323352.0, + "reward": -0.4529070910066366, + "reward_std": 0.0469353504377068, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.027777778217568994, + "rewards/repetition_penalty_reward": -0.01257872223504819, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 96 }, { "clip_ratio": 0.0, - "completion_length": 127.87500381469727, - "epoch": 0.0784, - "grad_norm": 11.642253875732422, - "kl": 3.9642333984375, - "learning_rate": 4.791682513345892e-07, - "loss": 0.2138, - "num_tokens": 9461791.0, - "reward": -0.36608864995650947, - "reward_std": 0.26178658893331885, - "rewards/SMILES_validity_reward": -0.4875000258907676, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.1015625016298145, - "rewards/reasoning_steps_reward": 0.01388888928340748, - "rewards/repetition_penalty_reward": -0.02420425981108565, - "rewards/smiles_len_reward": -0.04796776862349361, + "completion_length": 110.9479192495346, + "epoch": 0.0392, + "grad_norm": 6.070929050445557, + "kl": 0.122650146484375, + "learning_rate": 4.555884099526793e-07, + "loss": 0.0258, + "num_tokens": 3383155.0, + "reward": -0.439573897048831, + "reward_std": 0.07561502030876, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.03819444542750716, + "rewards/repetition_penalty_reward": -0.019350912392837927, + "rewards/smiles_len_reward": -0.015625000465661287, "rewards/tag_count_reward": 0.0052083334885537624, "step": 98 }, { "clip_ratio": 0.0, - "completion_length": 139.2734419107437, - "epoch": 0.08, - "grad_norm": 11.359210014343262, - "kl": 3.97216796875, - "learning_rate": 4.775907352415367e-07, - "loss": -0.1606, - "num_tokens": 9613576.0, - "reward": -0.3407728634774685, - "reward_std": 0.3080316074192524, - "rewards/SMILES_validity_reward": -0.4625000227242708, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.1223958341870457, - "rewards/reasoning_steps_reward": 0.04253472259733826, - "rewards/repetition_penalty_reward": -0.02961380738997832, - "rewards/smiles_len_reward": -0.058150356577243656, - "rewards/tag_count_reward": 0.007812499941792339, + "completion_length": 107.42708563804626, + "epoch": 0.04, + "grad_norm": 6.182693004608154, + "kl": 0.1309814453125, + "learning_rate": 4.5351725602562174e-07, + "loss": 0.0129, + "num_tokens": 3442620.0, + "reward": -0.465021351352334, + "reward_std": 0.018675321242881182, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.0416666679084301, + "rewards/repetition_penalty_reward": -0.020526854172203457, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 100 }, { "clip_ratio": 0.0, - "completion_length": 121.23177433013916, - "epoch": 0.0816, - "grad_norm": 12.774822235107422, - "kl": 4.29443359375, - "learning_rate": 4.759584424871301e-07, - "loss": 0.0665, - "num_tokens": 9758433.0, - "reward": -0.30280456133186817, - "reward_std": 0.37466976934229024, - "rewards/SMILES_validity_reward": -0.4083333518356085, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.1354166658129543, - "rewards/reasoning_steps_reward": 0.01822916720993817, - "rewards/repetition_penalty_reward": -0.02551854120247299, - "rewards/smiles_len_reward": -0.07518413302022964, - "rewards/tag_count_reward": 0.006510416802484542, + "completion_length": 106.75000429153442, + "epoch": 0.0408, + "grad_norm": 4.344884395599365, + "kl": 0.119537353515625, + "learning_rate": 4.514038371367791e-07, + "loss": 0.1713, + "num_tokens": 3502020.0, + "reward": -0.4704951122403145, + "reward_std": 0.0033068859920604154, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.0069444444961845875, + "rewards/repetition_penalty_reward": -0.011896428157342598, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0, "step": 102 }, { "clip_ratio": 0.0, - "completion_length": 120.58854365348816, - "epoch": 0.0832, - "grad_norm": 21.751882553100586, - "kl": 8.2001953125, - "learning_rate": 4.742717659324733e-07, - "loss": 0.0722, - "num_tokens": 9903043.0, - "reward": -0.21644436661154032, - "reward_std": 0.4530010260641575, - "rewards/SMILES_validity_reward": -0.31666667945683, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.21874999906867743, - "rewards/reasoning_steps_reward": 0.014756944845430553, - "rewards/repetition_penalty_reward": -0.026267023400578182, - "rewards/smiles_len_reward": -0.09707507817074656, - "rewards/tag_count_reward": 0.004557291744276881, + "completion_length": 65.79166901111603, + "epoch": 0.0416, + "grad_norm": 4.156596660614014, + "kl": 0.143524169921875, + "learning_rate": 4.4924859218538936e-07, + "loss": -0.0151, + "num_tokens": 3557488.0, + "reward": -0.4587331060320139, + "reward_std": 0.02372456392913591, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.031250000931322575, + "rewards/reasoning_steps_reward": 0.031250000931322575, + "rewards/repetition_penalty_reward": -0.007123569957911968, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 104 }, { "clip_ratio": 0.0, - "completion_length": 94.33333432674408, - "epoch": 0.0848, - "grad_norm": 9.819295883178711, - "kl": 9.8955078125, - "learning_rate": 4.7253111152779233e-07, - "loss": -0.1048, - "num_tokens": 10037571.0, - "reward": -0.25656710658222437, - "reward_std": 0.38735912647098303, - "rewards/SMILES_validity_reward": -0.37916668597608805, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.22135416697710752, - "rewards/reasoning_steps_reward": 0.015625000291038305, - "rewards/repetition_penalty_reward": -0.01776839853846468, - "rewards/smiles_len_reward": -0.0903514064848423, - "rewards/tag_count_reward": 0.0169270834303461, + "completion_length": 113.0312557220459, + "epoch": 0.0424, + "grad_norm": 5.499018669128418, + "kl": 0.11688232421875, + "learning_rate": 4.470519687568185e-07, + "loss": 0.0769, + "num_tokens": 3617491.0, + "reward": -0.46708683855831623, + "reward_std": 0.011621264442510437, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.04513889015652239, + "rewards/repetition_penalty_reward": -0.0160081133217318, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.0, "step": 106 }, { "clip_ratio": 0.0, - "completion_length": 136.5937523841858, - "epoch": 0.0864, - "grad_norm": 9.594542503356934, - "kl": 8.150390625, - "learning_rate": 4.707368982147317e-07, - "loss": 0.1126, - "num_tokens": 10188327.0, - "reward": -0.19759700493887067, - "reward_std": 0.49018072336912155, - "rewards/SMILES_validity_reward": -0.3083333484828472, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.25781250069849193, - "rewards/reasoning_steps_reward": 0.02170138928340748, - "rewards/repetition_penalty_reward": -0.027601693116594106, - "rewards/smiles_len_reward": -0.09363839635625482, - "rewards/tag_count_reward": 0.008463541802484542, + "completion_length": 103.69792056083679, + "epoch": 0.0432, + "grad_norm": 2.9395508766174316, + "kl": 0.1288299560546875, + "learning_rate": 4.4481442302960923e-07, + "loss": -0.0733, + "num_tokens": 3676598.0, + "reward": -0.4545927792787552, + "reward_std": 0.03454983566189185, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.045138889690861106, + "rewards/repetition_penalty_reward": -0.01033839286537841, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0078125, "step": 108 }, { "clip_ratio": 0.0, - "completion_length": 127.23177528381348, - "epoch": 0.088, - "grad_norm": 100.94241333007812, - "kl": 19.6396484375, - "learning_rate": 4.688895578255227e-07, - "loss": 0.0951, - "num_tokens": 10335488.0, - "reward": -0.058026916813105345, - "reward_std": 0.596280463039875, - "rewards/SMILES_validity_reward": -0.14583334388832253, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.35416666977107525, - "rewards/reasoning_steps_reward": 0.01822916720993817, - "rewards/repetition_penalty_reward": -0.022731273456884082, - "rewards/smiles_len_reward": -0.1311065279878676, - "rewards/tag_count_reward": 0.013671875058207661, + "completion_length": 90.92708659172058, + "epoch": 0.044, + "grad_norm": 4.17449426651001, + "kl": 0.1243133544921875, + "learning_rate": 4.4253641968074505e-07, + "loss": 0.0349, + "num_tokens": 3734479.0, + "reward": -0.4687845651060343, + "reward_std": 0.0076231868952163495, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.03125000069849193, + "rewards/repetition_penalty_reward": -0.011283979780273512, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 110 }, { "clip_ratio": 0.0, - "completion_length": 132.6119830608368, - "epoch": 0.0896, - "grad_norm": 12.545600891113281, - "kl": 8.9892578125, - "learning_rate": 4.6698953497905016e-07, - "loss": 0.132, - "num_tokens": 10484715.0, - "reward": 0.05423457216238603, - "reward_std": 0.6229321677237749, - "rewards/SMILES_validity_reward": -0.013802092677603184, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.427083327434957, - "rewards/reasoning_steps_reward": 0.019965278333984315, - "rewards/repetition_penalty_reward": -0.028624614773434587, - "rewards/smiles_len_reward": -0.15316252200864255, - "rewards/tag_count_reward": 0.01953125005820766, + "completion_length": 129.86458790302277, + "epoch": 0.0448, + "grad_norm": 4.147829055786133, + "kl": 0.10516357421875, + "learning_rate": 4.402184317891501e-07, + "loss": -0.0648, + "num_tokens": 3796098.0, + "reward": -0.45566146075725555, + "reward_std": 0.04058586481369275, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.04861111217178404, + "rewards/repetition_penalty_reward": -0.014080686727538705, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 112 }, { "clip_ratio": 0.0, - "completion_length": 114.6510443687439, - "epoch": 0.0912, - "grad_norm": 11.581257820129395, - "kl": 7.5224609375, - "learning_rate": 4.650372869738414e-07, - "loss": 0.1721, - "num_tokens": 10627045.0, - "reward": -0.007128866913262755, - "reward_std": 0.5819757748395205, - "rewards/SMILES_validity_reward": -0.1005208427086473, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.4166666669771075, - "rewards/reasoning_steps_reward": 0.02343750069849193, - "rewards/repetition_penalty_reward": -0.01863024538033642, - "rewards/smiles_len_reward": -0.14393543626647443, - "rewards/tag_count_reward": 0.021484375349245965, + "completion_length": 137.92708706855774, + "epoch": 0.0456, + "grad_norm": 8.529430389404297, + "kl": 0.115264892578125, + "learning_rate": 4.37860940737443e-07, + "loss": -0.1091, + "num_tokens": 3858491.0, + "reward": -0.4289623722434044, + "reward_std": 0.09471672396102804, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.052083334885537624, + "rewards/reasoning_steps_reward": 0.04861111263744533, + "rewards/repetition_penalty_reward": -0.01740230650466401, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 114 }, { "clip_ratio": 0.0, - "completion_length": 190.92187976837158, - "epoch": 0.0928, - "grad_norm": 7.184658050537109, - "kl": 6.5302734375, - "learning_rate": 4.630332836780028e-07, - "loss": 0.1909, - "num_tokens": 10798663.0, - "reward": 0.17218821711139753, - "reward_std": 0.6334065981209278, - "rewards/SMILES_validity_reward": 0.11249999608844519, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.5260416707023978, - "rewards/reasoning_steps_reward": 0.028645834128838032, - "rewards/repetition_penalty_reward": -0.039949697558768094, - "rewards/smiles_len_reward": -0.15913273417390883, - "rewards/tag_count_reward": 0.026692708721384406, + "completion_length": 63.87500262260437, + "epoch": 0.0464, + "grad_norm": 9.70301342010498, + "kl": 0.1183013916015625, + "learning_rate": 4.354644361119671e-07, + "loss": 0.0586, + "num_tokens": 3913775.0, + "reward": -0.4454812277108431, + "reward_std": 0.0836324201318348, + "rewards/SMILES_validity_reward": -0.571875024586916, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.003771467447222676, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.0, "step": 116 }, { "clip_ratio": 0.0, - "completion_length": 171.47656559944153, - "epoch": 0.0944, - "grad_norm": 14.1889009475708, - "kl": 6.49609375, - "learning_rate": 4.609780074161327e-07, - "loss": 0.3119, - "num_tokens": 10962814.0, - "reward": 0.28294606506824493, - "reward_std": 0.6309285741299391, - "rewards/SMILES_validity_reward": 0.2489583333954215, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.6015625055879354, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.032547464230447076, - "rewards/smiles_len_reward": -0.21143039967864752, - "rewards/tag_count_reward": 0.015624999825377017, + "completion_length": 90.41666865348816, + "epoch": 0.0472, + "grad_norm": 7.300216197967529, + "kl": 0.114227294921875, + "learning_rate": 4.3302941560111716e-07, + "loss": -0.0711, + "num_tokens": 3971607.0, + "reward": -0.45174217596650124, + "reward_std": 0.04705615748389391, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.05208333441987634, + "rewards/repetition_penalty_reward": -0.017422605837055016, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.010416666977107525, "step": 118 }, { "clip_ratio": 0.0, - "completion_length": 142.70573377609253, - "epoch": 0.096, - "grad_norm": 42.98564147949219, - "kl": 24.6396484375, - "learning_rate": 4.588719528532341e-07, - "loss": 0.4959, - "num_tokens": 11115917.0, - "reward": 0.437305249273777, - "reward_std": 0.6359960846602917, - "rewards/SMILES_validity_reward": 0.41953125642612576, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.6875000037252903, - "rewards/reasoning_steps_reward": 0.026909722946584225, - "rewards/repetition_penalty_reward": -0.02915750685497187, - "rewards/smiles_len_reward": -0.15191438651527278, - "rewards/tag_count_reward": 0.02799479162786156, + "completion_length": 120.2187534570694, + "epoch": 0.048, + "grad_norm": 7.793805122375488, + "kl": 0.114410400390625, + "learning_rate": 4.3055638489198236e-07, + "loss": -0.0088, + "num_tokens": 4032300.0, + "reward": -0.46079872362315655, + "reward_std": 0.022261769554461353, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.04861111240461469, + "rewards/repetition_penalty_reward": -0.01649503846419975, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.007812500232830644, "step": 120 }, { "clip_ratio": 0.0, - "completion_length": 125.51042151451111, - "epoch": 0.0976, - "grad_norm": 13.0149564743042, - "kl": 10.384765625, - "learning_rate": 4.567156268756593e-07, - "loss": 0.3887, - "num_tokens": 11262417.0, - "reward": 0.42353246640414, - "reward_std": 0.595534335821867, - "rewards/SMILES_validity_reward": 0.39244792703539133, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.7057291679084301, - "rewards/reasoning_steps_reward": 0.015625000465661287, - "rewards/repetition_penalty_reward": -0.025808534323005006, - "rewards/smiles_len_reward": -0.13964920805301517, - "rewards/tag_count_reward": 0.020833333837799728, + "completion_length": 112.37500369548798, + "epoch": 0.0488, + "grad_norm": 5.610187530517578, + "kl": 0.097442626953125, + "learning_rate": 4.280458575653296e-07, + "loss": 0.0941, + "num_tokens": 4092240.0, + "reward": -0.45120035484433174, + "reward_std": 0.04598578553486732, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.03472222317941487, + "rewards/repetition_penalty_reward": -0.015476670654607005, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.015624999767169356, "step": 122 }, { "clip_ratio": 0.0, - "completion_length": 129.255211353302, - "epoch": 0.0992, - "grad_norm": 9.103986740112305, - "kl": 36.1201171875, - "learning_rate": 4.5450954846911195e-07, - "loss": 0.4231, - "num_tokens": 11410355.0, - "reward": 0.545548053458333, - "reward_std": 0.5675202906131744, - "rewards/SMILES_validity_reward": 0.5458333436399698, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.7734375037252903, - "rewards/reasoning_steps_reward": 0.015625000465661287, - "rewards/repetition_penalty_reward": -0.025838642206508666, - "rewards/smiles_len_reward": -0.19758821406867355, - "rewards/tag_count_reward": 0.022135417093522847, + "completion_length": 92.35416865348816, + "epoch": 0.0496, + "grad_norm": 5.858239650726318, + "kl": 0.114837646484375, + "learning_rate": 4.2549835498894665e-07, + "loss": -0.0426, + "num_tokens": 4150258.0, + "reward": -0.44725739397108555, + "reward_std": 0.054796224998426624, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.031250000931322575, + "rewards/reasoning_steps_reward": 0.041666667675599456, + "rewards/repetition_penalty_reward": -0.011637324947514571, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.007812500232830644, "step": 124 }, { "clip_ratio": 0.0, - "completion_length": 121.94531488418579, - "epoch": 0.1008, - "grad_norm": 10.777301788330078, - "kl": 12.24609375, - "learning_rate": 4.5225424859373684e-07, - "loss": 0.3693, - "num_tokens": 11555486.0, - "reward": 0.5835141399875283, - "reward_std": 0.5180866969749331, - "rewards/SMILES_validity_reward": 0.5916666835546494, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.7656250018626451, - "rewards/reasoning_steps_reward": 0.00954861135687679, - "rewards/repetition_penalty_reward": -0.02219348722428549, - "rewards/smiles_len_reward": -0.1044283655937761, - "rewards/tag_count_reward": 0.013671875349245965, + "completion_length": 145.7083374261856, + "epoch": 0.0504, + "grad_norm": 4.300314426422119, + "kl": 0.092559814453125, + "learning_rate": 4.229144062093679e-07, + "loss": -0.067, + "num_tokens": 4213398.0, + "reward": -0.4413000885397196, + "reward_std": 0.07192132115596905, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.04861111217178404, + "rewards/repetition_penalty_reward": -0.018383800808805972, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.0078125, "step": 126 }, { "clip_ratio": 0.0, - "completion_length": 105.60416841506958, - "epoch": 0.1024, - "grad_norm": 30.928424835205078, - "kl": 55.1669921875, - "learning_rate": 4.4995027005632896e-07, - "loss": 0.3305, - "num_tokens": 11694342.0, - "reward": 0.5743140410631895, - "reward_std": 0.5184614229947329, - "rewards/SMILES_validity_reward": 0.5606770901940763, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8125000037252903, - "rewards/reasoning_steps_reward": 0.006076389050576836, - "rewards/repetition_penalty_reward": -0.015417461221659323, - "rewards/smiles_len_reward": -0.12278014622279443, - "rewards/tag_count_reward": 0.013020833546761423, + "completion_length": 124.31250333786011, + "epoch": 0.0512, + "grad_norm": 9.1367769241333, + "kl": 0.102203369140625, + "learning_rate": 4.2029454784200675e-07, + "loss": -0.0685, + "num_tokens": 4274484.0, + "reward": -0.45298329181969166, + "reward_std": 0.05172415471315617, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.052083334885537624, + "rewards/repetition_penalty_reward": -0.016812930087326095, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.007812500232830644, "step": 128 }, { "clip_ratio": 0.0, - "completion_length": 134.28646278381348, - "epoch": 0.104, - "grad_norm": 13111.33984375, - "kl": 1170.328125, - "learning_rate": 4.475981673796898e-07, - "loss": 1.5062, - "num_tokens": 11844212.0, - "reward": 0.6496439315378666, - "reward_std": 0.5050632283091545, - "rewards/SMILES_validity_reward": 0.6666666734963655, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8177083320915699, - "rewards/reasoning_steps_reward": 0.01822916720993817, - "rewards/repetition_penalty_reward": -0.023118784243706614, - "rewards/smiles_len_reward": -0.1405994631932117, - "rewards/tag_count_reward": 0.02213541668606922, + "completion_length": 116.47917032241821, + "epoch": 0.052, + "grad_norm": 6.13424825668335, + "kl": 0.106658935546875, + "learning_rate": 4.1763932395971433e-07, + "loss": -0.0741, + "num_tokens": 4334818.0, + "reward": -0.43802906200289726, + "reward_std": 0.08000406731298426, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.07986111333593726, + "rewards/repetition_penalty_reward": -0.016923470440815436, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.0078125, "step": 130 }, { "clip_ratio": 0.0, - "completion_length": 102.26823282241821, - "epoch": 0.1056, - "grad_norm": 12.344482421875, - "kl": 18.35546875, - "learning_rate": 4.451985066691648e-07, - "loss": 0.415, - "num_tokens": 11981787.0, - "reward": 0.5884968402533559, - "reward_std": 0.5006589200347662, - "rewards/SMILES_validity_reward": 0.5854166727513075, + "completion_length": 150.53125488758087, + "epoch": 0.0528, + "grad_norm": 8.036478996276855, + "kl": 0.09299468994140625, + "learning_rate": 4.1494928597979117e-07, + "loss": -0.1005, + "num_tokens": 4398421.0, + "reward": -0.43908108957111835, + "reward_std": 0.0789313922796282, + "rewards/SMILES_validity_reward": -0.5666666887700558, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.017361111589707434, - "rewards/repetition_penalty_reward": -0.01754250284648151, - "rewards/smiles_len_reward": -0.16914823453407735, - "rewards/tag_count_reward": 0.018880208721384406, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.05902777914889157, + "rewards/repetition_penalty_reward": -0.022235317796003073, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.0026041667442768812, "step": 132 }, { "clip_ratio": 0.0, - "completion_length": 84.96875262260437, - "epoch": 0.1072, - "grad_norm": 21.06283950805664, - "kl": 43.158203125, - "learning_rate": 4.4275186547639267e-07, - "loss": 0.2883, - "num_tokens": 12112719.0, - "reward": 0.624729085713625, - "reward_std": 0.5171774514019489, - "rewards/SMILES_validity_reward": 0.6250000149011612, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8177083432674408, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.01814899359305855, - "rewards/smiles_len_reward": -0.09458748006727546, - "rewards/tag_count_reward": 0.021484375174622983, + "completion_length": 150.88542008399963, + "epoch": 0.0536, + "grad_norm": 6.453376293182373, + "kl": 0.0865478515625, + "learning_rate": 4.122249925494726e-07, + "loss": 0.0604, + "num_tokens": 4462058.0, + "reward": -0.4095195522531867, + "reward_std": 0.10667452452980797, + "rewards/SMILES_validity_reward": -0.5166666880249977, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.041666667675599456, + "rewards/repetition_penalty_reward": -0.025404679967323318, + "rewards/smiles_len_reward": -0.03645833395421505, + "rewards/tag_count_reward": 0.010416666977107525, "step": 134 }, { "clip_ratio": 0.0, - "completion_length": 93.87500214576721, - "epoch": 0.1088, - "grad_norm": 15.506207466125488, - "kl": 16.0703125, - "learning_rate": 4.4025883266030014e-07, - "loss": 0.3161, - "num_tokens": 12247071.0, - "reward": 0.6261179409921169, - "reward_std": 0.5333305615931749, - "rewards/SMILES_validity_reward": 0.6458333414047956, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.7968749925494194, - "rewards/reasoning_steps_reward": 0.01388888928340748, - "rewards/repetition_penalty_reward": -0.019531703477696283, - "rewards/smiles_len_reward": -0.17263225640635937, - "rewards/tag_count_reward": 0.027994792035315186, + "completion_length": 96.39583551883698, + "epoch": 0.0544, + "grad_norm": 8.152026176452637, + "kl": 0.1044464111328125, + "learning_rate": 4.094670094299131e-07, + "loss": -0.0086, + "num_tokens": 4520464.0, + "reward": -0.45102883502840996, + "reward_std": 0.0681979734054039, + "rewards/SMILES_validity_reward": -0.588541692122817, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.04513889015652239, + "rewards/repetition_penalty_reward": -0.024178073908842634, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.0, "step": 136 }, { "clip_ratio": 0.0, - "completion_length": 95.94010603427887, - "epoch": 0.1104, - "grad_norm": 43.307518005371094, - "kl": 30.0234375, - "learning_rate": 4.377200082453748e-07, - "loss": 0.2749, - "num_tokens": 12382216.0, - "reward": 0.6953225135803223, - "reward_std": 0.4491841895505786, - "rewards/SMILES_validity_reward": 0.7208333406597376, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8411458320915699, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.013212446501711383, - "rewards/smiles_len_reward": -0.1262718337820843, - "rewards/tag_count_reward": 0.015625000407453626, + "completion_length": 112.1979204416275, + "epoch": 0.0552, + "grad_norm": 4.7342000007629395, + "kl": 0.08402252197265625, + "learning_rate": 4.066759093786931e-07, + "loss": 0.096, + "num_tokens": 4580387.0, + "reward": -0.4373946124687791, + "reward_std": 0.07959776032657828, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.031250000931322575, + "rewards/reasoning_steps_reward": 0.03819444542750716, + "rewards/repetition_penalty_reward": -0.0209956017206423, + "rewards/smiles_len_reward": -0.03125000046566129, + "rewards/tag_count_reward": 0.013020833721384406, "step": 138 }, { "clip_ratio": 0.0, - "completion_length": 102.95573198795319, - "epoch": 0.112, - "grad_norm": 11.107709884643555, - "kl": 17.333984375, - "learning_rate": 4.3513600327725117e-07, - "loss": 0.2443, - "num_tokens": 12520055.0, - "reward": 0.6400888189673424, - "reward_std": 0.5148144848644733, - "rewards/SMILES_validity_reward": 0.6583333387970924, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.006944444612599909, - "rewards/repetition_penalty_reward": -0.01866041096400295, - "rewards/smiles_len_reward": -0.15211048838682473, - "rewards/tag_count_reward": 0.018880208837799728, + "completion_length": 115.69791889190674, + "epoch": 0.056, + "grad_norm": 6.3178863525390625, + "kl": 0.0914154052734375, + "learning_rate": 4.038522720308732e-07, + "loss": -0.0181, + "num_tokens": 4640646.0, + "reward": -0.39627854742866475, + "reward_std": 0.10544795080204494, + "rewards/SMILES_validity_reward": -0.5166666894219816, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.05555555666796863, + "rewards/repetition_penalty_reward": -0.018862670767703094, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.013020833721384406, "step": 140 }, { "clip_ratio": 0.0, - "completion_length": 104.8984397649765, - "epoch": 0.1136, - "grad_norm": 15.497629165649414, - "kl": 12.576171875, - "learning_rate": 4.3250743967564364e-07, - "loss": 0.3195, - "num_tokens": 12658640.0, - "reward": 0.6857658997178078, - "reward_std": 0.4671833934262395, - "rewards/SMILES_validity_reward": 0.7041666712611914, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8359375, - "rewards/reasoning_steps_reward": 0.006944444612599909, - "rewards/repetition_penalty_reward": -0.012855285956902662, - "rewards/smiles_len_reward": -0.0961968683404848, - "rewards/tag_count_reward": 0.022786458488553762, + "completion_length": 125.94791948795319, + "epoch": 0.0568, + "grad_norm": 7.152596950531006, + "kl": 0.10680389404296875, + "learning_rate": 4.009966837786194e-07, + "loss": 0.0079, + "num_tokens": 4701889.0, + "reward": -0.45083487406373024, + "reward_std": 0.05493604081857484, + "rewards/SMILES_validity_reward": -0.5833333563059568, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.055555556900799274, + "rewards/repetition_penalty_reward": -0.0222385291417595, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 142 }, { "clip_ratio": 0.0, - "completion_length": 95.03906536102295, - "epoch": 0.1152, - "grad_norm": 752693.875, - "kl": 64275.8251953125, - "learning_rate": 4.2983495008466273e-07, - "loss": 64.4247, - "num_tokens": 12793439.0, - "reward": 0.6533464230597019, - "reward_std": 0.49899647012352943, - "rewards/SMILES_validity_reward": 0.658333346247673, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8489583358168602, - "rewards/reasoning_steps_reward": 0.0026041667442768812, - "rewards/repetition_penalty_reward": -0.014190958245308138, - "rewards/smiles_len_reward": -0.13880437827901915, - "rewards/tag_count_reward": 0.028645833546761423, + "completion_length": 170.20833539962769, + "epoch": 0.0576, + "grad_norm": 5.538232803344727, + "kl": 0.09381866455078125, + "learning_rate": 3.981097376494259e-07, + "loss": -0.1594, + "num_tokens": 4767381.0, + "reward": -0.4077454451471567, + "reward_std": 0.15711192146409303, + "rewards/SMILES_validity_reward": -0.5333333536982536, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.06944444682449102, + "rewards/repetition_penalty_reward": -0.025545662763761356, + "rewards/smiles_len_reward": -0.0416666679084301, + "rewards/tag_count_reward": 0.02864583395421505, "step": 144 }, { "clip_ratio": 0.0, - "completion_length": 85.64583575725555, - "epoch": 0.1168, - "grad_norm": 16.18448829650879, - "kl": 19.9951171875, - "learning_rate": 4.2711917772054997e-07, - "loss": 0.2634, - "num_tokens": 12924631.0, - "reward": 0.7568464614450932, - "reward_std": 0.4012842336669564, - "rewards/SMILES_validity_reward": 0.7791666649281979, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8723958395421505, - "rewards/reasoning_steps_reward": 0.01388888928340748, - "rewards/repetition_penalty_reward": -0.01767248242686037, - "rewards/smiles_len_reward": -0.018638497567735612, - "rewards/tag_count_reward": 0.01953125005820766, + "completion_length": 98.79166948795319, + "epoch": 0.0584, + "grad_norm": 5.379575729370117, + "kl": 0.1139678955078125, + "learning_rate": 3.951920331829592e-07, + "loss": 0.0907, + "num_tokens": 4826017.0, + "reward": -0.44052676670253277, + "reward_std": 0.07575159638145124, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.031250000931322575, + "rewards/reasoning_steps_reward": 0.0069444444961845875, + "rewards/repetition_penalty_reward": -0.013254568788397592, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.0052083334885537624, "step": 146 }, { "clip_ratio": 0.0, - "completion_length": 83.21614944934845, - "epoch": 0.1184, - "grad_norm": 31.727739334106445, - "kl": 22.85546875, - "learning_rate": 4.2436077621686784e-07, - "loss": 0.2287, - "num_tokens": 13054890.0, - "reward": 0.6930093914270401, - "reward_std": 0.41775880940258503, - "rewards/SMILES_validity_reward": 0.6924479249864817, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8697916679084301, - "rewards/reasoning_steps_reward": 0.004340277868323028, - "rewards/repetition_penalty_reward": -0.014475565858447226, - "rewards/smiles_len_reward": -0.041021980927325785, - "rewards/tag_count_reward": 0.02473958331393078, + "completion_length": 115.91666913032532, + "epoch": 0.0592, + "grad_norm": 5.238061904907227, + "kl": 0.0884857177734375, + "learning_rate": 3.922441763065506e-07, + "loss": -0.0319, + "num_tokens": 4886297.0, + "reward": -0.4026124000083655, + "reward_std": 0.12813380375882844, + "rewards/SMILES_validity_reward": -0.5333333550952375, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.07291666883975267, + "rewards/reasoning_steps_reward": 0.024305556202307343, + "rewards/repetition_penalty_reward": -0.0176178389810957, + "rewards/smiles_len_reward": -0.0416666679084301, + "rewards/tag_count_reward": 0.02343750069849193, "step": 148 }, { "clip_ratio": 0.0, - "completion_length": 109.81771111488342, - "epoch": 0.12, - "grad_norm": 19.164730072021484, - "kl": 52.619140625, - "learning_rate": 4.2156040946718343e-07, - "loss": 0.5484, - "num_tokens": 13195364.0, - "reward": 0.6934168599545956, - "reward_std": 0.45168319437652826, - "rewards/SMILES_validity_reward": 0.7140625007450581, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8463541679084301, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.02252123405196471, - "rewards/smiles_len_reward": -0.12117591802962124, - "rewards/tag_count_reward": 0.029947916918899864, + "completion_length": 81.14583551883698, + "epoch": 0.06, + "grad_norm": 9.144115447998047, + "kl": 0.10345458984375, + "learning_rate": 3.8926677920936093e-07, + "loss": 0.0174, + "num_tokens": 4943239.0, + "reward": -0.4181996285915375, + "reward_std": 0.12576395435098675, + "rewards/SMILES_validity_reward": -0.5500000212341547, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.02083333395421505, + "rewards/repetition_penalty_reward": -0.014288760961790103, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.010416666977107525, "step": 150 }, { "clip_ratio": 0.0, - "completion_length": 77.8828147649765, - "epoch": 0.1216, - "grad_norm": 14.251289367675781, - "kl": 23.25, - "learning_rate": 4.187187514652819e-07, - "loss": 0.282, - "num_tokens": 13323575.0, - "reward": 0.7740835659205914, - "reward_std": 0.40268346946686506, - "rewards/SMILES_validity_reward": 0.7986979261040688, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984374962747097, - "rewards/reasoning_steps_reward": 0.011284722539130598, - "rewards/repetition_penalty_reward": -0.010957858150504762, - "rewards/smiles_len_reward": -0.06391943350899965, - "rewards/tag_count_reward": 0.018229166860692203, + "completion_length": 126.05208683013916, + "epoch": 0.0608, + "grad_norm": 6.60600471496582, + "kl": 0.105255126953125, + "learning_rate": 3.862604602152464e-07, + "loss": -0.1057, + "num_tokens": 5004492.0, + "reward": -0.4281045887619257, + "reward_std": 0.09912917913607089, + "rewards/SMILES_validity_reward": -0.5666666887700558, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.10416667023673654, + "rewards/repetition_penalty_reward": -0.02531764625746291, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.007812500232830644, "step": 152 }, { "clip_ratio": 0.0, - "completion_length": 91.5494817495346, - "epoch": 0.1232, - "grad_norm": 103.39234161376953, - "kl": 59.921875, - "learning_rate": 4.158364861429493e-07, - "loss": 0.3783, - "num_tokens": 13457034.0, - "reward": 0.658618837594986, - "reward_std": 0.5097680818289518, - "rewards/SMILES_validity_reward": 0.6708333306014538, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8463541679084301, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.015188703837338835, - "rewards/smiles_len_reward": -0.1719313338799111, - "rewards/tag_count_reward": 0.02799479168606922, + "completion_length": 138.1041705608368, + "epoch": 0.0616, + "grad_norm": 8.543445587158203, + "kl": 0.08979034423828125, + "learning_rate": 3.8322584365434934e-07, + "loss": -0.0847, + "num_tokens": 5066902.0, + "reward": -0.4112464152276516, + "reward_std": 0.1456986589182634, + "rewards/SMILES_validity_reward": -0.5333333536982536, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.052083334885537624, + "rewards/reasoning_steps_reward": 0.03819444542750716, + "rewards/repetition_penalty_reward": -0.02149280160665512, + "rewards/smiles_len_reward": -0.06250000186264515, + "rewards/tag_count_reward": 0.010416666744276881, "step": 154 }, { "clip_ratio": 0.0, - "completion_length": 108.07552409172058, - "epoch": 0.1248, - "grad_norm": 460.58990478515625, - "kl": 540.94921875, - "learning_rate": 4.129143072053638e-07, - "loss": 0.8874, - "num_tokens": 13596839.0, - "reward": 0.6298879142850637, - "reward_std": 0.5001714690588415, - "rewards/SMILES_validity_reward": 0.6023437635740265, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.856770820915699, - "rewards/reasoning_steps_reward": 0.0190972225391306, - "rewards/repetition_penalty_reward": -0.01785970525816083, - "rewards/smiles_len_reward": -0.013817999046295881, - "rewards/tag_count_reward": 0.024739583488553762, + "completion_length": 179.20833945274353, + "epoch": 0.0624, + "grad_norm": 7.902252197265625, + "kl": 0.11370849609375, + "learning_rate": 3.8016355973344173e-07, + "loss": -0.1289, + "num_tokens": 5133258.0, + "reward": -0.40283326152712107, + "reward_std": 0.13701566337113036, + "rewards/SMILES_validity_reward": -0.5333333550952375, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.0902777798473835, + "rewards/repetition_penalty_reward": -0.025902962646796368, + "rewards/smiles_len_reward": -0.06250000186264515, + "rewards/tag_count_reward": 0.015625000465661287, "step": 156 }, { "clip_ratio": 0.0, - "completion_length": 65.27604413032532, - "epoch": 0.1264, - "grad_norm": 72.43608093261719, - "kl": 25.7138671875, - "learning_rate": 4.0995291796413365e-07, - "loss": 0.2613, - "num_tokens": 13720209.0, - "reward": 0.7709857225418091, - "reward_std": 0.3671606592833996, - "rewards/SMILES_validity_reward": 0.804166667163372, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8854166716337204, - "rewards/reasoning_steps_reward": 0.0026041667442768812, - "rewards/repetition_penalty_reward": -0.009442822036362486, - "rewards/smiles_len_reward": -0.0869512411300093, - "rewards/tag_count_reward": 0.018229167035315186, + "completion_length": 183.17708587646484, + "epoch": 0.0632, + "grad_norm": 5.164620399475098, + "kl": 0.089508056640625, + "learning_rate": 3.7707424440504863e-07, + "loss": 0.0345, + "num_tokens": 5199995.0, + "reward": -0.36004658944148105, + "reward_std": 0.23757518313504988, + "rewards/SMILES_validity_reward": -0.4833333524875343, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.09375000279396772, + "rewards/reasoning_steps_reward": 0.08333333511836827, + "rewards/repetition_penalty_reward": -0.03744592401199043, + "rewards/smiles_len_reward": -0.06250000139698386, + "rewards/tag_count_reward": 0.018229166977107525, "step": 158 }, { "clip_ratio": 0.0, - "completion_length": 69.42969000339508, - "epoch": 0.128, - "grad_norm": 15.941231727600098, - "kl": 17.091796875, - "learning_rate": 4.0695303116802467e-07, - "loss": 0.2727, - "num_tokens": 13845174.0, - "reward": 0.7843712531030178, - "reward_std": 0.3681061351671815, - "rewards/SMILES_validity_reward": 0.8041666708886623, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9140624962747097, - "rewards/reasoning_steps_reward": 0.004340277868323028, - "rewards/repetition_penalty_reward": -0.011173382534252596, - "rewards/smiles_len_reward": -0.03773681813618168, - "rewards/tag_count_reward": 0.016927083546761423, + "completion_length": 107.97916984558105, + "epoch": 0.064, + "grad_norm": 5.103909015655518, + "kl": 0.102081298828125, + "learning_rate": 3.739585392353787e-07, + "loss": -0.1095, + "num_tokens": 5259513.0, + "reward": -0.37965655652806163, + "reward_std": 0.18476388306226, + "rewards/SMILES_validity_reward": -0.5000000200234354, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0729166679084301, + "rewards/reasoning_steps_reward": 0.03819444542750716, + "rewards/repetition_penalty_reward": -0.011844190346891992, + "rewards/smiles_len_reward": -0.05208333441987634, + "rewards/tag_count_reward": 0.010416666977107525, "step": 160 }, { "clip_ratio": 0.0, - "completion_length": 80.86198151111603, - "epoch": 0.1296, - "grad_norm": 21.729921340942383, - "kl": 19.8984375, - "learning_rate": 4.039153688314145e-07, - "loss": 0.2752, - "num_tokens": 13974529.0, - "reward": 0.7434104010462761, - "reward_std": 0.4333901312202215, - "rewards/SMILES_validity_reward": 0.7666666693985462, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333283662796, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.011836088739073602, - "rewards/smiles_len_reward": -0.14598823036067188, - "rewards/tag_count_reward": 0.024739583663176745, + "completion_length": 133.76042115688324, + "epoch": 0.0648, + "grad_norm": 10.518568992614746, + "kl": 0.1388397216796875, + "learning_rate": 3.7081709127108767e-07, + "loss": -0.0174, + "num_tokens": 5321506.0, + "reward": -0.3726199440425262, + "reward_std": 0.1925114190817112, + "rewards/SMILES_validity_reward": -0.5166666875593364, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.13541666977107525, + "rewards/reasoning_steps_reward": 0.08333333465270698, + "rewards/repetition_penalty_reward": -0.03140865158638917, + "rewards/smiles_len_reward": -0.08854166837409139, + "rewards/tag_count_reward": 0.020833333488553762, "step": 162 }, { "clip_ratio": 0.0, - "completion_length": 91.11719036102295, - "epoch": 0.1312, - "grad_norm": 42.10722351074219, - "kl": 109.4140625, - "learning_rate": 4.008406620605189e-07, - "loss": 0.4418, - "num_tokens": 14107822.0, - "reward": 0.7064198963344097, - "reward_std": 0.45633639767766, - "rewards/SMILES_validity_reward": 0.7291666679084301, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8515625, - "rewards/reasoning_steps_reward": 0.0164930559694767, - "rewards/repetition_penalty_reward": -0.014415540204936406, - "rewards/smiles_len_reward": -0.11886932025663555, - "rewards/tag_count_reward": 0.02213541668606922, + "completion_length": 121.31250333786011, + "epoch": 0.0656, + "grad_norm": 13.173834800720215, + "kl": 0.153289794921875, + "learning_rate": 3.6765055290490513e-07, + "loss": 0.0415, + "num_tokens": 5382304.0, + "reward": -0.3137630212586373, + "reward_std": 0.2859425003753131, + "rewards/SMILES_validity_reward": -0.4333333526737988, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.14583333674818277, + "rewards/reasoning_steps_reward": 0.05208333465270698, + "rewards/repetition_penalty_reward": -0.023568583332234994, + "rewards/smiles_len_reward": -0.10416666930541396, + "rewards/tag_count_reward": 0.03385416744276881, "step": 164 }, { "clip_ratio": 0.0, - "completion_length": 60.77864742279053, - "epoch": 0.1328, - "grad_norm": 220.8507537841797, - "kl": 65.47265625, - "learning_rate": 3.977296508774278e-07, - "loss": 0.3117, - "num_tokens": 14229465.0, - "reward": 0.7945839650928974, - "reward_std": 0.3342463602311909, - "rewards/SMILES_validity_reward": 0.8333333358168602, + "completion_length": 97.02083551883698, + "epoch": 0.0664, + "grad_norm": 8.68799877166748, + "kl": 0.1730804443359375, + "learning_rate": 3.644595817401501e-07, + "loss": -0.0409, + "num_tokens": 5440770.0, + "reward": -0.27725822292268276, + "reward_std": 0.28745744668412954, + "rewards/SMILES_validity_reward": -0.38333335146307945, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8932291679084301, - "rewards/reasoning_steps_reward": 0.008680555794853717, - "rewards/repetition_penalty_reward": -0.006365777313476428, - "rewards/smiles_len_reward": -0.09033061633817852, - "rewards/tag_count_reward": 0.020833333488553762, + "rewards/format_reward": 0.1458333358168602, + "rewards/reasoning_steps_reward": 0.03472222341224551, + "rewards/repetition_penalty_reward": -0.009388637714437209, + "rewards/smiles_len_reward": -0.06770833535119891, + "rewards/tag_count_reward": 0.015625000232830644, "step": 166 }, { "clip_ratio": 0.0, - "completion_length": 97.78646194934845, - "epoch": 0.1344, - "grad_norm": 13.82856559753418, - "kl": 27.58203125, - "learning_rate": 3.945830840419966e-07, - "loss": 0.3424, - "num_tokens": 14365319.0, - "reward": 0.7539720423519611, - "reward_std": 0.42417754977941513, - "rewards/SMILES_validity_reward": 0.7723958343267441, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.890625, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.009425983691471629, - "rewards/smiles_len_reward": -0.0685632707318291, - "rewards/tag_count_reward": 0.026041667093522847, + "completion_length": 121.968754529953, + "epoch": 0.0672, + "grad_norm": 11.079466819763184, + "kl": 0.1530609130859375, + "learning_rate": 3.6124484045416483e-07, + "loss": 0.0816, + "num_tokens": 5501631.0, + "reward": -0.25446695898426697, + "reward_std": 0.32407376743503846, + "rewards/SMILES_validity_reward": -0.383333352394402, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.21875000465661287, + "rewards/reasoning_steps_reward": 0.03472222317941487, + "rewards/repetition_penalty_reward": -0.013246808550320566, + "rewards/smiles_len_reward": -0.07812500232830644, + "rewards/tag_count_reward": 0.039062499767169356, "step": 168 }, { "clip_ratio": 0.0, - "completion_length": 79.54427301883698, - "epoch": 0.136, - "grad_norm": 31.95520782470703, - "kl": 27.662109375, - "learning_rate": 3.9140171887163466e-07, - "loss": 0.2672, - "num_tokens": 14494168.0, - "reward": 0.7264284733682871, - "reward_std": 0.4606617968529463, - "rewards/SMILES_validity_reward": 0.7416666690260172, + "completion_length": 153.052086353302, + "epoch": 0.068, + "grad_norm": 8.2036714553833, + "kl": 0.171844482421875, + "learning_rate": 3.580069966606949e-07, + "loss": -0.1069, + "num_tokens": 5565476.0, + "reward": -0.17629894718993455, + "reward_std": 0.47014068998396397, + "rewards/SMILES_validity_reward": -0.2833333467133343, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8749999925494194, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.011190719229489332, - "rewards/smiles_len_reward": -0.07765073655173182, - "rewards/tag_count_reward": 0.026041666453238577, + "rewards/format_reward": 0.26041666977107525, + "rewards/reasoning_steps_reward": 0.052083334885537624, + "rewards/repetition_penalty_reward": -0.03746950729691889, + "rewards/smiles_len_reward": -0.13541666883975267, + "rewards/tag_count_reward": 0.05989583395421505, "step": 170 }, { "clip_ratio": 0.0, - "completion_length": 86.70573210716248, - "epoch": 0.1376, - "grad_norm": 111.74293518066406, - "kl": 73.783203125, - "learning_rate": 3.8818632105903315e-07, - "loss": 0.351, - "num_tokens": 14625767.0, - "reward": 0.7140463925898075, - "reward_std": 0.3952889391221106, - "rewards/SMILES_validity_reward": 0.7299479208886623, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8723958283662796, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.011792463701567613, - "rewards/smiles_len_reward": -0.10516674036625773, - "rewards/tag_count_reward": 0.020182292093522847, + "completion_length": 130.13542079925537, + "epoch": 0.0688, + "grad_norm": 12.656828880310059, + "kl": 0.278564453125, + "learning_rate": 3.547467227712444e-07, + "loss": -0.0281, + "num_tokens": 5627121.0, + "reward": -0.14627268894400913, + "reward_std": 0.501163106149761, + "rewards/SMILES_validity_reward": -0.2333333482965827, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.23958333674818277, + "rewards/reasoning_steps_reward": 0.031250000931322575, + "rewards/repetition_penalty_reward": -0.022102794086094946, + "rewards/smiles_len_reward": -0.09375000232830644, + "rewards/tag_count_reward": 0.03645833348855376, "step": 172 }, { "clip_ratio": 0.0, - "completion_length": 62.22135591506958, - "epoch": 0.1392, - "grad_norm": 31.038663864135742, - "kl": 34.14453125, - "learning_rate": 3.849376644878782e-07, - "loss": 0.3066, - "num_tokens": 14747964.0, - "reward": 0.7357414551079273, - "reward_std": 0.4182268213480711, - "rewards/SMILES_validity_reward": 0.7749999836087227, + "completion_length": 136.18750500679016, + "epoch": 0.0696, + "grad_norm": 8.628015518188477, + "kl": 0.2374114990234375, + "learning_rate": 3.5146469585543386e-07, + "loss": 0.0807, + "num_tokens": 5689347.0, + "reward": -0.11965863051591441, + "reward_std": 0.4950985286559444, + "rewards/SMILES_validity_reward": -0.21666668402031064, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8541666679084301, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.008280414986074902, - "rewards/smiles_len_reward": -0.16086881840601563, - "rewards/tag_count_reward": 0.026041666395030916, + "rewards/format_reward": 0.3020833367481828, + "rewards/reasoning_steps_reward": 0.031250000931322575, + "rewards/repetition_penalty_reward": -0.015858068407396786, + "rewards/smiles_len_reward": -0.11979166977107525, + "rewards/tag_count_reward": 0.01822916720993817, "step": 174 }, { "clip_ratio": 0.0, - "completion_length": 92.81250166893005, - "epoch": 0.1408, - "grad_norm": 17.010215759277344, - "kl": 70.134765625, - "learning_rate": 3.8165653104659185e-07, - "loss": 0.3149, - "num_tokens": 14881908.0, - "reward": 0.7312784865498543, - "reward_std": 0.42725326027721167, - "rewards/SMILES_validity_reward": 0.7666666693985462, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8489583358168602, - "rewards/reasoning_steps_reward": 0.015625000465661287, - "rewards/repetition_penalty_reward": -0.0110986630897969, - "rewards/smiles_len_reward": -0.13132599194068462, - "rewards/tag_count_reward": 0.026041667151730508, + "completion_length": 109.19791996479034, + "epoch": 0.0704, + "grad_norm": 9.866938591003418, + "kl": 0.29345703125, + "learning_rate": 3.481615975003922e-07, + "loss": -0.0523, + "num_tokens": 5748982.0, + "reward": -0.16320179387548706, + "reward_std": 0.46159220617846586, + "rewards/SMILES_validity_reward": -0.25520835211500525, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.2500000074505806, + "rewards/reasoning_steps_reward": 0.0243055559694767, + "rewards/repetition_penalty_reward": -0.01569942681817338, + "rewards/smiles_len_reward": -0.140625003259629, + "rewards/tag_count_reward": 0.03645833395421505, "step": 176 }, { "clip_ratio": 0.0, - "completion_length": 66.02083623409271, - "epoch": 0.1424, - "grad_norm": 14.404475212097168, - "kl": 24.978515625, - "learning_rate": 3.783437104401469e-07, - "loss": 0.4138, - "num_tokens": 15005564.0, - "reward": 0.749178359284997, - "reward_std": 0.3878423860296607, - "rewards/SMILES_validity_reward": 0.7875000014901161, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8697916679084301, - "rewards/reasoning_steps_reward": 0.0, - "rewards/repetition_penalty_reward": -0.0074081632774323225, - "rewards/smiles_len_reward": -0.14612202369607985, - "rewards/tag_count_reward": 0.02343749994179234, + "completion_length": 141.4479193687439, + "epoch": 0.0712, + "grad_norm": 7.250740051269531, + "kl": 0.243499755859375, + "learning_rate": 3.448381136692089e-07, + "loss": 0.0395, + "num_tokens": 5811713.0, + "reward": -0.009984066942706704, + "reward_std": 0.5580916641047224, + "rewards/SMILES_validity_reward": -0.10520835034549236, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.40625000838190317, + "rewards/reasoning_steps_reward": 0.0416666679084301, + "rewards/repetition_penalty_reward": -0.03265416223439388, + "rewards/smiles_len_reward": -0.13541666883975267, + "rewards/tag_count_reward": 0.04427083348855376, "step": 178 }, { "clip_ratio": 0.0, - "completion_length": 77.119793176651, - "epoch": 0.144, - "grad_norm": 11.405951499938965, - "kl": 19.92578125, - "learning_rate": 3.75e-07, - "loss": 0.3072, - "num_tokens": 15133482.0, - "reward": 0.7756290249526501, - "reward_std": 0.37124256137758493, - "rewards/SMILES_validity_reward": 0.8140624910593033, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.890625, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.010999515707226237, - "rewards/smiles_len_reward": -0.13752924266736954, - "rewards/tag_count_reward": 0.02669270831393078, + "completion_length": 140.208336353302, + "epoch": 0.072, + "grad_norm": 13.146002769470215, + "kl": 0.4273834228515625, + "learning_rate": 3.4149493455847897e-07, + "loss": 0.0279, + "num_tokens": 5874325.0, + "reward": -0.06133632222190499, + "reward_std": 0.522064303047955, + "rewards/SMILES_validity_reward": -0.1666666786186397, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.3854166707023978, + "rewards/reasoning_steps_reward": 0.013888889225199819, + "rewards/repetition_penalty_reward": -0.043919739313423634, + "rewards/smiles_len_reward": -0.1302083362825215, + "rewards/tag_count_reward": 0.05729166814126074, "step": 180 }, { "clip_ratio": 0.0, - "completion_length": 79.77864706516266, - "epoch": 0.1456, - "grad_norm": 48.71555709838867, - "kl": 52.099609375, - "learning_rate": 3.7162620449218993e-07, - "loss": 0.2348, - "num_tokens": 15262421.0, - "reward": 0.7475567385554314, - "reward_std": 0.4051980022341013, - "rewards/SMILES_validity_reward": 0.7708333358168602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8880208395421505, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.015756858985696454, - "rewards/smiles_len_reward": -0.0959165629465133, - "rewards/tag_count_reward": 0.019531250349245965, + "completion_length": 106.71875250339508, + "epoch": 0.0728, + "grad_norm": 10.344687461853027, + "kl": 0.3924560546875, + "learning_rate": 3.3813275445496766e-07, + "loss": -0.0579, + "num_tokens": 5933722.0, + "reward": 0.10793787596048787, + "reward_std": 0.6260992332245223, + "rewards/SMILES_validity_reward": 0.011458318680524826, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.5312500074505806, + "rewards/reasoning_steps_reward": 0.038194445660337806, + "rewards/repetition_penalty_reward": -0.020795845077373087, + "rewards/smiles_len_reward": -0.15104166930541396, + "rewards/tag_count_reward": 0.039062500931322575, "step": 182 }, { "clip_ratio": 0.0, - "completion_length": 60.70573079586029, - "epoch": 0.1472, - "grad_norm": 29.34798240661621, - "kl": 37.0546875, - "learning_rate": 3.682231359236459e-07, - "loss": 0.206, - "num_tokens": 15384036.0, - "reward": 0.7746610157191753, - "reward_std": 0.37258596147876233, - "rewards/SMILES_validity_reward": 0.8041666634380817, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8854166604578495, - "rewards/reasoning_steps_reward": 0.006944444612599909, - "rewards/repetition_penalty_reward": -0.008209791714762105, - "rewards/smiles_len_reward": -0.06293306592851877, - "rewards/tag_count_reward": 0.025390625349245965, + "completion_length": 129.43750321865082, + "epoch": 0.0736, + "grad_norm": 11.922599792480469, + "kl": 0.5929412841796875, + "learning_rate": 3.347522715914262e-07, + "loss": 0.0735, + "num_tokens": 5995300.0, + "reward": 0.14200845459708944, + "reward_std": 0.6097154561430216, + "rewards/SMILES_validity_reward": 0.06666666129603982, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.5208333395421505, + "rewards/reasoning_steps_reward": 0.0034722222480922937, + "rewards/repetition_penalty_reward": -0.021409569744719192, + "rewards/smiles_len_reward": -0.12500000279396772, + "rewards/tag_count_reward": 0.03385416744276881, "step": 184 }, { "clip_ratio": 0.0, - "completion_length": 93.895836353302, - "epoch": 0.1488, - "grad_norm": 11.991008758544922, - "kl": 19.599609375, - "learning_rate": 3.647916133467529e-07, - "loss": 0.4281, - "num_tokens": 15518396.0, - "reward": 0.7007387336343527, - "reward_std": 0.4086402766406536, - "rewards/SMILES_validity_reward": 0.7023437605239451, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8880208320915699, - "rewards/reasoning_steps_reward": 0.016493055794853717, - "rewards/repetition_penalty_reward": -0.022340198534948286, - "rewards/smiles_len_reward": -0.09132412448525429, - "rewards/tag_count_reward": 0.024088542093522847, + "completion_length": 72.20833587646484, + "epoch": 0.0744, + "grad_norm": 13.126518249511719, + "kl": 0.7501220703125, + "learning_rate": 3.313541880015877e-07, + "loss": 0.0348, + "num_tokens": 6051384.0, + "reward": 0.2482103147485759, + "reward_std": 0.6659532003104687, + "rewards/SMILES_validity_reward": 0.14999999664723873, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.6562500074505806, + "rewards/reasoning_steps_reward": 0.027777778450399637, + "rewards/repetition_penalty_reward": -0.009738226246554404, + "rewards/smiles_len_reward": -0.11979166930541396, + "rewards/tag_count_reward": 0.06510416744276881, "step": 186 }, { "clip_ratio": 0.0, - "completion_length": 69.02083575725555, - "epoch": 0.1504, - "grad_norm": 13.47666072845459, - "kl": 14.310546875, - "learning_rate": 3.6133246266222233e-07, - "loss": 0.1876, - "num_tokens": 15643204.0, - "reward": 0.7891622483730316, - "reward_std": 0.3716621574712917, - "rewards/SMILES_validity_reward": 0.8083333224058151, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458283662796, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.01272648310259683, - "rewards/smiles_len_reward": 0.004477886424865574, - "rewards/tag_count_reward": 0.022786458604969084, + "completion_length": 83.80208563804626, + "epoch": 0.0752, + "grad_norm": 11.092013359069824, + "kl": 0.70703125, + "learning_rate": 3.279392093743747e-07, + "loss": 0.0463, + "num_tokens": 6108581.0, + "reward": 0.36337129410821944, + "reward_std": 0.6607330553233624, + "rewards/SMILES_validity_reward": 0.3333333395421505, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.6250000055879354, + "rewards/reasoning_steps_reward": 0.010416666744276881, + "rewards/repetition_penalty_reward": -0.012121445353841409, + "rewards/smiles_len_reward": -0.10937500232830644, + "rewards/tag_count_reward": 0.03645833418704569, "step": 188 }, { "clip_ratio": 0.0, - "completion_length": 104.91406571865082, - "epoch": 0.152, - "grad_norm": 32.838348388671875, - "kl": 41.697265625, - "learning_rate": 3.5784651642031337e-07, - "loss": 0.4292, - "num_tokens": 15781795.0, - "reward": 0.7256705239415169, - "reward_std": 0.43252733163535595, - "rewards/SMILES_validity_reward": 0.7500000074505806, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8697916679084301, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.014169132729875855, - "rewards/smiles_len_reward": -0.12496000179089606, - "rewards/tag_count_reward": 0.028645833488553762, + "completion_length": 85.01041889190674, + "epoch": 0.076, + "grad_norm": 14.43812370300293, + "kl": 2.8946533203125, + "learning_rate": 3.245080449073459e-07, + "loss": 0.0825, + "num_tokens": 6165894.0, + "reward": 0.35271822242066264, + "reward_std": 0.6430098190903664, + "rewards/SMILES_validity_reward": 0.28333333088085055, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.7083333358168602, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.008235604094807059, + "rewards/smiles_len_reward": -0.1145833358168602, + "rewards/tag_count_reward": 0.03125000046566129, "step": 190 }, { "clip_ratio": 0.0, - "completion_length": 76.00000166893005, - "epoch": 0.1536, - "grad_norm": 104.36994171142578, - "kl": 43.689453125, - "learning_rate": 3.5433461362045447e-07, - "loss": 0.2651, - "num_tokens": 15909283.0, - "reward": 0.6896458622068167, - "reward_std": 0.4807543084025383, - "rewards/SMILES_validity_reward": 0.6973958415910602, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8906250037252903, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.01001048629404977, - "rewards/smiles_len_reward": -0.1771257909713313, - "rewards/tag_count_reward": 0.02473958331393078, + "completion_length": 92.05208504199982, + "epoch": 0.0768, + "grad_norm": 12.526323318481445, + "kl": 0.8994140625, + "learning_rate": 3.210614071594162e-07, + "loss": -0.0314, + "num_tokens": 6223883.0, + "reward": 0.4647933058440685, + "reward_std": 0.5719528524205089, + "rewards/SMILES_validity_reward": 0.43333333311602473, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.71875, + "rewards/reasoning_steps_reward": 0.020833333488553762, + "rewards/repetition_penalty_reward": -0.013005562825128436, + "rewards/smiles_len_reward": -0.09375000279396772, + "rewards/tag_count_reward": 0.04427083418704569, "step": 192 }, { "clip_ratio": 0.0, - "completion_length": 45.68489694595337, - "epoch": 0.1552, - "grad_norm": 10673.4697265625, - "kl": 2005.744140625, - "learning_rate": 3.507975995093125e-07, - "loss": 2.1771, - "num_tokens": 16025130.0, - "reward": 0.757711049169302, - "reward_std": 0.42240715958178043, - "rewards/SMILES_validity_reward": 0.7874999903142452, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8932291679084301, - "rewards/reasoning_steps_reward": 0.006944444612599909, - "rewards/repetition_penalty_reward": -0.005861769714101683, - "rewards/smiles_len_reward": -0.1369942625751719, - "rewards/tag_count_reward": 0.020833333663176745, + "completion_length": 120.45833647251129, + "epoch": 0.0776, + "grad_norm": 10.05866813659668, + "kl": 1.5377197265625, + "learning_rate": 3.1760001190287695e-07, + "loss": 0.056, + "num_tokens": 6284599.0, + "reward": 0.5278361081145704, + "reward_std": 0.5975875779986382, + "rewards/SMILES_validity_reward": 0.5000000088475645, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.7708333283662796, + "rewards/reasoning_steps_reward": 0.0069444444961845875, + "rewards/repetition_penalty_reward": -0.03327204444212839, + "rewards/smiles_len_reward": -0.06770833535119891, + "rewards/tag_count_reward": 0.05989583441987634, "step": 194 }, { "clip_ratio": 0.0, - "completion_length": 65.72916793823242, - "epoch": 0.1568, - "grad_norm": 173.10513305664062, - "kl": 54.318359375, - "learning_rate": 3.472363253773584e-07, - "loss": 0.2984, - "num_tokens": 16148674.0, - "reward": 0.752436138689518, - "reward_std": 0.4110519029200077, - "rewards/SMILES_validity_reward": 0.7916666641831398, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8697916641831398, - "rewards/reasoning_steps_reward": 0.015625000465661287, - "rewards/repetition_penalty_reward": -0.010170710236707237, - "rewards/smiles_len_reward": -0.16338579345028847, - "rewards/tag_count_reward": 0.031250000407453626, + "completion_length": 91.43750321865082, + "epoch": 0.0784, + "grad_norm": 1621.8258056640625, + "kl": 232.828125, + "learning_rate": 3.141245779747502e-07, + "loss": 0.2962, + "num_tokens": 6342529.0, + "reward": 0.5469494787976146, + "reward_std": 0.5157439392060041, + "rewards/SMILES_validity_reward": 0.5000000055879354, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8124999981373549, + "rewards/reasoning_steps_reward": 0.031250000931322575, + "rewards/repetition_penalty_reward": -0.012277308269403875, + "rewards/smiles_len_reward": -0.06250000186264515, + "rewards/tag_count_reward": 0.07552083465270698, "step": 196 }, { "clip_ratio": 0.0, - "completion_length": 60.609376072883606, - "epoch": 0.1584, - "grad_norm": 76.87126159667969, - "kl": 45.49609375, - "learning_rate": 3.43651648353978e-07, - "loss": 0.2331, - "num_tokens": 16270252.0, - "reward": 0.7394071221351624, - "reward_std": 0.40953583153896034, - "rewards/SMILES_validity_reward": 0.7583333291113377, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416641831398, - "rewards/reasoning_steps_reward": 0.0026041667442768812, - "rewards/repetition_penalty_reward": -0.00891629023681162, - "rewards/smiles_len_reward": -0.13125849929929245, - "rewards/tag_count_reward": 0.020182292035315186, + "completion_length": 94.01041972637177, + "epoch": 0.0792, + "grad_norm": 18.095582962036133, + "kl": 9.997314453125, + "learning_rate": 3.106358271275056e-07, + "loss": 0.056, + "num_tokens": 6400706.0, + "reward": 0.5480379769578576, + "reward_std": 0.5921958331018686, + "rewards/SMILES_validity_reward": 0.5166666777804494, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.7812500037252903, + "rewards/reasoning_steps_reward": 0.013888889225199819, + "rewards/repetition_penalty_reward": -0.00955194083508104, + "rewards/smiles_len_reward": -0.07291666837409139, + "rewards/tag_count_reward": 0.08854166744276881, "step": 198 }, { "clip_ratio": 0.0, - "completion_length": 74.92968940734863, - "epoch": 0.16, - "grad_norm": 247.7176513671875, - "kl": 203.548828125, - "learning_rate": 3.400444312011776e-07, - "loss": 0.5934, - "num_tokens": 16397329.0, - "reward": 0.7175582200288773, - "reward_std": 0.4524127524346113, - "rewards/SMILES_validity_reward": 0.7416666746139526, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8567708283662796, - "rewards/reasoning_steps_reward": 0.014756944845430553, - "rewards/repetition_penalty_reward": -0.0132746260278509, - "rewards/smiles_len_reward": -0.1087138393195346, - "rewards/tag_count_reward": 0.02083333337213844, + "completion_length": 65.22916853427887, + "epoch": 0.08, + "grad_norm": 16.6898250579834, + "kl": 2.7962646484375, + "learning_rate": 3.0713448387917227e-07, + "loss": 0.013, + "num_tokens": 6456120.0, + "reward": 0.6376010160893202, + "reward_std": 0.48966592181750457, + "rewards/SMILES_validity_reward": 0.6333333402872086, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.017361111473292112, + "rewards/repetition_penalty_reward": -0.01218536806118209, + "rewards/smiles_len_reward": -0.04687500139698386, + "rewards/tag_count_reward": 0.04687500116415322, "step": 200 }, { "clip_ratio": 0.0, - "completion_length": 89.00260639190674, - "epoch": 0.1616, - "grad_norm": 12.647216796875, - "kl": 16.201171875, - "learning_rate": 3.3641554210593414e-07, - "loss": 0.3755, - "num_tokens": 16529810.0, - "reward": 0.7455911412835121, - "reward_std": 0.41086752247065306, - "rewards/SMILES_validity_reward": 0.7708333283662796, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8828125, - "rewards/reasoning_steps_reward": 0.01649305602768436, - "rewards/repetition_penalty_reward": -0.014058286946237786, - "rewards/smiles_len_reward": -0.1200922247953713, - "rewards/tag_count_reward": 0.02929687494179234, + "completion_length": 82.77083480358124, + "epoch": 0.0808, + "grad_norm": 12.10589599609375, + "kl": 1.421875, + "learning_rate": 3.0362127536287636e-07, + "loss": 0.0907, + "num_tokens": 6513218.0, + "reward": 0.7045793011784554, + "reward_std": 0.3827786281472072, + "rewards/SMILES_validity_reward": 0.7166666761040688, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8437499925494194, + "rewards/reasoning_steps_reward": 0.02083333395421505, + "rewards/repetition_penalty_reward": -0.009937431663274765, + "rewards/smiles_len_reward": -0.06250000186264515, + "rewards/tag_count_reward": 0.04947916720993817, "step": 202 }, { "clip_ratio": 0.0, - "completion_length": 84.34896171092987, - "epoch": 0.1632, - "grad_norm": 36.01606750488281, - "kl": 30.4951171875, - "learning_rate": 3.327658544712395e-07, - "loss": 0.3301, - "num_tokens": 16660504.0, - "reward": 0.7541028931736946, - "reward_std": 0.3762020096182823, - "rewards/SMILES_validity_reward": 0.7666666731238365, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984375074505806, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.015571234718663618, - "rewards/smiles_len_reward": -0.03337510232813656, - "rewards/tag_count_reward": 0.022786458663176745, + "completion_length": 47.45833492279053, + "epoch": 0.0816, + "grad_norm": 16.462682723999023, + "kl": 1.3189697265625, + "learning_rate": 3.0009693117583523e-07, + "loss": 0.0142, + "num_tokens": 6566926.0, + "reward": 0.6629129499197006, + "reward_std": 0.4532448905520141, + "rewards/SMILES_validity_reward": 0.633333345875144, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062499925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0021217708999756724, + "rewards/smiles_len_reward": -0.06250000186264515, + "rewards/tag_count_reward": 0.0416666679084301, "step": 204 }, { "clip_ratio": 0.0, - "completion_length": 70.80729401111603, - "epoch": 0.1648, - "grad_norm": 11.157661437988281, - "kl": 41.3486328125, - "learning_rate": 3.290962467058891e-07, - "loss": 0.2184, - "num_tokens": 16785998.0, - "reward": 0.7728025019168854, - "reward_std": 0.35571749578230083, - "rewards/SMILES_validity_reward": 0.7958333231508732, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9166666679084301, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.008943471177190077, - "rewards/smiles_len_reward": -0.10535037610679865, - "rewards/tag_count_reward": 0.013671875291038305, + "completion_length": 53.000001430511475, + "epoch": 0.0824, + "grad_norm": 11.786378860473633, + "kl": 1.1549072265625, + "learning_rate": 2.965621832278401e-07, + "loss": 0.0448, + "num_tokens": 6621166.0, + "reward": 0.7589017078280449, + "reward_std": 0.4288171596126631, + "rewards/SMILES_validity_reward": 0.7666666805744171, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062499925494194, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.006817507412051782, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.03645833418704569, "step": 206 }, { "clip_ratio": 0.0, - "completion_length": 77.96875286102295, - "epoch": 0.1664, - "grad_norm": 13.703250885009766, - "kl": 242.390625, - "learning_rate": 3.2540760201306637e-07, - "loss": 0.6453, - "num_tokens": 16914242.0, - "reward": 0.6993302181363106, - "reward_std": 0.4618812408298254, - "rewards/SMILES_validity_reward": 0.7166666649281979, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8776041641831398, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.008216387410357129, - "rewards/smiles_len_reward": -0.17790959577541798, - "rewards/tag_count_reward": 0.022135416860692203, + "completion_length": 69.76041889190674, + "epoch": 0.0832, + "grad_norm": 17.44184112548828, + "kl": 4.21337890625, + "learning_rate": 2.9301776558925875e-07, + "loss": 0.0582, + "num_tokens": 6677015.0, + "reward": 0.7213433459401131, + "reward_std": 0.4034923327853903, + "rewards/SMILES_validity_reward": 0.7166666742414236, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062499888241291, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.011567762594495434, + "rewards/smiles_len_reward": -0.0416666679084301, + "rewards/tag_count_reward": 0.031249999767169356, "step": 208 }, { "clip_ratio": 0.0, - "completion_length": 86.28645968437195, - "epoch": 0.168, - "grad_norm": 15.79922866821289, - "kl": 40.01171875, - "learning_rate": 3.2170080817777257e-07, - "loss": 0.3946, - "num_tokens": 17045680.0, - "reward": 0.7615203447639942, - "reward_std": 0.39564547780901194, - "rewards/SMILES_validity_reward": 0.802864570170641, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.859375, - "rewards/reasoning_steps_reward": 0.01388888928340748, - "rewards/repetition_penalty_reward": -0.016861008152773138, - "rewards/smiles_len_reward": -0.09562780696433038, - "rewards/tag_count_reward": 0.015625000174622983, + "completion_length": 55.36458468437195, + "epoch": 0.084, + "grad_norm": 9.37515926361084, + "kl": 2.1453857421875, + "learning_rate": 2.894644143385885e-07, + "loss": 0.0234, + "num_tokens": 6731482.0, + "reward": 0.7111485507339239, + "reward_std": 0.4254695660783909, + "rewards/SMILES_validity_reward": 0.7166666761040688, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8541666679084301, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.0015365626823040657, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.05468750116415322, "step": 210 }, { "clip_ratio": 0.0, - "completion_length": 68.33333587646484, - "epoch": 0.1696, - "grad_norm": 20.75408172607422, - "kl": 50.240234375, - "learning_rate": 3.1797675735315454e-07, - "loss": 0.3075, - "num_tokens": 17170224.0, - "reward": 0.7155029028654099, - "reward_std": 0.456816378980875, - "rewards/SMILES_validity_reward": 0.7375000007450581, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8645833358168602, - "rewards/reasoning_steps_reward": 0.01822916720993817, - "rewards/repetition_penalty_reward": -0.009517593272903468, - "rewards/smiles_len_reward": -0.132069039857015, - "rewards/tag_count_reward": 0.022135417035315186, + "completion_length": 65.73958432674408, + "epoch": 0.0848, + "grad_norm": 16.811309814453125, + "kl": 3.41455078125, + "learning_rate": 2.859028674095937e-07, + "loss": 0.0406, + "num_tokens": 6786945.0, + "reward": 0.7542203683406115, + "reward_std": 0.39707838895265013, + "rewards/SMILES_validity_reward": 0.7666666749864817, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333320915699, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.019776776432991028, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.03906250069849193, "step": 212 }, { "clip_ratio": 0.0, - "completion_length": 62.45833611488342, - "epoch": 0.1712, - "grad_norm": 40.99763488769531, - "kl": 36.8671875, - "learning_rate": 3.142363458457805e-07, - "loss": 0.2656, - "num_tokens": 17292512.0, - "reward": 0.734221912920475, - "reward_std": 0.4476381931453943, - "rewards/SMILES_validity_reward": 0.7541666738688946, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8776041679084301, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.009483299305429682, - "rewards/smiles_len_reward": -0.10389782977290452, - "rewards/tag_count_reward": 0.035807291977107525, + "completion_length": 47.010417461395264, + "epoch": 0.0856, + "grad_norm": 9.183852195739746, + "kl": 1.72039794921875, + "learning_rate": 2.823338644380566e-07, + "loss": 0.0129, + "num_tokens": 6840610.0, + "reward": 0.8614385165274143, + "reward_std": 0.19072945657535456, + "rewards/SMILES_validity_reward": 0.8833333402872086, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9687499962747097, + "rewards/reasoning_steps_reward": 0.0069444444961845875, + "rewards/repetition_penalty_reward": -0.0029773336136713624, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.0416666679084301, "step": 214 }, { "clip_ratio": 0.0, - "completion_length": 93.15885698795319, - "epoch": 0.1728, - "grad_norm": 34.548885345458984, - "kl": 26.62890625, - "learning_rate": 3.104804738999169e-07, - "loss": 0.4481, - "num_tokens": 17426589.0, - "reward": 0.6837027445435524, - "reward_std": 0.47833400405943394, - "rewards/SMILES_validity_reward": 0.7000000104308128, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8515624962747097, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.013724267948418856, - "rewards/smiles_len_reward": -0.12997856584843248, - "rewards/tag_count_reward": 0.015625000291038305, + "completion_length": 40.33333420753479, + "epoch": 0.0864, + "grad_norm": 12.742477416992188, + "kl": 2.152099609375, + "learning_rate": 2.7875814660817504e-07, + "loss": 0.0156, + "num_tokens": 6893634.0, + "reward": 0.8851364441215992, + "reward_std": 0.17163086455548182, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9687499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0028036211151629686, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.05208333395421505, "step": 216 }, { "clip_ratio": 0.0, - "completion_length": 70.51823091506958, - "epoch": 0.1744, - "grad_norm": 26.471229553222656, - "kl": 275.8046875, - "learning_rate": 3.067100454808567e-07, - "loss": 0.5704, - "num_tokens": 17551972.0, - "reward": 0.7661529034376144, - "reward_std": 0.41659063287079334, - "rewards/SMILES_validity_reward": 0.7916666604578495, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8854166679084301, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.011230084801354678, - "rewards/smiles_len_reward": -0.059663921245373785, - "rewards/tag_count_reward": 0.021484375349245965, + "completion_length": 49.218750953674316, + "epoch": 0.0872, + "grad_norm": 13.92532730102539, + "kl": 2.0343017578125, + "learning_rate": 2.751764564986396e-07, + "loss": 0.0608, + "num_tokens": 6947511.0, + "reward": 0.7820510920137167, + "reward_std": 0.30305373720329953, + "rewards/SMILES_validity_reward": 0.8000000081956387, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062499925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0008443895567324944, + "rewards/smiles_len_reward": -0.0416666679084301, + "rewards/tag_count_reward": 0.04427083395421505, "step": 218 }, { "clip_ratio": 0.0, - "completion_length": 81.42968988418579, - "epoch": 0.176, - "grad_norm": 13.7344970703125, - "kl": 37.447265625, - "learning_rate": 3.029259680573527e-07, - "loss": 0.4229, - "num_tokens": 17681545.0, - "reward": 0.7183681428432465, - "reward_std": 0.3968039182946086, - "rewards/SMILES_validity_reward": 0.7177083268761635, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9088541641831398, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.007644086541404249, - "rewards/smiles_len_reward": -0.08979546726914123, - "rewards/tag_count_reward": 0.0227864584303461, + "completion_length": 47.88541829586029, + "epoch": 0.088, + "grad_norm": 15.1016206741333, + "kl": 5.68994140625, + "learning_rate": 2.715895379284194e-07, + "loss": 0.0248, + "num_tokens": 7001260.0, + "reward": 0.7979313395917416, + "reward_std": 0.287646692362614, + "rewards/SMILES_validity_reward": 0.8166666738688946, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9270833246409893, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.01079208473674953, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.018229166977107525, "step": 220 }, { "clip_ratio": 0.0, - "completion_length": 75.84896063804626, - "epoch": 0.1776, - "grad_norm": 19.092857360839844, - "kl": 29.484375, - "learning_rate": 2.991291523832075e-07, - "loss": 0.3437, - "num_tokens": 17808975.0, - "reward": 0.7603292763233185, - "reward_std": 0.4128862749785185, - "rewards/SMILES_validity_reward": 0.7833333425223827, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8932291716337204, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.009071222404600121, - "rewards/smiles_len_reward": -0.07800182851497084, - "rewards/tag_count_reward": 0.022135417093522847, + "completion_length": 53.968751072883606, + "epoch": 0.0888, + "grad_norm": 14.147798538208008, + "kl": 1.9619140625, + "learning_rate": 2.6799813580229174e-07, + "loss": 0.0081, + "num_tokens": 7055593.0, + "reward": 0.8779301084578037, + "reward_std": 0.15663258303538896, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0019502766517689452, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.03645833418704569, "step": 222 }, { "clip_ratio": 0.0, - "completion_length": 69.0208352804184, - "epoch": 0.1792, - "grad_norm": 26.781389236450195, - "kl": 29.21875, - "learning_rate": 2.953205122780729e-07, - "loss": 0.3597, - "num_tokens": 17933783.0, - "reward": 0.7867212891578674, - "reward_std": 0.37980251759290695, - "rewards/SMILES_validity_reward": 0.7999999932944775, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9140624925494194, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.011719140768036596, - "rewards/smiles_len_reward": 0.007446364965289831, - "rewards/tag_count_reward": 0.016276041860692203, + "completion_length": 60.437501311302185, + "epoch": 0.0896, + "grad_norm": 10.820075988769531, + "kl": 8.6739501953125, + "learning_rate": 2.6440299595614606e-07, + "loss": 0.0341, + "num_tokens": 7110547.0, + "reward": 0.7884009815752506, + "reward_std": 0.3595552204642445, + "rewards/SMILES_validity_reward": 0.8166666757315397, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8854166567325592, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.012345630770141724, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.04427083441987634, "step": 224 }, { "clip_ratio": 0.0, - "completion_length": 80.48698103427887, - "epoch": 0.1808, - "grad_norm": 24.122013092041016, - "kl": 94.544921875, - "learning_rate": 2.9150096440751103e-07, - "loss": 0.4924, - "num_tokens": 18062994.0, - "reward": 0.6973020005971193, - "reward_std": 0.45596596598625183, - "rewards/SMILES_validity_reward": 0.7166666742414236, + "completion_length": 60.79166853427887, + "epoch": 0.0904, + "grad_norm": 8.08423900604248, + "kl": 3.1444091796875, + "learning_rate": 2.6080486500209347e-07, + "loss": 0.04, + "num_tokens": 7165535.0, + "reward": 0.7426584232598543, + "reward_std": 0.2605252732464578, + "rewards/SMILES_validity_reward": 0.723958333954215, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8541666679084301, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.013226932846009731, - "rewards/smiles_len_reward": -0.12482209294103086, - "rewards/tag_count_reward": 0.02408854174427688, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.0031045216892380267, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.06510416767559946, "step": 226 }, { "clip_ratio": 0.0, - "completion_length": 62.888023257255554, - "epoch": 0.1824, - "grad_norm": 3758.826171875, - "kl": 1040.8896484375, - "learning_rate": 2.8767142806237077e-07, - "loss": 1.3245, - "num_tokens": 18185447.0, - "reward": 0.7444559372961521, - "reward_std": 0.410754032433033, - "rewards/SMILES_validity_reward": 0.7791666686534882, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8723958283662796, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.009680914790806128, - "rewards/smiles_len_reward": -0.14576094248332083, - "rewards/tag_count_reward": 0.020833333604969084, + "completion_length": 37.729167222976685, + "epoch": 0.0912, + "grad_norm": 268.18206787109375, + "kl": 43.56005859375, + "learning_rate": 2.572044901734166e-07, + "loss": 0.0062, + "num_tokens": 7218309.0, + "reward": 0.7288062907755375, + "reward_std": 0.28436800511553884, + "rewards/SMILES_validity_reward": 0.7020833417773247, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0015219146735034883, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.06250000046566129, "step": 228 }, { "clip_ratio": 0.0, - "completion_length": 65.07031440734863, - "epoch": 0.184, - "grad_norm": 1915.1563720703125, - "kl": 376.4609375, - "learning_rate": 2.838328249375328e-07, - "loss": 0.6467, - "num_tokens": 18308738.0, - "reward": 0.7443150542676449, - "reward_std": 0.42556965351104736, - "rewards/SMILES_validity_reward": 0.7778645865619183, + "completion_length": 38.33333432674408, + "epoch": 0.092, + "grad_norm": 12.932578086853027, + "kl": 2.032470703125, + "learning_rate": 2.536026191693893e-07, + "loss": 0.0224, + "num_tokens": 7271141.0, + "reward": 0.8507591001689434, + "reward_std": 0.25005635869456455, + "rewards/SMILES_validity_reward": 0.8666666746139526, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8723958320915699, - "rewards/reasoning_steps_reward": 0.014756944787222892, - "rewards/repetition_penalty_reward": -0.010134508582268609, - "rewards/smiles_len_reward": -0.1464992203982547, - "rewards/tag_count_reward": 0.022786458546761423, + "rewards/format_reward": 0.9687499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0017853352474048734, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0416666679084301, "step": 230 }, { "clip_ratio": 0.0, - "completion_length": 58.880210161209106, - "epoch": 0.1856, - "grad_norm": 18.341007232666016, - "kl": 31.94921875, - "learning_rate": 2.7998607891007493e-07, - "loss": 0.3038, - "num_tokens": 18429652.0, - "reward": 0.7311629764735699, - "reward_std": 0.43252694979310036, - "rewards/SMILES_validity_reward": 0.7375000026077032, + "completion_length": 59.989585518836975, + "epoch": 0.0928, + "grad_norm": 77.25758361816406, + "kl": 25.74658203125, + "learning_rate": 2.5e-07, + "loss": 0.098, + "num_tokens": 7326052.0, + "reward": 0.8249511867761612, + "reward_std": 0.275203651515767, + "rewards/SMILES_validity_reward": 0.8500000089406967, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984375, - "rewards/reasoning_steps_reward": 0.004340277868323028, - "rewards/repetition_penalty_reward": -0.006652665741057717, - "rewards/smiles_len_reward": -0.06340290373191237, - "rewards/tag_count_reward": 0.019531250349245965, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.010416666744276881, + "rewards/repetition_penalty_reward": -0.005176817707251757, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.07031250093132257, "step": 232 }, { "clip_ratio": 0.0, - "completion_length": 73.97135615348816, - "epoch": 0.1872, - "grad_norm": 50.981346130371094, - "kl": 101.208984375, - "learning_rate": 2.761321158169134e-07, - "loss": 0.3981, - "num_tokens": 18556361.0, - "reward": 0.7673307359218597, - "reward_std": 0.3982690507546067, - "rewards/SMILES_validity_reward": 0.7791666723787785, + "completion_length": 42.56250071525574, + "epoch": 0.0936, + "grad_norm": 12.249787330627441, + "kl": 14.455810546875, + "learning_rate": 2.4639738083061073e-07, + "loss": -0.0119, + "num_tokens": 7379290.0, + "reward": 0.7708266153931618, + "reward_std": 0.23000530700664967, + "rewards/SMILES_validity_reward": 0.7520833369344473, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458395421505, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.009968915626814123, - "rewards/smiles_len_reward": -0.01503224135376513, - "rewards/tag_count_reward": 0.025390624825377017, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0016309260972775519, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.12239583325572312, "step": 234 }, { "clip_ratio": 0.0, - "completion_length": 80.88281464576721, - "epoch": 0.1888, - "grad_norm": 282.7349548339844, - "kl": 101.9609375, - "learning_rate": 2.722718632319716e-07, - "loss": 0.4886, - "num_tokens": 18685724.0, - "reward": 0.7614529021084309, - "reward_std": 0.3986309599131346, - "rewards/SMILES_validity_reward": 0.7874999977648258, + "completion_length": 48.91666758060455, + "epoch": 0.0944, + "grad_norm": 25.05546760559082, + "kl": 10.76171875, + "learning_rate": 2.4279550982658345e-07, + "loss": 0.0693, + "num_tokens": 7433138.0, + "reward": 0.7949926014989614, + "reward_std": 0.28037455228331964, + "rewards/SMILES_validity_reward": 0.8166666748002172, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8854166679084301, - "rewards/reasoning_steps_reward": 0.013020833488553762, - "rewards/repetition_penalty_reward": -0.013385478279815288, - "rewards/smiles_len_reward": -0.07403978041838855, - "rewards/tag_count_reward": 0.020182291977107525, + "rewards/format_reward": 0.9062499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.011533659446286038, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.05208333395421505, "step": 236 }, { "clip_ratio": 0.0, - "completion_length": 74.22656440734863, - "epoch": 0.1904, - "grad_norm": 1830.841552734375, - "kl": 231.3828125, - "learning_rate": 2.684062502429312e-07, - "loss": 0.415, - "num_tokens": 18812531.0, - "reward": 0.7645077109336853, - "reward_std": 0.37579927399929147, - "rewards/SMILES_validity_reward": 0.779166666790843, + "completion_length": 59.07291853427887, + "epoch": 0.0952, + "grad_norm": 8.763030052185059, + "kl": 1.9202880859375, + "learning_rate": 2.3919513499790646e-07, + "loss": 0.1179, + "num_tokens": 7487961.0, + "reward": 0.8150318637490273, + "reward_std": 0.3204096044646576, + "rewards/SMILES_validity_reward": 0.8333333432674408, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458358168602, - "rewards/reasoning_steps_reward": 0.015625000465661287, - "rewards/repetition_penalty_reward": -0.009541891871776897, - "rewards/smiles_len_reward": -0.04434057860635221, - "rewards/tag_count_reward": 0.018229166977107525, + "rewards/format_reward": 0.9166666641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.011140864342451096, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.10937500186264515, "step": 238 }, { "clip_ratio": 0.0, - "completion_length": 86.03646087646484, - "epoch": 0.192, - "grad_norm": 82.80435943603516, - "kl": 793.9921875, - "learning_rate": 2.6453620722761895e-07, - "loss": 1.1515, - "num_tokens": 18943873.0, - "reward": 0.7398635447025299, - "reward_std": 0.4153846015688032, - "rewards/SMILES_validity_reward": 0.7500000111758709, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8776041641831398, - "rewards/reasoning_steps_reward": 0.010416666744276881, - "rewards/repetition_penalty_reward": -0.014263476856285706, - "rewards/smiles_len_reward": -0.009628391126170754, - "rewards/tag_count_reward": 0.029296875232830644, + "completion_length": 71.48958492279053, + "epoch": 0.096, + "grad_norm": 8.304740905761719, + "kl": 2.7021484375, + "learning_rate": 2.3559700404385394e-07, + "loss": 0.0967, + "num_tokens": 7543976.0, + "reward": 0.8159580808132887, + "reward_std": 0.276808429392986, + "rewards/SMILES_validity_reward": 0.8500000070780516, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8854166641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0065662864144542255, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.08593750256113708, "step": 240 }, { "clip_ratio": 0.0, - "completion_length": 61.32031428813934, - "epoch": 0.1936, - "grad_norm": 17.316062927246094, - "kl": 18.3447265625, - "learning_rate": 2.6066266563008265e-07, - "loss": 0.3039, - "num_tokens": 19065724.0, - "reward": 0.8148761950433254, - "reward_std": 0.3233891185373068, - "rewards/SMILES_validity_reward": 0.8541666530072689, + "completion_length": 69.59375298023224, + "epoch": 0.0968, + "grad_norm": 11.63044548034668, + "kl": 1.60302734375, + "learning_rate": 2.3200186419770823e-07, + "loss": 0.064, + "num_tokens": 7599809.0, + "reward": 0.8393127992749214, + "reward_std": 0.272423698563216, + "rewards/SMILES_validity_reward": 0.8666666746139526, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458358168602, + "rewards/format_reward": 0.9374999925494194, "rewards/reasoning_steps_reward": 0.0, - "rewards/repetition_penalty_reward": -0.008308633463457227, - "rewards/smiles_len_reward": -0.05777434818446636, - "rewards/tag_count_reward": 0.024739583139307797, + "rewards/repetition_penalty_reward": -0.004269022305379622, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.049479167675599456, "step": 242 }, { "clip_ratio": 0.0, - "completion_length": 79.41146039962769, - "epoch": 0.1952, - "grad_norm": 939.9111938476562, - "kl": 117.7265625, - "learning_rate": 2.567865577364107e-07, - "loss": 0.2587, - "num_tokens": 19194522.0, - "reward": 0.7968677990138531, - "reward_std": 0.331410052604042, - "rewards/SMILES_validity_reward": 0.824999988079071, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9192708320915699, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.010457739825142198, - "rewards/smiles_len_reward": -0.08623012714087963, - "rewards/tag_count_reward": 0.024739583546761423, + "completion_length": 55.96875178813934, + "epoch": 0.0976, + "grad_norm": 9.17193603515625, + "kl": 2.159423828125, + "learning_rate": 2.284104620715807e-07, + "loss": -0.0007, + "num_tokens": 7654334.0, + "reward": 0.7993007898330688, + "reward_std": 0.13739498698851094, + "rewards/SMILES_validity_reward": 0.8020833358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.004389131907373667, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.08072916767559946, "step": 244 }, { "clip_ratio": 0.0, - "completion_length": 73.66927254199982, - "epoch": 0.1968, - "grad_norm": 14.630553245544434, - "kl": 34.8125, - "learning_rate": 2.5290881645034926e-07, - "loss": 0.3309, - "num_tokens": 19321115.0, - "reward": 0.7642820924520493, - "reward_std": 0.3856325391680002, - "rewards/SMILES_validity_reward": 0.7958333306014538, + "completion_length": 54.104168176651, + "epoch": 0.0984, + "grad_norm": 13.933065414428711, + "kl": 4.1160888671875, + "learning_rate": 2.2482354350136043e-07, + "loss": -0.052, + "num_tokens": 7708680.0, + "reward": 0.8486762568354607, + "reward_std": 0.256513943313621, + "rewards/SMILES_validity_reward": 0.8666666746139526, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8802083283662796, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.008856045191350859, - "rewards/smiles_len_reward": -0.09168360711191781, - "rewards/tag_count_reward": 0.024088542093522847, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0034722222480922937, + "rewards/repetition_penalty_reward": -0.0026484212721697986, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.13281250023283064, "step": 246 }, { "clip_ratio": 0.0, - "completion_length": 79.70312690734863, - "epoch": 0.1984, - "grad_norm": 21.64442253112793, - "kl": 28.0859375, - "learning_rate": 2.4903037506876995e-07, - "loss": 0.3431, - "num_tokens": 19450025.0, - "reward": 0.7640567347407341, - "reward_std": 0.41116272285580635, - "rewards/SMILES_validity_reward": 0.7874999940395355, + "completion_length": 56.34375178813934, + "epoch": 0.0992, + "grad_norm": 11.744587898254395, + "kl": 8.788330078125, + "learning_rate": 2.2124185339182496e-07, + "loss": 0.0587, + "num_tokens": 7763241.0, + "reward": 0.8349000215530396, + "reward_std": 0.28379016066901386, + "rewards/SMILES_validity_reward": 0.8666666746139526, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8776041716337204, - "rewards/reasoning_steps_reward": 0.018229167151730508, - "rewards/repetition_penalty_reward": -0.013350201916182414, - "rewards/smiles_len_reward": -0.03371383191552013, - "rewards/tag_count_reward": 0.02408854162786156, + "rewards/format_reward": 0.9062499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0015219679335132241, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.08593750023283064, "step": 248 }, { "clip_ratio": 0.0, - "completion_length": 65.33593893051147, - "epoch": 0.2, - "grad_norm": 14.254769325256348, - "kl": 357.6015625, - "learning_rate": 2.4515216705704393e-07, - "loss": 0.4718, - "num_tokens": 19573418.0, - "reward": 0.7403161786496639, - "reward_std": 0.38923288183286786, - "rewards/SMILES_validity_reward": 0.7708333358168602, + "completion_length": 56.43750286102295, + "epoch": 0.1, + "grad_norm": 15.010062217712402, + "kl": 4.990234375, + "learning_rate": 2.1766613556194344e-07, + "loss": 0.0464, + "num_tokens": 7817811.0, + "reward": 0.8480795957148075, + "reward_std": 0.25808324449462816, + "rewards/SMILES_validity_reward": 0.8833333402872086, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8619791716337204, - "rewards/reasoning_steps_reward": 0.012152778101153672, - "rewards/repetition_penalty_reward": -0.009566615204676054, - "rewards/smiles_len_reward": -0.11114423366962001, - "rewards/tag_count_reward": 0.02994791674427688, + "rewards/format_reward": 0.9166666641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.009830351526034065, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.07812500093132257, "step": 250 }, { "clip_ratio": 0.0, - "completion_length": 70.15625238418579, - "epoch": 0.2016, - "grad_norm": 20.493144989013672, - "kl": 32.86328125, - "learning_rate": 2.412751258243748e-07, - "loss": 0.382, - "num_tokens": 19698662.0, - "reward": 0.7679505236446857, - "reward_std": 0.40248001366853714, - "rewards/SMILES_validity_reward": 0.8041666522622108, + "completion_length": 45.85416758060455, + "epoch": 0.1008, + "grad_norm": 41.247093200683594, + "kl": 19.83984375, + "learning_rate": 2.1409713259040628e-07, + "loss": 0.0715, + "num_tokens": 7871365.0, + "reward": 0.8904657512903214, + "reward_std": 0.1805916415178217, + "rewards/SMILES_validity_reward": 0.9333333373069763, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.890625, - "rewards/reasoning_steps_reward": 0.011284722539130598, - "rewards/repetition_penalty_reward": -0.010105125089467037, - "rewards/smiles_len_reward": -0.14680567476898432, - "rewards/tag_count_reward": 0.024088542035315186, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.014093825098825619, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.14583333558402956, "step": 252 }, { "clip_ratio": 0.0, - "completion_length": 63.04687678813934, - "epoch": 0.2032, - "grad_norm": 29.680883407592773, - "kl": 32.193359375, - "learning_rate": 2.37400184499145e-07, - "loss": 0.2616, - "num_tokens": 19821176.0, - "reward": 0.7311373427510262, - "reward_std": 0.43501078244298697, - "rewards/SMILES_validity_reward": 0.7458333410322666, + "completion_length": 59.23958468437195, + "epoch": 0.1016, + "grad_norm": 13.48940372467041, + "kl": 20.977294921875, + "learning_rate": 2.105355856614115e-07, + "loss": 0.0556, + "num_tokens": 7926204.0, + "reward": 0.7457279786467552, + "reward_std": 0.2488641007221304, + "rewards/SMILES_validity_reward": 0.7354166712611914, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8802083320915699, - "rewards/reasoning_steps_reward": 0.006944444612599909, - "rewards/repetition_penalty_reward": -0.008689612010130077, - "rewards/smiles_len_reward": -0.07047638797666878, - "rewards/tag_count_reward": 0.022135416918899864, + "rewards/format_reward": 0.9062499962747097, + "rewards/reasoning_steps_reward": 0.013888889225199819, + "rewards/repetition_penalty_reward": -0.004005965311080217, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.1171875, "step": 254 }, { "clip_ratio": 0.0, - "completion_length": 61.67187702655792, - "epoch": 0.2048, - "grad_norm": 31.821918487548828, - "kl": 74.078125, - "learning_rate": 2.3352827570433033e-07, - "loss": 0.4004, - "num_tokens": 19943162.0, - "reward": 0.8020155876874924, - "reward_std": 0.3569780103280209, - "rewards/SMILES_validity_reward": 0.8291666656732559, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9166666679084301, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.006827858622273197, - "rewards/smiles_len_reward": -0.06168944027740508, - "rewards/tag_count_reward": 0.026692708197515458, + "completion_length": 49.5208340883255, + "epoch": 0.1024, + "grad_norm": 11.00932502746582, + "kl": 3.1722412109375, + "learning_rate": 2.069822344107413e-07, + "loss": 0.0286, + "num_tokens": 7980110.0, + "reward": 0.8951140381395817, + "reward_std": 0.16878563386853784, + "rewards/SMILES_validity_reward": 0.9333333373069763, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.009277544566430151, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.12500000069849193, "step": 256 }, { "clip_ratio": 0.0, - "completion_length": 75.89583539962769, - "epoch": 0.2064, - "grad_norm": 24.872432708740234, - "kl": 34.111328125, - "learning_rate": 2.2966033133303545e-07, - "loss": 0.4437, - "num_tokens": 20070610.0, - "reward": 0.7651942074298859, - "reward_std": 0.3582833812106401, - "rewards/SMILES_validity_reward": 0.7874999977648258, + "completion_length": 42.94791805744171, + "epoch": 0.1032, + "grad_norm": 30.966983795166016, + "kl": 6.7318115234375, + "learning_rate": 2.034378167721599e-07, + "loss": 0.0322, + "num_tokens": 8033385.0, + "reward": 0.8276818729937077, + "reward_std": 0.30462851375341415, + "rewards/SMILES_validity_reward": 0.8500000089406967, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.890625, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.010585774871287867, - "rewards/smiles_len_reward": -0.06807235861197114, - "rewards/tag_count_reward": 0.03320312488358468, + "rewards/format_reward": 0.8958333246409893, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0013075313763692975, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.15625, "step": 258 }, { "clip_ratio": 0.0, - "completion_length": 61.50781464576721, - "epoch": 0.208, - "grad_norm": 41.98968505859375, - "kl": 44.3046875, - "learning_rate": 2.2579728232420523e-07, - "loss": 0.2895, - "num_tokens": 20192533.0, - "reward": 0.7789809294044971, - "reward_std": 0.3466755224435474, - "rewards/SMILES_validity_reward": 0.8028645887970924, + "completion_length": 49.312501192092896, + "epoch": 0.104, + "grad_norm": 14.884501457214355, + "kl": 382.56396484375, + "learning_rate": 1.9990306882416485e-07, + "loss": 0.3722, + "num_tokens": 8087271.0, + "reward": 0.8172137457877398, + "reward_std": 0.259702468290925, + "rewards/SMILES_validity_reward": 0.8333333358168602, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9088541604578495, - "rewards/reasoning_steps_reward": 0.006076389050576836, - "rewards/repetition_penalty_reward": -0.010937447841570247, - "rewards/smiles_len_reward": -0.06887253125751158, - "rewards/tag_count_reward": 0.016927083604969084, + "rewards/format_reward": 0.9062499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.002343074476812035, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.13802083441987634, "step": 260 }, { "clip_ratio": 0.0, - "completion_length": 66.78906500339508, - "epoch": 0.2096, - "grad_norm": 18.29418182373047, - "kl": 34.1484375, - "learning_rate": 2.2194005843856633e-07, - "loss": 0.4393, - "num_tokens": 20316484.0, - "reward": 0.8505359329283237, - "reward_std": 0.2659579182509333, - "rewards/SMILES_validity_reward": 0.8916666619479656, + "completion_length": 52.562500953674316, + "epoch": 0.1048, + "grad_norm": 42.34991455078125, + "kl": 13.2562255859375, + "learning_rate": 1.9637872463712362e-07, + "loss": -0.0042, + "num_tokens": 8141469.0, + "reward": 0.8395539987832308, + "reward_std": 0.25605778163298965, + "rewards/SMILES_validity_reward": 0.8614583387970924, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9375000037252903, - "rewards/reasoning_steps_reward": 0.0017361111240461469, - "rewards/repetition_penalty_reward": -0.009237999664037488, - "rewards/smiles_len_reward": -0.05497855134308338, - "rewards/tag_count_reward": 0.013671875291038305, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.0018571128166513517, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.12760416674427688, "step": 262 }, { "clip_ratio": 0.0, - "completion_length": 50.89583480358124, - "epoch": 0.2112, - "grad_norm": 26.120685577392578, - "kl": 45.78125, - "learning_rate": 2.1808958803485133e-07, - "loss": 0.2691, - "num_tokens": 20434332.0, - "reward": 0.7770359516143799, - "reward_std": 0.37100684829056263, - "rewards/SMILES_validity_reward": 0.7999999988824129, + "completion_length": 68.10416960716248, + "epoch": 0.1056, + "grad_norm": 15.944575309753418, + "kl": 7.031494140625, + "learning_rate": 1.9286551612082773e-07, + "loss": 0.0701, + "num_tokens": 8197159.0, + "reward": 0.7490371651947498, + "reward_std": 0.2526437883498147, + "rewards/SMILES_validity_reward": 0.7354166731238365, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416679084301, + "rewards/format_reward": 0.9062499962747097, "rewards/reasoning_steps_reward": 0.013888889225199819, - "rewards/repetition_penalty_reward": -0.0058435348473722115, - "rewards/smiles_len_reward": -0.0629474117886275, - "rewards/tag_count_reward": 0.022135416802484542, + "rewards/repetition_penalty_reward": -0.017789341538446024, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.15364583395421505, "step": 264 }, { "clip_ratio": 0.0, - "completion_length": 68.60416793823242, - "epoch": 0.2128, - "grad_norm": 50.12458419799805, - "kl": 44.2275390625, - "learning_rate": 2.1424679784636144e-07, - "loss": 0.3092, - "num_tokens": 20558980.0, - "reward": 0.7655416280031204, - "reward_std": 0.3876127991534304, - "rewards/SMILES_validity_reward": 0.7833333350718021, + "completion_length": 51.593751192092896, + "epoch": 0.1064, + "grad_norm": 15.867863655090332, + "kl": 2011.879150390625, + "learning_rate": 1.8936417287249446e-07, + "loss": 2.0318, + "num_tokens": 8251264.0, + "reward": 0.8784895241260529, + "reward_std": 0.19250585057307035, + "rewards/SMILES_validity_reward": 0.9166666716337204, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416716337204, - "rewards/reasoning_steps_reward": 0.014756944845430553, - "rewards/repetition_penalty_reward": -0.013187804717745166, - "rewards/smiles_len_reward": -0.061909247655421495, - "rewards/tag_count_reward": 0.029296875407453626, + "rewards/format_reward": 0.9270833320915699, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.006772592620109208, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.11458333465270698, "step": 266 }, { "clip_ratio": 0.0, - "completion_length": 72.49218928813934, - "epoch": 0.2144, - "grad_norm": 2464.454833984375, - "kl": 425.138671875, - "learning_rate": 2.104126127579193e-07, - "loss": 0.8155, - "num_tokens": 20685121.0, - "reward": 0.7608797401189804, - "reward_std": 0.3963581267744303, - "rewards/SMILES_validity_reward": 0.783333346247673, + "completion_length": 73.71875202655792, + "epoch": 0.1072, + "grad_norm": 17.923137664794922, + "kl": 5.484130859375, + "learning_rate": 1.8587542202524985e-07, + "loss": 0.1186, + "num_tokens": 8307493.0, + "reward": 0.8079846333712339, + "reward_std": 0.3059961917460896, + "rewards/SMILES_validity_reward": 0.8333333414047956, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416641831398, - "rewards/reasoning_steps_reward": 0.006076389050576836, - "rewards/repetition_penalty_reward": -0.011656314345600549, - "rewards/smiles_len_reward": -0.09486878104507923, - "rewards/tag_count_reward": 0.022786458488553762, + "rewards/format_reward": 0.8749999925494194, + "rewards/reasoning_steps_reward": 0.013888889225199819, + "rewards/repetition_penalty_reward": -0.012168978486442938, + "rewards/smiles_len_reward": -0.0416666679084301, + "rewards/tag_count_reward": 0.16145833441987634, "step": 268 }, { "clip_ratio": 0.0, - "completion_length": 45.796876311302185, - "epoch": 0.216, - "grad_norm": 90.27828216552734, - "kl": 113.61328125, - "learning_rate": 2.065879555832674e-07, - "loss": 0.3303, - "num_tokens": 20801011.0, - "reward": 0.7906807139515877, - "reward_std": 0.347005927702412, - "rewards/SMILES_validity_reward": 0.8291666619479656, + "completion_length": 44.98958456516266, + "epoch": 0.108, + "grad_norm": 13.717728614807129, + "kl": 193.90045166015625, + "learning_rate": 1.82399988097123e-07, + "loss": 0.172, + "num_tokens": 8360964.0, + "reward": 0.8983778692781925, + "reward_std": 0.15485689049819484, + "rewards/SMILES_validity_reward": 0.9333333373069763, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8880208358168602, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.004367547942820238, - "rewards/smiles_len_reward": -0.08830558031331748, - "rewards/tag_count_reward": 0.02604166674427688, + "rewards/format_reward": 0.9375000037252903, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0026809167611645535, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.15625000023283064, "step": 270 }, { "clip_ratio": 0.0, - "completion_length": 85.78125309944153, - "epoch": 0.2176, - "grad_norm": 390.4526672363281, - "kl": 193.759765625, - "learning_rate": 2.0277374684296498e-07, - "loss": 0.512, - "num_tokens": 20932255.0, - "reward": 0.7888578772544861, - "reward_std": 0.3733808258548379, - "rewards/SMILES_validity_reward": 0.8223958350718021, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9114583395421505, - "rewards/reasoning_steps_reward": 0.012152778101153672, - "rewards/repetition_penalty_reward": -0.013028325862251222, - "rewards/smiles_len_reward": -0.1238281219266355, - "rewards/tag_count_reward": 0.02213541662786156, + "completion_length": 33.04166758060455, + "epoch": 0.1088, + "grad_norm": 17.957460403442383, + "kl": 26.424072265625, + "learning_rate": 1.7893859284058378e-07, + "loss": 0.0096, + "num_tokens": 8413288.0, + "reward": 0.9149191044270992, + "reward_std": 0.12943543260917068, + "rewards/SMILES_validity_reward": 0.9500000029802322, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9583333283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0008101852145045996, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.12500000069849193, "step": 272 }, { "clip_ratio": 0.0, - "completion_length": 69.94010603427887, - "epoch": 0.2192, - "grad_norm": 23.130889892578125, - "kl": 126.322265625, - "learning_rate": 1.989709045428361e-07, - "loss": 0.3458, - "num_tokens": 21057416.0, - "reward": 0.7640821002423763, - "reward_std": 0.36372836004011333, - "rewards/SMILES_validity_reward": 0.7916666716337204, + "completion_length": 62.18750083446503, + "epoch": 0.1096, + "grad_norm": 25.32672119140625, + "kl": 17.261962890625, + "learning_rate": 1.7549195509265407e-07, + "loss": 0.0172, + "num_tokens": 8468410.0, + "reward": 0.8183866981416941, + "reward_std": 0.28250074468087405, + "rewards/SMILES_validity_reward": 0.8500000070780516, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333358168602, - "rewards/reasoning_steps_reward": 0.013020833488553762, - "rewards/repetition_penalty_reward": -0.009005672072817106, - "rewards/smiles_len_reward": -0.11449750722385943, - "rewards/tag_count_reward": 0.022135417035315186, + "rewards/format_reward": 0.8749999925494194, + "rewards/reasoning_steps_reward": 0.0069444444961845875, + "rewards/repetition_penalty_reward": -0.010057752471766435, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.1484375016298145, "step": 274 }, { "clip_ratio": 0.0, - "completion_length": 86.97916913032532, - "epoch": 0.2208, - "grad_norm": 133.2987823486328, - "kl": 152.8232421875, - "learning_rate": 1.9518034395302412e-07, - "loss": 0.5016, - "num_tokens": 21189120.0, - "reward": 0.7536205910146236, - "reward_std": 0.39099146984517574, - "rewards/SMILES_validity_reward": 0.7999999970197678, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8645833283662796, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.01477565044478979, - "rewards/smiles_len_reward": -0.1902956496924162, - "rewards/tag_count_reward": 0.03450520854676142, + "completion_length": 51.82291805744171, + "epoch": 0.1104, + "grad_norm": 13.775248527526855, + "kl": 12.86962890625, + "learning_rate": 1.7206079062562536e-07, + "loss": -0.031, + "num_tokens": 8522537.0, + "reward": 0.8880144283175468, + "reward_std": 0.19194586825324222, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.00266948455828242, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.13802083488553762, "step": 276 }, { "clip_ratio": 0.0, - "completion_length": 51.25260508060455, - "epoch": 0.2224, - "grad_norm": 67.20164489746094, - "kl": 61.5078125, - "learning_rate": 1.9140297738770385e-07, - "loss": 0.2463, - "num_tokens": 21307105.0, - "reward": 0.7786005400121212, - "reward_std": 0.3619860680773854, - "rewards/SMILES_validity_reward": 0.8124999925494194, + "completion_length": 50.718751668930054, + "epoch": 0.1112, + "grad_norm": 15.227043151855469, + "kl": 6.1226806640625, + "learning_rate": 1.6864581199841226e-07, + "loss": 0.0581, + "num_tokens": 8576558.0, + "reward": 0.8818176276981831, + "reward_std": 0.20186374394688755, + "rewards/SMILES_validity_reward": 0.9000000059604645, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416679084301, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.0060174524296598975, - "rewards/smiles_len_reward": -0.12334280030336231, - "rewards/tag_count_reward": 0.019531250116415322, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.005262482751277275, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.19531250302679837, "step": 278 }, { "clip_ratio": 0.0, - "completion_length": 59.000001668930054, - "epoch": 0.224, - "grad_norm": 18.651592254638672, - "kl": 22.283203125, - "learning_rate": 1.8763971398550467e-07, - "loss": 0.2241, - "num_tokens": 21428065.0, - "reward": 0.8106764741241932, - "reward_std": 0.32357010687701404, - "rewards/SMILES_validity_reward": 0.8458333313465118, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9218749962747097, - "rewards/reasoning_steps_reward": 0.0034722223062999547, - "rewards/repetition_penalty_reward": -0.006691267819405766, - "rewards/smiles_len_reward": -0.09079861175268888, - "rewards/tag_count_reward": 0.014322916802484542, + "completion_length": 53.6250022649765, + "epoch": 0.112, + "grad_norm": 15.401267051696777, + "kl": 45.770263671875, + "learning_rate": 1.6524772840857388e-07, + "loss": 0.0846, + "num_tokens": 8630858.0, + "reward": 0.8197834901511669, + "reward_std": 0.32834628655109555, + "rewards/SMILES_validity_reward": 0.8166666757315397, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.003207936490071006, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.16145833395421505, "step": 280 }, { "clip_ratio": 0.0, - "completion_length": 73.3333351612091, - "epoch": 0.2256, - "grad_norm": 14.318089485168457, - "kl": 27.7333984375, - "learning_rate": 1.8389145949069951e-07, - "loss": 0.3572, - "num_tokens": 21554529.0, - "reward": 0.7339997342787683, - "reward_std": 0.3634449951350689, - "rewards/SMILES_validity_reward": 0.7398437573574483, + "completion_length": 34.51041769981384, + "epoch": 0.1128, + "grad_norm": 26.33031463623047, + "kl": 28.7257080078125, + "learning_rate": 1.6186724554503237e-07, + "loss": 0.0075, + "num_tokens": 8683323.0, + "reward": 0.9235701858997345, + "reward_std": 0.1269298658007756, + "rewards/SMILES_validity_reward": 0.9500000029802322, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8932291604578495, - "rewards/reasoning_steps_reward": 0.017361111706122756, - "rewards/repetition_penalty_reward": -0.011701545292453375, - "rewards/smiles_len_reward": -0.04183520987862721, - "rewards/tag_count_reward": 0.017578125232830644, + "rewards/format_reward": 0.9687499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0028410269296728075, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.19791666651144624, "step": 282 }, { "clip_ratio": 0.0, - "completion_length": 47.28385519981384, - "epoch": 0.2272, - "grad_norm": 29.62630844116211, - "kl": 34.2265625, - "learning_rate": 1.8015911603520893e-07, - "loss": 0.2096, - "num_tokens": 21670990.0, - "reward": 0.7384370751678944, - "reward_std": 0.40297506004571915, - "rewards/SMILES_validity_reward": 0.7666666675359011, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8697916623204947, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.004413491224113386, - "rewards/smiles_len_reward": -0.12501907243859023, - "rewards/tag_count_reward": 0.029947916860692203, + "completion_length": 39.593750953674316, + "epoch": 0.1136, + "grad_norm": 20.33456802368164, + "kl": 40.810302734375, + "learning_rate": 1.5850506544152103e-07, + "loss": 0.0284, + "num_tokens": 8736276.0, + "reward": 0.8979808054864407, + "reward_std": 0.15943958633579314, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9583333320915699, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.001964016948477365, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.1953125004656613, "step": 284 }, { "clip_ratio": 0.0, - "completion_length": 66.74218964576721, - "epoch": 0.2288, - "grad_norm": 26.962491989135742, - "kl": 73.96875, - "learning_rate": 1.764435819214762e-07, - "loss": 0.3738, - "num_tokens": 21794923.0, - "reward": 0.7910585440695286, - "reward_std": 0.34452163241803646, - "rewards/SMILES_validity_reward": 0.8291666731238365, + "completion_length": 41.322917342185974, + "epoch": 0.1144, + "grad_norm": 18.296335220336914, + "kl": 10.76171875, + "learning_rate": 1.5516188633079107e-07, + "loss": 0.0069, + "num_tokens": 8789395.0, + "reward": 0.8857920374721289, + "reward_std": 0.17018686461960897, + "rewards/SMILES_validity_reward": 0.9166666716337204, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8880208283662796, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.006722177873598412, - "rewards/smiles_len_reward": -0.08803215471561998, - "rewards/tag_count_reward": 0.026692708721384406, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.009268370922654867, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.21093750023283064, "step": 286 }, { "clip_ratio": 0.0, - "completion_length": 71.90625166893005, - "epoch": 0.2304, - "grad_norm": 3684.165283203125, - "kl": 391.78515625, - "learning_rate": 1.7274575140626315e-07, - "loss": 0.7691, - "num_tokens": 21920839.0, - "reward": 0.7605861239135265, - "reward_std": 0.38395687006413937, - "rewards/SMILES_validity_reward": 0.7916666604578495, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8854166716337204, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.010926677110546734, - "rewards/smiles_len_reward": -0.11563519097398967, - "rewards/tag_count_reward": 0.026692708139307797, + "completion_length": 59.20833480358124, + "epoch": 0.1152, + "grad_norm": 11.021564483642578, + "kl": 40.314453125, + "learning_rate": 1.5183840249960784e-07, + "loss": 0.0934, + "num_tokens": 8844231.0, + "reward": 0.8584657311439514, + "reward_std": 0.23017027985770255, + "rewards/SMILES_validity_reward": 0.8666666727513075, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.017361111473292112, + "rewards/repetition_penalty_reward": -0.007184128480730578, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.21093749930150807, "step": 288 }, { "clip_ratio": 0.0, - "completion_length": 66.61718928813934, - "epoch": 0.232, - "grad_norm": 30.921920776367188, - "kl": 51.79296875, - "learning_rate": 1.6906651448541976e-07, - "loss": 0.4191, - "num_tokens": 22044724.0, - "reward": 0.7936124540865421, - "reward_std": 0.3641815240844153, - "rewards/SMILES_validity_reward": 0.8195312321186066, + "completion_length": 52.385417222976685, + "epoch": 0.116, + "grad_norm": 64.6243896484375, + "kl": 17.584716796875, + "learning_rate": 1.4853530414456612e-07, + "loss": 0.0556, + "num_tokens": 8898412.0, + "reward": 0.8611728437244892, + "reward_std": 0.2143052279134281, + "rewards/SMILES_validity_reward": 0.8833333365619183, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9192708283662796, - "rewards/reasoning_steps_reward": 0.004340277868323028, - "rewards/repetition_penalty_reward": -0.009806223220948596, - "rewards/smiles_len_reward": -0.07182217901572585, - "rewards/tag_count_reward": 0.018880208779592067, + "rewards/format_reward": 0.9166666641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.01952289731707424, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.21354166534729302, "step": 290 }, { "clip_ratio": 0.0, - "completion_length": 51.83854305744171, - "epoch": 0.2336, - "grad_norm": 68.02233123779297, - "kl": 50.3828125, - "learning_rate": 1.6540675667967973e-07, - "loss": 0.2395, - "num_tokens": 22162934.0, - "reward": 0.8059075474739075, - "reward_std": 0.34976634243503213, - "rewards/SMILES_validity_reward": 0.8416666649281979, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9088541641831398, - "rewards/reasoning_steps_reward": 0.0026041667442768812, - "rewards/repetition_penalty_reward": -0.0050892977487819735, - "rewards/smiles_len_reward": -0.0794562753289938, - "rewards/tag_count_reward": 0.02278645889600739, + "completion_length": 56.11458492279053, + "epoch": 0.1168, + "grad_norm": 34.26335906982422, + "kl": 36.561767578125, + "learning_rate": 1.4525327722875568e-07, + "loss": 0.0401, + "num_tokens": 8952951.0, + "reward": 0.8726859986782074, + "reward_std": 0.2261066745268181, + "rewards/SMILES_validity_reward": 0.9000000059604645, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9166666641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.008037056235480122, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.2109374995343387, "step": 292 }, { "clip_ratio": 0.0, - "completion_length": 47.04166758060455, - "epoch": 0.2352, - "grad_norm": 17.918827056884766, - "kl": 37.42578125, - "learning_rate": 1.617673588215328e-07, - "loss": 0.2869, - "num_tokens": 22279302.0, - "reward": 0.7713445201516151, - "reward_std": 0.3772635292261839, - "rewards/SMILES_validity_reward": 0.8083333261311054, + "completion_length": 97.10416972637177, + "epoch": 0.1176, + "grad_norm": 45.39623260498047, + "kl": 15.76025390625, + "learning_rate": 1.4199300333930515e-07, + "loss": 0.0902, + "num_tokens": 9011425.0, + "reward": 0.7532959198579192, + "reward_std": 0.36259316198993474, + "rewards/SMILES_validity_reward": 0.7666666712611914, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8932291641831398, - "rewards/reasoning_steps_reward": 0.00954861135687679, - "rewards/repetition_penalty_reward": -0.006463400637585437, - "rewards/smiles_len_reward": -0.15175057773012668, - "rewards/tag_count_reward": 0.02408854174427688, + "rewards/format_reward": 0.8229166604578495, + "rewards/reasoning_steps_reward": 0.017361111473292112, + "rewards/repetition_penalty_reward": -0.004715499031590298, + "rewards/smiles_len_reward": -0.046875000931322575, + "rewards/tag_count_reward": 0.23177083185873926, "step": 294 }, { "clip_ratio": 0.0, - "completion_length": 52.083335638046265, - "epoch": 0.2368, - "grad_norm": 64.83023834228516, - "kl": 49.0859375, - "learning_rate": 1.5814919684322542e-07, - "loss": 0.2833, - "num_tokens": 22397606.0, - "reward": 0.7741575352847576, - "reward_std": 0.4162686008712626, - "rewards/SMILES_validity_reward": 0.7833333313465118, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.90625, - "rewards/reasoning_steps_reward": 0.006076388992369175, - "rewards/repetition_penalty_reward": -0.0056419622706016526, - "rewards/smiles_len_reward": 0.018222944694571197, - "rewards/tag_count_reward": 0.020833333546761423, + "completion_length": 48.62500178813934, + "epoch": 0.1184, + "grad_norm": 12.377687454223633, + "kl": 6.2955322265625, + "learning_rate": 1.3875515954583523e-07, + "loss": 0.0367, + "num_tokens": 9065245.0, + "reward": 0.8059816248714924, + "reward_std": 0.1653146606986411, + "rewards/SMILES_validity_reward": 0.7854166682809591, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0032058514771051705, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.2369791674427688, "step": 296 }, { "clip_ratio": 0.0, - "completion_length": 58.859376311302185, - "epoch": 0.2384, - "grad_norm": 173.7949981689453, - "kl": 106.66796875, - "learning_rate": 1.5455314156594123e-07, - "loss": 0.346, - "num_tokens": 22518512.0, - "reward": 0.8143112808465958, - "reward_std": 0.3242040954064578, - "rewards/SMILES_validity_reward": 0.8374999910593033, + "completion_length": 46.04166805744171, + "epoch": 0.1192, + "grad_norm": 12.048561096191406, + "kl": 7.322998046875, + "learning_rate": 1.3554041825985e-07, + "loss": -0.0214, + "num_tokens": 9118817.0, + "reward": 0.8983332067728043, + "reward_std": 0.13631578732747585, + "rewards/SMILES_validity_reward": 0.9166666679084301, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9244791716337204, - "rewards/reasoning_steps_reward": 0.0026041667442768812, - "rewards/repetition_penalty_reward": -0.006993362576395157, - "rewards/smiles_len_reward": -0.009921115823090076, - "rewards/tag_count_reward": 0.021484375116415322, + "rewards/format_reward": 0.9583333320915699, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0036484165393630974, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.21093750093132257, "step": 298 }, { "clip_ratio": 0.0, - "completion_length": 61.15364754199982, - "epoch": 0.24, - "grad_norm": 400.5851135253906, - "kl": 67.044921875, - "learning_rate": 1.5098005849021078e-07, - "loss": 0.3079, - "num_tokens": 22640299.0, - "reward": 0.7607803493738174, - "reward_std": 0.36718062963336706, - "rewards/SMILES_validity_reward": 0.7874999903142452, + "completion_length": 62.3750022649765, + "epoch": 0.12, + "grad_norm": 23.098913192749023, + "kl": 142.0855712890625, + "learning_rate": 1.323494470950949e-07, + "loss": 0.2089, + "num_tokens": 9173957.0, + "reward": 0.8859845735132694, + "reward_std": 0.20868572185281664, + "rewards/SMILES_validity_reward": 0.9166666716337204, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8828124962747097, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.005829951467603678, - "rewards/smiles_len_reward": -0.06878954637795687, - "rewards/tag_count_reward": 0.013671875174622983, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.002134755515726283, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.2057291674427688, "step": 300 }, { "clip_ratio": 0.0, - "completion_length": 59.927085638046265, - "epoch": 0.2416, - "grad_norm": 44.80502700805664, - "kl": 45.6953125, - "learning_rate": 1.47430807587603e-07, - "loss": 0.3873, - "num_tokens": 22761615.0, - "reward": 0.8036354631185532, - "reward_std": 0.3190372730605304, - "rewards/SMILES_validity_reward": 0.8333333283662796, + "completion_length": 56.55208492279053, + "epoch": 0.1208, + "grad_norm": 19.67671012878418, + "kl": 7.3123779296875, + "learning_rate": 1.2918290872891236e-07, + "loss": 0.0589, + "num_tokens": 9228538.0, + "reward": 0.8848470933735371, + "reward_std": 0.213736486970447, + "rewards/SMILES_validity_reward": 0.9000000059604645, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9114583358168602, - "rewards/reasoning_steps_reward": 0.006076389050576836, - "rewards/repetition_penalty_reward": -0.007518649843405001, - "rewards/smiles_len_reward": -0.04488650645362213, - "rewards/tag_count_reward": 0.014973958430346102, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.003613690787460655, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.21354166697710752, "step": 302 }, { "clip_ratio": 0.0, - "completion_length": 49.257814168930054, - "epoch": 0.2432, - "grad_norm": 61.11827087402344, - "kl": 51.85546875, - "learning_rate": 1.4390624309374617e-07, - "loss": 0.2636, - "num_tokens": 22878834.0, - "reward": 0.7895913496613503, - "reward_std": 0.3571253365371376, - "rewards/SMILES_validity_reward": 0.816666666418314, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333358168602, - "rewards/reasoning_steps_reward": 0.012152778101153672, - "rewards/repetition_penalty_reward": -0.006857625812699553, - "rewards/smiles_len_reward": -0.038289141783025116, - "rewards/tag_count_reward": 0.024739583081100136, + "completion_length": 70.11458575725555, + "epoch": 0.1216, + "grad_norm": 6248.3740234375, + "kl": 1183.59375, + "learning_rate": 1.260414607646213e-07, + "loss": 1.1913, + "num_tokens": 9284421.0, + "reward": 0.9230890087783337, + "reward_std": 0.13145335903391242, + "rewards/SMILES_validity_reward": 0.9500000029802322, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.005048613820690662, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.28385416651144624, "step": 304 }, { "clip_ratio": 0.0, - "completion_length": 44.632813930511475, - "epoch": 0.2448, - "grad_norm": 24.276023864746094, - "kl": 101.8251953125, - "learning_rate": 1.404072133027306e-07, - "loss": 0.3328, - "num_tokens": 22994277.0, - "reward": 0.7833369635045528, - "reward_std": 0.3747501680627465, - "rewards/SMILES_validity_reward": 0.7999999858438969, + "completion_length": 78.30208432674408, + "epoch": 0.1224, + "grad_norm": 22.36941146850586, + "kl": 9.85986328125, + "learning_rate": 1.2292575559495143e-07, + "loss": 0.0109, + "num_tokens": 9341090.0, + "reward": 0.9161470979452133, + "reward_std": 0.16511259728576988, + "rewards/SMILES_validity_reward": 0.9333333373069763, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9192708358168602, - "rewards/reasoning_steps_reward": 0.00954861135687679, - "rewards/repetition_penalty_reward": -0.005959605041425675, - "rewards/smiles_len_reward": -0.05147076665889472, - "rewards/tag_count_reward": 0.0234375, + "rewards/format_reward": 0.9479166641831398, + "rewards/reasoning_steps_reward": 0.0034722222480922937, + "rewards/repetition_penalty_reward": -0.005544163461308926, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3020833330228925, "step": 306 }, { "clip_ratio": 0.0, - "completion_length": 66.4192727804184, - "epoch": 0.2464, - "grad_norm": 19.5562686920166, - "kl": 92.298828125, - "learning_rate": 1.369345603629406e-07, - "loss": 0.4215, - "num_tokens": 23118086.0, - "reward": 0.807420376688242, - "reward_std": 0.3090350958518684, - "rewards/SMILES_validity_reward": 0.8374999985098839, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9218749962747097, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.01095377303136047, - "rewards/smiles_len_reward": -0.07096354046370834, - "rewards/tag_count_reward": 0.017578125349245965, + "completion_length": 59.104168176651, + "epoch": 0.1232, + "grad_norm": 33.61720275878906, + "kl": 31.44189453125, + "learning_rate": 1.1983644026655835e-07, + "loss": 0.0418, + "num_tokens": 9395916.0, + "reward": 0.8362351339310408, + "reward_std": 0.2932694805203937, + "rewards/SMILES_validity_reward": 0.8333333432674408, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.002233220911875833, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.24479166604578495, "step": 308 }, { "clip_ratio": 0.0, - "completion_length": 57.330730676651, - "epoch": 0.248, - "grad_norm": 23.46638298034668, - "kl": 61.33984375, - "learning_rate": 1.3348912007436536e-07, - "loss": 0.3241, - "num_tokens": 23238405.0, - "reward": 0.7636997401714325, - "reward_std": 0.40307603124529123, - "rewards/SMILES_validity_reward": 0.7791666612029076, + "completion_length": 44.29166758060455, + "epoch": 0.124, + "grad_norm": 15.68494987487793, + "kl": 8.397705078125, + "learning_rate": 1.1677415634565066e-07, + "loss": 0.0084, + "num_tokens": 9449320.0, + "reward": 0.8074449226260185, + "reward_std": 0.175186739070341, + "rewards/SMILES_validity_reward": 0.7854166682809591, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333320915699, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.010418418491099146, - "rewards/smiles_len_reward": -0.020944936492014676, - "rewards/tag_count_reward": 0.018880208197515458, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.001593647408299148, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.2812500037252903, "step": 310 }, { "clip_ratio": 0.0, - "completion_length": 58.80208468437195, - "epoch": 0.2496, - "grad_norm": 811.1200561523438, - "kl": 195.423828125, - "learning_rate": 1.3007172168743852e-07, - "loss": 0.4265, - "num_tokens": 23359289.0, - "reward": 0.7152650374919176, - "reward_std": 0.4054424315690994, - "rewards/SMILES_validity_reward": 0.6997395902872086, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9270833283662796, - "rewards/reasoning_steps_reward": 0.009548611473292112, - "rewards/repetition_penalty_reward": -0.012816645510611124, - "rewards/smiles_len_reward": -0.04499440788640641, - "rewards/tag_count_reward": 0.021484375116415322, + "completion_length": 54.87500178813934, + "epoch": 0.1248, + "grad_norm": 50.4384765625, + "kl": 22.9573974609375, + "learning_rate": 1.1373953978475353e-07, + "loss": 0.0812, + "num_tokens": 9503740.0, + "reward": 0.9299623742699623, + "reward_std": 0.10308579099364579, + "rewards/SMILES_validity_reward": 0.9500000029802322, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166679084301, + "rewards/reasoning_steps_reward": 0.017361111473292112, + "rewards/repetition_penalty_reward": -0.00315533101093024, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3020833311602473, "step": 312 }, { "clip_ratio": 0.0, - "completion_length": 59.476564049720764, - "epoch": 0.2512, - "grad_norm": 25.647239685058594, - "kl": 42.7890625, - "learning_rate": 1.2668318770345368e-07, - "loss": 0.3482, - "num_tokens": 23480432.0, - "reward": 0.7882367707788944, - "reward_std": 0.3668895438313484, - "rewards/SMILES_validity_reward": 0.820833332836628, + "completion_length": 77.62500095367432, + "epoch": 0.1256, + "grad_norm": 14.502692222595215, + "kl": 13.63623046875, + "learning_rate": 1.1073322079063913e-07, + "loss": 0.0621, + "num_tokens": 9560344.0, + "reward": 0.8834315277636051, + "reward_std": 0.24336056731408462, + "rewards/SMILES_validity_reward": 0.9000000059604645, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984374925494194, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.0075505238128243946, - "rewards/smiles_len_reward": -0.08182770665735006, - "rewards/tag_count_reward": 0.025390625116415322, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.009956820620573126, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.3098958358168602, "step": 314 }, { "clip_ratio": 0.0, - "completion_length": 53.632813930511475, - "epoch": 0.2528, - "grad_norm": 57.68588638305664, - "kl": 97.1669921875, - "learning_rate": 1.233243336766044e-07, - "loss": 0.4193, - "num_tokens": 23599331.0, - "reward": 0.7653166949748993, - "reward_std": 0.3775314458180219, - "rewards/SMILES_validity_reward": 0.7874999977648258, + "completion_length": 64.33333539962769, + "epoch": 0.1264, + "grad_norm": 12.168563842773438, + "kl": 8.08837890625, + "learning_rate": 1.0775582369344946e-07, + "loss": 0.0469, + "num_tokens": 9615672.0, + "reward": 0.8775706607848406, + "reward_std": 0.17083232302684337, + "rewards/SMILES_validity_reward": 0.900000000372529, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458395421505, - "rewards/reasoning_steps_reward": 0.008680555736646056, - "rewards/repetition_penalty_reward": -0.008220537267334294, - "rewards/smiles_len_reward": -0.08570567087735981, - "rewards/tag_count_reward": 0.014973958488553762, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.011273908166913316, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.25260416651144624, "step": 316 }, { "clip_ratio": 0.0, - "completion_length": 67.55208587646484, - "epoch": 0.2544, - "grad_norm": 63.47262191772461, - "kl": 39.439453125, - "learning_rate": 1.1999596801769616e-07, - "loss": 0.2992, - "num_tokens": 23723575.0, - "reward": 0.7604156136512756, - "reward_std": 0.3610593224875629, - "rewards/SMILES_validity_reward": 0.7791666686534882, + "completion_length": 48.479168176651, + "epoch": 0.1272, + "grad_norm": 51.8846321105957, + "kl": 17.612548828125, + "learning_rate": 1.0480796681704077e-07, + "loss": 0.0651, + "num_tokens": 9669478.0, + "reward": 0.896970622241497, + "reward_std": 0.15544916148064658, + "rewards/SMILES_validity_reward": 0.9166666697710752, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9244791641831398, - "rewards/reasoning_steps_reward": 0.0026041666860692203, - "rewards/repetition_penalty_reward": -0.00864661141531542, - "rewards/smiles_len_reward": -0.13107849314110354, - "rewards/tag_count_reward": 0.013671875174622983, + "rewards/format_reward": 0.9270833283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.012065992181305774, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.2994791651144624, "step": 318 }, { "clip_ratio": 0.0, - "completion_length": 57.58594036102295, - "epoch": 0.256, - "grad_norm": 26.219541549682617, - "kl": 45.1484375, - "learning_rate": 1.1669889179957723e-07, - "loss": 0.1946, - "num_tokens": 23843992.0, - "reward": 0.7795398309826851, - "reward_std": 0.37162910774350166, - "rewards/SMILES_validity_reward": 0.7999999895691872, + "completion_length": 40.91666841506958, + "epoch": 0.128, + "grad_norm": 10.52868938446045, + "kl": 17.12109375, + "learning_rate": 1.018902623505741e-07, + "loss": -0.0259, + "num_tokens": 9722558.0, + "reward": 0.8453612364828587, + "reward_std": 0.27566418022615835, + "rewards/SMILES_validity_reward": 0.8500000070780516, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458320915699, - "rewards/reasoning_steps_reward": 0.0034722222480922937, - "rewards/repetition_penalty_reward": -0.008394229185796576, - "rewards/smiles_len_reward": -0.037310975953005254, - "rewards/tag_count_reward": 0.026692708139307797, + "rewards/format_reward": 0.9270833283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.001597268710611388, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.2552083320915699, "step": 320 }, { "clip_ratio": 0.0, - "completion_length": 61.520835518836975, - "epoch": 0.2576, - "grad_norm": 419.2340393066406, - "kl": 158.7578125, - "learning_rate": 1.1343389856433658e-07, - "loss": 0.4102, - "num_tokens": 23965920.0, - "reward": 0.7558124400675297, - "reward_std": 0.42608391866087914, - "rewards/SMILES_validity_reward": 0.7791666612029076, + "completion_length": 46.979167222976685, + "epoch": 0.1288, + "grad_norm": 13.913373947143555, + "kl": 7.58154296875, + "learning_rate": 9.900331622138063e-08, + "loss": -0.0041, + "num_tokens": 9776220.0, + "reward": 0.9387280717492104, + "reward_std": 0.11939947289647534, + "rewards/SMILES_validity_reward": 0.9500000029802322, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984375037252903, - "rewards/reasoning_steps_reward": 0.0034722223062999547, - "rewards/repetition_penalty_reward": -0.007730142860964406, - "rewards/smiles_len_reward": -0.1092335598077625, - "rewards/tag_count_reward": 0.022135416511446238, + "rewards/format_reward": 0.9791666641831398, + "rewards/reasoning_steps_reward": 0.0069444444961845875, + "rewards/repetition_penalty_reward": -0.004039903054945171, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3124999958090484, "step": 322 }, { "clip_ratio": 0.0, - "completion_length": 50.750001668930054, - "epoch": 0.2592, - "grad_norm": 55.14189910888672, - "kl": 44.15234375, - "learning_rate": 1.1020177413231332e-07, - "loss": 0.2614, - "num_tokens": 24083712.0, - "reward": 0.7523438110947609, - "reward_std": 0.40479825623333454, - "rewards/SMILES_validity_reward": 0.7874999903142452, + "completion_length": 46.88541793823242, + "epoch": 0.1296, + "grad_norm": 13.716207504272461, + "kl": 13.66259765625, + "learning_rate": 9.614772796912681e-08, + "loss": 0.0425, + "num_tokens": 9829873.0, + "reward": 0.8513182587921619, + "reward_std": 0.2821584241464734, + "rewards/SMILES_validity_reward": 0.8666666727513075, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8723958358168602, - "rewards/reasoning_steps_reward": 0.006076389050576836, - "rewards/repetition_penalty_reward": -0.00841052423857036, - "rewards/smiles_len_reward": -0.1234477300895378, - "rewards/tag_count_reward": 0.019531250232830644, + "rewards/format_reward": 0.9062499888241291, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0040062003245111555, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.2578125037252903, "step": 324 }, { "clip_ratio": 0.0, - "completion_length": 54.93489706516266, - "epoch": 0.2608, - "grad_norm": 71.32405090332031, - "kl": 60.220703125, - "learning_rate": 1.070032964129654e-07, - "loss": 0.291, - "num_tokens": 24203111.0, - "reward": 0.7721843849867582, - "reward_std": 0.33104856190038845, - "rewards/SMILES_validity_reward": 0.8041666690260172, + "completion_length": 43.86458432674408, + "epoch": 0.1304, + "grad_norm": 13.790403366088867, + "kl": 10.61859130859375, + "learning_rate": 9.332409062130686e-08, + "loss": 0.0133, + "num_tokens": 9883236.0, + "reward": 0.8828205205500126, + "reward_std": 0.21784398559248075, + "rewards/SMILES_validity_reward": 0.9000000040978193, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333246409893, - "rewards/reasoning_steps_reward": 0.008680555794853717, - "rewards/repetition_penalty_reward": -0.011790495191235095, - "rewards/smiles_len_reward": -0.10733911380521022, - "rewards/tag_count_reward": 0.015625000291038305, + "rewards/format_reward": 0.9166666641831398, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.003046113546588458, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.2864583348855376, "step": 326 }, { "clip_ratio": 0.0, - "completion_length": 44.559897661209106, - "epoch": 0.2624, - "grad_norm": 17.008113861083984, - "kl": 19.951171875, - "learning_rate": 1.0383923521764174e-07, - "loss": 0.2228, - "num_tokens": 24318526.0, - "reward": 0.7702805139124393, - "reward_std": 0.38822865672409534, - "rewards/SMILES_validity_reward": 0.7999999895691872, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458320915699, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.004773934269906022, - "rewards/smiles_len_reward": -0.12223973160143942, - "rewards/tag_count_reward": 0.013671875291038305, + "completion_length": 54.968750953674316, + "epoch": 0.1312, + "grad_norm": 20.811195373535156, + "kl": 10.6666259765625, + "learning_rate": 9.053299057008699e-08, + "loss": 0.067, + "num_tokens": 9937665.0, + "reward": 0.8830222934484482, + "reward_std": 0.2312118139816448, + "rewards/SMILES_validity_reward": 0.8833333384245634, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.006757562659913674, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3203125020954758, "step": 328 }, { "clip_ratio": 0.0, - "completion_length": 77.84635615348816, - "epoch": 0.264, - "grad_norm": 36.427547454833984, - "kl": 43.140625, - "learning_rate": 1.007103520743035e-07, - "loss": 0.354, - "num_tokens": 24446723.0, - "reward": 0.7763010747730732, - "reward_std": 0.3733202526345849, - "rewards/SMILES_validity_reward": 0.7875000052154064, + "completion_length": 59.46875238418579, + "epoch": 0.132, + "grad_norm": 11.719626426696777, + "kl": 10.68682861328125, + "learning_rate": 8.777500745052743e-08, + "loss": 0.0699, + "num_tokens": 9992526.0, + "reward": 0.8845588695257902, + "reward_std": 0.21500852517783642, + "rewards/SMILES_validity_reward": 0.9000000059604645, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9296874925494194, - "rewards/reasoning_steps_reward": 0.0026041667442768812, - "rewards/repetition_penalty_reward": -0.006564084775163792, - "rewards/smiles_len_reward": -0.053473433246836066, - "rewards/tag_count_reward": 0.018880208779592067, + "rewards/format_reward": 0.9270833283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.006495966023067012, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.296875, "step": 330 }, { "clip_ratio": 0.0, - "completion_length": 63.32552206516266, - "epoch": 0.2656, - "grad_norm": 13.066360473632812, - "kl": 40.861328125, - "learning_rate": 9.761740004423926e-08, - "loss": 0.2626, - "num_tokens": 24569344.0, - "reward": 0.7705531045794487, - "reward_std": 0.3602237828890793, - "rewards/SMILES_validity_reward": 0.808333333581686, + "completion_length": 67.85416865348816, + "epoch": 0.1328, + "grad_norm": 18.956579208374023, + "kl": 11.107421875, + "learning_rate": 8.505071402020892e-08, + "loss": 0.0666, + "num_tokens": 10048192.0, + "reward": 0.8577948585152626, + "reward_std": 0.25394871982280165, + "rewards/SMILES_validity_reward": 0.8666666746139526, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.890625, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.006140822355519049, - "rewards/smiles_len_reward": -0.15825135062914342, - "rewards/tag_count_reward": 0.026692708604969084, + "rewards/format_reward": 0.9166666567325592, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.017365171763231046, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.30468749813735485, "step": 332 }, { "clip_ratio": 0.0, - "completion_length": 47.304688930511475, - "epoch": 0.2672, - "grad_norm": 7936.54248046875, - "kl": 849.38671875, - "learning_rate": 9.45611235408178e-08, - "loss": 1.071, - "num_tokens": 24685813.0, - "reward": 0.8130536079406738, - "reward_std": 0.34073650278151035, - "rewards/SMILES_validity_reward": 0.8374999985098839, + "completion_length": 55.46875178813934, + "epoch": 0.1336, + "grad_norm": 19.334678649902344, + "kl": 17.1290283203125, + "learning_rate": 8.236067604028562e-08, + "loss": -0.0002, + "num_tokens": 10102669.0, + "reward": 0.9023284614086151, + "reward_std": 0.16440001456066966, + "rewards/SMILES_validity_reward": 0.9166666716337204, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9166666604578495, + "rewards/format_reward": 0.9375, "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.006224363439287117, - "rewards/smiles_len_reward": -0.01024607045110315, - "rewards/tag_count_reward": 0.024088542035315186, + "rewards/repetition_penalty_reward": -0.005362560274079442, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3098958330228925, "step": 334 }, { "clip_ratio": 0.0, - "completion_length": 56.476564049720764, - "epoch": 0.2688, - "grad_norm": 18.579252243041992, - "kl": 74.7998046875, - "learning_rate": 9.15422581503224e-08, - "loss": 0.4535, - "num_tokens": 24805804.0, - "reward": 0.7905747666954994, - "reward_std": 0.34068747609853745, - "rewards/SMILES_validity_reward": 0.8333333171904087, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8880208320915699, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.00672910911089275, - "rewards/smiles_len_reward": -0.11942530050873756, - "rewards/tag_count_reward": 0.02148437505820766, + "completion_length": 40.61458444595337, + "epoch": 0.1344, + "grad_norm": 22.786741256713867, + "kl": 20.921875, + "learning_rate": 7.970545215799327e-08, + "loss": -0.0046, + "num_tokens": 10155720.0, + "reward": 0.8433664571493864, + "reward_std": 0.28101494431030005, + "rewards/SMILES_validity_reward": 0.850000006146729, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9166666567325592, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.003316073620226234, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.27343749767169356, "step": 336 }, { "clip_ratio": 0.0, - "completion_length": 44.60677218437195, - "epoch": 0.2704, - "grad_norm": 20.473526000976562, - "kl": 47.2734375, - "learning_rate": 8.856153045490947e-08, - "loss": 0.3034, - "num_tokens": 24921237.0, - "reward": 0.7792329639196396, - "reward_std": 0.3795014023198746, - "rewards/SMILES_validity_reward": 0.7958333231508732, + "completion_length": 62.38541805744171, + "epoch": 0.1352, + "grad_norm": 19.285070419311523, + "kl": 15.4423828125, + "learning_rate": 7.708559379063204e-08, + "loss": -0.0467, + "num_tokens": 10210861.0, + "reward": 0.8785608969628811, + "reward_std": 0.20795345888473094, + "rewards/SMILES_validity_reward": 0.8833333384245634, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.90625, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.005042356933699921, - "rewards/smiles_len_reward": -0.018905383301898837, - "rewards/tag_count_reward": 0.021484375407453626, + "rewards/format_reward": 0.9479166641831398, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.004496572495554574, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.2786458367481828, "step": 338 }, { "clip_ratio": 0.0, - "completion_length": 76.48437809944153, - "epoch": 0.272, - "grad_norm": 147.6204071044922, - "kl": 92.5546875, - "learning_rate": 8.561965785773412e-08, - "loss": 0.3352, - "num_tokens": 25048911.0, - "reward": 0.7174020754173398, - "reward_std": 0.3969442341476679, - "rewards/SMILES_validity_reward": 0.7190104154869914, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8828125037252903, - "rewards/reasoning_steps_reward": 0.01388888928340748, - "rewards/repetition_penalty_reward": -0.007572865069960244, - "rewards/smiles_len_reward": -0.03659327526111156, - "rewards/tag_count_reward": 0.022786458488553762, + "completion_length": 64.51041853427887, + "epoch": 0.136, + "grad_norm": 12.174135208129883, + "kl": 4.55029296875, + "learning_rate": 7.45016450110534e-08, + "loss": 0.0704, + "num_tokens": 10266206.0, + "reward": 0.9585046321153641, + "reward_std": 0.08200875978218392, + "rewards/SMILES_validity_reward": 0.9833333343267441, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.009225877125572879, + "rewards/smiles_len_reward": 0.0, + "rewards/tag_count_reward": 0.3671874990686774, "step": 340 }, { "clip_ratio": 0.0, - "completion_length": 45.588543176651, - "epoch": 0.2736, - "grad_norm": 469.4862060546875, - "kl": 140.984375, - "learning_rate": 8.271734841028552e-08, - "loss": 0.3789, - "num_tokens": 25164721.0, - "reward": 0.7690527178347111, - "reward_std": 0.39881330635398626, - "rewards/SMILES_validity_reward": 0.8041666597127914, + "completion_length": 46.33333504199982, + "epoch": 0.1368, + "grad_norm": 14.097783088684082, + "kl": 17.8291015625, + "learning_rate": 7.195414243467029e-08, + "loss": -0.0138, + "num_tokens": 10319806.0, + "reward": 0.8139520492404699, + "reward_std": 0.19127862562891096, + "rewards/SMILES_validity_reward": 0.7854166701436043, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.890625, - "rewards/reasoning_steps_reward": 0.0026041667442768812, - "rewards/repetition_penalty_reward": -0.005571819157921709, - "rewards/smiles_len_reward": -0.12382401619106531, - "rewards/tag_count_reward": 0.016276041918899864, + "rewards/format_reward": 0.9583333283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.001626729812414851, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.2890624972060323, "step": 342 }, { "clip_ratio": 0.0, - "completion_length": 48.276043176651, - "epoch": 0.2752, - "grad_norm": 190.9840545654297, - "kl": 74.7578125, - "learning_rate": 7.985530064197241e-08, - "loss": 0.3261, - "num_tokens": 25281563.0, - "reward": 0.7844961099326611, - "reward_std": 0.3285376951098442, - "rewards/SMILES_validity_reward": 0.8249999992549419, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8906249925494194, - "rewards/reasoning_steps_reward": 0.014756944845430553, - "rewards/repetition_penalty_reward": -0.005333821703970898, - "rewards/smiles_len_reward": -0.13086939207278192, - "rewards/tag_count_reward": 0.019531250291038305, + "completion_length": 45.479167342185974, + "epoch": 0.1376, + "grad_norm": 20.031068801879883, + "kl": 6.23046875, + "learning_rate": 6.944361510801763e-08, + "loss": 0.0154, + "num_tokens": 10373324.0, + "reward": 0.9225666001439095, + "reward_std": 0.13941001996863633, + "rewards/SMILES_validity_reward": 0.9500000029802322, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9166666641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.005064387529273517, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3411458320915699, "step": 344 }, { "clip_ratio": 0.0, - "completion_length": 70.25000202655792, - "epoch": 0.2768, - "grad_norm": 23.501083374023438, - "kl": 51.0078125, - "learning_rate": 7.703420339200101e-08, - "loss": 0.3129, - "num_tokens": 25406843.0, - "reward": 0.7355504985898733, - "reward_std": 0.369172902777791, - "rewards/SMILES_validity_reward": 0.748177076689899, + "completion_length": 58.312501668930054, + "epoch": 0.1384, + "grad_norm": 11.070231437683105, + "kl": 9.16748046875, + "learning_rate": 6.697058439888283e-08, + "loss": 0.0035, + "num_tokens": 10428074.0, + "reward": 0.9040811881422997, + "reward_std": 0.20653777720872313, + "rewards/SMILES_validity_reward": 0.9166666716337204, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984375037252903, - "rewards/reasoning_steps_reward": 0.0026041667442768812, - "rewards/repetition_penalty_reward": -0.01213008257218462, - "rewards/smiles_len_reward": -0.08640275153447874, - "rewards/tag_count_reward": 0.018880208546761423, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.008668714319355786, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3411458311602473, "step": 346 }, { "clip_ratio": 0.0, - "completion_length": 56.47135639190674, - "epoch": 0.2784, - "grad_norm": 26.730709075927734, - "kl": 42.05859375, - "learning_rate": 7.425473564358456e-08, - "loss": 0.2728, - "num_tokens": 25526832.0, - "reward": 0.8217011019587517, - "reward_std": 0.27569763401697855, - "rewards/SMILES_validity_reward": 0.8624999970197678, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9140624962747097, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.006716858119034441, - "rewards/smiles_len_reward": -0.0911170897888951, - "rewards/tag_count_reward": 0.024739583663176745, + "completion_length": 38.60416758060455, + "epoch": 0.1392, + "grad_norm": 17.219825744628906, + "kl": 11.9794921875, + "learning_rate": 6.453556388803288e-08, + "loss": -0.0453, + "num_tokens": 10480932.0, + "reward": 0.9238246716558933, + "reward_std": 0.161299129773397, + "rewards/SMILES_validity_reward": 0.9333333373069763, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9583333283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0008169934735633433, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3463541669771075, "step": 348 }, { "clip_ratio": 0.0, - "completion_length": 60.69791901111603, - "epoch": 0.28, - "grad_norm": 98.43875885009766, - "kl": 61.244140625, - "learning_rate": 7.151756636052527e-08, - "loss": 0.3109, - "num_tokens": 25648444.0, - "reward": 0.7451902097091079, - "reward_std": 0.35876176378224045, - "rewards/SMILES_validity_reward": 0.7536458186805248, + "completion_length": 39.60416758060455, + "epoch": 0.14, + "grad_norm": 41.384090423583984, + "kl": 24.66162109375, + "learning_rate": 6.213905926255697e-08, + "loss": 0.0626, + "num_tokens": 10533886.0, + "reward": 0.8788461312651634, + "reward_std": 0.22121598839294165, + "rewards/SMILES_validity_reward": 0.8833333402872086, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9140624925494194, - "rewards/reasoning_steps_reward": 0.008680555736646056, - "rewards/repetition_penalty_reward": -0.009796898761123884, - "rewards/smiles_len_reward": -0.09724315593484789, - "rewards/tag_count_reward": 0.0325520834303461, + "rewards/format_reward": 0.9270833320915699, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.0016438955790363252, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.3463541669771075, "step": 350 }, { "clip_ratio": 0.0, - "completion_length": 50.132814049720764, - "epoch": 0.2816, - "grad_norm": 18.02010726928711, - "kl": 46.1328125, - "learning_rate": 6.882335432620779e-08, - "loss": 0.1619, - "num_tokens": 25765999.0, - "reward": 0.7713147848844528, - "reward_std": 0.3866591900587082, - "rewards/SMILES_validity_reward": 0.8041666708886623, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8828125, - "rewards/reasoning_steps_reward": 0.0164930559694767, - "rewards/repetition_penalty_reward": -0.008989743004349293, - "rewards/smiles_len_reward": -0.09670030255801976, - "rewards/tag_count_reward": 0.024739583663176745, + "completion_length": 54.70833432674408, + "epoch": 0.1408, + "grad_norm": 2251.369873046875, + "kl": 260.109375, + "learning_rate": 5.978156821084987e-08, + "loss": 0.3363, + "num_tokens": 10588290.0, + "reward": 0.906851053237915, + "reward_std": 0.20935551589354873, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0018032030784524977, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3619791641831398, "step": 352 }, { "clip_ratio": 0.0, - "completion_length": 52.015626311302185, - "epoch": 0.2832, - "grad_norm": 34.14971923828125, - "kl": 58.6484375, - "learning_rate": 6.617274798504286e-08, - "loss": 0.3631, - "num_tokens": 25884277.0, - "reward": 0.7878972478210926, - "reward_std": 0.3565414815675467, - "rewards/SMILES_validity_reward": 0.804166667163372, + "completion_length": 41.07291758060455, + "epoch": 0.1416, + "grad_norm": 86.15303039550781, + "kl": 25.509521484375, + "learning_rate": 5.7463580319254853e-08, + "loss": 0.0125, + "num_tokens": 10641385.0, + "reward": 0.9007619582116604, + "reward_std": 0.2126168095273897, + "rewards/SMILES_validity_reward": 0.9166666716337204, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9192708395421505, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.0039607606104254955, - "rewards/smiles_len_reward": -0.030088828410953283, - "rewards/tag_count_reward": 0.018229166918899864, + "rewards/format_reward": 0.9270833246409893, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.00540257606189698, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3359375009313226, "step": 354 }, { "clip_ratio": 0.0, - "completion_length": 54.94010627269745, - "epoch": 0.2848, - "grad_norm": 20.642011642456055, - "kl": 198.462890625, - "learning_rate": 6.356638528639954e-08, - "loss": 0.4167, - "num_tokens": 26003678.0, - "reward": 0.7857323661446571, - "reward_std": 0.36366927227936685, - "rewards/SMILES_validity_reward": 0.8166666626930237, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9062499962747097, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.004005594724731054, - "rewards/smiles_len_reward": -0.10598958295304328, - "rewards/tag_count_reward": 0.02408854162786156, + "completion_length": 53.343751668930054, + "epoch": 0.1424, + "grad_norm": 22.986886978149414, + "kl": 9.517333984375, + "learning_rate": 5.518557697039081e-08, + "loss": -0.0054, + "num_tokens": 10695658.0, + "reward": 0.9346610344946384, + "reward_std": 0.14231333124917, + "rewards/SMILES_validity_reward": 0.9500000029802322, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.009120229282416403, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3671875037252903, "step": 356 }, { "clip_ratio": 0.0, - "completion_length": 44.007814049720764, - "epoch": 0.2864, - "grad_norm": 66.33602142333984, - "kl": 51.1875, - "learning_rate": 6.100489353106303e-08, - "loss": 0.2763, - "num_tokens": 26118881.0, - "reward": 0.8222081623971462, - "reward_std": 0.27234845350903925, - "rewards/SMILES_validity_reward": 0.8583333306014538, + "completion_length": 36.20833420753479, + "epoch": 0.1432, + "grad_norm": 20.674697875976562, + "kl": 6.1153564453125, + "learning_rate": 5.294803124318145e-08, + "loss": -0.0081, + "num_tokens": 10748286.0, + "reward": 0.9437504410743713, + "reward_std": 0.11255369227728806, + "rewards/SMILES_validity_reward": 0.9500000029802322, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9192708395421505, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.006773012763005681, - "rewards/smiles_len_reward": -0.06463625153992325, - "rewards/tag_count_reward": 0.016927083488553762, + "rewards/format_reward": 0.9791666641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0015593872813042253, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.3567708330228925, "step": 358 }, { "clip_ratio": 0.0, - "completion_length": 67.45312595367432, - "epoch": 0.288, - "grad_norm": 26.583322525024414, - "kl": 47.1904296875, - "learning_rate": 5.848888922025552e-08, - "loss": 0.3307, - "num_tokens": 26243087.0, - "reward": 0.7812948487699032, - "reward_std": 0.35617859475314617, - "rewards/SMILES_validity_reward": 0.8124999888241291, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333358168602, - "rewards/reasoning_steps_reward": 0.015625000465661287, - "rewards/repetition_penalty_reward": -0.014582588351913728, - "rewards/smiles_len_reward": -0.08327753038611263, - "rewards/tag_count_reward": 0.020182291860692203, + "completion_length": 51.54166805744171, + "epoch": 0.144, + "grad_norm": 98.24833679199219, + "kl": 26.5751953125, + "learning_rate": 5.07514078146106e-08, + "loss": 0.072, + "num_tokens": 10802386.0, + "reward": 0.9014865681529045, + "reward_std": 0.18204960267757997, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0007607677835039794, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3072916674427688, "step": 360 }, { "clip_ratio": 0.0, - "completion_length": 61.65625178813934, - "epoch": 0.2896, - "grad_norm": 303.911865234375, - "kl": 85.3046875, - "learning_rate": 5.601897790725643e-08, - "loss": 0.2272, - "num_tokens": 26365067.0, - "reward": 0.7704563029110432, - "reward_std": 0.4014833262190223, - "rewards/SMILES_validity_reward": 0.7875000052154064, + "completion_length": 51.531251430511475, + "epoch": 0.1448, + "grad_norm": 16.738651275634766, + "kl": 11.8388671875, + "learning_rate": 4.859616286322094e-08, + "loss": -0.0065, + "num_tokens": 10856485.0, + "reward": 0.8817995861172676, + "reward_std": 0.22947271226439625, + "rewards/SMILES_validity_reward": 0.8833333402872086, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9114583320915699, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.008073668244833243, - "rewards/smiles_len_reward": -0.06679152825381607, - "rewards/tag_count_reward": 0.019531250407453626, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0033597408328205347, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.32031249813735485, "step": 362 }, { "clip_ratio": 0.0, - "completion_length": 43.49479305744171, - "epoch": 0.2912, - "grad_norm": 263.7744140625, - "kl": 203.021484375, - "learning_rate": 5.3595754051657476e-08, - "loss": 0.3918, - "num_tokens": 26480073.0, - "reward": 0.806282889097929, - "reward_std": 0.3312041540630162, - "rewards/SMILES_validity_reward": 0.8291666619479656, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9270833320915699, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.006226064411748666, - "rewards/smiles_len_reward": -0.04696196690201759, - "rewards/tag_count_reward": 0.02278645895421505, + "completion_length": 34.98958432674408, + "epoch": 0.1456, + "grad_norm": 24.977575302124023, + "kl": 15.1845703125, + "learning_rate": 4.648274397437829e-08, + "loss": 0.0125, + "num_tokens": 10908996.0, + "reward": 0.8553803451359272, + "reward_std": 0.26896312623284757, + "rewards/SMILES_validity_reward": 0.8500000070780516, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0003645833348855376, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.3229166667442769, "step": 364 }, { "clip_ratio": 0.0, - "completion_length": 62.968751549720764, - "epoch": 0.2928, - "grad_norm": 22.697811126708984, - "kl": 32.64453125, - "learning_rate": 5.121980087628802e-08, - "loss": 0.2565, - "num_tokens": 26602557.0, - "reward": 0.7698783986270428, - "reward_std": 0.4065110133960843, - "rewards/SMILES_validity_reward": 0.7916666641831398, + "completion_length": 64.43750143051147, + "epoch": 0.1464, + "grad_norm": 23.089859008789062, + "kl": 14.002685546875, + "learning_rate": 4.4411590047320617e-08, + "loss": 0.0537, + "num_tokens": 10964334.0, + "reward": 0.9408868439495564, + "reward_std": 0.11353623877221253, + "rewards/SMILES_validity_reward": 0.9666666686534882, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984375, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.009550296479574172, - "rewards/smiles_len_reward": -0.05859398643951863, - "rewards/tag_count_reward": 0.024739583779592067, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.008841203583870083, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.34895833395421505, "step": 366 }, { "clip_ratio": 0.0, - "completion_length": 52.43489730358124, - "epoch": 0.2944, - "grad_norm": 16.701269149780273, - "kl": 39.60546875, - "learning_rate": 4.88916902268445e-08, - "loss": 0.3071, - "num_tokens": 26720996.0, - "reward": 0.8138786628842354, - "reward_std": 0.3093279884196818, - "rewards/SMILES_validity_reward": 0.845833320170641, + "completion_length": 87.88541948795319, + "epoch": 0.1472, + "grad_norm": 49.889408111572266, + "kl": 17.040771484375, + "learning_rate": 4.2383131204010494e-08, + "loss": 0.089, + "num_tokens": 11021923.0, + "reward": 0.8763796538114548, + "reward_std": 0.24184174370020628, + "rewards/SMILES_validity_reward": 0.8833333402872086, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9192708320915699, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.00628939275702578, - "rewards/smiles_len_reward": -0.05961279338225722, - "rewards/tag_count_reward": 0.018229166977107525, + "rewards/format_reward": 0.9270833283662796, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.01589235052233562, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.33593749767169356, "step": 368 }, { "clip_ratio": 0.0, - "completion_length": 53.37760519981384, - "epoch": 0.296, - "grad_norm": 41.683509826660156, - "kl": 99.9921875, - "learning_rate": 4.6611982434258124e-08, - "loss": 0.4217, - "num_tokens": 26839797.0, - "reward": 0.7909123674035072, - "reward_std": 0.3771476158872247, - "rewards/SMILES_validity_reward": 0.8333333171904087, + "completion_length": 56.270835399627686, + "epoch": 0.148, + "grad_norm": 12.946943283081055, + "kl": 6.9974365234375, + "learning_rate": 4.039778869981064e-08, + "loss": 0.0186, + "num_tokens": 11076477.0, + "reward": 0.7805087044835091, + "reward_std": 0.24972321733366698, + "rewards/SMILES_validity_reward": 0.7520833369344473, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416716337204, - "rewards/reasoning_steps_reward": 0.008680555794853717, - "rewards/repetition_penalty_reward": -0.008577862317906693, - "rewards/smiles_len_reward": -0.14827177836559713, - "rewards/tag_count_reward": 0.020833333488553762, + "rewards/format_reward": 0.9062499925494194, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.006372742980602197, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.3437500016298145, "step": 370 }, { "clip_ratio": 0.0, - "completion_length": 65.79687774181366, - "epoch": 0.2976, - "grad_norm": 9209.0556640625, - "kl": 1455.1953125, - "learning_rate": 4.438122617983442e-08, - "loss": 1.6162, - "num_tokens": 26963367.0, - "reward": 0.7700982913374901, - "reward_std": 0.3737562280148268, - "rewards/SMILES_validity_reward": 0.8041666597127914, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9062499962747097, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.009300025478296448, - "rewards/smiles_len_reward": -0.17930139059899375, - "rewards/tag_count_reward": 0.03385416720993817, + "completion_length": 55.385418176651, + "epoch": 0.1488, + "grad_norm": 14.288317680358887, + "kl": 13.7529296875, + "learning_rate": 3.845597483600049e-08, + "loss": 0.0478, + "num_tokens": 11130946.0, + "reward": 0.9122310355305672, + "reward_std": 0.16964219394139946, + "rewards/SMILES_validity_reward": 0.9333333373069763, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.007378652589977719, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3046875009313226, "step": 372 }, { "clip_ratio": 0.0, - "completion_length": 55.71614730358124, - "epoch": 0.2992, - "grad_norm": 4210.1923828125, - "kl": 429.9921875, - "learning_rate": 4.219995836319631e-08, - "loss": 0.7209, - "num_tokens": 27083066.0, - "reward": 0.7837943881750107, - "reward_std": 0.3702095244079828, - "rewards/SMILES_validity_reward": 0.8083333298563957, + "completion_length": 68.54166889190674, + "epoch": 0.1496, + "grad_norm": 27.640518188476562, + "kl": 46.751708984375, + "learning_rate": 3.655809287415284e-08, + "loss": 0.0663, + "num_tokens": 11186678.0, + "reward": 0.8932533636689186, + "reward_std": 0.216169131686911, + "rewards/SMILES_validity_reward": 0.9166666716337204, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9192708320915699, - "rewards/reasoning_steps_reward": 0.004340277868323028, - "rewards/repetition_penalty_reward": -0.005971535720163956, - "rewards/smiles_len_reward": -0.0993585159885697, - "rewards/tag_count_reward": 0.022786458488553762, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.015384390106191859, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.30208333022892475, "step": 374 }, { "clip_ratio": 0.0, - "completion_length": 60.216147780418396, - "epoch": 0.3008, - "grad_norm": 19.845266342163086, - "kl": 30.6083984375, - "learning_rate": 4.006870397306256e-08, - "loss": 0.3621, - "num_tokens": 27204493.0, - "reward": 0.8325573541224003, - "reward_std": 0.2710378540214151, - "rewards/SMILES_validity_reward": 0.8708333186805248, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9166666679084301, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.008276921271317406, - "rewards/smiles_len_reward": -0.039327892707660794, - "rewards/tag_count_reward": 0.022135416860692203, + "completion_length": 70.22916865348816, + "epoch": 0.1504, + "grad_norm": 20.095703125, + "kl": 14.381103515625, + "learning_rate": 3.4704536952387285e-08, + "loss": 0.0798, + "num_tokens": 11242572.0, + "reward": 0.89638776704669, + "reward_std": 0.19948702392866835, + "rewards/SMILES_validity_reward": 0.9000000040978193, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.02083333395421505, + "rewards/repetition_penalty_reward": -0.015811095014214516, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.37239583395421505, "step": 376 }, { "clip_ratio": 0.0, - "completion_length": 78.28906345367432, - "epoch": 0.3024, - "grad_norm": 19.69918441772461, - "kl": 43.9736328125, - "learning_rate": 3.798797596089351e-08, - "loss": 0.5056, - "num_tokens": 27332860.0, - "reward": 0.754853866994381, - "reward_std": 0.37358047830639407, - "rewards/SMILES_validity_reward": 0.7874999921768904, + "completion_length": 52.60416829586029, + "epoch": 0.1512, + "grad_norm": 100.34427642822266, + "kl": 31.5345458984375, + "learning_rate": 3.2895692003518575e-08, + "loss": 0.0002, + "num_tokens": 11296774.0, + "reward": 0.8779818527400494, + "reward_std": 0.2134934167843312, + "rewards/SMILES_validity_reward": 0.8833333365619183, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984375, - "rewards/reasoning_steps_reward": 0.015625000465661287, - "rewards/repetition_penalty_reward": -0.009878307530016173, - "rewards/smiles_len_reward": -0.18325082340743393, - "rewards/tag_count_reward": 0.018229167151730508, + "rewards/format_reward": 0.9166666641831398, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.002474533444910776, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3593749990686774, "step": 378 }, { "clip_ratio": 0.0, - "completion_length": 65.35677254199982, - "epoch": 0.304, - "grad_norm": 60.32386779785156, - "kl": 81.28125, - "learning_rate": 3.5958275117433404e-08, - "loss": 0.4199, - "num_tokens": 27456261.0, - "reward": 0.7353403887245804, - "reward_std": 0.3281521408353001, - "rewards/SMILES_validity_reward": 0.7466145819053054, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458358168602, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.008604248070696485, - "rewards/smiles_len_reward": -0.09997228858992457, - "rewards/tag_count_reward": 0.014322916977107525, + "completion_length": 41.437501192092896, + "epoch": 0.152, + "grad_norm": 22.127193450927734, + "kl": 25.8388671875, + "learning_rate": 3.113193367511635e-08, + "loss": 0.053, + "num_tokens": 11349904.0, + "reward": 0.9402668289840221, + "reward_std": 0.1094548690598458, + "rewards/SMILES_validity_reward": 0.9666666686534882, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0020205874752718955, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3046875027939677, "step": 380 }, { "clip_ratio": 0.0, - "completion_length": 56.51562714576721, - "epoch": 0.3056, - "grad_norm": 231.04348754882812, - "kl": 87.3828125, - "learning_rate": 3.398008995217988e-08, - "loss": 0.3055, - "num_tokens": 27576267.0, - "reward": 0.7327682701870799, - "reward_std": 0.3798722317442298, - "rewards/SMILES_validity_reward": 0.7398437494412065, + "completion_length": 57.28125178813934, + "epoch": 0.1528, + "grad_norm": 33.59663391113281, + "kl": 21.923828125, + "learning_rate": 2.9413628251493934e-08, + "loss": 0.054, + "num_tokens": 11404555.0, + "reward": 0.8744519427418709, + "reward_std": 0.24664068571291864, + "rewards/SMILES_validity_reward": 0.8833333402872086, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984374962747097, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.004630661387636792, - "rewards/smiles_len_reward": -0.07771396718453616, - "rewards/tag_count_reward": 0.027994791395030916, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.013888889225199819, + "rewards/repetition_penalty_reward": -0.004787582263816148, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.33854166604578495, "step": 382 }, { "clip_ratio": 0.0, - "completion_length": 58.658855676651, - "epoch": 0.3072, - "grad_norm": 1089.19970703125, - "kl": 154.09765625, - "learning_rate": 3.205389657580943e-08, - "loss": 0.4028, - "num_tokens": 27697096.0, - "reward": 0.8069799989461899, - "reward_std": 0.35150486323982477, - "rewards/SMILES_validity_reward": 0.845833320170641, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9114583358168602, - "rewards/reasoning_steps_reward": 0.017361111589707434, - "rewards/repetition_penalty_reward": -0.00833509930089349, - "rewards/smiles_len_reward": -0.11136261757928878, - "rewards/tag_count_reward": 0.0169270834303461, + "completion_length": 48.187501668930054, + "epoch": 0.1536, + "grad_norm": 13.808212280273438, + "kl": 10.502685546875, + "learning_rate": 2.774113257764066e-08, + "loss": -0.0248, + "num_tokens": 11458333.0, + "reward": 0.9046551585197449, + "reward_std": 0.19689062098041177, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.010741465914179571, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.32291666651144624, "step": 384 }, { "clip_ratio": 0.0, - "completion_length": 59.04166889190674, - "epoch": 0.3088, - "grad_norm": 62.683502197265625, - "kl": 65.48046875, - "learning_rate": 3.0180158585586395e-08, - "loss": 0.365, - "num_tokens": 27818072.0, - "reward": 0.7640376538038254, - "reward_std": 0.3756988551467657, - "rewards/SMILES_validity_reward": 0.7916666567325592, + "completion_length": 45.23958504199982, + "epoch": 0.1544, + "grad_norm": 16.571672439575195, + "kl": 9.0302734375, + "learning_rate": 2.611479398511518e-08, + "loss": 0.0151, + "num_tokens": 11511828.0, + "reward": 0.9085580185055733, + "reward_std": 0.16660702659282833, + "rewards/SMILES_validity_reward": 0.9166666697710752, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8828125037252903, - "rewards/reasoning_steps_reward": 0.006944444612599909, - "rewards/repetition_penalty_reward": -0.009555418229865609, - "rewards/smiles_len_reward": -0.07120654941536486, - "rewards/tag_count_reward": 0.024088541918899864, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.02083333395421505, + "rewards/repetition_penalty_reward": -0.002962835831567645, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3489583330228925, "step": 386 }, { "clip_ratio": 0.0, - "completion_length": 49.63021004199982, - "epoch": 0.3104, - "grad_norm": 13.964454650878906, - "kl": 261.11328125, - "learning_rate": 2.8359326953784735e-08, - "loss": 0.4462, - "num_tokens": 27935434.0, - "reward": 0.7693060860037804, - "reward_std": 0.38393950555473566, - "rewards/SMILES_validity_reward": 0.8125000037252903, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8619791641831398, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.007570940229925327, - "rewards/smiles_len_reward": -0.11252047342713922, - "rewards/tag_count_reward": 0.029296875232830644, + "completion_length": 40.08333420753479, + "epoch": 0.1552, + "grad_norm": 17.277727127075195, + "kl": 19.247802734375, + "learning_rate": 2.4534950219914057e-08, + "loss": 0.0401, + "num_tokens": 11564828.0, + "reward": 0.934512734413147, + "reward_std": 0.12718994682654738, + "rewards/SMILES_validity_reward": 0.9500000029802322, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9687499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.005394943415012676, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3046874990686774, "step": 388 }, { "clip_ratio": 0.0, - "completion_length": 47.25520956516266, - "epoch": 0.312, - "grad_norm": 1176.294677734375, - "kl": 183.84375, - "learning_rate": 2.659183991914696e-08, - "loss": 0.3188, - "num_tokens": 28051884.0, - "reward": 0.7658360535278916, - "reward_std": 0.34651170764118433, - "rewards/SMILES_validity_reward": 0.7898437320254743, + "completion_length": 53.80208480358124, + "epoch": 0.156, + "grad_norm": 16.590299606323242, + "kl": 11.242431640625, + "learning_rate": 2.300192937233128e-08, + "loss": -0.0123, + "num_tokens": 11619145.0, + "reward": 0.8780521750450134, + "reward_std": 0.1917848305311054, + "rewards/SMILES_validity_reward": 0.8833333365619183, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458358168602, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.0037111646015546285, - "rewards/smiles_len_reward": -0.10511704150121659, - "rewards/tag_count_reward": 0.016927083604969084, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.004375363059807569, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3098958320915699, "step": 390 }, { "clip_ratio": 0.0, - "completion_length": 70.17187738418579, - "epoch": 0.3136, - "grad_norm": 21.213207244873047, - "kl": 44.240234375, - "learning_rate": 2.4878122881409447e-08, - "loss": 0.3338, - "num_tokens": 28177134.0, - "reward": 0.8168588802218437, - "reward_std": 0.33587119466392323, - "rewards/SMILES_validity_reward": 0.8375000096857548, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9296874925494194, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.008981447015685262, - "rewards/smiles_len_reward": 0.0019179416121914983, - "rewards/tag_count_reward": 0.016276042093522847, + "completion_length": 69.8645852804184, + "epoch": 0.1568, + "grad_norm": 20.95870018005371, + "kl": 25.18212890625, + "learning_rate": 2.1516049808822935e-08, + "loss": 0.0524, + "num_tokens": 11675004.0, + "reward": 0.9173811189830303, + "reward_std": 0.17535327681980561, + "rewards/SMILES_validity_reward": 0.9333333373069763, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.010565137352386955, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3593750009313226, "step": 392 }, { "clip_ratio": 0.0, - "completion_length": 58.79948043823242, - "epoch": 0.3152, - "grad_norm": 2544.607666015625, - "kl": 461.828125, - "learning_rate": 2.3218588298916543e-08, - "loss": 0.7863, - "num_tokens": 28298017.0, - "reward": 0.7550071179866791, - "reward_std": 0.4074345175176859, - "rewards/SMILES_validity_reward": 0.7833333276212215, + "completion_length": 45.48958468437195, + "epoch": 0.1576, + "grad_norm": 13.889383316040039, + "kl": 14.10595703125, + "learning_rate": 2.007762010589098e-08, + "loss": 0.0016, + "num_tokens": 11728523.0, + "reward": 0.9576418250799179, + "reward_std": 0.0714196574408561, + "rewards/SMILES_validity_reward": 0.9833333343267441, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8802083358168602, - "rewards/reasoning_steps_reward": 0.017361111589707434, - "rewards/repetition_penalty_reward": -0.006937599915545434, - "rewards/smiles_len_reward": -0.10579633654560894, - "rewards/tag_count_reward": 0.021484375116415322, + "rewards/format_reward": 0.9583333320915699, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.004833227023482323, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.3177083320915699, "step": 394 }, { "clip_ratio": 0.0, - "completion_length": 64.85156464576721, - "epoch": 0.3168, - "grad_norm": 19.701536178588867, - "kl": 35.046875, - "learning_rate": 2.1613635589349756e-08, - "loss": 0.2979, - "num_tokens": 28421224.0, - "reward": 0.7340408079326153, - "reward_std": 0.4440473485738039, - "rewards/SMILES_validity_reward": 0.7473958246409893, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984374962747097, - "rewards/reasoning_steps_reward": 0.014756944961845875, - "rewards/repetition_penalty_reward": -0.007214406403363682, - "rewards/smiles_len_reward": -0.1137501149205491, - "rewards/tag_count_reward": 0.019531250116415322, + "completion_length": 46.35416781902313, + "epoch": 0.1584, + "grad_norm": 16.284929275512695, + "kl": 12.9434814453125, + "learning_rate": 1.8686938986000627e-08, + "loss": -0.0031, + "num_tokens": 11782125.0, + "reward": 0.9024706967175007, + "reward_std": 0.19274842808954418, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.00133611261844635, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3177083358168602, "step": 396 }, { "clip_ratio": 0.0, - "completion_length": 60.52343833446503, - "epoch": 0.3184, - "grad_norm": 25.242353439331055, - "kl": 39.62890625, - "learning_rate": 2.006365103359614e-08, - "loss": 0.3568, - "num_tokens": 28542769.0, - "reward": 0.7509439922869205, - "reward_std": 0.42264856584370136, - "rewards/SMILES_validity_reward": 0.7708333395421505, + "completion_length": 56.989585518836975, + "epoch": 0.1592, + "grad_norm": 16.189149856567383, + "kl": 9.690673828125, + "learning_rate": 1.734429525554365e-08, + "loss": -0.0609, + "num_tokens": 11836748.0, + "reward": 0.9177911207079887, + "reward_std": 0.17121762363240123, + "rewards/SMILES_validity_reward": 0.9333333373069763, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8880208358168602, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.011348883606842719, - "rewards/smiles_len_reward": -0.06319690844975412, - "rewards/tag_count_reward": 0.016276042093522847, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.003861064527882263, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3255208311602473, "step": 398 }, { "clip_ratio": 0.0, - "completion_length": 67.77343893051147, - "epoch": 0.32, - "grad_norm": 25.40789222717285, - "kl": 549.107421875, - "learning_rate": 1.8569007682777415e-08, - "loss": 0.8423, - "num_tokens": 28667098.0, - "reward": 0.7792046889662743, - "reward_std": 0.3950358787551522, - "rewards/SMILES_validity_reward": 0.8041666597127914, + "completion_length": 66.229168176651, + "epoch": 0.16, + "grad_norm": 93.22771453857422, + "kl": 26.119140625, + "learning_rate": 1.604996774486145e-08, + "loss": 0.0444, + "num_tokens": 11892258.0, + "reward": 0.8573517389595509, + "reward_std": 0.29699486307799816, + "rewards/SMILES_validity_reward": 0.8666666746139526, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458283662796, + "rewards/format_reward": 0.9166666604578495, "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.012652922217966989, - "rewards/smiles_len_reward": -0.060145003226352856, - "rewards/tag_count_reward": 0.014322916977107525, + "rewards/repetition_penalty_reward": -0.011379826988559216, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.2890625, "step": 400 }, { "clip_ratio": 0.0, - "completion_length": 41.66145944595337, - "epoch": 0.3216, - "grad_norm": 20.436986923217773, - "kl": 34.26171875, - "learning_rate": 1.713006526846439e-08, - "loss": 0.1911, - "num_tokens": 28781400.0, - "reward": 0.8079907111823559, - "reward_std": 0.3260812449734658, - "rewards/SMILES_validity_reward": 0.8499999716877937, + "completion_length": 39.26041758060455, + "epoch": 0.1608, + "grad_norm": 19.394777297973633, + "kl": 15.634765625, + "learning_rate": 1.4804225250339281e-08, + "loss": 0.0331, + "num_tokens": 11945179.0, + "reward": 0.8919323273003101, + "reward_std": 0.1985253794118762, + "rewards/SMILES_validity_reward": 0.8833333365619183, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9088541641831398, + "rewards/format_reward": 0.9687499962747097, "rewards/reasoning_steps_reward": 0.0, - "rewards/repetition_penalty_reward": -0.004972287300915923, - "rewards/smiles_len_reward": -0.12488739204127342, - "rewards/tag_count_reward": 0.03320312505820766, + "rewards/repetition_penalty_reward": -0.000990475993603468, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3463541679084301, "step": 402 }, { "clip_ratio": 0.0, - "completion_length": 63.95052218437195, - "epoch": 0.3232, - "grad_norm": 19.446300506591797, - "kl": 46.587890625, - "learning_rate": 1.574717011609633e-08, - "loss": 0.3809, - "num_tokens": 28904261.0, - "reward": 0.7811654321849346, - "reward_std": 0.3506924198009074, - "rewards/SMILES_validity_reward": 0.8124999962747097, + "completion_length": 80.62500202655792, + "epoch": 0.1616, + "grad_norm": 52.34968185424805, + "kl": 23.559814453125, + "learning_rate": 1.360732647858498e-08, + "loss": 0.1011, + "num_tokens": 12002071.0, + "reward": 0.8946188390254974, + "reward_std": 0.20518372696824372, + "rewards/SMILES_validity_reward": 0.9166666716337204, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333358168602, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.010433290342916735, - "rewards/smiles_len_reward": -0.08025730540975928, - "rewards/tag_count_reward": 0.022135416860692203, + "rewards/format_reward": 0.9062499962747097, + "rewards/reasoning_steps_reward": 0.0034722222480922937, + "rewards/repetition_penalty_reward": -0.015618514036759734, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3437500009313226, "step": 404 }, { "clip_ratio": 0.0, - "completion_length": 76.74479413032532, - "epoch": 0.3248, - "grad_norm": 21.497493743896484, - "kl": 61.5185546875, - "learning_rate": 1.4420655061626929e-08, - "loss": 0.4488, - "num_tokens": 29032035.0, - "reward": 0.7920585982501507, - "reward_std": 0.34399935975670815, - "rewards/SMILES_validity_reward": 0.8125000037252903, + "completion_length": 61.145835757255554, + "epoch": 0.1624, + "grad_norm": 20.986385345458984, + "kl": 24.005859375, + "learning_rate": 1.2459519992702311e-08, + "loss": 0.0417, + "num_tokens": 12057093.0, + "reward": 0.8893291372805834, + "reward_std": 0.21797450783196837, + "rewards/SMILES_validity_reward": 0.9000000040978193, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9192708395421505, - "rewards/reasoning_steps_reward": 0.0017361111240461469, - "rewards/repetition_penalty_reward": -0.012168998313427437, - "rewards/smiles_len_reward": -0.03317499946570024, - "rewards/tag_count_reward": 0.0188802084303461, + "rewards/format_reward": 0.9270833283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.013480799068929628, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3463541669771075, "step": 406 }, { "clip_ratio": 0.0, - "completion_length": 58.755210280418396, - "epoch": 0.3264, - "grad_norm": 40.13300323486328, - "kl": 1958.30078125, - "learning_rate": 1.3150839371417699e-08, - "loss": 2.1492, - "num_tokens": 29152901.0, - "reward": 0.7800212763249874, - "reward_std": 0.3497500689700246, - "rewards/SMILES_validity_reward": 0.7945312596857548, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9296874962747097, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.007477993072825484, - "rewards/smiles_len_reward": -0.0684822405455634, - "rewards/tag_count_reward": 0.020182292151730508, + "completion_length": 43.91666829586029, + "epoch": 0.1632, + "grad_norm": 278.88031005859375, + "kl": 38.345947265625, + "learning_rate": 1.1361044160671629e-08, + "loss": -0.0215, + "num_tokens": 12110461.0, + "reward": 0.9176699332892895, + "reward_std": 0.1729820077889599, + "rewards/SMILES_validity_reward": 0.9333333373069763, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.002468873586622067, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3541666669771075, "step": 408 }, { "clip_ratio": 0.0, - "completion_length": 51.61718952655792, - "epoch": 0.328, - "grad_norm": 40.98759460449219, - "kl": 50.23828125, - "learning_rate": 1.1938028665396171e-08, - "loss": 0.3593, - "num_tokens": 29271026.0, - "reward": 0.7865180224180222, - "reward_std": 0.3510497361421585, - "rewards/SMILES_validity_reward": 0.8124999925494194, + "completion_length": 75.67708659172058, + "epoch": 0.164, + "grad_norm": 18.142961502075195, + "kl": 17.00732421875, + "learning_rate": 1.0312127105846947e-08, + "loss": 0.097, + "num_tokens": 12166878.0, + "reward": 0.8851692602038383, + "reward_std": 0.2130603661062196, + "rewards/SMILES_validity_reward": 0.9000000040978193, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9088541641831398, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.007369535931502469, - "rewards/smiles_len_reward": -0.062347470491658896, - "rewards/tag_count_reward": 0.015625000291038305, + "rewards/format_reward": 0.9270833283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.01341284648515284, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.3151041669771075, "step": 410 }, { "clip_ratio": 0.0, - "completion_length": 51.83333456516266, - "epoch": 0.3296, - "grad_norm": 16.528141021728516, - "kl": 21.453125, - "learning_rate": 1.0782514843499652e-08, - "loss": 0.2404, - "num_tokens": 29389234.0, - "reward": 0.7603043857961893, - "reward_std": 0.23452748105773935, - "rewards/SMILES_validity_reward": 0.7671874971129, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9427083283662796, - "rewards/reasoning_steps_reward": 0.0, - "rewards/repetition_penalty_reward": -0.0036121224620728754, - "rewards/smiles_len_reward": -0.10610569885466248, - "rewards/tag_count_reward": 0.014322916918899864, + "completion_length": 53.270835399627686, + "epoch": 0.1648, + "grad_norm": 1134.2469482421875, + "kl": 161.255859375, + "learning_rate": 9.312986659581301e-09, + "loss": 0.2592, + "num_tokens": 12221144.0, + "reward": 0.8985108807682991, + "reward_std": 0.21337384218350053, + "rewards/SMILES_validity_reward": 0.9000000059604645, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166641831398, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.007600876037031412, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3541666651144624, "step": 412 }, { "clip_ratio": 0.0, - "completion_length": 58.78125202655792, - "epoch": 0.3312, - "grad_norm": 59.89474868774414, - "kl": 48.08984375, - "learning_rate": 9.684576015420275e-09, - "loss": 0.3184, - "num_tokens": 29510110.0, - "reward": 0.7541577331721783, - "reward_std": 0.4041676054475829, - "rewards/SMILES_validity_reward": 0.7833333387970924, + "completion_length": 44.82291769981384, + "epoch": 0.1656, + "grad_norm": 14.800688743591309, + "kl": 10.4658203125, + "learning_rate": 8.363830315988945e-09, + "loss": 0.0247, + "num_tokens": 12274599.0, + "reward": 0.9208331219851971, + "reward_std": 0.15400783298537135, + "rewards/SMILES_validity_reward": 0.9333333373069763, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8776041753590107, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.011318007542286068, - "rewards/smiles_len_reward": -0.09189758577849716, - "rewards/tag_count_reward": 0.018229166802484542, + "rewards/format_reward": 0.9479166679084301, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0020868037827312946, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3489583358168602, "step": 414 }, { "clip_ratio": 0.0, - "completion_length": 43.156251668930054, - "epoch": 0.3328, - "grad_norm": 157.40074157714844, - "kl": 75.53125, - "learning_rate": 8.644476433669529e-09, - "loss": 0.3375, - "num_tokens": 29624986.0, - "reward": 0.7917684130370617, - "reward_std": 0.36183687672019005, - "rewards/SMILES_validity_reward": 0.829166654497385, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333320915699, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.0043341358250472695, - "rewards/smiles_len_reward": -0.09569119266234338, - "rewards/tag_count_reward": 0.020833333546761423, + "completion_length": 42.52083456516266, + "epoch": 0.1664, + "grad_norm": 40.16646194458008, + "kl": 24.19287109375, + "learning_rate": 7.46485518885462e-09, + "loss": -0.024, + "num_tokens": 12327833.0, + "reward": 0.9045818336308002, + "reward_std": 0.1850796421058476, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.0010579937370494008, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.3281250009313226, "step": 416 }, { "clip_ratio": 0.0, - "completion_length": 64.85416805744171, - "epoch": 0.3344, - "grad_norm": 617.4863891601562, - "kl": 177.0859375, - "learning_rate": 7.662466429977698e-09, - "loss": 0.4221, - "num_tokens": 29748194.0, - "reward": 0.7439644634723663, - "reward_std": 0.3548463308252394, - "rewards/SMILES_validity_reward": 0.7591145820915699, + "completion_length": 46.656251311302185, + "epoch": 0.1672, + "grad_norm": 16.928592681884766, + "kl": 14.68310546875, + "learning_rate": 6.616247970698319e-09, + "loss": 0.0131, + "num_tokens": 12381464.0, + "reward": 0.933611448854208, + "reward_std": 0.1313752842215763, + "rewards/SMILES_validity_reward": 0.9500000029802322, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9062499888241291, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.00889256418668083, - "rewards/smiles_len_reward": -0.11266191606409848, - "rewards/tag_count_reward": 0.02343749994179234, + "rewards/format_reward": 0.9479166604578495, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.001386914482282009, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3541666632518172, "step": 418 }, { "clip_ratio": 0.0, - "completion_length": 71.27344048023224, - "epoch": 0.336, - "grad_norm": 14.72851276397705, - "kl": 57.1376953125, - "learning_rate": 6.738782355044048e-09, - "loss": 0.3434, - "num_tokens": 29873867.0, - "reward": 0.7687530927360058, - "reward_std": 0.3611881284014089, - "rewards/SMILES_validity_reward": 0.7999999970197678, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8984375, - "rewards/reasoning_steps_reward": 0.006076389050576836, - "rewards/repetition_penalty_reward": -0.008875246570823947, - "rewards/smiles_len_reward": -0.12386404629796743, - "rewards/tag_count_reward": 0.018880208488553762, + "completion_length": 63.625001668930054, + "epoch": 0.168, + "grad_norm": 12.708135604858398, + "kl": 8.35107421875, + "learning_rate": 5.8181848940044855e-09, + "loss": 0.0416, + "num_tokens": 12436724.0, + "reward": 0.8699160404503345, + "reward_std": 0.2740453766891733, + "rewards/SMILES_validity_reward": 0.8833333402872086, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9166666567325592, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.007611631946929265, + "rewards/smiles_len_reward": -0.03645833441987634, + "rewards/tag_count_reward": 0.29947916604578495, "step": 420 }, { "clip_ratio": 0.0, - "completion_length": 69.51302433013916, - "epoch": 0.3376, - "grad_norm": 463.06427001953125, - "kl": 152.01171875, - "learning_rate": 5.8736465216517594e-09, - "loss": 0.4927, - "num_tokens": 29998864.0, - "reward": 0.7758666761219501, - "reward_std": 0.35622790618799627, - "rewards/SMILES_validity_reward": 0.8083333298563957, + "completion_length": 54.01041805744171, + "epoch": 0.1688, + "grad_norm": 13.965043067932129, + "kl": 47.185302734375, + "learning_rate": 5.070831694623135e-09, + "loss": 0.0942, + "num_tokens": 12491061.0, + "reward": 0.859567554667592, + "reward_std": 0.20586686272872612, + "rewards/SMILES_validity_reward": 0.8666666708886623, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8906250037252903, - "rewards/reasoning_steps_reward": 0.013020833721384406, - "rewards/repetition_penalty_reward": -0.013086474158626515, - "rewards/smiles_len_reward": -0.08645110134966671, - "rewards/tag_count_reward": 0.014973958546761423, + "rewards/format_reward": 0.9062499990686774, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.010054960977868177, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.3463541669771075, "step": 422 }, { "clip_ratio": 0.0, - "completion_length": 81.4921897649765, - "epoch": 0.3392, - "grad_norm": 89.55314636230469, - "kl": 86.578125, - "learning_rate": 5.067267151161514e-09, - "loss": 0.4928, - "num_tokens": 30128461.0, - "reward": 0.7535761334002018, - "reward_std": 0.40810330770909786, - "rewards/SMILES_validity_reward": 0.7750000096857548, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416604578495, - "rewards/reasoning_steps_reward": 0.018229166977107525, - "rewards/repetition_penalty_reward": -0.011182753019966185, - "rewards/smiles_len_reward": -0.11959353811107576, - "rewards/tag_count_reward": 0.020182291802484542, + "completion_length": 36.406251072883606, + "epoch": 0.1696, + "grad_norm": 25.02899932861328, + "kl": 15.2127685546875, + "learning_rate": 4.374343577351336e-09, + "loss": 0.0298, + "num_tokens": 12543708.0, + "reward": 0.9081405811011791, + "reward_std": 0.18163511087186635, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9583333283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.007137345732189715, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3072916651144624, "step": 424 }, { "clip_ratio": 0.0, - "completion_length": 50.244792461395264, - "epoch": 0.3408, - "grad_norm": 14.1329345703125, - "kl": 39.599609375, - "learning_rate": 4.319838323396691e-09, - "loss": 0.2317, - "num_tokens": 30246059.0, - "reward": 0.7920440249145031, - "reward_std": 0.31801472790539265, - "rewards/SMILES_validity_reward": 0.8208333402872086, + "completion_length": 85.25000143051147, + "epoch": 0.1704, + "grad_norm": 129.71121215820312, + "kl": 40.9404296875, + "learning_rate": 3.7288651837012745e-09, + "loss": 0.1807, + "num_tokens": 12601044.0, + "reward": 0.8859259001910686, + "reward_std": 0.22173287137411535, + "rewards/SMILES_validity_reward": 0.9000000040978193, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9088541753590107, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.007406066724797711, - "rewards/smiles_len_reward": -0.06798831513151526, - "rewards/tag_count_reward": 0.018229166511446238, + "rewards/format_reward": 0.9062499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.003242230974137783, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.3697916632518172, "step": 426 }, { "clip_ratio": 0.0, - "completion_length": 58.59375178813934, - "epoch": 0.3424, - "grad_norm": 29.327720642089844, - "kl": 41.130859375, - "learning_rate": 3.631539929932148e-09, - "loss": 0.1712, - "num_tokens": 30366863.0, - "reward": 0.7371162544004619, - "reward_std": 0.37827688455581665, - "rewards/SMILES_validity_reward": 0.7453124960884452, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9088541604578495, - "rewards/reasoning_steps_reward": 0.006076389050576836, - "rewards/repetition_penalty_reward": -0.0067941894121759105, - "rewards/smiles_len_reward": -0.08619371574604884, - "rewards/tag_count_reward": 0.014322916977107525, + "completion_length": 45.937501668930054, + "epoch": 0.1712, + "grad_norm": 17.502323150634766, + "kl": 11.782958984375, + "learning_rate": 3.134530561862081e-09, + "loss": -0.0112, + "num_tokens": 12654606.0, + "reward": 0.9541554935276508, + "reward_std": 0.09704665496246889, + "rewards/SMILES_validity_reward": 0.9666666686534882, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9791666641831398, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.0037587993138004094, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.3359375046566129, "step": 428 }, { "clip_ratio": 0.0, - "completion_length": 50.015626192092896, - "epoch": 0.344, - "grad_norm": 26.84274673461914, - "kl": 2524.26171875, - "learning_rate": 3.002537630797747e-09, - "loss": 2.7493, - "num_tokens": 30484373.0, - "reward": 0.7964664585888386, - "reward_std": 0.3583722086623311, - "rewards/SMILES_validity_reward": 0.8333333171904087, + "completion_length": 92.07291972637177, + "epoch": 0.172, + "grad_norm": 35.44276809692383, + "kl": 17.183837890625, + "learning_rate": 2.5914631388619103e-09, + "loss": 0.1045, + "num_tokens": 12712597.0, + "reward": 0.847147922962904, + "reward_std": 0.29424209939315915, + "rewards/SMILES_validity_reward": 0.8666666746139526, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9062500037252903, - "rewards/reasoning_steps_reward": 0.004340277868323028, - "rewards/repetition_penalty_reward": -0.006158172767754877, - "rewards/smiles_len_reward": -0.10187815490644425, - "rewards/tag_count_reward": 0.0162760415696539, + "rewards/format_reward": 0.8749999962747097, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.01966797193745151, + "rewards/smiles_len_reward": -0.031250000931322575, + "rewards/tag_count_reward": 0.3203125, "step": 430 }, { "clip_ratio": 0.0, - "completion_length": 43.679688572883606, - "epoch": 0.3456, - "grad_norm": 57.18663787841797, - "kl": 89.0, - "learning_rate": 2.4329828146074096e-09, - "loss": 0.2609, - "num_tokens": 30599450.0, - "reward": 0.7566449083387852, - "reward_std": 0.41301782708615065, - "rewards/SMILES_validity_reward": 0.7749999966472387, + "completion_length": 66.14583551883698, + "epoch": 0.1728, + "grad_norm": 27.857742309570312, + "kl": 25.1279296875, + "learning_rate": 2.0997756949353297e-09, + "loss": 0.0567, + "num_tokens": 12768099.0, + "reward": 0.7844962626695633, + "reward_std": 0.3947558330837637, + "rewards/SMILES_validity_reward": 0.7833333443850279, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9036458320915699, + "rewards/format_reward": 0.8749999925494194, "rewards/reasoning_steps_reward": 0.0, - "rewards/repetition_penalty_reward": -0.005343449403881095, - "rewards/smiles_len_reward": -0.08693266217596829, - "rewards/tag_count_reward": 0.0227864584303461, + "rewards/repetition_penalty_reward": -0.013371882087085396, + "rewards/smiles_len_reward": -0.0416666679084301, + "rewards/tag_count_reward": 0.29166666604578495, "step": 432 }, { "clip_ratio": 0.0, - "completion_length": 49.11458432674408, - "epoch": 0.3472, - "grad_norm": 154.4663848876953, - "kl": 46.12890625, - "learning_rate": 1.9230125621225725e-09, - "loss": 0.2244, - "num_tokens": 30716614.0, - "reward": 0.7455132007598877, - "reward_std": 0.32910655019804835, - "rewards/SMILES_validity_reward": 0.7690104194916785, + "completion_length": 41.08333420753479, + "epoch": 0.1736, + "grad_norm": 18.345930099487305, + "kl": 8.83349609375, + "learning_rate": 1.6595703401020844e-09, + "loss": -0.0442, + "num_tokens": 12821195.0, + "reward": 0.9463203363120556, + "reward_std": 0.10566021921113133, + "rewards/SMILES_validity_reward": 0.9666666686534882, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8854166679084301, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.005490455321705667, - "rewards/smiles_len_reward": -0.09758184582460672, - "rewards/tag_count_reward": 0.011067708488553762, + "rewards/format_reward": 0.9583333283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0013811642420478165, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3333333311602473, "step": 434 }, { "clip_ratio": 0.0, - "completion_length": 50.18229281902313, - "epoch": 0.3488, - "grad_norm": 22.847536087036133, - "kl": 50.4775390625, - "learning_rate": 1.4727496132596605e-09, - "loss": 0.2611, - "num_tokens": 30834188.0, - "reward": 0.7400539442896843, - "reward_std": 0.3555171948391944, - "rewards/SMILES_validity_reward": 0.7494791620410979, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9140624962747097, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.00538063397834776, - "rewards/smiles_len_reward": -0.10606034100055695, - "rewards/tag_count_reward": 0.015625000116415322, + "completion_length": 56.88541841506958, + "epoch": 0.1744, + "grad_norm": 13.988067626953125, + "kl": 7.73828125, + "learning_rate": 1.2709384929615596e-09, + "loss": 0.0832, + "num_tokens": 12875808.0, + "reward": 0.9088719300925732, + "reward_std": 0.1888645300641656, + "rewards/SMILES_validity_reward": 0.9166666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9374999962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.005031941807828844, + "rewards/smiles_len_reward": -0.02083333395421505, + "rewards/tag_count_reward": 0.38541666604578495, "step": 436 }, { "clip_ratio": 0.0, - "completion_length": 49.55468928813934, - "epoch": 0.3504, - "grad_norm": 204.87474060058594, - "kl": 68.55859375, - "learning_rate": 1.0823023375489126e-09, - "loss": 0.2751, - "num_tokens": 30951521.0, - "reward": 0.7930208593606949, - "reward_std": 0.3214792348444462, - "rewards/SMILES_validity_reward": 0.8333333283662796, + "completion_length": 54.47916769981384, + "epoch": 0.1752, + "grad_norm": 16.43869972229004, + "kl": 43.60986328125, + "learning_rate": 9.339608617077165e-10, + "loss": 0.0403, + "num_tokens": 12930190.0, + "reward": 0.8349853716790676, + "reward_std": 0.13201338605722412, + "rewards/SMILES_validity_reward": 0.8020833358168602, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9062500074505806, - "rewards/reasoning_steps_reward": 0.004340277810115367, - "rewards/repetition_penalty_reward": -0.005328564649971668, - "rewards/smiles_len_reward": -0.14823145783157088, - "rewards/tag_count_reward": 0.027343750291038305, + "rewards/format_reward": 0.9687499962747097, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0017100500699598342, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.34635416558012366, "step": 438 }, { "clip_ratio": 0.0, - "completion_length": 60.58333492279053, - "epoch": 0.352, - "grad_norm": 53.10478591918945, - "kl": 57.669921875, - "learning_rate": 7.51764708051994e-10, - "loss": 0.3782, - "num_tokens": 31073089.0, - "reward": 0.7825033217668533, - "reward_std": 0.3621302582323551, - "rewards/SMILES_validity_reward": 0.8208333365619183, + "completion_length": 36.86458420753479, + "epoch": 0.176, + "grad_norm": 17.14826011657715, + "kl": 28.635498046875, + "learning_rate": 6.487074273681114e-10, + "loss": 0.0276, + "num_tokens": 12982881.0, + "reward": 0.9228818565607071, + "reward_std": 0.1395076098269783, + "rewards/SMILES_validity_reward": 0.9500000029802322, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8906250037252903, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.008823041369396378, - "rewards/smiles_len_reward": -0.10533744562417269, - "rewards/tag_count_reward": 0.016276041977107525, + "rewards/format_reward": 0.9270833320915699, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.001911880768602714, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3151041688397527, "step": 440 }, { "clip_ratio": 0.0, - "completion_length": 42.38020920753479, - "epoch": 0.3536, - "grad_norm": 537.83251953125, - "kl": 135.3125, - "learning_rate": 4.812162787445062e-10, - "loss": 0.3406, - "num_tokens": 31187667.0, - "reward": 0.7788260616362095, - "reward_std": 0.3945662109181285, - "rewards/SMILES_validity_reward": 0.8125, + "completion_length": 57.86458492279053, + "epoch": 0.1768, + "grad_norm": 17.069561004638672, + "kl": 9.9276123046875, + "learning_rate": 4.152374292708538e-10, + "loss": 0.034, + "num_tokens": 13037588.0, + "reward": 0.8790950290858746, + "reward_std": 0.25983966258354485, + "rewards/SMILES_validity_reward": 0.8833333402872086, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.8958333320915699, - "rewards/reasoning_steps_reward": 0.008680555794853717, - "rewards/repetition_penalty_reward": -0.004561349313007668, - "rewards/smiles_len_reward": -0.10648488637525588, - "rewards/tag_count_reward": 0.015625, + "rewards/format_reward": 0.9374999925494194, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.01217609562445432, + "rewards/smiles_len_reward": -0.026041667442768812, + "rewards/tag_count_reward": 0.3333333320915699, "step": 442 }, { "clip_ratio": 0.0, - "completion_length": 83.73437654972076, - "epoch": 0.3552, - "grad_norm": 31.371912002563477, - "kl": 57.451171875, - "learning_rate": 2.707221653688585e-10, - "loss": 0.5198, - "num_tokens": 31318125.0, - "reward": 0.8033189512789249, - "reward_std": 0.3403410839382559, - "rewards/SMILES_validity_reward": 0.8458333276212215, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416641831398, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.009130334568908438, - "rewards/smiles_len_reward": -0.12135331379249692, - "rewards/tag_count_reward": 0.02929687494179234, + "completion_length": 52.05208492279053, + "epoch": 0.1776, + "grad_norm": 22.546499252319336, + "kl": 13.678955078125, + "learning_rate": 2.3359935274214204e-10, + "loss": 0.0059, + "num_tokens": 13091737.0, + "reward": 0.9071716498583555, + "reward_std": 0.14055689855013043, + "rewards/SMILES_validity_reward": 0.9166666679084301, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.006409892288502306, + "rewards/smiles_len_reward": -0.020833333488553762, + "rewards/tag_count_reward": 0.3385416632518172, "step": 444 }, { "clip_ratio": 0.0, - "completion_length": 52.46354329586029, - "epoch": 0.3568, - "grad_norm": 19.290857315063477, - "kl": 40.796875, - "learning_rate": 1.203330297622207e-10, - "loss": 0.2786, - "num_tokens": 31436575.0, - "reward": 0.7805712670087814, - "reward_std": 0.4029743252322078, - "rewards/SMILES_validity_reward": 0.8000000007450581, + "completion_length": 47.010418176651, + "epoch": 0.1784, + "grad_norm": 15.367889404296875, + "kl": 19.07421875, + "learning_rate": 1.0383091903720665e-10, + "loss": 0.0431, + "num_tokens": 13145402.0, + "reward": 0.9057391881942749, + "reward_std": 0.1908833612978924, + "rewards/SMILES_validity_reward": 0.9333333373069763, "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9114583358168602, - "rewards/reasoning_steps_reward": 0.006944444612599909, - "rewards/repetition_penalty_reward": -0.004496393317822367, - "rewards/smiles_len_reward": -0.04478339894558303, - "rewards/tag_count_reward": 0.013671875174622983, + "rewards/format_reward": 0.9062499925494194, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.004588633368257433, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.3098958358168602, "step": 446 }, { "clip_ratio": 0.0, - "completion_length": 60.22916829586029, - "epoch": 0.3584, - "grad_norm": 32.02495193481445, - "kl": 48.533203125, - "learning_rate": 3.008506766313812e-11, - "loss": 0.3244, - "num_tokens": 31558007.0, - "reward": 0.80455506965518, - "reward_std": 0.32425580685958266, - "rewards/SMILES_validity_reward": 0.8416666425764561, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9010416679084301, - "rewards/reasoning_steps_reward": 0.018229166977107525, - "rewards/repetition_penalty_reward": -0.007674921703255677, - "rewards/smiles_len_reward": -0.08323407603893429, - "rewards/tag_count_reward": 0.02343750005820766, + "completion_length": 61.66666793823242, + "epoch": 0.1792, + "grad_norm": 11.217643737792969, + "kl": 11.876708984375, + "learning_rate": 2.595907750671533e-11, + "loss": 0.0867, + "num_tokens": 13200474.0, + "reward": 0.9117394872009754, + "reward_std": 0.16433565050829202, + "rewards/SMILES_validity_reward": 0.9333333373069763, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9270833358168602, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.009689704194897786, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3177083330228925, "step": 448 }, { "clip_ratio": 0.0, - "completion_length": 55.799480676651, - "epoch": 0.36, - "grad_norm": 16.788972854614258, - "kl": 34.279296875, + "completion_length": 49.55208456516266, + "epoch": 0.18, + "grad_norm": 20.237873077392578, + "kl": 16.04541015625, "learning_rate": 0.0, - "loss": 0.353, - "num_tokens": 31677738.0, - "reward": 0.7685777321457863, - "reward_std": 0.3936071125790477, - "rewards/SMILES_validity_reward": 0.7916666679084301, - "rewards/cosine_scaled_reward": -0.4999987781047821, - "rewards/format_reward": 0.9088541679084301, - "rewards/reasoning_steps_reward": 0.007812500232830644, - "rewards/repetition_penalty_reward": -0.0068950422428315505, - "rewards/smiles_len_reward": -0.09313616005238146, - "rewards/tag_count_reward": 0.009765625058207661, + "loss": -0.0015, + "num_tokens": 13254383.0, + "reward": 0.8980720601975918, + "reward_std": 0.19482463505119085, + "rewards/SMILES_validity_reward": 0.9000000059604645, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9479166641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.006780669005820528, + "rewards/smiles_len_reward": -0.015625000465661287, + "rewards/tag_count_reward": 0.3593749962747097, "step": 450 }, { - "epoch": 0.36, + "epoch": 0.18, "step": 450, "total_flos": 0.0, - "train_loss": 0.6825665244791242, - "train_runtime": 21029.9945, - "train_samples_per_second": 4.108, - "train_steps_per_second": 0.021 + "train_loss": 0.053622997744823805, + "train_runtime": 6963.8787, + "train_samples_per_second": 3.102, + "train_steps_per_second": 0.065 } ], "logging_steps": 2, @@ -4537,7 +4537,7 @@ } }, "total_flos": 0.0, - "train_batch_size": 8, + "train_batch_size": 2, "trial_name": null, "trial_params": null }