diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4543 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.36, + "eval_steps": 500, + "global_step": 450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 328.68490409851074, + "epoch": 0.0016, + "grad_norm": 3.912095069885254, + "kl": 0.0, + "learning_rate": 2.222222222222222e-08, + "loss": 0.5058, + "num_tokens": 224519.0, + "reward": -0.46514276787638664, + "reward_std": 0.04444521979894489, + "rewards/SMILES_validity_reward": -0.59166669100523, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.007812500232830644, + "rewards/reasoning_steps_reward": 0.0434027785086073, + "rewards/repetition_penalty_reward": -0.07269583910237998, + "rewards/smiles_len_reward": -0.003906250116415322, + "rewards/tag_count_reward": 0.0, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.9791736602783, + "epoch": 0.0032, + "grad_norm": 4.011274814605713, + "kl": 0.0010592937469482422, + "learning_rate": 4.444444444444444e-08, + "loss": 0.5611, + "num_tokens": 441087.0, + "reward": -0.46445096656680107, + "reward_std": 0.046506868267897516, + "rewards/SMILES_validity_reward": -0.587500024586916, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0052083334885537624, + "rewards/reasoning_steps_reward": 0.029513889166992158, + "rewards/repetition_penalty_reward": -0.06998793449020013, + "rewards/smiles_len_reward": -0.007812500232830644, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.9869842529297, + "epoch": 0.0048, + "grad_norm": 2.4506795406341553, + "kl": 0.0011034011840820312, + "learning_rate": 6.666666666666667e-08, + "loss": 0.3471, + "num_tokens": 652666.0, + "reward": -0.4590358715504408, + "reward_std": 0.058758297411259264, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.007812500232830644, + "rewards/reasoning_steps_reward": 0.0442708347691223, + "rewards/repetition_penalty_reward": -0.0656199580989778, + "rewards/smiles_len_reward": -0.009114583604969084, + "rewards/tag_count_reward": 0.0, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.95313262939453, + "epoch": 0.0064, + "grad_norm": 4.508991241455078, + "kl": 0.001209259033203125, + "learning_rate": 8.888888888888888e-08, + "loss": 0.2718, + "num_tokens": 861160.0, + "reward": -0.4643410835415125, + "reward_std": 0.044357261242112145, + "rewards/SMILES_validity_reward": -0.59166669100523, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.034722222946584225, + "rewards/repetition_penalty_reward": -0.06381095026154071, + "rewards/smiles_len_reward": -0.006510416860692203, + "rewards/tag_count_reward": 0.0026041666860692203, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.7838635444641, + "epoch": 0.008, + "grad_norm": 3.3212454319000244, + "kl": 0.0011568069458007812, + "learning_rate": 1.111111111111111e-07, + "loss": 0.3496, + "num_tokens": 1066517.0, + "reward": -0.4583498015999794, + "reward_std": 0.06702683249022812, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.015625000465661287, + "rewards/reasoning_steps_reward": 0.026909722946584225, + "rewards/repetition_penalty_reward": -0.0630204735789448, + "rewards/smiles_len_reward": -0.01092973274353426, + "rewards/tag_count_reward": 0.0, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.56511402130127, + "epoch": 0.0096, + "grad_norm": 2.4336678981781006, + "kl": 0.0010323524475097656, + "learning_rate": 1.3333333333333334e-07, + "loss": 0.2837, + "num_tokens": 1310574.0, + "reward": -0.461483683437109, + "reward_std": 0.04768646776210517, + "rewards/SMILES_validity_reward": -0.587500024586916, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0052083334885537624, + "rewards/reasoning_steps_reward": 0.07031250116415322, + "rewards/repetition_penalty_reward": -0.08111370843835175, + "rewards/smiles_len_reward": -0.007812500232830644, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.9218864440918, + "epoch": 0.0112, + "grad_norm": 2.4363439083099365, + "kl": 0.0010030269622802734, + "learning_rate": 1.5555555555555556e-07, + "loss": 0.5532, + "num_tokens": 1552080.0, + "reward": -0.4688709732145071, + "reward_std": 0.03987264301395044, + "rewards/SMILES_validity_reward": -0.59166669100523, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.04427083511836827, + "rewards/repetition_penalty_reward": -0.08675744908396155, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.2369899749756, + "epoch": 0.0128, + "grad_norm": 3.011733293533325, + "kl": 0.0010938644409179688, + "learning_rate": 1.7777777777777776e-07, + "loss": 0.4054, + "num_tokens": 1764139.0, + "reward": -0.4688110891729593, + "reward_std": 0.03221098065841943, + "rewards/SMILES_validity_reward": -0.5958333574235439, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0052083334885537624, + "rewards/reasoning_steps_reward": 0.03906250122236088, + "rewards/repetition_penalty_reward": -0.06805969902779907, + "rewards/smiles_len_reward": -0.003906250116415322, + "rewards/tag_count_reward": 0.0, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.020845413208, + "epoch": 0.0144, + "grad_norm": 2.7837798595428467, + "kl": 0.0011706352233886719, + "learning_rate": 2e-07, + "loss": 0.4862, + "num_tokens": 1988019.0, + "reward": -0.47093865275382996, + "reward_std": 0.026894541137153283, + "rewards/SMILES_validity_reward": -0.5958333574235439, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.03732639015652239, + "rewards/repetition_penalty_reward": -0.07132309174630791, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.48177909851074, + "epoch": 0.016, + "grad_norm": 2.6016883850097656, + "kl": 0.0010716915130615234, + "learning_rate": 2.222222222222222e-07, + "loss": 0.5166, + "num_tokens": 2223980.0, + "reward": -0.4630376435816288, + "reward_std": 0.05124675569823012, + "rewards/SMILES_validity_reward": -0.587500024586916, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.007812500232830644, + "rewards/reasoning_steps_reward": 0.03819444519467652, + "rewards/repetition_penalty_reward": -0.07320236065424979, + "rewards/smiles_len_reward": -0.006957859538488265, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.92709732055664, + "epoch": 0.0176, + "grad_norm": 2.962827682495117, + "kl": 0.0009744167327880859, + "learning_rate": 2.4444444444444445e-07, + "loss": 0.6561, + "num_tokens": 2466640.0, + "reward": -0.4662686139345169, + "reward_std": 0.046040316578000784, + "rewards/SMILES_validity_reward": -0.59166669100523, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.007812500232830644, + "rewards/reasoning_steps_reward": 0.04079861199716106, + "rewards/repetition_penalty_reward": -0.07874601823277771, + "rewards/smiles_len_reward": -0.007812500232830644, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.31772327423096, + "epoch": 0.0192, + "grad_norm": 3.939411163330078, + "kl": 0.0012691020965576172, + "learning_rate": 2.6666666666666667e-07, + "loss": 0.3223, + "num_tokens": 2701386.0, + "reward": -0.46141638420522213, + "reward_std": 0.06114580819848925, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0052083334885537624, + "rewards/reasoning_steps_reward": 0.047743057599291205, + "rewards/repetition_penalty_reward": -0.08709702454507351, + "rewards/smiles_len_reward": -0.007102272938936949, + "rewards/tag_count_reward": 0.0, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.07032585144043, + "epoch": 0.0208, + "grad_norm": 1.7252240180969238, + "kl": 0.0011508464813232422, + "learning_rate": 2.8888888888888885e-07, + "loss": 0.2639, + "num_tokens": 2956773.0, + "reward": -0.4627871084958315, + "reward_std": 0.05538264673668891, + "rewards/SMILES_validity_reward": -0.587500024586916, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0026041667442768812, + "rewards/reasoning_steps_reward": 0.07204861339414492, + "rewards/repetition_penalty_reward": -0.08872260828502476, + "rewards/smiles_len_reward": -0.007812500232830644, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.5130281448364, + "epoch": 0.0224, + "grad_norm": 3.2929608821868896, + "kl": 0.0012526512145996094, + "learning_rate": 3.111111111111111e-07, + "loss": 0.2808, + "num_tokens": 3186602.0, + "reward": -0.4632618837058544, + "reward_std": 0.04645203723339364, + "rewards/SMILES_validity_reward": -0.59166669100523, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.06597222399432212, + "rewards/repetition_penalty_reward": -0.07775854406645522, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.0312604904175, + "epoch": 0.024, + "grad_norm": 3.8771326541900635, + "kl": 0.0015463829040527344, + "learning_rate": 3.333333333333333e-07, + "loss": 0.457, + "num_tokens": 3406646.0, + "reward": -0.4599665645509958, + "reward_std": 0.05891757505014539, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0052083334885537624, + "rewards/reasoning_steps_reward": 0.03906250064028427, + "rewards/repetition_penalty_reward": -0.06320823275018483, + "rewards/smiles_len_reward": -0.007812500232830644, + "rewards/tag_count_reward": 0.0, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.46615171432495, + "epoch": 0.0256, + "grad_norm": 4.99418306350708, + "kl": 0.0019631385803222656, + "learning_rate": 3.5555555555555553e-07, + "loss": 0.4474, + "num_tokens": 3605737.0, + "reward": -0.45110186748206615, + "reward_std": 0.08477685746038333, + "rewards/SMILES_validity_reward": -0.5763021092861891, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.020833333721384406, + "rewards/reasoning_steps_reward": 0.03906250145519152, + "rewards/repetition_penalty_reward": -0.06543084210716188, + "rewards/smiles_len_reward": -0.014338664012029767, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.69532203674316, + "epoch": 0.0272, + "grad_norm": 3.533830165863037, + "kl": 0.0023183822631835938, + "learning_rate": 3.7777777777777775e-07, + "loss": 0.594, + "num_tokens": 3831412.0, + "reward": -0.4669134635478258, + "reward_std": 0.0388556448451709, + "rewards/SMILES_validity_reward": -0.59166669100523, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0026041667442768812, + "rewards/reasoning_steps_reward": 0.052083334478084, + "rewards/repetition_penalty_reward": -0.07955215487163514, + "rewards/smiles_len_reward": -0.009114583604969084, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.330735206604, + "epoch": 0.0288, + "grad_norm": 5.293888092041016, + "kl": 0.0037059783935546875, + "learning_rate": 4e-07, + "loss": 0.4527, + "num_tokens": 4027379.0, + "reward": -0.4606757778674364, + "reward_std": 0.055119884957093745, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0026041667442768812, + "rewards/reasoning_steps_reward": 0.03385416738456115, + "rewards/repetition_penalty_reward": -0.05780024908017367, + "rewards/smiles_len_reward": -0.008593750302679837, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.2604250907898, + "epoch": 0.0304, + "grad_norm": 2.2371666431427, + "kl": 0.003749847412109375, + "learning_rate": 4.222222222222222e-07, + "loss": 0.1871, + "num_tokens": 4257879.0, + "reward": -0.4692084323614836, + "reward_std": 0.0285613224550616, + "rewards/SMILES_validity_reward": -0.6000000238418579, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.007812500232830644, + "rewards/reasoning_steps_reward": 0.06597222341224551, + "rewards/repetition_penalty_reward": -0.07172926003113389, + "rewards/smiles_len_reward": -0.011718750349245965, + "rewards/tag_count_reward": 0.001953125058207661, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.0260524749756, + "epoch": 0.032, + "grad_norm": 3.276158094406128, + "kl": 0.0050830841064453125, + "learning_rate": 4.444444444444444e-07, + "loss": 0.3235, + "num_tokens": 4486369.0, + "reward": -0.4574108552187681, + "reward_std": 0.0688521406846121, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.05989583517657593, + "rewards/repetition_penalty_reward": -0.07410931127378717, + "rewards/smiles_len_reward": -0.011718750349245965, + "rewards/tag_count_reward": 0.003906250116415322, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.33333778381348, + "epoch": 0.0336, + "grad_norm": 3.6556997299194336, + "kl": 0.006580352783203125, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.5006, + "num_tokens": 4681953.0, + "reward": -0.46202344447374344, + "reward_std": 0.052374251157743856, + "rewards/SMILES_validity_reward": -0.587500024586916, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.0269097225391306, + "rewards/repetition_penalty_reward": -0.0574314376572147, + "rewards/smiles_len_reward": -0.009114583604969084, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.24480056762695, + "epoch": 0.0352, + "grad_norm": 4.723124027252197, + "kl": 0.009726524353027344, + "learning_rate": 4.888888888888889e-07, + "loss": 0.619, + "num_tokens": 4896703.0, + "reward": -0.460586316883564, + "reward_std": 0.05454236414516345, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.007812500232830644, + "rewards/reasoning_steps_reward": 0.034722223004791886, + "rewards/repetition_penalty_reward": -0.07157582964282483, + "rewards/smiles_len_reward": -0.009114583604969084, + "rewards/tag_count_reward": 0.0, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.3046941757202, + "epoch": 0.0368, + "grad_norm": 3.3449621200561523, + "kl": 0.016521453857421875, + "learning_rate": 4.999924786199418e-07, + "loss": 0.4126, + "num_tokens": 5121844.0, + "reward": -0.4563949555158615, + "reward_std": 0.07530709472484887, + "rewards/SMILES_validity_reward": -0.579166691750288, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.03993055655155331, + "rewards/repetition_penalty_reward": -0.07119860564125702, + "rewards/smiles_len_reward": -0.010416666977107525, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.0156307220459, + "epoch": 0.0384, + "grad_norm": 5.478763103485107, + "kl": 0.02780914306640625, + "learning_rate": 4.999323102948654e-07, + "loss": 0.345, + "num_tokens": 5320378.0, + "reward": -0.4597903210669756, + "reward_std": 0.04729547622264363, + "rewards/SMILES_validity_reward": -0.587500024586916, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.04166666732635349, + "rewards/repetition_penalty_reward": -0.05049094167770818, + "rewards/smiles_len_reward": -0.007829760666936636, + "rewards/tag_count_reward": 0.0, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.11719608306885, + "epoch": 0.04, + "grad_norm": 4.790470600128174, + "kl": 0.0424652099609375, + "learning_rate": 4.998119881260575e-07, + "loss": 0.3691, + "num_tokens": 5517415.0, + "reward": -0.4537742752581835, + "reward_std": 0.06936266523553059, + "rewards/SMILES_validity_reward": -0.579166691750288, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.013020833721384406, + "rewards/reasoning_steps_reward": 0.04861111240461469, + "rewards/repetition_penalty_reward": -0.05909780884394422, + "rewards/smiles_len_reward": -0.01345486135687679, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.916672706604, + "epoch": 0.0416, + "grad_norm": 8.038966178894043, + "kl": 0.0573883056640625, + "learning_rate": 4.996315410727229e-07, + "loss": 0.4202, + "num_tokens": 5713223.0, + "reward": -0.4641662258654833, + "reward_std": 0.038869212061399594, + "rewards/SMILES_validity_reward": -0.59166669100523, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.0052083334885537624, + "rewards/reasoning_steps_reward": 0.03993055736646056, + "rewards/repetition_penalty_reward": -0.052296683279564604, + "rewards/smiles_len_reward": -0.0052083334885537624, + "rewards/tag_count_reward": 0.001953125058207661, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.43489933013916, + "epoch": 0.0432, + "grad_norm": 6.251274108886719, + "kl": 0.0938720703125, + "learning_rate": 4.99391012564956e-07, + "loss": 0.4254, + "num_tokens": 5890414.0, + "reward": -0.448165163397789, + "reward_std": 0.09143485396634787, + "rewards/SMILES_validity_reward": -0.57083335891366, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.013020833721384406, + "rewards/reasoning_steps_reward": 0.032118056842591614, + "rewards/repetition_penalty_reward": -0.04787064273841679, + "rewards/smiles_len_reward": -0.011733278282918036, + "rewards/tag_count_reward": 0.0026041667442768812, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.63281869888306, + "epoch": 0.0448, + "grad_norm": 4.371728897094727, + "kl": 0.1113739013671875, + "learning_rate": 4.990904604932884e-07, + "loss": 0.1543, + "num_tokens": 6076897.0, + "reward": -0.4571216255426407, + "reward_std": 0.060153877711854875, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.007812500232830644, + "rewards/reasoning_steps_reward": 0.046875001688022166, + "rewards/repetition_penalty_reward": -0.049048895947635174, + "rewards/smiles_len_reward": -0.011751507059670985, + "rewards/tag_count_reward": 0.0026041666860692203, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.5156297683716, + "epoch": 0.0464, + "grad_norm": 6.046833038330078, + "kl": 0.1477203369140625, + "learning_rate": 4.987299571947553e-07, + "loss": 0.3914, + "num_tokens": 6274471.0, + "reward": -0.4582570604979992, + "reward_std": 0.05744875143864192, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.007812500232830644, + "rewards/reasoning_steps_reward": 0.03645833377959207, + "rewards/repetition_penalty_reward": -0.052860176598187536, + "rewards/smiles_len_reward": -0.0075757576851174235, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.3307328224182, + "epoch": 0.048, + "grad_norm": 3.264477014541626, + "kl": 0.1656341552734375, + "learning_rate": 4.983095894354857e-07, + "loss": 0.3925, + "num_tokens": 6457382.0, + "reward": -0.4511374644935131, + "reward_std": 0.08122232253663242, + "rewards/SMILES_validity_reward": -0.575000025331974, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.04079861118225381, + "rewards/repetition_penalty_reward": -0.0453608765383251, + "rewards/smiles_len_reward": -0.014365261304192245, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.14323806762695, + "epoch": 0.0496, + "grad_norm": 4.010850429534912, + "kl": 0.195281982421875, + "learning_rate": 4.978294583898195e-07, + "loss": 0.6237, + "num_tokens": 6653661.0, + "reward": -0.45640311203897, + "reward_std": 0.07789475272875279, + "rewards/SMILES_validity_reward": -0.5804687775671482, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.013020833721384406, + "rewards/reasoning_steps_reward": 0.03211805649334565, + "rewards/repetition_penalty_reward": -0.0569572810200043, + "rewards/smiles_len_reward": -0.016927083604969084, + "rewards/tag_count_reward": 0.001953125058207661, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.16927552223206, + "epoch": 0.0512, + "grad_norm": 5.870493412017822, + "kl": 0.6861572265625, + "learning_rate": 4.972896796159568e-07, + "loss": 0.1471, + "num_tokens": 6828830.0, + "reward": -0.4545394852757454, + "reward_std": 0.06280390484607778, + "rewards/SMILES_validity_reward": -0.583333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.013020833721384406, + "rewards/reasoning_steps_reward": 0.04687500145519152, + "rewards/repetition_penalty_reward": -0.04258153022965416, + "rewards/smiles_len_reward": -0.009324597000158974, + "rewards/tag_count_reward": 0.003906249941792339, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.05990076065063, + "epoch": 0.0528, + "grad_norm": 9.24417781829834, + "kl": 1.529541015625, + "learning_rate": 4.966903830281448e-07, + "loss": 0.5681, + "num_tokens": 6991669.0, + "reward": -0.46538511849939823, + "reward_std": 0.03016815922455862, + "rewards/SMILES_validity_reward": -0.5958333574235439, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.026909722830168903, + "rewards/repetition_penalty_reward": -0.03530300591955893, + "rewards/smiles_len_reward": -0.007828538189642131, + "rewards/tag_count_reward": 0.001953125058207661, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.63021278381348, + "epoch": 0.0544, + "grad_norm": 7.270186901092529, + "kl": 5.887939453125, + "learning_rate": 4.960317128654107e-07, + "loss": 0.4877, + "num_tokens": 7152807.0, + "reward": -0.44715421833097935, + "reward_std": 0.07873522458248772, + "rewards/SMILES_validity_reward": -0.575000025331974, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02343750069849193, + "rewards/reasoning_steps_reward": 0.029513889516238123, + "rewards/repetition_penalty_reward": -0.03269748640013859, + "rewards/smiles_len_reward": -0.015625000349245965, + "rewards/tag_count_reward": 0.001953125058207661, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.88281559944153, + "epoch": 0.056, + "grad_norm": 21.904245376586914, + "kl": 13.5673828125, + "learning_rate": 4.953138276568461e-07, + "loss": 0.3171, + "num_tokens": 7296378.0, + "reward": -0.45215606689453125, + "reward_std": 0.06071481961407699, + "rewards/SMILES_validity_reward": -0.579166691750288, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.010416666977107525, + "rewards/reasoning_steps_reward": 0.03559027874143794, + "rewards/repetition_penalty_reward": -0.023167401901446283, + "rewards/smiles_len_reward": -0.011718750349245965, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.77083659172058, + "epoch": 0.0576, + "grad_norm": 9.583633422851562, + "kl": 14.696044921875, + "learning_rate": 4.945369001834514e-07, + "loss": 0.2378, + "num_tokens": 7454498.0, + "reward": -0.44742128998041153, + "reward_std": 0.0873336758086225, + "rewards/SMILES_validity_reward": -0.57083335891366, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.013020833721384406, + "rewards/reasoning_steps_reward": 0.026909722946584225, + "rewards/repetition_penalty_reward": -0.030680728232255206, + "rewards/smiles_len_reward": -0.014322917093522847, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.04167103767395, + "epoch": 0.0592, + "grad_norm": 23.011547088623047, + "kl": 23.6376953125, + "learning_rate": 4.937011174365514e-07, + "loss": 0.2045, + "num_tokens": 7620786.0, + "reward": -0.4481741450726986, + "reward_std": 0.07758373257820494, + "rewards/SMILES_validity_reward": -0.575000025331974, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.01822916720993817, + "rewards/reasoning_steps_reward": 0.040798612229991704, + "rewards/repetition_penalty_reward": -0.03660338817280717, + "rewards/smiles_len_reward": -0.015625000349245965, + "rewards/tag_count_reward": 0.0, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.41146230697632, + "epoch": 0.0608, + "grad_norm": 6.287862777709961, + "kl": 8.23095703125, + "learning_rate": 4.928066805727901e-07, + "loss": 0.2391, + "num_tokens": 7774544.0, + "reward": -0.45081394724547863, + "reward_std": 0.07222701624414185, + "rewards/SMILES_validity_reward": -0.579166691750288, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.02690972329583019, + "rewards/repetition_penalty_reward": -0.03107271766930353, + "rewards/smiles_len_reward": -0.01231060631107539, + "rewards/tag_count_reward": 0.0, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.8411512374878, + "epoch": 0.0624, + "grad_norm": 6.666407585144043, + "kl": 7.9921875, + "learning_rate": 4.918538048657159e-07, + "loss": 0.45, + "num_tokens": 7936915.0, + "reward": -0.4527244567871094, + "reward_std": 0.06426695434493013, + "rewards/SMILES_validity_reward": -0.579166691750288, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.013020833721384406, + "rewards/reasoning_steps_reward": 0.031250000873114914, + "rewards/repetition_penalty_reward": -0.03297455201391131, + "rewards/smiles_len_reward": -0.013020833721384406, + "rewards/tag_count_reward": 0.0026041666860692203, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.31771183013916, + "epoch": 0.064, + "grad_norm": 8.43528938293457, + "kl": 5.4541015625, + "learning_rate": 4.908427196539701e-07, + "loss": 0.146, + "num_tokens": 8086797.0, + "reward": -0.44427087157964706, + "reward_std": 0.08865486389186117, + "rewards/SMILES_validity_reward": -0.575000025331974, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.033854167675599456, + "rewards/reasoning_steps_reward": 0.026041667093522847, + "rewards/repetition_penalty_reward": -0.026433447477757, + "rewards/smiles_len_reward": -0.01953125058207661, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.90104603767395, + "epoch": 0.0656, + "grad_norm": 8.4097900390625, + "kl": 4.1484375, + "learning_rate": 4.897736682860885e-07, + "loss": 0.192, + "num_tokens": 8247655.0, + "reward": -0.45605068281292915, + "reward_std": 0.059727372688939795, + "rewards/SMILES_validity_reward": -0.587500024586916, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.03906250145519152, + "rewards/repetition_penalty_reward": -0.034038055848213844, + "rewards/smiles_len_reward": -0.01618303614668548, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.7239625453949, + "epoch": 0.0672, + "grad_norm": 10.938655853271484, + "kl": 3.68115234375, + "learning_rate": 4.88646908061933e-07, + "loss": 0.0666, + "num_tokens": 8388477.0, + "reward": -0.44183752313256264, + "reward_std": 0.08375674461422022, + "rewards/SMILES_validity_reward": -0.5750000234693289, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.039062500931322575, + "rewards/reasoning_steps_reward": 0.03038194531109184, + "rewards/repetition_penalty_reward": -0.02076093477808172, + "rewards/smiles_len_reward": -0.023439725977368653, + "rewards/tag_count_reward": 0.0032552084303461015, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.8776068687439, + "epoch": 0.0688, + "grad_norm": 34.703914642333984, + "kl": 6.35546875, + "learning_rate": 4.874627101707643e-07, + "loss": 0.424, + "num_tokens": 8536270.0, + "reward": -0.4417836368083954, + "reward_std": 0.10183166417846223, + "rewards/SMILES_validity_reward": -0.57083335891366, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.03385416744276881, + "rewards/reasoning_steps_reward": 0.019965278392191976, + "rewards/repetition_penalty_reward": -0.028516643127659336, + "rewards/smiles_len_reward": -0.019572260905988514, + "rewards/tag_count_reward": 0.00455729168606922, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.65365076065063, + "epoch": 0.0704, + "grad_norm": 9.4151029586792, + "kl": 3.0634765625, + "learning_rate": 4.86221359625972e-07, + "loss": 0.1732, + "num_tokens": 8693577.0, + "reward": -0.43013707362115383, + "reward_std": 0.12213981148670428, + "rewards/SMILES_validity_reward": -0.5651041902601719, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.05989583348855376, + "rewards/reasoning_steps_reward": 0.03038194525288418, + "rewards/repetition_penalty_reward": -0.0299287144880509, + "rewards/smiles_len_reward": -0.026434181840159, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.5572965145111, + "epoch": 0.072, + "grad_norm": 7.3330078125, + "kl": 2.5065155029296875, + "learning_rate": 4.849231551964771e-07, + "loss": 0.1509, + "num_tokens": 8845855.0, + "reward": -0.432578993961215, + "reward_std": 0.13273732305970043, + "rewards/SMILES_validity_reward": -0.5583333596587181, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.033854167675599456, + "rewards/reasoning_steps_reward": 0.03559027856681496, + "rewards/repetition_penalty_reward": -0.03383599827066064, + "rewards/smiles_len_reward": -0.021425190032459795, + "rewards/tag_count_reward": 0.0006510416860692203, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8619828224182, + "epoch": 0.0736, + "grad_norm": 7.545994281768799, + "kl": 2.7392578125, + "learning_rate": 4.835684093348244e-07, + "loss": 0.2127, + "num_tokens": 8997098.0, + "reward": -0.4239292126148939, + "reward_std": 0.1364244522410445, + "rewards/SMILES_validity_reward": -0.5541666932404041, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.05468750069849193, + "rewards/reasoning_steps_reward": 0.031250000989530236, + "rewards/repetition_penalty_reward": -0.028238558385055512, + "rewards/smiles_len_reward": -0.031106452457606792, + "rewards/tag_count_reward": 0.003906250116415322, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.50000405311584, + "epoch": 0.0752, + "grad_norm": 12.123269081115723, + "kl": 3.155029296875, + "learning_rate": 4.821574481019811e-07, + "loss": 0.2579, + "num_tokens": 9157802.0, + "reward": -0.4132191240787506, + "reward_std": 0.1652318238047883, + "rewards/SMILES_validity_reward": -0.541666692122817, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.06510416837409139, + "rewards/reasoning_steps_reward": 0.028645834245253354, + "rewards/repetition_penalty_reward": -0.029634669452207163, + "rewards/smiles_len_reward": -0.03615117573644966, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.7630271911621, + "epoch": 0.0768, + "grad_norm": 10.459805488586426, + "kl": 3.26171875, + "learning_rate": 4.806906110888606e-07, + "loss": 0.2427, + "num_tokens": 9314383.0, + "reward": -0.39828602597117424, + "reward_std": 0.20478773265494965, + "rewards/SMILES_validity_reward": -0.5208333618938923, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.06770833465270698, + "rewards/reasoning_steps_reward": 0.02777777868323028, + "rewards/repetition_penalty_reward": -0.03425826533930376, + "rewards/smiles_len_reward": -0.03888054273556918, + "rewards/tag_count_reward": 0.0052083334303461015, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.87500381469727, + "epoch": 0.0784, + "grad_norm": 11.642253875732422, + "kl": 3.9642333984375, + "learning_rate": 4.791682513345892e-07, + "loss": 0.2138, + "num_tokens": 9461791.0, + "reward": -0.36608864995650947, + "reward_std": 0.26178658893331885, + "rewards/SMILES_validity_reward": -0.4875000258907676, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.1015625016298145, + "rewards/reasoning_steps_reward": 0.01388888928340748, + "rewards/repetition_penalty_reward": -0.02420425981108565, + "rewards/smiles_len_reward": -0.04796776862349361, + "rewards/tag_count_reward": 0.0052083334885537624, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.2734419107437, + "epoch": 0.08, + "grad_norm": 11.359210014343262, + "kl": 3.97216796875, + "learning_rate": 4.775907352415367e-07, + "loss": -0.1606, + "num_tokens": 9613576.0, + "reward": -0.3407728634774685, + "reward_std": 0.3080316074192524, + "rewards/SMILES_validity_reward": -0.4625000227242708, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.1223958341870457, + "rewards/reasoning_steps_reward": 0.04253472259733826, + "rewards/repetition_penalty_reward": -0.02961380738997832, + "rewards/smiles_len_reward": -0.058150356577243656, + "rewards/tag_count_reward": 0.007812499941792339, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.23177433013916, + "epoch": 0.0816, + "grad_norm": 12.774822235107422, + "kl": 4.29443359375, + "learning_rate": 4.759584424871301e-07, + "loss": 0.0665, + "num_tokens": 9758433.0, + "reward": -0.30280456133186817, + "reward_std": 0.37466976934229024, + "rewards/SMILES_validity_reward": -0.4083333518356085, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.1354166658129543, + "rewards/reasoning_steps_reward": 0.01822916720993817, + "rewards/repetition_penalty_reward": -0.02551854120247299, + "rewards/smiles_len_reward": -0.07518413302022964, + "rewards/tag_count_reward": 0.006510416802484542, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.58854365348816, + "epoch": 0.0832, + "grad_norm": 21.751882553100586, + "kl": 8.2001953125, + "learning_rate": 4.742717659324733e-07, + "loss": 0.0722, + "num_tokens": 9903043.0, + "reward": -0.21644436661154032, + "reward_std": 0.4530010260641575, + "rewards/SMILES_validity_reward": -0.31666667945683, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.21874999906867743, + "rewards/reasoning_steps_reward": 0.014756944845430553, + "rewards/repetition_penalty_reward": -0.026267023400578182, + "rewards/smiles_len_reward": -0.09707507817074656, + "rewards/tag_count_reward": 0.004557291744276881, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.33333432674408, + "epoch": 0.0848, + "grad_norm": 9.819295883178711, + "kl": 9.8955078125, + "learning_rate": 4.7253111152779233e-07, + "loss": -0.1048, + "num_tokens": 10037571.0, + "reward": -0.25656710658222437, + "reward_std": 0.38735912647098303, + "rewards/SMILES_validity_reward": -0.37916668597608805, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.22135416697710752, + "rewards/reasoning_steps_reward": 0.015625000291038305, + "rewards/repetition_penalty_reward": -0.01776839853846468, + "rewards/smiles_len_reward": -0.0903514064848423, + "rewards/tag_count_reward": 0.0169270834303461, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5937523841858, + "epoch": 0.0864, + "grad_norm": 9.594542503356934, + "kl": 8.150390625, + "learning_rate": 4.707368982147317e-07, + "loss": 0.1126, + "num_tokens": 10188327.0, + "reward": -0.19759700493887067, + "reward_std": 0.49018072336912155, + "rewards/SMILES_validity_reward": -0.3083333484828472, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.25781250069849193, + "rewards/reasoning_steps_reward": 0.02170138928340748, + "rewards/repetition_penalty_reward": -0.027601693116594106, + "rewards/smiles_len_reward": -0.09363839635625482, + "rewards/tag_count_reward": 0.008463541802484542, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.23177528381348, + "epoch": 0.088, + "grad_norm": 100.94241333007812, + "kl": 19.6396484375, + "learning_rate": 4.688895578255227e-07, + "loss": 0.0951, + "num_tokens": 10335488.0, + "reward": -0.058026916813105345, + "reward_std": 0.596280463039875, + "rewards/SMILES_validity_reward": -0.14583334388832253, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.35416666977107525, + "rewards/reasoning_steps_reward": 0.01822916720993817, + "rewards/repetition_penalty_reward": -0.022731273456884082, + "rewards/smiles_len_reward": -0.1311065279878676, + "rewards/tag_count_reward": 0.013671875058207661, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6119830608368, + "epoch": 0.0896, + "grad_norm": 12.545600891113281, + "kl": 8.9892578125, + "learning_rate": 4.6698953497905016e-07, + "loss": 0.132, + "num_tokens": 10484715.0, + "reward": 0.05423457216238603, + "reward_std": 0.6229321677237749, + "rewards/SMILES_validity_reward": -0.013802092677603184, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.427083327434957, + "rewards/reasoning_steps_reward": 0.019965278333984315, + "rewards/repetition_penalty_reward": -0.028624614773434587, + "rewards/smiles_len_reward": -0.15316252200864255, + "rewards/tag_count_reward": 0.01953125005820766, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.6510443687439, + "epoch": 0.0912, + "grad_norm": 11.581257820129395, + "kl": 7.5224609375, + "learning_rate": 4.650372869738414e-07, + "loss": 0.1721, + "num_tokens": 10627045.0, + "reward": -0.007128866913262755, + "reward_std": 0.5819757748395205, + "rewards/SMILES_validity_reward": -0.1005208427086473, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.4166666669771075, + "rewards/reasoning_steps_reward": 0.02343750069849193, + "rewards/repetition_penalty_reward": -0.01863024538033642, + "rewards/smiles_len_reward": -0.14393543626647443, + "rewards/tag_count_reward": 0.021484375349245965, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.92187976837158, + "epoch": 0.0928, + "grad_norm": 7.184658050537109, + "kl": 6.5302734375, + "learning_rate": 4.630332836780028e-07, + "loss": 0.1909, + "num_tokens": 10798663.0, + "reward": 0.17218821711139753, + "reward_std": 0.6334065981209278, + "rewards/SMILES_validity_reward": 0.11249999608844519, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.5260416707023978, + "rewards/reasoning_steps_reward": 0.028645834128838032, + "rewards/repetition_penalty_reward": -0.039949697558768094, + "rewards/smiles_len_reward": -0.15913273417390883, + "rewards/tag_count_reward": 0.026692708721384406, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.47656559944153, + "epoch": 0.0944, + "grad_norm": 14.1889009475708, + "kl": 6.49609375, + "learning_rate": 4.609780074161327e-07, + "loss": 0.3119, + "num_tokens": 10962814.0, + "reward": 0.28294606506824493, + "reward_std": 0.6309285741299391, + "rewards/SMILES_validity_reward": 0.2489583333954215, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.6015625055879354, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.032547464230447076, + "rewards/smiles_len_reward": -0.21143039967864752, + "rewards/tag_count_reward": 0.015624999825377017, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.70573377609253, + "epoch": 0.096, + "grad_norm": 42.98564147949219, + "kl": 24.6396484375, + "learning_rate": 4.588719528532341e-07, + "loss": 0.4959, + "num_tokens": 11115917.0, + "reward": 0.437305249273777, + "reward_std": 0.6359960846602917, + "rewards/SMILES_validity_reward": 0.41953125642612576, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.6875000037252903, + "rewards/reasoning_steps_reward": 0.026909722946584225, + "rewards/repetition_penalty_reward": -0.02915750685497187, + "rewards/smiles_len_reward": -0.15191438651527278, + "rewards/tag_count_reward": 0.02799479162786156, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.51042151451111, + "epoch": 0.0976, + "grad_norm": 13.0149564743042, + "kl": 10.384765625, + "learning_rate": 4.567156268756593e-07, + "loss": 0.3887, + "num_tokens": 11262417.0, + "reward": 0.42353246640414, + "reward_std": 0.595534335821867, + "rewards/SMILES_validity_reward": 0.39244792703539133, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.7057291679084301, + "rewards/reasoning_steps_reward": 0.015625000465661287, + "rewards/repetition_penalty_reward": -0.025808534323005006, + "rewards/smiles_len_reward": -0.13964920805301517, + "rewards/tag_count_reward": 0.020833333837799728, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.255211353302, + "epoch": 0.0992, + "grad_norm": 9.103986740112305, + "kl": 36.1201171875, + "learning_rate": 4.5450954846911195e-07, + "loss": 0.4231, + "num_tokens": 11410355.0, + "reward": 0.545548053458333, + "reward_std": 0.5675202906131744, + "rewards/SMILES_validity_reward": 0.5458333436399698, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.7734375037252903, + "rewards/reasoning_steps_reward": 0.015625000465661287, + "rewards/repetition_penalty_reward": -0.025838642206508666, + "rewards/smiles_len_reward": -0.19758821406867355, + "rewards/tag_count_reward": 0.022135417093522847, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.94531488418579, + "epoch": 0.1008, + "grad_norm": 10.777301788330078, + "kl": 12.24609375, + "learning_rate": 4.5225424859373684e-07, + "loss": 0.3693, + "num_tokens": 11555486.0, + "reward": 0.5835141399875283, + "reward_std": 0.5180866969749331, + "rewards/SMILES_validity_reward": 0.5916666835546494, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.7656250018626451, + "rewards/reasoning_steps_reward": 0.00954861135687679, + "rewards/repetition_penalty_reward": -0.02219348722428549, + "rewards/smiles_len_reward": -0.1044283655937761, + "rewards/tag_count_reward": 0.013671875349245965, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.60416841506958, + "epoch": 0.1024, + "grad_norm": 30.928424835205078, + "kl": 55.1669921875, + "learning_rate": 4.4995027005632896e-07, + "loss": 0.3305, + "num_tokens": 11694342.0, + "reward": 0.5743140410631895, + "reward_std": 0.5184614229947329, + "rewards/SMILES_validity_reward": 0.5606770901940763, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8125000037252903, + "rewards/reasoning_steps_reward": 0.006076389050576836, + "rewards/repetition_penalty_reward": -0.015417461221659323, + "rewards/smiles_len_reward": -0.12278014622279443, + "rewards/tag_count_reward": 0.013020833546761423, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.28646278381348, + "epoch": 0.104, + "grad_norm": 13111.33984375, + "kl": 1170.328125, + "learning_rate": 4.475981673796898e-07, + "loss": 1.5062, + "num_tokens": 11844212.0, + "reward": 0.6496439315378666, + "reward_std": 0.5050632283091545, + "rewards/SMILES_validity_reward": 0.6666666734963655, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8177083320915699, + "rewards/reasoning_steps_reward": 0.01822916720993817, + "rewards/repetition_penalty_reward": -0.023118784243706614, + "rewards/smiles_len_reward": -0.1405994631932117, + "rewards/tag_count_reward": 0.02213541668606922, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.26823282241821, + "epoch": 0.1056, + "grad_norm": 12.344482421875, + "kl": 18.35546875, + "learning_rate": 4.451985066691648e-07, + "loss": 0.415, + "num_tokens": 11981787.0, + "reward": 0.5884968402533559, + "reward_std": 0.5006589200347662, + "rewards/SMILES_validity_reward": 0.5854166727513075, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.017361111589707434, + "rewards/repetition_penalty_reward": -0.01754250284648151, + "rewards/smiles_len_reward": -0.16914823453407735, + "rewards/tag_count_reward": 0.018880208721384406, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.96875262260437, + "epoch": 0.1072, + "grad_norm": 21.06283950805664, + "kl": 43.158203125, + "learning_rate": 4.4275186547639267e-07, + "loss": 0.2883, + "num_tokens": 12112719.0, + "reward": 0.624729085713625, + "reward_std": 0.5171774514019489, + "rewards/SMILES_validity_reward": 0.6250000149011612, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8177083432674408, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.01814899359305855, + "rewards/smiles_len_reward": -0.09458748006727546, + "rewards/tag_count_reward": 0.021484375174622983, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.87500214576721, + "epoch": 0.1088, + "grad_norm": 15.506207466125488, + "kl": 16.0703125, + "learning_rate": 4.4025883266030014e-07, + "loss": 0.3161, + "num_tokens": 12247071.0, + "reward": 0.6261179409921169, + "reward_std": 0.5333305615931749, + "rewards/SMILES_validity_reward": 0.6458333414047956, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.7968749925494194, + "rewards/reasoning_steps_reward": 0.01388888928340748, + "rewards/repetition_penalty_reward": -0.019531703477696283, + "rewards/smiles_len_reward": -0.17263225640635937, + "rewards/tag_count_reward": 0.027994792035315186, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.94010603427887, + "epoch": 0.1104, + "grad_norm": 43.307518005371094, + "kl": 30.0234375, + "learning_rate": 4.377200082453748e-07, + "loss": 0.2749, + "num_tokens": 12382216.0, + "reward": 0.6953225135803223, + "reward_std": 0.4491841895505786, + "rewards/SMILES_validity_reward": 0.7208333406597376, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8411458320915699, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.013212446501711383, + "rewards/smiles_len_reward": -0.1262718337820843, + "rewards/tag_count_reward": 0.015625000407453626, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.95573198795319, + "epoch": 0.112, + "grad_norm": 11.107709884643555, + "kl": 17.333984375, + "learning_rate": 4.3513600327725117e-07, + "loss": 0.2443, + "num_tokens": 12520055.0, + "reward": 0.6400888189673424, + "reward_std": 0.5148144848644733, + "rewards/SMILES_validity_reward": 0.6583333387970924, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.006944444612599909, + "rewards/repetition_penalty_reward": -0.01866041096400295, + "rewards/smiles_len_reward": -0.15211048838682473, + "rewards/tag_count_reward": 0.018880208837799728, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.8984397649765, + "epoch": 0.1136, + "grad_norm": 15.497629165649414, + "kl": 12.576171875, + "learning_rate": 4.3250743967564364e-07, + "loss": 0.3195, + "num_tokens": 12658640.0, + "reward": 0.6857658997178078, + "reward_std": 0.4671833934262395, + "rewards/SMILES_validity_reward": 0.7041666712611914, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8359375, + "rewards/reasoning_steps_reward": 0.006944444612599909, + "rewards/repetition_penalty_reward": -0.012855285956902662, + "rewards/smiles_len_reward": -0.0961968683404848, + "rewards/tag_count_reward": 0.022786458488553762, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.03906536102295, + "epoch": 0.1152, + "grad_norm": 752693.875, + "kl": 64275.8251953125, + "learning_rate": 4.2983495008466273e-07, + "loss": 64.4247, + "num_tokens": 12793439.0, + "reward": 0.6533464230597019, + "reward_std": 0.49899647012352943, + "rewards/SMILES_validity_reward": 0.658333346247673, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8489583358168602, + "rewards/reasoning_steps_reward": 0.0026041667442768812, + "rewards/repetition_penalty_reward": -0.014190958245308138, + "rewards/smiles_len_reward": -0.13880437827901915, + "rewards/tag_count_reward": 0.028645833546761423, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.64583575725555, + "epoch": 0.1168, + "grad_norm": 16.18448829650879, + "kl": 19.9951171875, + "learning_rate": 4.2711917772054997e-07, + "loss": 0.2634, + "num_tokens": 12924631.0, + "reward": 0.7568464614450932, + "reward_std": 0.4012842336669564, + "rewards/SMILES_validity_reward": 0.7791666649281979, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8723958395421505, + "rewards/reasoning_steps_reward": 0.01388888928340748, + "rewards/repetition_penalty_reward": -0.01767248242686037, + "rewards/smiles_len_reward": -0.018638497567735612, + "rewards/tag_count_reward": 0.01953125005820766, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.21614944934845, + "epoch": 0.1184, + "grad_norm": 31.727739334106445, + "kl": 22.85546875, + "learning_rate": 4.2436077621686784e-07, + "loss": 0.2287, + "num_tokens": 13054890.0, + "reward": 0.6930093914270401, + "reward_std": 0.41775880940258503, + "rewards/SMILES_validity_reward": 0.6924479249864817, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8697916679084301, + "rewards/reasoning_steps_reward": 0.004340277868323028, + "rewards/repetition_penalty_reward": -0.014475565858447226, + "rewards/smiles_len_reward": -0.041021980927325785, + "rewards/tag_count_reward": 0.02473958331393078, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.81771111488342, + "epoch": 0.12, + "grad_norm": 19.164730072021484, + "kl": 52.619140625, + "learning_rate": 4.2156040946718343e-07, + "loss": 0.5484, + "num_tokens": 13195364.0, + "reward": 0.6934168599545956, + "reward_std": 0.45168319437652826, + "rewards/SMILES_validity_reward": 0.7140625007450581, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8463541679084301, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.02252123405196471, + "rewards/smiles_len_reward": -0.12117591802962124, + "rewards/tag_count_reward": 0.029947916918899864, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.8828147649765, + "epoch": 0.1216, + "grad_norm": 14.251289367675781, + "kl": 23.25, + "learning_rate": 4.187187514652819e-07, + "loss": 0.282, + "num_tokens": 13323575.0, + "reward": 0.7740835659205914, + "reward_std": 0.40268346946686506, + "rewards/SMILES_validity_reward": 0.7986979261040688, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984374962747097, + "rewards/reasoning_steps_reward": 0.011284722539130598, + "rewards/repetition_penalty_reward": -0.010957858150504762, + "rewards/smiles_len_reward": -0.06391943350899965, + "rewards/tag_count_reward": 0.018229166860692203, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.5494817495346, + "epoch": 0.1232, + "grad_norm": 103.39234161376953, + "kl": 59.921875, + "learning_rate": 4.158364861429493e-07, + "loss": 0.3783, + "num_tokens": 13457034.0, + "reward": 0.658618837594986, + "reward_std": 0.5097680818289518, + "rewards/SMILES_validity_reward": 0.6708333306014538, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8463541679084301, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.015188703837338835, + "rewards/smiles_len_reward": -0.1719313338799111, + "rewards/tag_count_reward": 0.02799479168606922, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.07552409172058, + "epoch": 0.1248, + "grad_norm": 460.58990478515625, + "kl": 540.94921875, + "learning_rate": 4.129143072053638e-07, + "loss": 0.8874, + "num_tokens": 13596839.0, + "reward": 0.6298879142850637, + "reward_std": 0.5001714690588415, + "rewards/SMILES_validity_reward": 0.6023437635740265, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.856770820915699, + "rewards/reasoning_steps_reward": 0.0190972225391306, + "rewards/repetition_penalty_reward": -0.01785970525816083, + "rewards/smiles_len_reward": -0.013817999046295881, + "rewards/tag_count_reward": 0.024739583488553762, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.27604413032532, + "epoch": 0.1264, + "grad_norm": 72.43608093261719, + "kl": 25.7138671875, + "learning_rate": 4.0995291796413365e-07, + "loss": 0.2613, + "num_tokens": 13720209.0, + "reward": 0.7709857225418091, + "reward_std": 0.3671606592833996, + "rewards/SMILES_validity_reward": 0.804166667163372, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8854166716337204, + "rewards/reasoning_steps_reward": 0.0026041667442768812, + "rewards/repetition_penalty_reward": -0.009442822036362486, + "rewards/smiles_len_reward": -0.0869512411300093, + "rewards/tag_count_reward": 0.018229167035315186, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.42969000339508, + "epoch": 0.128, + "grad_norm": 15.941231727600098, + "kl": 17.091796875, + "learning_rate": 4.0695303116802467e-07, + "loss": 0.2727, + "num_tokens": 13845174.0, + "reward": 0.7843712531030178, + "reward_std": 0.3681061351671815, + "rewards/SMILES_validity_reward": 0.8041666708886623, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9140624962747097, + "rewards/reasoning_steps_reward": 0.004340277868323028, + "rewards/repetition_penalty_reward": -0.011173382534252596, + "rewards/smiles_len_reward": -0.03773681813618168, + "rewards/tag_count_reward": 0.016927083546761423, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.86198151111603, + "epoch": 0.1296, + "grad_norm": 21.729921340942383, + "kl": 19.8984375, + "learning_rate": 4.039153688314145e-07, + "loss": 0.2752, + "num_tokens": 13974529.0, + "reward": 0.7434104010462761, + "reward_std": 0.4333901312202215, + "rewards/SMILES_validity_reward": 0.7666666693985462, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333283662796, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.011836088739073602, + "rewards/smiles_len_reward": -0.14598823036067188, + "rewards/tag_count_reward": 0.024739583663176745, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.11719036102295, + "epoch": 0.1312, + "grad_norm": 42.10722351074219, + "kl": 109.4140625, + "learning_rate": 4.008406620605189e-07, + "loss": 0.4418, + "num_tokens": 14107822.0, + "reward": 0.7064198963344097, + "reward_std": 0.45633639767766, + "rewards/SMILES_validity_reward": 0.7291666679084301, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8515625, + "rewards/reasoning_steps_reward": 0.0164930559694767, + "rewards/repetition_penalty_reward": -0.014415540204936406, + "rewards/smiles_len_reward": -0.11886932025663555, + "rewards/tag_count_reward": 0.02213541668606922, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.77864742279053, + "epoch": 0.1328, + "grad_norm": 220.8507537841797, + "kl": 65.47265625, + "learning_rate": 3.977296508774278e-07, + "loss": 0.3117, + "num_tokens": 14229465.0, + "reward": 0.7945839650928974, + "reward_std": 0.3342463602311909, + "rewards/SMILES_validity_reward": 0.8333333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8932291679084301, + "rewards/reasoning_steps_reward": 0.008680555794853717, + "rewards/repetition_penalty_reward": -0.006365777313476428, + "rewards/smiles_len_reward": -0.09033061633817852, + "rewards/tag_count_reward": 0.020833333488553762, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.78646194934845, + "epoch": 0.1344, + "grad_norm": 13.82856559753418, + "kl": 27.58203125, + "learning_rate": 3.945830840419966e-07, + "loss": 0.3424, + "num_tokens": 14365319.0, + "reward": 0.7539720423519611, + "reward_std": 0.42417754977941513, + "rewards/SMILES_validity_reward": 0.7723958343267441, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.890625, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.009425983691471629, + "rewards/smiles_len_reward": -0.0685632707318291, + "rewards/tag_count_reward": 0.026041667093522847, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.54427301883698, + "epoch": 0.136, + "grad_norm": 31.95520782470703, + "kl": 27.662109375, + "learning_rate": 3.9140171887163466e-07, + "loss": 0.2672, + "num_tokens": 14494168.0, + "reward": 0.7264284733682871, + "reward_std": 0.4606617968529463, + "rewards/SMILES_validity_reward": 0.7416666690260172, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8749999925494194, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.011190719229489332, + "rewards/smiles_len_reward": -0.07765073655173182, + "rewards/tag_count_reward": 0.026041666453238577, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.70573210716248, + "epoch": 0.1376, + "grad_norm": 111.74293518066406, + "kl": 73.783203125, + "learning_rate": 3.8818632105903315e-07, + "loss": 0.351, + "num_tokens": 14625767.0, + "reward": 0.7140463925898075, + "reward_std": 0.3952889391221106, + "rewards/SMILES_validity_reward": 0.7299479208886623, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8723958283662796, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.011792463701567613, + "rewards/smiles_len_reward": -0.10516674036625773, + "rewards/tag_count_reward": 0.020182292093522847, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.22135591506958, + "epoch": 0.1392, + "grad_norm": 31.038663864135742, + "kl": 34.14453125, + "learning_rate": 3.849376644878782e-07, + "loss": 0.3066, + "num_tokens": 14747964.0, + "reward": 0.7357414551079273, + "reward_std": 0.4182268213480711, + "rewards/SMILES_validity_reward": 0.7749999836087227, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8541666679084301, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.008280414986074902, + "rewards/smiles_len_reward": -0.16086881840601563, + "rewards/tag_count_reward": 0.026041666395030916, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.81250166893005, + "epoch": 0.1408, + "grad_norm": 17.010215759277344, + "kl": 70.134765625, + "learning_rate": 3.8165653104659185e-07, + "loss": 0.3149, + "num_tokens": 14881908.0, + "reward": 0.7312784865498543, + "reward_std": 0.42725326027721167, + "rewards/SMILES_validity_reward": 0.7666666693985462, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8489583358168602, + "rewards/reasoning_steps_reward": 0.015625000465661287, + "rewards/repetition_penalty_reward": -0.0110986630897969, + "rewards/smiles_len_reward": -0.13132599194068462, + "rewards/tag_count_reward": 0.026041667151730508, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.02083623409271, + "epoch": 0.1424, + "grad_norm": 14.404475212097168, + "kl": 24.978515625, + "learning_rate": 3.783437104401469e-07, + "loss": 0.4138, + "num_tokens": 15005564.0, + "reward": 0.749178359284997, + "reward_std": 0.3878423860296607, + "rewards/SMILES_validity_reward": 0.7875000014901161, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8697916679084301, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0074081632774323225, + "rewards/smiles_len_reward": -0.14612202369607985, + "rewards/tag_count_reward": 0.02343749994179234, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.119793176651, + "epoch": 0.144, + "grad_norm": 11.405951499938965, + "kl": 19.92578125, + "learning_rate": 3.75e-07, + "loss": 0.3072, + "num_tokens": 15133482.0, + "reward": 0.7756290249526501, + "reward_std": 0.37124256137758493, + "rewards/SMILES_validity_reward": 0.8140624910593033, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.890625, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.010999515707226237, + "rewards/smiles_len_reward": -0.13752924266736954, + "rewards/tag_count_reward": 0.02669270831393078, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.77864706516266, + "epoch": 0.1456, + "grad_norm": 48.71555709838867, + "kl": 52.099609375, + "learning_rate": 3.7162620449218993e-07, + "loss": 0.2348, + "num_tokens": 15262421.0, + "reward": 0.7475567385554314, + "reward_std": 0.4051980022341013, + "rewards/SMILES_validity_reward": 0.7708333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8880208395421505, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.015756858985696454, + "rewards/smiles_len_reward": -0.0959165629465133, + "rewards/tag_count_reward": 0.019531250349245965, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.70573079586029, + "epoch": 0.1472, + "grad_norm": 29.34798240661621, + "kl": 37.0546875, + "learning_rate": 3.682231359236459e-07, + "loss": 0.206, + "num_tokens": 15384036.0, + "reward": 0.7746610157191753, + "reward_std": 0.37258596147876233, + "rewards/SMILES_validity_reward": 0.8041666634380817, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8854166604578495, + "rewards/reasoning_steps_reward": 0.006944444612599909, + "rewards/repetition_penalty_reward": -0.008209791714762105, + "rewards/smiles_len_reward": -0.06293306592851877, + "rewards/tag_count_reward": 0.025390625349245965, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.895836353302, + "epoch": 0.1488, + "grad_norm": 11.991008758544922, + "kl": 19.599609375, + "learning_rate": 3.647916133467529e-07, + "loss": 0.4281, + "num_tokens": 15518396.0, + "reward": 0.7007387336343527, + "reward_std": 0.4086402766406536, + "rewards/SMILES_validity_reward": 0.7023437605239451, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8880208320915699, + "rewards/reasoning_steps_reward": 0.016493055794853717, + "rewards/repetition_penalty_reward": -0.022340198534948286, + "rewards/smiles_len_reward": -0.09132412448525429, + "rewards/tag_count_reward": 0.024088542093522847, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.02083575725555, + "epoch": 0.1504, + "grad_norm": 13.47666072845459, + "kl": 14.310546875, + "learning_rate": 3.6133246266222233e-07, + "loss": 0.1876, + "num_tokens": 15643204.0, + "reward": 0.7891622483730316, + "reward_std": 0.3716621574712917, + "rewards/SMILES_validity_reward": 0.8083333224058151, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458283662796, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.01272648310259683, + "rewards/smiles_len_reward": 0.004477886424865574, + "rewards/tag_count_reward": 0.022786458604969084, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.91406571865082, + "epoch": 0.152, + "grad_norm": 32.838348388671875, + "kl": 41.697265625, + "learning_rate": 3.5784651642031337e-07, + "loss": 0.4292, + "num_tokens": 15781795.0, + "reward": 0.7256705239415169, + "reward_std": 0.43252733163535595, + "rewards/SMILES_validity_reward": 0.7500000074505806, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8697916679084301, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.014169132729875855, + "rewards/smiles_len_reward": -0.12496000179089606, + "rewards/tag_count_reward": 0.028645833488553762, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.00000166893005, + "epoch": 0.1536, + "grad_norm": 104.36994171142578, + "kl": 43.689453125, + "learning_rate": 3.5433461362045447e-07, + "loss": 0.2651, + "num_tokens": 15909283.0, + "reward": 0.6896458622068167, + "reward_std": 0.4807543084025383, + "rewards/SMILES_validity_reward": 0.6973958415910602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8906250037252903, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.01001048629404977, + "rewards/smiles_len_reward": -0.1771257909713313, + "rewards/tag_count_reward": 0.02473958331393078, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.68489694595337, + "epoch": 0.1552, + "grad_norm": 10673.4697265625, + "kl": 2005.744140625, + "learning_rate": 3.507975995093125e-07, + "loss": 2.1771, + "num_tokens": 16025130.0, + "reward": 0.757711049169302, + "reward_std": 0.42240715958178043, + "rewards/SMILES_validity_reward": 0.7874999903142452, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8932291679084301, + "rewards/reasoning_steps_reward": 0.006944444612599909, + "rewards/repetition_penalty_reward": -0.005861769714101683, + "rewards/smiles_len_reward": -0.1369942625751719, + "rewards/tag_count_reward": 0.020833333663176745, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.72916793823242, + "epoch": 0.1568, + "grad_norm": 173.10513305664062, + "kl": 54.318359375, + "learning_rate": 3.472363253773584e-07, + "loss": 0.2984, + "num_tokens": 16148674.0, + "reward": 0.752436138689518, + "reward_std": 0.4110519029200077, + "rewards/SMILES_validity_reward": 0.7916666641831398, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8697916641831398, + "rewards/reasoning_steps_reward": 0.015625000465661287, + "rewards/repetition_penalty_reward": -0.010170710236707237, + "rewards/smiles_len_reward": -0.16338579345028847, + "rewards/tag_count_reward": 0.031250000407453626, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.609376072883606, + "epoch": 0.1584, + "grad_norm": 76.87126159667969, + "kl": 45.49609375, + "learning_rate": 3.43651648353978e-07, + "loss": 0.2331, + "num_tokens": 16270252.0, + "reward": 0.7394071221351624, + "reward_std": 0.40953583153896034, + "rewards/SMILES_validity_reward": 0.7583333291113377, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416641831398, + "rewards/reasoning_steps_reward": 0.0026041667442768812, + "rewards/repetition_penalty_reward": -0.00891629023681162, + "rewards/smiles_len_reward": -0.13125849929929245, + "rewards/tag_count_reward": 0.020182292035315186, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.92968940734863, + "epoch": 0.16, + "grad_norm": 247.7176513671875, + "kl": 203.548828125, + "learning_rate": 3.400444312011776e-07, + "loss": 0.5934, + "num_tokens": 16397329.0, + "reward": 0.7175582200288773, + "reward_std": 0.4524127524346113, + "rewards/SMILES_validity_reward": 0.7416666746139526, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8567708283662796, + "rewards/reasoning_steps_reward": 0.014756944845430553, + "rewards/repetition_penalty_reward": -0.0132746260278509, + "rewards/smiles_len_reward": -0.1087138393195346, + "rewards/tag_count_reward": 0.02083333337213844, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.00260639190674, + "epoch": 0.1616, + "grad_norm": 12.647216796875, + "kl": 16.201171875, + "learning_rate": 3.3641554210593414e-07, + "loss": 0.3755, + "num_tokens": 16529810.0, + "reward": 0.7455911412835121, + "reward_std": 0.41086752247065306, + "rewards/SMILES_validity_reward": 0.7708333283662796, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8828125, + "rewards/reasoning_steps_reward": 0.01649305602768436, + "rewards/repetition_penalty_reward": -0.014058286946237786, + "rewards/smiles_len_reward": -0.1200922247953713, + "rewards/tag_count_reward": 0.02929687494179234, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.34896171092987, + "epoch": 0.1632, + "grad_norm": 36.01606750488281, + "kl": 30.4951171875, + "learning_rate": 3.327658544712395e-07, + "loss": 0.3301, + "num_tokens": 16660504.0, + "reward": 0.7541028931736946, + "reward_std": 0.3762020096182823, + "rewards/SMILES_validity_reward": 0.7666666731238365, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984375074505806, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.015571234718663618, + "rewards/smiles_len_reward": -0.03337510232813656, + "rewards/tag_count_reward": 0.022786458663176745, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.80729401111603, + "epoch": 0.1648, + "grad_norm": 11.157661437988281, + "kl": 41.3486328125, + "learning_rate": 3.290962467058891e-07, + "loss": 0.2184, + "num_tokens": 16785998.0, + "reward": 0.7728025019168854, + "reward_std": 0.35571749578230083, + "rewards/SMILES_validity_reward": 0.7958333231508732, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9166666679084301, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.008943471177190077, + "rewards/smiles_len_reward": -0.10535037610679865, + "rewards/tag_count_reward": 0.013671875291038305, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.96875286102295, + "epoch": 0.1664, + "grad_norm": 13.703250885009766, + "kl": 242.390625, + "learning_rate": 3.2540760201306637e-07, + "loss": 0.6453, + "num_tokens": 16914242.0, + "reward": 0.6993302181363106, + "reward_std": 0.4618812408298254, + "rewards/SMILES_validity_reward": 0.7166666649281979, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8776041641831398, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.008216387410357129, + "rewards/smiles_len_reward": -0.17790959577541798, + "rewards/tag_count_reward": 0.022135416860692203, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.28645968437195, + "epoch": 0.168, + "grad_norm": 15.79922866821289, + "kl": 40.01171875, + "learning_rate": 3.2170080817777257e-07, + "loss": 0.3946, + "num_tokens": 17045680.0, + "reward": 0.7615203447639942, + "reward_std": 0.39564547780901194, + "rewards/SMILES_validity_reward": 0.802864570170641, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.859375, + "rewards/reasoning_steps_reward": 0.01388888928340748, + "rewards/repetition_penalty_reward": -0.016861008152773138, + "rewards/smiles_len_reward": -0.09562780696433038, + "rewards/tag_count_reward": 0.015625000174622983, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.33333587646484, + "epoch": 0.1696, + "grad_norm": 20.75408172607422, + "kl": 50.240234375, + "learning_rate": 3.1797675735315454e-07, + "loss": 0.3075, + "num_tokens": 17170224.0, + "reward": 0.7155029028654099, + "reward_std": 0.456816378980875, + "rewards/SMILES_validity_reward": 0.7375000007450581, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8645833358168602, + "rewards/reasoning_steps_reward": 0.01822916720993817, + "rewards/repetition_penalty_reward": -0.009517593272903468, + "rewards/smiles_len_reward": -0.132069039857015, + "rewards/tag_count_reward": 0.022135417035315186, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.45833611488342, + "epoch": 0.1712, + "grad_norm": 40.99763488769531, + "kl": 36.8671875, + "learning_rate": 3.142363458457805e-07, + "loss": 0.2656, + "num_tokens": 17292512.0, + "reward": 0.734221912920475, + "reward_std": 0.4476381931453943, + "rewards/SMILES_validity_reward": 0.7541666738688946, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8776041679084301, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.009483299305429682, + "rewards/smiles_len_reward": -0.10389782977290452, + "rewards/tag_count_reward": 0.035807291977107525, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.15885698795319, + "epoch": 0.1728, + "grad_norm": 34.548885345458984, + "kl": 26.62890625, + "learning_rate": 3.104804738999169e-07, + "loss": 0.4481, + "num_tokens": 17426589.0, + "reward": 0.6837027445435524, + "reward_std": 0.47833400405943394, + "rewards/SMILES_validity_reward": 0.7000000104308128, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8515624962747097, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.013724267948418856, + "rewards/smiles_len_reward": -0.12997856584843248, + "rewards/tag_count_reward": 0.015625000291038305, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.51823091506958, + "epoch": 0.1744, + "grad_norm": 26.471229553222656, + "kl": 275.8046875, + "learning_rate": 3.067100454808567e-07, + "loss": 0.5704, + "num_tokens": 17551972.0, + "reward": 0.7661529034376144, + "reward_std": 0.41659063287079334, + "rewards/SMILES_validity_reward": 0.7916666604578495, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8854166679084301, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.011230084801354678, + "rewards/smiles_len_reward": -0.059663921245373785, + "rewards/tag_count_reward": 0.021484375349245965, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.42968988418579, + "epoch": 0.176, + "grad_norm": 13.7344970703125, + "kl": 37.447265625, + "learning_rate": 3.029259680573527e-07, + "loss": 0.4229, + "num_tokens": 17681545.0, + "reward": 0.7183681428432465, + "reward_std": 0.3968039182946086, + "rewards/SMILES_validity_reward": 0.7177083268761635, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9088541641831398, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.007644086541404249, + "rewards/smiles_len_reward": -0.08979546726914123, + "rewards/tag_count_reward": 0.0227864584303461, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.84896063804626, + "epoch": 0.1776, + "grad_norm": 19.092857360839844, + "kl": 29.484375, + "learning_rate": 2.991291523832075e-07, + "loss": 0.3437, + "num_tokens": 17808975.0, + "reward": 0.7603292763233185, + "reward_std": 0.4128862749785185, + "rewards/SMILES_validity_reward": 0.7833333425223827, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8932291716337204, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.009071222404600121, + "rewards/smiles_len_reward": -0.07800182851497084, + "rewards/tag_count_reward": 0.022135417093522847, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.0208352804184, + "epoch": 0.1792, + "grad_norm": 26.781389236450195, + "kl": 29.21875, + "learning_rate": 2.953205122780729e-07, + "loss": 0.3597, + "num_tokens": 17933783.0, + "reward": 0.7867212891578674, + "reward_std": 0.37980251759290695, + "rewards/SMILES_validity_reward": 0.7999999932944775, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9140624925494194, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.011719140768036596, + "rewards/smiles_len_reward": 0.007446364965289831, + "rewards/tag_count_reward": 0.016276041860692203, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.48698103427887, + "epoch": 0.1808, + "grad_norm": 24.122013092041016, + "kl": 94.544921875, + "learning_rate": 2.9150096440751103e-07, + "loss": 0.4924, + "num_tokens": 18062994.0, + "reward": 0.6973020005971193, + "reward_std": 0.45596596598625183, + "rewards/SMILES_validity_reward": 0.7166666742414236, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8541666679084301, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.013226932846009731, + "rewards/smiles_len_reward": -0.12482209294103086, + "rewards/tag_count_reward": 0.02408854174427688, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.888023257255554, + "epoch": 0.1824, + "grad_norm": 3758.826171875, + "kl": 1040.8896484375, + "learning_rate": 2.8767142806237077e-07, + "loss": 1.3245, + "num_tokens": 18185447.0, + "reward": 0.7444559372961521, + "reward_std": 0.410754032433033, + "rewards/SMILES_validity_reward": 0.7791666686534882, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8723958283662796, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.009680914790806128, + "rewards/smiles_len_reward": -0.14576094248332083, + "rewards/tag_count_reward": 0.020833333604969084, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.07031440734863, + "epoch": 0.184, + "grad_norm": 1915.1563720703125, + "kl": 376.4609375, + "learning_rate": 2.838328249375328e-07, + "loss": 0.6467, + "num_tokens": 18308738.0, + "reward": 0.7443150542676449, + "reward_std": 0.42556965351104736, + "rewards/SMILES_validity_reward": 0.7778645865619183, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8723958320915699, + "rewards/reasoning_steps_reward": 0.014756944787222892, + "rewards/repetition_penalty_reward": -0.010134508582268609, + "rewards/smiles_len_reward": -0.1464992203982547, + "rewards/tag_count_reward": 0.022786458546761423, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.880210161209106, + "epoch": 0.1856, + "grad_norm": 18.341007232666016, + "kl": 31.94921875, + "learning_rate": 2.7998607891007493e-07, + "loss": 0.3038, + "num_tokens": 18429652.0, + "reward": 0.7311629764735699, + "reward_std": 0.43252694979310036, + "rewards/SMILES_validity_reward": 0.7375000026077032, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984375, + "rewards/reasoning_steps_reward": 0.004340277868323028, + "rewards/repetition_penalty_reward": -0.006652665741057717, + "rewards/smiles_len_reward": -0.06340290373191237, + "rewards/tag_count_reward": 0.019531250349245965, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.97135615348816, + "epoch": 0.1872, + "grad_norm": 50.981346130371094, + "kl": 101.208984375, + "learning_rate": 2.761321158169134e-07, + "loss": 0.3981, + "num_tokens": 18556361.0, + "reward": 0.7673307359218597, + "reward_std": 0.3982690507546067, + "rewards/SMILES_validity_reward": 0.7791666723787785, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458395421505, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.009968915626814123, + "rewards/smiles_len_reward": -0.01503224135376513, + "rewards/tag_count_reward": 0.025390624825377017, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.88281464576721, + "epoch": 0.1888, + "grad_norm": 282.7349548339844, + "kl": 101.9609375, + "learning_rate": 2.722718632319716e-07, + "loss": 0.4886, + "num_tokens": 18685724.0, + "reward": 0.7614529021084309, + "reward_std": 0.3986309599131346, + "rewards/SMILES_validity_reward": 0.7874999977648258, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8854166679084301, + "rewards/reasoning_steps_reward": 0.013020833488553762, + "rewards/repetition_penalty_reward": -0.013385478279815288, + "rewards/smiles_len_reward": -0.07403978041838855, + "rewards/tag_count_reward": 0.020182291977107525, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.22656440734863, + "epoch": 0.1904, + "grad_norm": 1830.841552734375, + "kl": 231.3828125, + "learning_rate": 2.684062502429312e-07, + "loss": 0.415, + "num_tokens": 18812531.0, + "reward": 0.7645077109336853, + "reward_std": 0.37579927399929147, + "rewards/SMILES_validity_reward": 0.779166666790843, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458358168602, + "rewards/reasoning_steps_reward": 0.015625000465661287, + "rewards/repetition_penalty_reward": -0.009541891871776897, + "rewards/smiles_len_reward": -0.04434057860635221, + "rewards/tag_count_reward": 0.018229166977107525, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.03646087646484, + "epoch": 0.192, + "grad_norm": 82.80435943603516, + "kl": 793.9921875, + "learning_rate": 2.6453620722761895e-07, + "loss": 1.1515, + "num_tokens": 18943873.0, + "reward": 0.7398635447025299, + "reward_std": 0.4153846015688032, + "rewards/SMILES_validity_reward": 0.7500000111758709, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8776041641831398, + "rewards/reasoning_steps_reward": 0.010416666744276881, + "rewards/repetition_penalty_reward": -0.014263476856285706, + "rewards/smiles_len_reward": -0.009628391126170754, + "rewards/tag_count_reward": 0.029296875232830644, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.32031428813934, + "epoch": 0.1936, + "grad_norm": 17.316062927246094, + "kl": 18.3447265625, + "learning_rate": 2.6066266563008265e-07, + "loss": 0.3039, + "num_tokens": 19065724.0, + "reward": 0.8148761950433254, + "reward_std": 0.3233891185373068, + "rewards/SMILES_validity_reward": 0.8541666530072689, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458358168602, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.008308633463457227, + "rewards/smiles_len_reward": -0.05777434818446636, + "rewards/tag_count_reward": 0.024739583139307797, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.41146039962769, + "epoch": 0.1952, + "grad_norm": 939.9111938476562, + "kl": 117.7265625, + "learning_rate": 2.567865577364107e-07, + "loss": 0.2587, + "num_tokens": 19194522.0, + "reward": 0.7968677990138531, + "reward_std": 0.331410052604042, + "rewards/SMILES_validity_reward": 0.824999988079071, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9192708320915699, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.010457739825142198, + "rewards/smiles_len_reward": -0.08623012714087963, + "rewards/tag_count_reward": 0.024739583546761423, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.66927254199982, + "epoch": 0.1968, + "grad_norm": 14.630553245544434, + "kl": 34.8125, + "learning_rate": 2.5290881645034926e-07, + "loss": 0.3309, + "num_tokens": 19321115.0, + "reward": 0.7642820924520493, + "reward_std": 0.3856325391680002, + "rewards/SMILES_validity_reward": 0.7958333306014538, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8802083283662796, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.008856045191350859, + "rewards/smiles_len_reward": -0.09168360711191781, + "rewards/tag_count_reward": 0.024088542093522847, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.70312690734863, + "epoch": 0.1984, + "grad_norm": 21.64442253112793, + "kl": 28.0859375, + "learning_rate": 2.4903037506876995e-07, + "loss": 0.3431, + "num_tokens": 19450025.0, + "reward": 0.7640567347407341, + "reward_std": 0.41116272285580635, + "rewards/SMILES_validity_reward": 0.7874999940395355, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8776041716337204, + "rewards/reasoning_steps_reward": 0.018229167151730508, + "rewards/repetition_penalty_reward": -0.013350201916182414, + "rewards/smiles_len_reward": -0.03371383191552013, + "rewards/tag_count_reward": 0.02408854162786156, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.33593893051147, + "epoch": 0.2, + "grad_norm": 14.254769325256348, + "kl": 357.6015625, + "learning_rate": 2.4515216705704393e-07, + "loss": 0.4718, + "num_tokens": 19573418.0, + "reward": 0.7403161786496639, + "reward_std": 0.38923288183286786, + "rewards/SMILES_validity_reward": 0.7708333358168602, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8619791716337204, + "rewards/reasoning_steps_reward": 0.012152778101153672, + "rewards/repetition_penalty_reward": -0.009566615204676054, + "rewards/smiles_len_reward": -0.11114423366962001, + "rewards/tag_count_reward": 0.02994791674427688, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.15625238418579, + "epoch": 0.2016, + "grad_norm": 20.493144989013672, + "kl": 32.86328125, + "learning_rate": 2.412751258243748e-07, + "loss": 0.382, + "num_tokens": 19698662.0, + "reward": 0.7679505236446857, + "reward_std": 0.40248001366853714, + "rewards/SMILES_validity_reward": 0.8041666522622108, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.890625, + "rewards/reasoning_steps_reward": 0.011284722539130598, + "rewards/repetition_penalty_reward": -0.010105125089467037, + "rewards/smiles_len_reward": -0.14680567476898432, + "rewards/tag_count_reward": 0.024088542035315186, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.04687678813934, + "epoch": 0.2032, + "grad_norm": 29.680883407592773, + "kl": 32.193359375, + "learning_rate": 2.37400184499145e-07, + "loss": 0.2616, + "num_tokens": 19821176.0, + "reward": 0.7311373427510262, + "reward_std": 0.43501078244298697, + "rewards/SMILES_validity_reward": 0.7458333410322666, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8802083320915699, + "rewards/reasoning_steps_reward": 0.006944444612599909, + "rewards/repetition_penalty_reward": -0.008689612010130077, + "rewards/smiles_len_reward": -0.07047638797666878, + "rewards/tag_count_reward": 0.022135416918899864, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.67187702655792, + "epoch": 0.2048, + "grad_norm": 31.821918487548828, + "kl": 74.078125, + "learning_rate": 2.3352827570433033e-07, + "loss": 0.4004, + "num_tokens": 19943162.0, + "reward": 0.8020155876874924, + "reward_std": 0.3569780103280209, + "rewards/SMILES_validity_reward": 0.8291666656732559, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9166666679084301, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.006827858622273197, + "rewards/smiles_len_reward": -0.06168944027740508, + "rewards/tag_count_reward": 0.026692708197515458, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.89583539962769, + "epoch": 0.2064, + "grad_norm": 24.872432708740234, + "kl": 34.111328125, + "learning_rate": 2.2966033133303545e-07, + "loss": 0.4437, + "num_tokens": 20070610.0, + "reward": 0.7651942074298859, + "reward_std": 0.3582833812106401, + "rewards/SMILES_validity_reward": 0.7874999977648258, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.890625, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.010585774871287867, + "rewards/smiles_len_reward": -0.06807235861197114, + "rewards/tag_count_reward": 0.03320312488358468, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.50781464576721, + "epoch": 0.208, + "grad_norm": 41.98968505859375, + "kl": 44.3046875, + "learning_rate": 2.2579728232420523e-07, + "loss": 0.2895, + "num_tokens": 20192533.0, + "reward": 0.7789809294044971, + "reward_std": 0.3466755224435474, + "rewards/SMILES_validity_reward": 0.8028645887970924, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9088541604578495, + "rewards/reasoning_steps_reward": 0.006076389050576836, + "rewards/repetition_penalty_reward": -0.010937447841570247, + "rewards/smiles_len_reward": -0.06887253125751158, + "rewards/tag_count_reward": 0.016927083604969084, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.78906500339508, + "epoch": 0.2096, + "grad_norm": 18.29418182373047, + "kl": 34.1484375, + "learning_rate": 2.2194005843856633e-07, + "loss": 0.4393, + "num_tokens": 20316484.0, + "reward": 0.8505359329283237, + "reward_std": 0.2659579182509333, + "rewards/SMILES_validity_reward": 0.8916666619479656, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9375000037252903, + "rewards/reasoning_steps_reward": 0.0017361111240461469, + "rewards/repetition_penalty_reward": -0.009237999664037488, + "rewards/smiles_len_reward": -0.05497855134308338, + "rewards/tag_count_reward": 0.013671875291038305, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.89583480358124, + "epoch": 0.2112, + "grad_norm": 26.120685577392578, + "kl": 45.78125, + "learning_rate": 2.1808958803485133e-07, + "loss": 0.2691, + "num_tokens": 20434332.0, + "reward": 0.7770359516143799, + "reward_std": 0.37100684829056263, + "rewards/SMILES_validity_reward": 0.7999999988824129, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416679084301, + "rewards/reasoning_steps_reward": 0.013888889225199819, + "rewards/repetition_penalty_reward": -0.0058435348473722115, + "rewards/smiles_len_reward": -0.0629474117886275, + "rewards/tag_count_reward": 0.022135416802484542, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.60416793823242, + "epoch": 0.2128, + "grad_norm": 50.12458419799805, + "kl": 44.2275390625, + "learning_rate": 2.1424679784636144e-07, + "loss": 0.3092, + "num_tokens": 20558980.0, + "reward": 0.7655416280031204, + "reward_std": 0.3876127991534304, + "rewards/SMILES_validity_reward": 0.7833333350718021, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416716337204, + "rewards/reasoning_steps_reward": 0.014756944845430553, + "rewards/repetition_penalty_reward": -0.013187804717745166, + "rewards/smiles_len_reward": -0.061909247655421495, + "rewards/tag_count_reward": 0.029296875407453626, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.49218928813934, + "epoch": 0.2144, + "grad_norm": 2464.454833984375, + "kl": 425.138671875, + "learning_rate": 2.104126127579193e-07, + "loss": 0.8155, + "num_tokens": 20685121.0, + "reward": 0.7608797401189804, + "reward_std": 0.3963581267744303, + "rewards/SMILES_validity_reward": 0.783333346247673, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416641831398, + "rewards/reasoning_steps_reward": 0.006076389050576836, + "rewards/repetition_penalty_reward": -0.011656314345600549, + "rewards/smiles_len_reward": -0.09486878104507923, + "rewards/tag_count_reward": 0.022786458488553762, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.796876311302185, + "epoch": 0.216, + "grad_norm": 90.27828216552734, + "kl": 113.61328125, + "learning_rate": 2.065879555832674e-07, + "loss": 0.3303, + "num_tokens": 20801011.0, + "reward": 0.7906807139515877, + "reward_std": 0.347005927702412, + "rewards/SMILES_validity_reward": 0.8291666619479656, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8880208358168602, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.004367547942820238, + "rewards/smiles_len_reward": -0.08830558031331748, + "rewards/tag_count_reward": 0.02604166674427688, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.78125309944153, + "epoch": 0.2176, + "grad_norm": 390.4526672363281, + "kl": 193.759765625, + "learning_rate": 2.0277374684296498e-07, + "loss": 0.512, + "num_tokens": 20932255.0, + "reward": 0.7888578772544861, + "reward_std": 0.3733808258548379, + "rewards/SMILES_validity_reward": 0.8223958350718021, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9114583395421505, + "rewards/reasoning_steps_reward": 0.012152778101153672, + "rewards/repetition_penalty_reward": -0.013028325862251222, + "rewards/smiles_len_reward": -0.1238281219266355, + "rewards/tag_count_reward": 0.02213541662786156, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.94010603427887, + "epoch": 0.2192, + "grad_norm": 23.130889892578125, + "kl": 126.322265625, + "learning_rate": 1.989709045428361e-07, + "loss": 0.3458, + "num_tokens": 21057416.0, + "reward": 0.7640821002423763, + "reward_std": 0.36372836004011333, + "rewards/SMILES_validity_reward": 0.7916666716337204, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333358168602, + "rewards/reasoning_steps_reward": 0.013020833488553762, + "rewards/repetition_penalty_reward": -0.009005672072817106, + "rewards/smiles_len_reward": -0.11449750722385943, + "rewards/tag_count_reward": 0.022135417035315186, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.97916913032532, + "epoch": 0.2208, + "grad_norm": 133.2987823486328, + "kl": 152.8232421875, + "learning_rate": 1.9518034395302412e-07, + "loss": 0.5016, + "num_tokens": 21189120.0, + "reward": 0.7536205910146236, + "reward_std": 0.39099146984517574, + "rewards/SMILES_validity_reward": 0.7999999970197678, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8645833283662796, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.01477565044478979, + "rewards/smiles_len_reward": -0.1902956496924162, + "rewards/tag_count_reward": 0.03450520854676142, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.25260508060455, + "epoch": 0.2224, + "grad_norm": 67.20164489746094, + "kl": 61.5078125, + "learning_rate": 1.9140297738770385e-07, + "loss": 0.2463, + "num_tokens": 21307105.0, + "reward": 0.7786005400121212, + "reward_std": 0.3619860680773854, + "rewards/SMILES_validity_reward": 0.8124999925494194, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416679084301, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.0060174524296598975, + "rewards/smiles_len_reward": -0.12334280030336231, + "rewards/tag_count_reward": 0.019531250116415322, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.000001668930054, + "epoch": 0.224, + "grad_norm": 18.651592254638672, + "kl": 22.283203125, + "learning_rate": 1.8763971398550467e-07, + "loss": 0.2241, + "num_tokens": 21428065.0, + "reward": 0.8106764741241932, + "reward_std": 0.32357010687701404, + "rewards/SMILES_validity_reward": 0.8458333313465118, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9218749962747097, + "rewards/reasoning_steps_reward": 0.0034722223062999547, + "rewards/repetition_penalty_reward": -0.006691267819405766, + "rewards/smiles_len_reward": -0.09079861175268888, + "rewards/tag_count_reward": 0.014322916802484542, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.3333351612091, + "epoch": 0.2256, + "grad_norm": 14.318089485168457, + "kl": 27.7333984375, + "learning_rate": 1.8389145949069951e-07, + "loss": 0.3572, + "num_tokens": 21554529.0, + "reward": 0.7339997342787683, + "reward_std": 0.3634449951350689, + "rewards/SMILES_validity_reward": 0.7398437573574483, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8932291604578495, + "rewards/reasoning_steps_reward": 0.017361111706122756, + "rewards/repetition_penalty_reward": -0.011701545292453375, + "rewards/smiles_len_reward": -0.04183520987862721, + "rewards/tag_count_reward": 0.017578125232830644, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.28385519981384, + "epoch": 0.2272, + "grad_norm": 29.62630844116211, + "kl": 34.2265625, + "learning_rate": 1.8015911603520893e-07, + "loss": 0.2096, + "num_tokens": 21670990.0, + "reward": 0.7384370751678944, + "reward_std": 0.40297506004571915, + "rewards/SMILES_validity_reward": 0.7666666675359011, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8697916623204947, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.004413491224113386, + "rewards/smiles_len_reward": -0.12501907243859023, + "rewards/tag_count_reward": 0.029947916860692203, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.74218964576721, + "epoch": 0.2288, + "grad_norm": 26.962491989135742, + "kl": 73.96875, + "learning_rate": 1.764435819214762e-07, + "loss": 0.3738, + "num_tokens": 21794923.0, + "reward": 0.7910585440695286, + "reward_std": 0.34452163241803646, + "rewards/SMILES_validity_reward": 0.8291666731238365, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8880208283662796, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.006722177873598412, + "rewards/smiles_len_reward": -0.08803215471561998, + "rewards/tag_count_reward": 0.026692708721384406, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.90625166893005, + "epoch": 0.2304, + "grad_norm": 3684.165283203125, + "kl": 391.78515625, + "learning_rate": 1.7274575140626315e-07, + "loss": 0.7691, + "num_tokens": 21920839.0, + "reward": 0.7605861239135265, + "reward_std": 0.38395687006413937, + "rewards/SMILES_validity_reward": 0.7916666604578495, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8854166716337204, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.010926677110546734, + "rewards/smiles_len_reward": -0.11563519097398967, + "rewards/tag_count_reward": 0.026692708139307797, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.61718928813934, + "epoch": 0.232, + "grad_norm": 30.921920776367188, + "kl": 51.79296875, + "learning_rate": 1.6906651448541976e-07, + "loss": 0.4191, + "num_tokens": 22044724.0, + "reward": 0.7936124540865421, + "reward_std": 0.3641815240844153, + "rewards/SMILES_validity_reward": 0.8195312321186066, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9192708283662796, + "rewards/reasoning_steps_reward": 0.004340277868323028, + "rewards/repetition_penalty_reward": -0.009806223220948596, + "rewards/smiles_len_reward": -0.07182217901572585, + "rewards/tag_count_reward": 0.018880208779592067, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.83854305744171, + "epoch": 0.2336, + "grad_norm": 68.02233123779297, + "kl": 50.3828125, + "learning_rate": 1.6540675667967973e-07, + "loss": 0.2395, + "num_tokens": 22162934.0, + "reward": 0.8059075474739075, + "reward_std": 0.34976634243503213, + "rewards/SMILES_validity_reward": 0.8416666649281979, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9088541641831398, + "rewards/reasoning_steps_reward": 0.0026041667442768812, + "rewards/repetition_penalty_reward": -0.0050892977487819735, + "rewards/smiles_len_reward": -0.0794562753289938, + "rewards/tag_count_reward": 0.02278645889600739, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.04166758060455, + "epoch": 0.2352, + "grad_norm": 17.918827056884766, + "kl": 37.42578125, + "learning_rate": 1.617673588215328e-07, + "loss": 0.2869, + "num_tokens": 22279302.0, + "reward": 0.7713445201516151, + "reward_std": 0.3772635292261839, + "rewards/SMILES_validity_reward": 0.8083333261311054, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8932291641831398, + "rewards/reasoning_steps_reward": 0.00954861135687679, + "rewards/repetition_penalty_reward": -0.006463400637585437, + "rewards/smiles_len_reward": -0.15175057773012668, + "rewards/tag_count_reward": 0.02408854174427688, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.083335638046265, + "epoch": 0.2368, + "grad_norm": 64.83023834228516, + "kl": 49.0859375, + "learning_rate": 1.5814919684322542e-07, + "loss": 0.2833, + "num_tokens": 22397606.0, + "reward": 0.7741575352847576, + "reward_std": 0.4162686008712626, + "rewards/SMILES_validity_reward": 0.7833333313465118, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.006076388992369175, + "rewards/repetition_penalty_reward": -0.0056419622706016526, + "rewards/smiles_len_reward": 0.018222944694571197, + "rewards/tag_count_reward": 0.020833333546761423, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.859376311302185, + "epoch": 0.2384, + "grad_norm": 173.7949981689453, + "kl": 106.66796875, + "learning_rate": 1.5455314156594123e-07, + "loss": 0.346, + "num_tokens": 22518512.0, + "reward": 0.8143112808465958, + "reward_std": 0.3242040954064578, + "rewards/SMILES_validity_reward": 0.8374999910593033, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9244791716337204, + "rewards/reasoning_steps_reward": 0.0026041667442768812, + "rewards/repetition_penalty_reward": -0.006993362576395157, + "rewards/smiles_len_reward": -0.009921115823090076, + "rewards/tag_count_reward": 0.021484375116415322, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.15364754199982, + "epoch": 0.24, + "grad_norm": 400.5851135253906, + "kl": 67.044921875, + "learning_rate": 1.5098005849021078e-07, + "loss": 0.3079, + "num_tokens": 22640299.0, + "reward": 0.7607803493738174, + "reward_std": 0.36718062963336706, + "rewards/SMILES_validity_reward": 0.7874999903142452, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8828124962747097, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.005829951467603678, + "rewards/smiles_len_reward": -0.06878954637795687, + "rewards/tag_count_reward": 0.013671875174622983, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.927085638046265, + "epoch": 0.2416, + "grad_norm": 44.80502700805664, + "kl": 45.6953125, + "learning_rate": 1.47430807587603e-07, + "loss": 0.3873, + "num_tokens": 22761615.0, + "reward": 0.8036354631185532, + "reward_std": 0.3190372730605304, + "rewards/SMILES_validity_reward": 0.8333333283662796, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9114583358168602, + "rewards/reasoning_steps_reward": 0.006076389050576836, + "rewards/repetition_penalty_reward": -0.007518649843405001, + "rewards/smiles_len_reward": -0.04488650645362213, + "rewards/tag_count_reward": 0.014973958430346102, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.257814168930054, + "epoch": 0.2432, + "grad_norm": 61.11827087402344, + "kl": 51.85546875, + "learning_rate": 1.4390624309374617e-07, + "loss": 0.2636, + "num_tokens": 22878834.0, + "reward": 0.7895913496613503, + "reward_std": 0.3571253365371376, + "rewards/SMILES_validity_reward": 0.816666666418314, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333358168602, + "rewards/reasoning_steps_reward": 0.012152778101153672, + "rewards/repetition_penalty_reward": -0.006857625812699553, + "rewards/smiles_len_reward": -0.038289141783025116, + "rewards/tag_count_reward": 0.024739583081100136, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.632813930511475, + "epoch": 0.2448, + "grad_norm": 24.276023864746094, + "kl": 101.8251953125, + "learning_rate": 1.404072133027306e-07, + "loss": 0.3328, + "num_tokens": 22994277.0, + "reward": 0.7833369635045528, + "reward_std": 0.3747501680627465, + "rewards/SMILES_validity_reward": 0.7999999858438969, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9192708358168602, + "rewards/reasoning_steps_reward": 0.00954861135687679, + "rewards/repetition_penalty_reward": -0.005959605041425675, + "rewards/smiles_len_reward": -0.05147076665889472, + "rewards/tag_count_reward": 0.0234375, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.4192727804184, + "epoch": 0.2464, + "grad_norm": 19.5562686920166, + "kl": 92.298828125, + "learning_rate": 1.369345603629406e-07, + "loss": 0.4215, + "num_tokens": 23118086.0, + "reward": 0.807420376688242, + "reward_std": 0.3090350958518684, + "rewards/SMILES_validity_reward": 0.8374999985098839, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9218749962747097, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.01095377303136047, + "rewards/smiles_len_reward": -0.07096354046370834, + "rewards/tag_count_reward": 0.017578125349245965, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.330730676651, + "epoch": 0.248, + "grad_norm": 23.46638298034668, + "kl": 61.33984375, + "learning_rate": 1.3348912007436536e-07, + "loss": 0.3241, + "num_tokens": 23238405.0, + "reward": 0.7636997401714325, + "reward_std": 0.40307603124529123, + "rewards/SMILES_validity_reward": 0.7791666612029076, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333320915699, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.010418418491099146, + "rewards/smiles_len_reward": -0.020944936492014676, + "rewards/tag_count_reward": 0.018880208197515458, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.80208468437195, + "epoch": 0.2496, + "grad_norm": 811.1200561523438, + "kl": 195.423828125, + "learning_rate": 1.3007172168743852e-07, + "loss": 0.4265, + "num_tokens": 23359289.0, + "reward": 0.7152650374919176, + "reward_std": 0.4054424315690994, + "rewards/SMILES_validity_reward": 0.6997395902872086, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9270833283662796, + "rewards/reasoning_steps_reward": 0.009548611473292112, + "rewards/repetition_penalty_reward": -0.012816645510611124, + "rewards/smiles_len_reward": -0.04499440788640641, + "rewards/tag_count_reward": 0.021484375116415322, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.476564049720764, + "epoch": 0.2512, + "grad_norm": 25.647239685058594, + "kl": 42.7890625, + "learning_rate": 1.2668318770345368e-07, + "loss": 0.3482, + "num_tokens": 23480432.0, + "reward": 0.7882367707788944, + "reward_std": 0.3668895438313484, + "rewards/SMILES_validity_reward": 0.820833332836628, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984374925494194, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.0075505238128243946, + "rewards/smiles_len_reward": -0.08182770665735006, + "rewards/tag_count_reward": 0.025390625116415322, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.632813930511475, + "epoch": 0.2528, + "grad_norm": 57.68588638305664, + "kl": 97.1669921875, + "learning_rate": 1.233243336766044e-07, + "loss": 0.4193, + "num_tokens": 23599331.0, + "reward": 0.7653166949748993, + "reward_std": 0.3775314458180219, + "rewards/SMILES_validity_reward": 0.7874999977648258, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458395421505, + "rewards/reasoning_steps_reward": 0.008680555736646056, + "rewards/repetition_penalty_reward": -0.008220537267334294, + "rewards/smiles_len_reward": -0.08570567087735981, + "rewards/tag_count_reward": 0.014973958488553762, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.55208587646484, + "epoch": 0.2544, + "grad_norm": 63.47262191772461, + "kl": 39.439453125, + "learning_rate": 1.1999596801769616e-07, + "loss": 0.2992, + "num_tokens": 23723575.0, + "reward": 0.7604156136512756, + "reward_std": 0.3610593224875629, + "rewards/SMILES_validity_reward": 0.7791666686534882, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9244791641831398, + "rewards/reasoning_steps_reward": 0.0026041666860692203, + "rewards/repetition_penalty_reward": -0.00864661141531542, + "rewards/smiles_len_reward": -0.13107849314110354, + "rewards/tag_count_reward": 0.013671875174622983, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 57.58594036102295, + "epoch": 0.256, + "grad_norm": 26.219541549682617, + "kl": 45.1484375, + "learning_rate": 1.1669889179957723e-07, + "loss": 0.1946, + "num_tokens": 23843992.0, + "reward": 0.7795398309826851, + "reward_std": 0.37162910774350166, + "rewards/SMILES_validity_reward": 0.7999999895691872, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458320915699, + "rewards/reasoning_steps_reward": 0.0034722222480922937, + "rewards/repetition_penalty_reward": -0.008394229185796576, + "rewards/smiles_len_reward": -0.037310975953005254, + "rewards/tag_count_reward": 0.026692708139307797, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.520835518836975, + "epoch": 0.2576, + "grad_norm": 419.2340393066406, + "kl": 158.7578125, + "learning_rate": 1.1343389856433658e-07, + "loss": 0.4102, + "num_tokens": 23965920.0, + "reward": 0.7558124400675297, + "reward_std": 0.42608391866087914, + "rewards/SMILES_validity_reward": 0.7791666612029076, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984375037252903, + "rewards/reasoning_steps_reward": 0.0034722223062999547, + "rewards/repetition_penalty_reward": -0.007730142860964406, + "rewards/smiles_len_reward": -0.1092335598077625, + "rewards/tag_count_reward": 0.022135416511446238, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.750001668930054, + "epoch": 0.2592, + "grad_norm": 55.14189910888672, + "kl": 44.15234375, + "learning_rate": 1.1020177413231332e-07, + "loss": 0.2614, + "num_tokens": 24083712.0, + "reward": 0.7523438110947609, + "reward_std": 0.40479825623333454, + "rewards/SMILES_validity_reward": 0.7874999903142452, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8723958358168602, + "rewards/reasoning_steps_reward": 0.006076389050576836, + "rewards/repetition_penalty_reward": -0.00841052423857036, + "rewards/smiles_len_reward": -0.1234477300895378, + "rewards/tag_count_reward": 0.019531250232830644, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.93489706516266, + "epoch": 0.2608, + "grad_norm": 71.32405090332031, + "kl": 60.220703125, + "learning_rate": 1.070032964129654e-07, + "loss": 0.291, + "num_tokens": 24203111.0, + "reward": 0.7721843849867582, + "reward_std": 0.33104856190038845, + "rewards/SMILES_validity_reward": 0.8041666690260172, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333246409893, + "rewards/reasoning_steps_reward": 0.008680555794853717, + "rewards/repetition_penalty_reward": -0.011790495191235095, + "rewards/smiles_len_reward": -0.10733911380521022, + "rewards/tag_count_reward": 0.015625000291038305, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.559897661209106, + "epoch": 0.2624, + "grad_norm": 17.008113861083984, + "kl": 19.951171875, + "learning_rate": 1.0383923521764174e-07, + "loss": 0.2228, + "num_tokens": 24318526.0, + "reward": 0.7702805139124393, + "reward_std": 0.38822865672409534, + "rewards/SMILES_validity_reward": 0.7999999895691872, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458320915699, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.004773934269906022, + "rewards/smiles_len_reward": -0.12223973160143942, + "rewards/tag_count_reward": 0.013671875291038305, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.84635615348816, + "epoch": 0.264, + "grad_norm": 36.427547454833984, + "kl": 43.140625, + "learning_rate": 1.007103520743035e-07, + "loss": 0.354, + "num_tokens": 24446723.0, + "reward": 0.7763010747730732, + "reward_std": 0.3733202526345849, + "rewards/SMILES_validity_reward": 0.7875000052154064, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9296874925494194, + "rewards/reasoning_steps_reward": 0.0026041667442768812, + "rewards/repetition_penalty_reward": -0.006564084775163792, + "rewards/smiles_len_reward": -0.053473433246836066, + "rewards/tag_count_reward": 0.018880208779592067, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.32552206516266, + "epoch": 0.2656, + "grad_norm": 13.066360473632812, + "kl": 40.861328125, + "learning_rate": 9.761740004423926e-08, + "loss": 0.2626, + "num_tokens": 24569344.0, + "reward": 0.7705531045794487, + "reward_std": 0.3602237828890793, + "rewards/SMILES_validity_reward": 0.808333333581686, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.890625, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.006140822355519049, + "rewards/smiles_len_reward": -0.15825135062914342, + "rewards/tag_count_reward": 0.026692708604969084, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.304688930511475, + "epoch": 0.2672, + "grad_norm": 7936.54248046875, + "kl": 849.38671875, + "learning_rate": 9.45611235408178e-08, + "loss": 1.071, + "num_tokens": 24685813.0, + "reward": 0.8130536079406738, + "reward_std": 0.34073650278151035, + "rewards/SMILES_validity_reward": 0.8374999985098839, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9166666604578495, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.006224363439287117, + "rewards/smiles_len_reward": -0.01024607045110315, + "rewards/tag_count_reward": 0.024088542035315186, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.476564049720764, + "epoch": 0.2688, + "grad_norm": 18.579252243041992, + "kl": 74.7998046875, + "learning_rate": 9.15422581503224e-08, + "loss": 0.4535, + "num_tokens": 24805804.0, + "reward": 0.7905747666954994, + "reward_std": 0.34068747609853745, + "rewards/SMILES_validity_reward": 0.8333333171904087, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8880208320915699, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.00672910911089275, + "rewards/smiles_len_reward": -0.11942530050873756, + "rewards/tag_count_reward": 0.02148437505820766, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.60677218437195, + "epoch": 0.2704, + "grad_norm": 20.473526000976562, + "kl": 47.2734375, + "learning_rate": 8.856153045490947e-08, + "loss": 0.3034, + "num_tokens": 24921237.0, + "reward": 0.7792329639196396, + "reward_std": 0.3795014023198746, + "rewards/SMILES_validity_reward": 0.7958333231508732, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.005042356933699921, + "rewards/smiles_len_reward": -0.018905383301898837, + "rewards/tag_count_reward": 0.021484375407453626, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.48437809944153, + "epoch": 0.272, + "grad_norm": 147.6204071044922, + "kl": 92.5546875, + "learning_rate": 8.561965785773412e-08, + "loss": 0.3352, + "num_tokens": 25048911.0, + "reward": 0.7174020754173398, + "reward_std": 0.3969442341476679, + "rewards/SMILES_validity_reward": 0.7190104154869914, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8828125037252903, + "rewards/reasoning_steps_reward": 0.01388888928340748, + "rewards/repetition_penalty_reward": -0.007572865069960244, + "rewards/smiles_len_reward": -0.03659327526111156, + "rewards/tag_count_reward": 0.022786458488553762, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.588543176651, + "epoch": 0.2736, + "grad_norm": 469.4862060546875, + "kl": 140.984375, + "learning_rate": 8.271734841028552e-08, + "loss": 0.3789, + "num_tokens": 25164721.0, + "reward": 0.7690527178347111, + "reward_std": 0.39881330635398626, + "rewards/SMILES_validity_reward": 0.8041666597127914, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.890625, + "rewards/reasoning_steps_reward": 0.0026041667442768812, + "rewards/repetition_penalty_reward": -0.005571819157921709, + "rewards/smiles_len_reward": -0.12382401619106531, + "rewards/tag_count_reward": 0.016276041918899864, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.276043176651, + "epoch": 0.2752, + "grad_norm": 190.9840545654297, + "kl": 74.7578125, + "learning_rate": 7.985530064197241e-08, + "loss": 0.3261, + "num_tokens": 25281563.0, + "reward": 0.7844961099326611, + "reward_std": 0.3285376951098442, + "rewards/SMILES_validity_reward": 0.8249999992549419, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8906249925494194, + "rewards/reasoning_steps_reward": 0.014756944845430553, + "rewards/repetition_penalty_reward": -0.005333821703970898, + "rewards/smiles_len_reward": -0.13086939207278192, + "rewards/tag_count_reward": 0.019531250291038305, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.25000202655792, + "epoch": 0.2768, + "grad_norm": 23.501083374023438, + "kl": 51.0078125, + "learning_rate": 7.703420339200101e-08, + "loss": 0.3129, + "num_tokens": 25406843.0, + "reward": 0.7355504985898733, + "reward_std": 0.369172902777791, + "rewards/SMILES_validity_reward": 0.748177076689899, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984375037252903, + "rewards/reasoning_steps_reward": 0.0026041667442768812, + "rewards/repetition_penalty_reward": -0.01213008257218462, + "rewards/smiles_len_reward": -0.08640275153447874, + "rewards/tag_count_reward": 0.018880208546761423, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.47135639190674, + "epoch": 0.2784, + "grad_norm": 26.730709075927734, + "kl": 42.05859375, + "learning_rate": 7.425473564358456e-08, + "loss": 0.2728, + "num_tokens": 25526832.0, + "reward": 0.8217011019587517, + "reward_std": 0.27569763401697855, + "rewards/SMILES_validity_reward": 0.8624999970197678, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9140624962747097, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.006716858119034441, + "rewards/smiles_len_reward": -0.0911170897888951, + "rewards/tag_count_reward": 0.024739583663176745, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.69791901111603, + "epoch": 0.28, + "grad_norm": 98.43875885009766, + "kl": 61.244140625, + "learning_rate": 7.151756636052527e-08, + "loss": 0.3109, + "num_tokens": 25648444.0, + "reward": 0.7451902097091079, + "reward_std": 0.35876176378224045, + "rewards/SMILES_validity_reward": 0.7536458186805248, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9140624925494194, + "rewards/reasoning_steps_reward": 0.008680555736646056, + "rewards/repetition_penalty_reward": -0.009796898761123884, + "rewards/smiles_len_reward": -0.09724315593484789, + "rewards/tag_count_reward": 0.0325520834303461, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.132814049720764, + "epoch": 0.2816, + "grad_norm": 18.02010726928711, + "kl": 46.1328125, + "learning_rate": 6.882335432620779e-08, + "loss": 0.1619, + "num_tokens": 25765999.0, + "reward": 0.7713147848844528, + "reward_std": 0.3866591900587082, + "rewards/SMILES_validity_reward": 0.8041666708886623, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8828125, + "rewards/reasoning_steps_reward": 0.0164930559694767, + "rewards/repetition_penalty_reward": -0.008989743004349293, + "rewards/smiles_len_reward": -0.09670030255801976, + "rewards/tag_count_reward": 0.024739583663176745, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.015626311302185, + "epoch": 0.2832, + "grad_norm": 34.14971923828125, + "kl": 58.6484375, + "learning_rate": 6.617274798504286e-08, + "loss": 0.3631, + "num_tokens": 25884277.0, + "reward": 0.7878972478210926, + "reward_std": 0.3565414815675467, + "rewards/SMILES_validity_reward": 0.804166667163372, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9192708395421505, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.0039607606104254955, + "rewards/smiles_len_reward": -0.030088828410953283, + "rewards/tag_count_reward": 0.018229166918899864, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.94010627269745, + "epoch": 0.2848, + "grad_norm": 20.642011642456055, + "kl": 198.462890625, + "learning_rate": 6.356638528639954e-08, + "loss": 0.4167, + "num_tokens": 26003678.0, + "reward": 0.7857323661446571, + "reward_std": 0.36366927227936685, + "rewards/SMILES_validity_reward": 0.8166666626930237, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062499962747097, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.004005594724731054, + "rewards/smiles_len_reward": -0.10598958295304328, + "rewards/tag_count_reward": 0.02408854162786156, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.007814049720764, + "epoch": 0.2864, + "grad_norm": 66.33602142333984, + "kl": 51.1875, + "learning_rate": 6.100489353106303e-08, + "loss": 0.2763, + "num_tokens": 26118881.0, + "reward": 0.8222081623971462, + "reward_std": 0.27234845350903925, + "rewards/SMILES_validity_reward": 0.8583333306014538, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9192708395421505, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.006773012763005681, + "rewards/smiles_len_reward": -0.06463625153992325, + "rewards/tag_count_reward": 0.016927083488553762, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.45312595367432, + "epoch": 0.288, + "grad_norm": 26.583322525024414, + "kl": 47.1904296875, + "learning_rate": 5.848888922025552e-08, + "loss": 0.3307, + "num_tokens": 26243087.0, + "reward": 0.7812948487699032, + "reward_std": 0.35617859475314617, + "rewards/SMILES_validity_reward": 0.8124999888241291, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333358168602, + "rewards/reasoning_steps_reward": 0.015625000465661287, + "rewards/repetition_penalty_reward": -0.014582588351913728, + "rewards/smiles_len_reward": -0.08327753038611263, + "rewards/tag_count_reward": 0.020182291860692203, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.65625178813934, + "epoch": 0.2896, + "grad_norm": 303.911865234375, + "kl": 85.3046875, + "learning_rate": 5.601897790725643e-08, + "loss": 0.2272, + "num_tokens": 26365067.0, + "reward": 0.7704563029110432, + "reward_std": 0.4014833262190223, + "rewards/SMILES_validity_reward": 0.7875000052154064, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9114583320915699, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.008073668244833243, + "rewards/smiles_len_reward": -0.06679152825381607, + "rewards/tag_count_reward": 0.019531250407453626, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.49479305744171, + "epoch": 0.2912, + "grad_norm": 263.7744140625, + "kl": 203.021484375, + "learning_rate": 5.3595754051657476e-08, + "loss": 0.3918, + "num_tokens": 26480073.0, + "reward": 0.806282889097929, + "reward_std": 0.3312041540630162, + "rewards/SMILES_validity_reward": 0.8291666619479656, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9270833320915699, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.006226064411748666, + "rewards/smiles_len_reward": -0.04696196690201759, + "rewards/tag_count_reward": 0.02278645895421505, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.968751549720764, + "epoch": 0.2928, + "grad_norm": 22.697811126708984, + "kl": 32.64453125, + "learning_rate": 5.121980087628802e-08, + "loss": 0.2565, + "num_tokens": 26602557.0, + "reward": 0.7698783986270428, + "reward_std": 0.4065110133960843, + "rewards/SMILES_validity_reward": 0.7916666641831398, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984375, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.009550296479574172, + "rewards/smiles_len_reward": -0.05859398643951863, + "rewards/tag_count_reward": 0.024739583779592067, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.43489730358124, + "epoch": 0.2944, + "grad_norm": 16.701269149780273, + "kl": 39.60546875, + "learning_rate": 4.88916902268445e-08, + "loss": 0.3071, + "num_tokens": 26720996.0, + "reward": 0.8138786628842354, + "reward_std": 0.3093279884196818, + "rewards/SMILES_validity_reward": 0.845833320170641, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9192708320915699, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.00628939275702578, + "rewards/smiles_len_reward": -0.05961279338225722, + "rewards/tag_count_reward": 0.018229166977107525, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.37760519981384, + "epoch": 0.296, + "grad_norm": 41.683509826660156, + "kl": 99.9921875, + "learning_rate": 4.6611982434258124e-08, + "loss": 0.4217, + "num_tokens": 26839797.0, + "reward": 0.7909123674035072, + "reward_std": 0.3771476158872247, + "rewards/SMILES_validity_reward": 0.8333333171904087, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416716337204, + "rewards/reasoning_steps_reward": 0.008680555794853717, + "rewards/repetition_penalty_reward": -0.008577862317906693, + "rewards/smiles_len_reward": -0.14827177836559713, + "rewards/tag_count_reward": 0.020833333488553762, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.79687774181366, + "epoch": 0.2976, + "grad_norm": 9209.0556640625, + "kl": 1455.1953125, + "learning_rate": 4.438122617983442e-08, + "loss": 1.6162, + "num_tokens": 26963367.0, + "reward": 0.7700982913374901, + "reward_std": 0.3737562280148268, + "rewards/SMILES_validity_reward": 0.8041666597127914, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062499962747097, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.009300025478296448, + "rewards/smiles_len_reward": -0.17930139059899375, + "rewards/tag_count_reward": 0.03385416720993817, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.71614730358124, + "epoch": 0.2992, + "grad_norm": 4210.1923828125, + "kl": 429.9921875, + "learning_rate": 4.219995836319631e-08, + "loss": 0.7209, + "num_tokens": 27083066.0, + "reward": 0.7837943881750107, + "reward_std": 0.3702095244079828, + "rewards/SMILES_validity_reward": 0.8083333298563957, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9192708320915699, + "rewards/reasoning_steps_reward": 0.004340277868323028, + "rewards/repetition_penalty_reward": -0.005971535720163956, + "rewards/smiles_len_reward": -0.0993585159885697, + "rewards/tag_count_reward": 0.022786458488553762, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.216147780418396, + "epoch": 0.3008, + "grad_norm": 19.845266342163086, + "kl": 30.6083984375, + "learning_rate": 4.006870397306256e-08, + "loss": 0.3621, + "num_tokens": 27204493.0, + "reward": 0.8325573541224003, + "reward_std": 0.2710378540214151, + "rewards/SMILES_validity_reward": 0.8708333186805248, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9166666679084301, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.008276921271317406, + "rewards/smiles_len_reward": -0.039327892707660794, + "rewards/tag_count_reward": 0.022135416860692203, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.28906345367432, + "epoch": 0.3024, + "grad_norm": 19.69918441772461, + "kl": 43.9736328125, + "learning_rate": 3.798797596089351e-08, + "loss": 0.5056, + "num_tokens": 27332860.0, + "reward": 0.754853866994381, + "reward_std": 0.37358047830639407, + "rewards/SMILES_validity_reward": 0.7874999921768904, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984375, + "rewards/reasoning_steps_reward": 0.015625000465661287, + "rewards/repetition_penalty_reward": -0.009878307530016173, + "rewards/smiles_len_reward": -0.18325082340743393, + "rewards/tag_count_reward": 0.018229167151730508, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.35677254199982, + "epoch": 0.304, + "grad_norm": 60.32386779785156, + "kl": 81.28125, + "learning_rate": 3.5958275117433404e-08, + "loss": 0.4199, + "num_tokens": 27456261.0, + "reward": 0.7353403887245804, + "reward_std": 0.3281521408353001, + "rewards/SMILES_validity_reward": 0.7466145819053054, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458358168602, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.008604248070696485, + "rewards/smiles_len_reward": -0.09997228858992457, + "rewards/tag_count_reward": 0.014322916977107525, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.51562714576721, + "epoch": 0.3056, + "grad_norm": 231.04348754882812, + "kl": 87.3828125, + "learning_rate": 3.398008995217988e-08, + "loss": 0.3055, + "num_tokens": 27576267.0, + "reward": 0.7327682701870799, + "reward_std": 0.3798722317442298, + "rewards/SMILES_validity_reward": 0.7398437494412065, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984374962747097, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.004630661387636792, + "rewards/smiles_len_reward": -0.07771396718453616, + "rewards/tag_count_reward": 0.027994791395030916, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.658855676651, + "epoch": 0.3072, + "grad_norm": 1089.19970703125, + "kl": 154.09765625, + "learning_rate": 3.205389657580943e-08, + "loss": 0.4028, + "num_tokens": 27697096.0, + "reward": 0.8069799989461899, + "reward_std": 0.35150486323982477, + "rewards/SMILES_validity_reward": 0.845833320170641, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9114583358168602, + "rewards/reasoning_steps_reward": 0.017361111589707434, + "rewards/repetition_penalty_reward": -0.00833509930089349, + "rewards/smiles_len_reward": -0.11136261757928878, + "rewards/tag_count_reward": 0.0169270834303461, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.04166889190674, + "epoch": 0.3088, + "grad_norm": 62.683502197265625, + "kl": 65.48046875, + "learning_rate": 3.0180158585586395e-08, + "loss": 0.365, + "num_tokens": 27818072.0, + "reward": 0.7640376538038254, + "reward_std": 0.3756988551467657, + "rewards/SMILES_validity_reward": 0.7916666567325592, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8828125037252903, + "rewards/reasoning_steps_reward": 0.006944444612599909, + "rewards/repetition_penalty_reward": -0.009555418229865609, + "rewards/smiles_len_reward": -0.07120654941536486, + "rewards/tag_count_reward": 0.024088541918899864, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.63021004199982, + "epoch": 0.3104, + "grad_norm": 13.964454650878906, + "kl": 261.11328125, + "learning_rate": 2.8359326953784735e-08, + "loss": 0.4462, + "num_tokens": 27935434.0, + "reward": 0.7693060860037804, + "reward_std": 0.38393950555473566, + "rewards/SMILES_validity_reward": 0.8125000037252903, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8619791641831398, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.007570940229925327, + "rewards/smiles_len_reward": -0.11252047342713922, + "rewards/tag_count_reward": 0.029296875232830644, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.25520956516266, + "epoch": 0.312, + "grad_norm": 1176.294677734375, + "kl": 183.84375, + "learning_rate": 2.659183991914696e-08, + "loss": 0.3188, + "num_tokens": 28051884.0, + "reward": 0.7658360535278916, + "reward_std": 0.34651170764118433, + "rewards/SMILES_validity_reward": 0.7898437320254743, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458358168602, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.0037111646015546285, + "rewards/smiles_len_reward": -0.10511704150121659, + "rewards/tag_count_reward": 0.016927083604969084, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.17187738418579, + "epoch": 0.3136, + "grad_norm": 21.213207244873047, + "kl": 44.240234375, + "learning_rate": 2.4878122881409447e-08, + "loss": 0.3338, + "num_tokens": 28177134.0, + "reward": 0.8168588802218437, + "reward_std": 0.33587119466392323, + "rewards/SMILES_validity_reward": 0.8375000096857548, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9296874925494194, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.008981447015685262, + "rewards/smiles_len_reward": 0.0019179416121914983, + "rewards/tag_count_reward": 0.016276042093522847, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.79948043823242, + "epoch": 0.3152, + "grad_norm": 2544.607666015625, + "kl": 461.828125, + "learning_rate": 2.3218588298916543e-08, + "loss": 0.7863, + "num_tokens": 28298017.0, + "reward": 0.7550071179866791, + "reward_std": 0.4074345175176859, + "rewards/SMILES_validity_reward": 0.7833333276212215, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8802083358168602, + "rewards/reasoning_steps_reward": 0.017361111589707434, + "rewards/repetition_penalty_reward": -0.006937599915545434, + "rewards/smiles_len_reward": -0.10579633654560894, + "rewards/tag_count_reward": 0.021484375116415322, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.85156464576721, + "epoch": 0.3168, + "grad_norm": 19.701536178588867, + "kl": 35.046875, + "learning_rate": 2.1613635589349756e-08, + "loss": 0.2979, + "num_tokens": 28421224.0, + "reward": 0.7340408079326153, + "reward_std": 0.4440473485738039, + "rewards/SMILES_validity_reward": 0.7473958246409893, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984374962747097, + "rewards/reasoning_steps_reward": 0.014756944961845875, + "rewards/repetition_penalty_reward": -0.007214406403363682, + "rewards/smiles_len_reward": -0.1137501149205491, + "rewards/tag_count_reward": 0.019531250116415322, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.52343833446503, + "epoch": 0.3184, + "grad_norm": 25.242353439331055, + "kl": 39.62890625, + "learning_rate": 2.006365103359614e-08, + "loss": 0.3568, + "num_tokens": 28542769.0, + "reward": 0.7509439922869205, + "reward_std": 0.42264856584370136, + "rewards/SMILES_validity_reward": 0.7708333395421505, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8880208358168602, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.011348883606842719, + "rewards/smiles_len_reward": -0.06319690844975412, + "rewards/tag_count_reward": 0.016276042093522847, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.77343893051147, + "epoch": 0.32, + "grad_norm": 25.40789222717285, + "kl": 549.107421875, + "learning_rate": 1.8569007682777415e-08, + "loss": 0.8423, + "num_tokens": 28667098.0, + "reward": 0.7792046889662743, + "reward_std": 0.3950358787551522, + "rewards/SMILES_validity_reward": 0.8041666597127914, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458283662796, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.012652922217966989, + "rewards/smiles_len_reward": -0.060145003226352856, + "rewards/tag_count_reward": 0.014322916977107525, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.66145944595337, + "epoch": 0.3216, + "grad_norm": 20.436986923217773, + "kl": 34.26171875, + "learning_rate": 1.713006526846439e-08, + "loss": 0.1911, + "num_tokens": 28781400.0, + "reward": 0.8079907111823559, + "reward_std": 0.3260812449734658, + "rewards/SMILES_validity_reward": 0.8499999716877937, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9088541641831398, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.004972287300915923, + "rewards/smiles_len_reward": -0.12488739204127342, + "rewards/tag_count_reward": 0.03320312505820766, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.95052218437195, + "epoch": 0.3232, + "grad_norm": 19.446300506591797, + "kl": 46.587890625, + "learning_rate": 1.574717011609633e-08, + "loss": 0.3809, + "num_tokens": 28904261.0, + "reward": 0.7811654321849346, + "reward_std": 0.3506924198009074, + "rewards/SMILES_validity_reward": 0.8124999962747097, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333358168602, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.010433290342916735, + "rewards/smiles_len_reward": -0.08025730540975928, + "rewards/tag_count_reward": 0.022135416860692203, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.74479413032532, + "epoch": 0.3248, + "grad_norm": 21.497493743896484, + "kl": 61.5185546875, + "learning_rate": 1.4420655061626929e-08, + "loss": 0.4488, + "num_tokens": 29032035.0, + "reward": 0.7920585982501507, + "reward_std": 0.34399935975670815, + "rewards/SMILES_validity_reward": 0.8125000037252903, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9192708395421505, + "rewards/reasoning_steps_reward": 0.0017361111240461469, + "rewards/repetition_penalty_reward": -0.012168998313427437, + "rewards/smiles_len_reward": -0.03317499946570024, + "rewards/tag_count_reward": 0.0188802084303461, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.755210280418396, + "epoch": 0.3264, + "grad_norm": 40.13300323486328, + "kl": 1958.30078125, + "learning_rate": 1.3150839371417699e-08, + "loss": 2.1492, + "num_tokens": 29152901.0, + "reward": 0.7800212763249874, + "reward_std": 0.3497500689700246, + "rewards/SMILES_validity_reward": 0.7945312596857548, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9296874962747097, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.007477993072825484, + "rewards/smiles_len_reward": -0.0684822405455634, + "rewards/tag_count_reward": 0.020182292151730508, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.61718952655792, + "epoch": 0.328, + "grad_norm": 40.98759460449219, + "kl": 50.23828125, + "learning_rate": 1.1938028665396171e-08, + "loss": 0.3593, + "num_tokens": 29271026.0, + "reward": 0.7865180224180222, + "reward_std": 0.3510497361421585, + "rewards/SMILES_validity_reward": 0.8124999925494194, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9088541641831398, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.007369535931502469, + "rewards/smiles_len_reward": -0.062347470491658896, + "rewards/tag_count_reward": 0.015625000291038305, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.83333456516266, + "epoch": 0.3296, + "grad_norm": 16.528141021728516, + "kl": 21.453125, + "learning_rate": 1.0782514843499652e-08, + "loss": 0.2404, + "num_tokens": 29389234.0, + "reward": 0.7603043857961893, + "reward_std": 0.23452748105773935, + "rewards/SMILES_validity_reward": 0.7671874971129, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9427083283662796, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.0036121224620728754, + "rewards/smiles_len_reward": -0.10610569885466248, + "rewards/tag_count_reward": 0.014322916918899864, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.78125202655792, + "epoch": 0.3312, + "grad_norm": 59.89474868774414, + "kl": 48.08984375, + "learning_rate": 9.684576015420275e-09, + "loss": 0.3184, + "num_tokens": 29510110.0, + "reward": 0.7541577331721783, + "reward_std": 0.4041676054475829, + "rewards/SMILES_validity_reward": 0.7833333387970924, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8776041753590107, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.011318007542286068, + "rewards/smiles_len_reward": -0.09189758577849716, + "rewards/tag_count_reward": 0.018229166802484542, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.156251668930054, + "epoch": 0.3328, + "grad_norm": 157.40074157714844, + "kl": 75.53125, + "learning_rate": 8.644476433669529e-09, + "loss": 0.3375, + "num_tokens": 29624986.0, + "reward": 0.7917684130370617, + "reward_std": 0.36183687672019005, + "rewards/SMILES_validity_reward": 0.829166654497385, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333320915699, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.0043341358250472695, + "rewards/smiles_len_reward": -0.09569119266234338, + "rewards/tag_count_reward": 0.020833333546761423, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.85416805744171, + "epoch": 0.3344, + "grad_norm": 617.4863891601562, + "kl": 177.0859375, + "learning_rate": 7.662466429977698e-09, + "loss": 0.4221, + "num_tokens": 29748194.0, + "reward": 0.7439644634723663, + "reward_std": 0.3548463308252394, + "rewards/SMILES_validity_reward": 0.7591145820915699, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062499888241291, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.00889256418668083, + "rewards/smiles_len_reward": -0.11266191606409848, + "rewards/tag_count_reward": 0.02343749994179234, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.27344048023224, + "epoch": 0.336, + "grad_norm": 14.72851276397705, + "kl": 57.1376953125, + "learning_rate": 6.738782355044048e-09, + "loss": 0.3434, + "num_tokens": 29873867.0, + "reward": 0.7687530927360058, + "reward_std": 0.3611881284014089, + "rewards/SMILES_validity_reward": 0.7999999970197678, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8984375, + "rewards/reasoning_steps_reward": 0.006076389050576836, + "rewards/repetition_penalty_reward": -0.008875246570823947, + "rewards/smiles_len_reward": -0.12386404629796743, + "rewards/tag_count_reward": 0.018880208488553762, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.51302433013916, + "epoch": 0.3376, + "grad_norm": 463.06427001953125, + "kl": 152.01171875, + "learning_rate": 5.8736465216517594e-09, + "loss": 0.4927, + "num_tokens": 29998864.0, + "reward": 0.7758666761219501, + "reward_std": 0.35622790618799627, + "rewards/SMILES_validity_reward": 0.8083333298563957, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8906250037252903, + "rewards/reasoning_steps_reward": 0.013020833721384406, + "rewards/repetition_penalty_reward": -0.013086474158626515, + "rewards/smiles_len_reward": -0.08645110134966671, + "rewards/tag_count_reward": 0.014973958546761423, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.4921897649765, + "epoch": 0.3392, + "grad_norm": 89.55314636230469, + "kl": 86.578125, + "learning_rate": 5.067267151161514e-09, + "loss": 0.4928, + "num_tokens": 30128461.0, + "reward": 0.7535761334002018, + "reward_std": 0.40810330770909786, + "rewards/SMILES_validity_reward": 0.7750000096857548, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416604578495, + "rewards/reasoning_steps_reward": 0.018229166977107525, + "rewards/repetition_penalty_reward": -0.011182753019966185, + "rewards/smiles_len_reward": -0.11959353811107576, + "rewards/tag_count_reward": 0.020182291802484542, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.244792461395264, + "epoch": 0.3408, + "grad_norm": 14.1329345703125, + "kl": 39.599609375, + "learning_rate": 4.319838323396691e-09, + "loss": 0.2317, + "num_tokens": 30246059.0, + "reward": 0.7920440249145031, + "reward_std": 0.31801472790539265, + "rewards/SMILES_validity_reward": 0.8208333402872086, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9088541753590107, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.007406066724797711, + "rewards/smiles_len_reward": -0.06798831513151526, + "rewards/tag_count_reward": 0.018229166511446238, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.59375178813934, + "epoch": 0.3424, + "grad_norm": 29.327720642089844, + "kl": 41.130859375, + "learning_rate": 3.631539929932148e-09, + "loss": 0.1712, + "num_tokens": 30366863.0, + "reward": 0.7371162544004619, + "reward_std": 0.37827688455581665, + "rewards/SMILES_validity_reward": 0.7453124960884452, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9088541604578495, + "rewards/reasoning_steps_reward": 0.006076389050576836, + "rewards/repetition_penalty_reward": -0.0067941894121759105, + "rewards/smiles_len_reward": -0.08619371574604884, + "rewards/tag_count_reward": 0.014322916977107525, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.015626192092896, + "epoch": 0.344, + "grad_norm": 26.84274673461914, + "kl": 2524.26171875, + "learning_rate": 3.002537630797747e-09, + "loss": 2.7493, + "num_tokens": 30484373.0, + "reward": 0.7964664585888386, + "reward_std": 0.3583722086623311, + "rewards/SMILES_validity_reward": 0.8333333171904087, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062500037252903, + "rewards/reasoning_steps_reward": 0.004340277868323028, + "rewards/repetition_penalty_reward": -0.006158172767754877, + "rewards/smiles_len_reward": -0.10187815490644425, + "rewards/tag_count_reward": 0.0162760415696539, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.679688572883606, + "epoch": 0.3456, + "grad_norm": 57.18663787841797, + "kl": 89.0, + "learning_rate": 2.4329828146074096e-09, + "loss": 0.2609, + "num_tokens": 30599450.0, + "reward": 0.7566449083387852, + "reward_std": 0.41301782708615065, + "rewards/SMILES_validity_reward": 0.7749999966472387, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9036458320915699, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": -0.005343449403881095, + "rewards/smiles_len_reward": -0.08693266217596829, + "rewards/tag_count_reward": 0.0227864584303461, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.11458432674408, + "epoch": 0.3472, + "grad_norm": 154.4663848876953, + "kl": 46.12890625, + "learning_rate": 1.9230125621225725e-09, + "loss": 0.2244, + "num_tokens": 30716614.0, + "reward": 0.7455132007598877, + "reward_std": 0.32910655019804835, + "rewards/SMILES_validity_reward": 0.7690104194916785, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8854166679084301, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.005490455321705667, + "rewards/smiles_len_reward": -0.09758184582460672, + "rewards/tag_count_reward": 0.011067708488553762, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.18229281902313, + "epoch": 0.3488, + "grad_norm": 22.847536087036133, + "kl": 50.4775390625, + "learning_rate": 1.4727496132596605e-09, + "loss": 0.2611, + "num_tokens": 30834188.0, + "reward": 0.7400539442896843, + "reward_std": 0.3555171948391944, + "rewards/SMILES_validity_reward": 0.7494791620410979, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9140624962747097, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.00538063397834776, + "rewards/smiles_len_reward": -0.10606034100055695, + "rewards/tag_count_reward": 0.015625000116415322, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.55468928813934, + "epoch": 0.3504, + "grad_norm": 204.87474060058594, + "kl": 68.55859375, + "learning_rate": 1.0823023375489126e-09, + "loss": 0.2751, + "num_tokens": 30951521.0, + "reward": 0.7930208593606949, + "reward_std": 0.3214792348444462, + "rewards/SMILES_validity_reward": 0.8333333283662796, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9062500074505806, + "rewards/reasoning_steps_reward": 0.004340277810115367, + "rewards/repetition_penalty_reward": -0.005328564649971668, + "rewards/smiles_len_reward": -0.14823145783157088, + "rewards/tag_count_reward": 0.027343750291038305, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.58333492279053, + "epoch": 0.352, + "grad_norm": 53.10478591918945, + "kl": 57.669921875, + "learning_rate": 7.51764708051994e-10, + "loss": 0.3782, + "num_tokens": 31073089.0, + "reward": 0.7825033217668533, + "reward_std": 0.3621302582323551, + "rewards/SMILES_validity_reward": 0.8208333365619183, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8906250037252903, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.008823041369396378, + "rewards/smiles_len_reward": -0.10533744562417269, + "rewards/tag_count_reward": 0.016276041977107525, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.38020920753479, + "epoch": 0.3536, + "grad_norm": 537.83251953125, + "kl": 135.3125, + "learning_rate": 4.812162787445062e-10, + "loss": 0.3406, + "num_tokens": 31187667.0, + "reward": 0.7788260616362095, + "reward_std": 0.3945662109181285, + "rewards/SMILES_validity_reward": 0.8125, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.8958333320915699, + "rewards/reasoning_steps_reward": 0.008680555794853717, + "rewards/repetition_penalty_reward": -0.004561349313007668, + "rewards/smiles_len_reward": -0.10648488637525588, + "rewards/tag_count_reward": 0.015625, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.73437654972076, + "epoch": 0.3552, + "grad_norm": 31.371912002563477, + "kl": 57.451171875, + "learning_rate": 2.707221653688585e-10, + "loss": 0.5198, + "num_tokens": 31318125.0, + "reward": 0.8033189512789249, + "reward_std": 0.3403410839382559, + "rewards/SMILES_validity_reward": 0.8458333276212215, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416641831398, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.009130334568908438, + "rewards/smiles_len_reward": -0.12135331379249692, + "rewards/tag_count_reward": 0.02929687494179234, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.46354329586029, + "epoch": 0.3568, + "grad_norm": 19.290857315063477, + "kl": 40.796875, + "learning_rate": 1.203330297622207e-10, + "loss": 0.2786, + "num_tokens": 31436575.0, + "reward": 0.7805712670087814, + "reward_std": 0.4029743252322078, + "rewards/SMILES_validity_reward": 0.8000000007450581, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9114583358168602, + "rewards/reasoning_steps_reward": 0.006944444612599909, + "rewards/repetition_penalty_reward": -0.004496393317822367, + "rewards/smiles_len_reward": -0.04478339894558303, + "rewards/tag_count_reward": 0.013671875174622983, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.22916829586029, + "epoch": 0.3584, + "grad_norm": 32.02495193481445, + "kl": 48.533203125, + "learning_rate": 3.008506766313812e-11, + "loss": 0.3244, + "num_tokens": 31558007.0, + "reward": 0.80455506965518, + "reward_std": 0.32425580685958266, + "rewards/SMILES_validity_reward": 0.8416666425764561, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9010416679084301, + "rewards/reasoning_steps_reward": 0.018229166977107525, + "rewards/repetition_penalty_reward": -0.007674921703255677, + "rewards/smiles_len_reward": -0.08323407603893429, + "rewards/tag_count_reward": 0.02343750005820766, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.799480676651, + "epoch": 0.36, + "grad_norm": 16.788972854614258, + "kl": 34.279296875, + "learning_rate": 0.0, + "loss": 0.353, + "num_tokens": 31677738.0, + "reward": 0.7685777321457863, + "reward_std": 0.3936071125790477, + "rewards/SMILES_validity_reward": 0.7916666679084301, + "rewards/cosine_scaled_reward": -0.4999987781047821, + "rewards/format_reward": 0.9088541679084301, + "rewards/reasoning_steps_reward": 0.007812500232830644, + "rewards/repetition_penalty_reward": -0.0068950422428315505, + "rewards/smiles_len_reward": -0.09313616005238146, + "rewards/tag_count_reward": 0.009765625058207661, + "step": 450 + }, + { + "epoch": 0.36, + "step": 450, + "total_flos": 0.0, + "train_loss": 0.6825665244791242, + "train_runtime": 21029.9945, + "train_samples_per_second": 4.108, + "train_steps_per_second": 0.021 + } + ], + "logging_steps": 2, + "max_steps": 450, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 25, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}