| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.22857142857142856, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2253.854206085205, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.029817435890436172, | |
| "kl": 0.0, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 0.0, | |
| "loss": -0.048, | |
| "reward": 0.18865508306771517, | |
| "reward_after_mean": 0.18865508306771517, | |
| "reward_after_std": 0.5825161132961512, | |
| "reward_before_mean": 0.5353203006088734, | |
| "reward_before_std": 0.5411310354247689, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3466652315109968, | |
| "reward_change_min": -0.5511383786797523, | |
| "reward_change_std": 0.21760745346546173, | |
| "reward_std": 0.5825161281973124, | |
| "rewards/accuracy_reward": 0.37500000931322575, | |
| "rewards/cosine_scaled_reward": 0.16032031644135714, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2566.395854949951, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.025483140721917152, | |
| "kl": 0.0, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5e-08, | |
| "loss": 0.0336, | |
| "reward": 0.19053915701806545, | |
| "reward_after_mean": 0.19053915701806545, | |
| "reward_after_std": 0.5598375909030437, | |
| "reward_before_mean": 0.5439198296517134, | |
| "reward_before_std": 0.5335724893957376, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3533806595951319, | |
| "reward_change_min": -0.5397481862455606, | |
| "reward_change_std": 0.22024841140955687, | |
| "reward_std": 0.5598376058042049, | |
| "rewards/accuracy_reward": 0.41666667722165585, | |
| "rewards/cosine_scaled_reward": 0.12725313939154148, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2870.9166946411133, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.02348862774670124, | |
| "kl": 0.00016453862190246582, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0332, | |
| "reward": -0.11309619061648846, | |
| "reward_after_mean": -0.11309619061648846, | |
| "reward_after_std": 0.4816359058022499, | |
| "reward_before_mean": 0.13815331272780895, | |
| "reward_before_std": 0.4635454909875989, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2512495145201683, | |
| "reward_change_min": -0.4654350485652685, | |
| "reward_change_std": 0.16807015426456928, | |
| "reward_std": 0.4816359244287014, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/cosine_scaled_reward": -0.04934668634086847, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1454.0625305175781, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.03703964129090309, | |
| "kl": 9.372830390930176e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.5e-07, | |
| "loss": -0.0271, | |
| "reward": 0.31341979652643204, | |
| "reward_after_mean": 0.31341979652643204, | |
| "reward_after_std": 0.6322482246905565, | |
| "reward_before_mean": 0.7019761502742767, | |
| "reward_before_std": 0.6114191431552172, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.38855636678636074, | |
| "reward_change_min": -0.6810240596532822, | |
| "reward_change_std": 0.26496267691254616, | |
| "reward_std": 0.632248230278492, | |
| "rewards/accuracy_reward": 0.4791666679084301, | |
| "rewards/cosine_scaled_reward": 0.2228094656020403, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3106.2708892822266, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.02083689533174038, | |
| "kl": 0.00016355514526367188, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2e-07, | |
| "loss": 0.014, | |
| "reward": -0.14295638352632523, | |
| "reward_after_mean": -0.14295638352632523, | |
| "reward_after_std": 0.5368688032031059, | |
| "reward_before_mean": 0.08551615010946989, | |
| "reward_before_std": 0.4862247873097658, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22847254946827888, | |
| "reward_change_min": -0.34290769696235657, | |
| "reward_change_std": 0.12987546809017658, | |
| "reward_std": 0.5368688274174929, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.08115051127970219, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2359.7083587646484, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.026594510301947594, | |
| "kl": 0.00012198090553283691, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0042, | |
| "reward": 0.02189142513088882, | |
| "reward_after_mean": 0.02189142513088882, | |
| "reward_after_std": 0.6753048785030842, | |
| "reward_before_mean": 0.2973055485635996, | |
| "reward_before_std": 0.6682278430089355, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2754141204059124, | |
| "reward_change_min": -0.4809851162135601, | |
| "reward_change_std": 0.19098785053938627, | |
| "reward_std": 0.6753048803657293, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/cosine_scaled_reward": 0.026472217752598226, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2399.3125610351562, | |
| "epoch": 0.008, | |
| "grad_norm": 0.02208767458796501, | |
| "kl": 0.00013466179370880127, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0248, | |
| "reward": 0.04978923127055168, | |
| "reward_after_mean": 0.04978923127055168, | |
| "reward_after_std": 0.6014937199652195, | |
| "reward_before_mean": 0.3478062404319644, | |
| "reward_before_std": 0.5934015912935138, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.298017006367445, | |
| "reward_change_min": -0.48044517263770103, | |
| "reward_change_std": 0.19282954651862383, | |
| "reward_std": 0.6014937292784452, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/cosine_scaled_reward": 0.0353062367066741, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1923.0625343322754, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.025719735771417618, | |
| "kl": 7.99819827079773e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.5e-07, | |
| "loss": 0.0407, | |
| "reward": 0.20353314653038979, | |
| "reward_after_mean": 0.20353314653038979, | |
| "reward_after_std": 0.522263016551733, | |
| "reward_before_mean": 0.5592934358865023, | |
| "reward_before_std": 0.4454885171726346, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3557603023946285, | |
| "reward_change_min": -0.5532414987683296, | |
| "reward_change_std": 0.20976066123694181, | |
| "reward_std": 0.5222630221396685, | |
| "rewards/accuracy_reward": 0.41666667349636555, | |
| "rewards/cosine_scaled_reward": 0.1426267744973302, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2551.104202270508, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.028185561299324036, | |
| "kl": 0.0001239469274878502, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4e-07, | |
| "loss": -0.0295, | |
| "reward": -0.09345190459862351, | |
| "reward_after_mean": -0.09345190459862351, | |
| "reward_after_std": 0.6783309075981379, | |
| "reward_before_mean": 0.1387091837823391, | |
| "reward_before_std": 0.6670187395066023, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23216108232736588, | |
| "reward_change_min": -0.4182189740240574, | |
| "reward_change_std": 0.1609387183561921, | |
| "reward_std": 0.6783309262245893, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/cosine_scaled_reward": -0.04879084415733814, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2372.895881652832, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.026600031182169914, | |
| "kl": 0.00011247396469116211, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.5e-07, | |
| "loss": 0.0272, | |
| "reward": 0.042636663652956486, | |
| "reward_after_mean": 0.042636663652956486, | |
| "reward_after_std": 0.564329631626606, | |
| "reward_before_mean": 0.340551670640707, | |
| "reward_before_std": 0.5367275485768914, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.29791501350700855, | |
| "reward_change_min": -0.5192943438887596, | |
| "reward_change_std": 0.2046388229355216, | |
| "reward_std": 0.5643296670168638, | |
| "rewards/accuracy_reward": 0.33333333767950535, | |
| "rewards/cosine_scaled_reward": 0.007218348793685436, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3101.125015258789, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.02012811414897442, | |
| "kl": 0.0001429915428161621, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0255, | |
| "reward": -0.1189196240156889, | |
| "reward_after_mean": -0.1189196240156889, | |
| "reward_after_std": 0.5897521004080772, | |
| "reward_before_mean": 0.12155997939407825, | |
| "reward_before_std": 0.607317803427577, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24047961458563805, | |
| "reward_change_min": -0.47533175721764565, | |
| "reward_change_std": 0.18331623543053865, | |
| "reward_std": 0.5897521134465933, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/cosine_scaled_reward": -0.08677334897220135, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1999.7708740234375, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.030618170276284218, | |
| "kl": 0.00013138353824615479, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0727, | |
| "reward": -0.11606020852923393, | |
| "reward_after_mean": -0.11606020852923393, | |
| "reward_after_std": 0.45726848393678665, | |
| "reward_before_mean": 0.13688807259313762, | |
| "reward_before_std": 0.4379520956426859, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25294830463826656, | |
| "reward_change_min": -0.417834984138608, | |
| "reward_change_std": 0.16013498976826668, | |
| "reward_std": 0.4572684969753027, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/cosine_scaled_reward": -0.05061192624270916, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2364.7916946411133, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.021956922486424446, | |
| "kl": 0.0001315474510192871, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6e-07, | |
| "loss": -0.0309, | |
| "reward": -0.07681845407932997, | |
| "reward_after_mean": -0.07681845407932997, | |
| "reward_after_std": 0.5858405251055956, | |
| "reward_before_mean": 0.17525275237858295, | |
| "reward_before_std": 0.5871480498462915, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25207119435071945, | |
| "reward_change_min": -0.4718685280531645, | |
| "reward_change_std": 0.1800235854461789, | |
| "reward_std": 0.5858405511826277, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/cosine_scaled_reward": -0.03308058716356754, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2517.3750534057617, | |
| "epoch": 0.016, | |
| "grad_norm": 0.02542865462601185, | |
| "kl": 0.0001404285430908203, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.5e-07, | |
| "loss": -0.0269, | |
| "reward": -0.05427968641743064, | |
| "reward_after_mean": -0.05427968641743064, | |
| "reward_after_std": 0.5012820027768612, | |
| "reward_before_mean": 0.21205449337139726, | |
| "reward_before_std": 0.4408043739385903, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2663341574370861, | |
| "reward_change_min": -0.4154788888990879, | |
| "reward_change_std": 0.15815073158591986, | |
| "reward_std": 0.5012820195406675, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/cosine_scaled_reward": -0.0171121833845973, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2587.333381652832, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.026293920353055, | |
| "kl": 0.00010640174150466919, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7e-07, | |
| "loss": -0.0171, | |
| "reward": 0.1291836015880108, | |
| "reward_after_mean": 0.1291836015880108, | |
| "reward_after_std": 0.34985754638910294, | |
| "reward_before_mean": 0.48446296714246273, | |
| "reward_before_std": 0.26648052502423525, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.35527935065329075, | |
| "reward_change_min": -0.5008186884224415, | |
| "reward_change_std": 0.19560196995735168, | |
| "reward_std": 0.34985755756497383, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/cosine_scaled_reward": 0.1511296145617962, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3479.3958740234375, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.018178097903728485, | |
| "kl": 0.00018554925918579102, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0308, | |
| "reward": -0.12316650152206421, | |
| "reward_after_mean": -0.12316650152206421, | |
| "reward_after_std": 0.48174857906997204, | |
| "reward_before_mean": 0.12909814529120922, | |
| "reward_before_std": 0.4807006008923054, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2522646654397249, | |
| "reward_change_min": -0.4256697855889797, | |
| "reward_change_std": 0.17396669182926416, | |
| "reward_std": 0.48174858279526234, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.037568524945527315, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2003.1458587646484, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.037696439772844315, | |
| "kl": 0.00012348592281341553, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8e-07, | |
| "loss": -0.0628, | |
| "reward": 0.28727056505158544, | |
| "reward_after_mean": 0.28727056505158544, | |
| "reward_after_std": 0.7488968446850777, | |
| "reward_before_mean": 0.6418840168043971, | |
| "reward_before_std": 0.6678282842040062, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3546134736388922, | |
| "reward_change_min": -0.5937347710132599, | |
| "reward_change_std": 0.2221750607714057, | |
| "reward_std": 0.7488968670368195, | |
| "rewards/accuracy_reward": 0.4583333358168602, | |
| "rewards/cosine_scaled_reward": 0.18355069373501465, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2641.895881652832, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.018719196319580078, | |
| "kl": 0.0001255720853805542, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": -0.0322, | |
| "reward": 0.13862532377243042, | |
| "reward_after_mean": 0.13862532377243042, | |
| "reward_after_std": 0.4716039840131998, | |
| "reward_before_mean": 0.48331868555396795, | |
| "reward_before_std": 0.4304672125726938, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3446933813393116, | |
| "reward_change_min": -0.49582854844629765, | |
| "reward_change_std": 0.19730697199702263, | |
| "reward_std": 0.4716039877384901, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/cosine_scaled_reward": 0.1291519934311509, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2432.6667251586914, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.024000316858291626, | |
| "kl": 0.00012255460023880005, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0587, | |
| "reward": 0.22441758587956429, | |
| "reward_after_mean": 0.22441758587956429, | |
| "reward_after_std": 0.7845676727592945, | |
| "reward_before_mean": 0.556532722664997, | |
| "reward_before_std": 0.759381739422679, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3321151062846184, | |
| "reward_change_min": -0.6048189178109169, | |
| "reward_change_std": 0.2319869976490736, | |
| "reward_std": 0.7845676802098751, | |
| "rewards/accuracy_reward": 0.4166666753590107, | |
| "rewards/cosine_scaled_reward": 0.13986602576915175, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1485.7917366027832, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.033277831971645355, | |
| "kl": 6.410479545593262e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": 0.0805, | |
| "reward": 0.27623686753213406, | |
| "reward_after_mean": 0.27623686753213406, | |
| "reward_after_std": 0.5876359883695841, | |
| "reward_before_mean": 0.649633388966322, | |
| "reward_before_std": 0.5021514918189496, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.37339654564857483, | |
| "reward_change_min": -0.5447837132960558, | |
| "reward_change_std": 0.21536927483975887, | |
| "reward_std": 0.5876360051333904, | |
| "rewards/accuracy_reward": 0.4583333395421505, | |
| "rewards/cosine_scaled_reward": 0.19130004616454244, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2453.8958587646484, | |
| "epoch": 0.024, | |
| "grad_norm": 0.03172062337398529, | |
| "kl": 0.00014081597328186035, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0158, | |
| "reward": 0.054135403130203485, | |
| "reward_after_mean": 0.054135403130203485, | |
| "reward_after_std": 0.673911839723587, | |
| "reward_before_mean": 0.3450733758509159, | |
| "reward_before_std": 0.6906127110123634, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2909379918128252, | |
| "reward_change_min": -0.5521223619580269, | |
| "reward_change_std": 0.21271574683487415, | |
| "reward_std": 0.6739118546247482, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/cosine_scaled_reward": 0.05340671516023576, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1199.6458587646484, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.03503501042723656, | |
| "kl": 0.00010113418102264404, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.99931462820376e-07, | |
| "loss": 0.0149, | |
| "reward": -0.011358192190527916, | |
| "reward_after_mean": -0.011358192190527916, | |
| "reward_after_std": 0.42460223753005266, | |
| "reward_before_mean": 0.2760426625609398, | |
| "reward_before_std": 0.30881113989744335, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28740084543824196, | |
| "reward_change_min": -0.3831062186509371, | |
| "reward_change_std": 0.14771532081067562, | |
| "reward_std": 0.42460225615650415, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": -0.015624009916791692, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2435.3542251586914, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.024877028539776802, | |
| "kl": 0.00011622905731201172, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0085, | |
| "reward": 0.01893002726137638, | |
| "reward_after_mean": 0.01893002726137638, | |
| "reward_after_std": 0.7557151317596436, | |
| "reward_before_mean": 0.28602612018585205, | |
| "reward_before_std": 0.779970521107316, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2670961171388626, | |
| "reward_change_min": -0.5823409277945757, | |
| "reward_change_std": 0.21948671340942383, | |
| "reward_std": 0.7557151485234499, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/cosine_scaled_reward": -0.005640537710860372, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1879.1666870117188, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.025321708992123604, | |
| "kl": 7.653236389160156e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.993832906395582e-07, | |
| "loss": 0.0233, | |
| "reward": 0.27044946048408747, | |
| "reward_after_mean": 0.27044946048408747, | |
| "reward_after_std": 0.6552858538925648, | |
| "reward_before_mean": 0.6367096854373813, | |
| "reward_before_std": 0.612536040134728, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3662601951509714, | |
| "reward_change_min": -0.5752917006611824, | |
| "reward_change_std": 0.22535593062639236, | |
| "reward_std": 0.6552858743816614, | |
| "rewards/accuracy_reward": 0.4166666753590107, | |
| "rewards/cosine_scaled_reward": 0.22004299331456423, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2328.708366394043, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.022590631619095802, | |
| "kl": 0.00014277477748692036, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0379, | |
| "reward": 0.15500614792108536, | |
| "reward_after_mean": 0.15500614792108536, | |
| "reward_after_std": 0.46420795284211636, | |
| "reward_before_mean": 0.5061908392235637, | |
| "reward_before_std": 0.4215713571757078, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.351184718310833, | |
| "reward_change_min": -0.5469736345112324, | |
| "reward_change_std": 0.21114493068307638, | |
| "reward_std": 0.46420796401798725, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/cosine_scaled_reward": 0.15202417224645615, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2400.645851135254, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.023803971707820892, | |
| "kl": 0.00012956559658050537, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.982876141412855e-07, | |
| "loss": -0.0398, | |
| "reward": -0.30033721402287483, | |
| "reward_after_mean": -0.30033721402287483, | |
| "reward_after_std": 0.40887343883514404, | |
| "reward_before_mean": -0.10959341688430868, | |
| "reward_before_std": 0.38057865016162395, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19074379839003086, | |
| "reward_change_min": -0.3431377075612545, | |
| "reward_change_std": 0.12020129058510065, | |
| "reward_std": 0.40887344256043434, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.17209341190755367, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2402.500068664551, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.027803365141153336, | |
| "kl": 0.00014105439186096191, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0521, | |
| "reward": 0.012697070837020874, | |
| "reward_after_mean": 0.012697070837020874, | |
| "reward_after_std": 0.4288240037858486, | |
| "reward_before_mean": 0.3213062509894371, | |
| "reward_before_std": 0.4260330041870475, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30860918015241623, | |
| "reward_change_min": -0.4718206785619259, | |
| "reward_change_std": 0.19340202026069164, | |
| "reward_std": 0.42882401682436466, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/cosine_scaled_reward": 0.05047290958464146, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2371.5000343322754, | |
| "epoch": 0.032, | |
| "grad_norm": 0.03396248817443848, | |
| "kl": 0.00012034177780151367, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.96645768238595e-07, | |
| "loss": -0.0088, | |
| "reward": 0.34543062816374004, | |
| "reward_after_mean": 0.34543062816374004, | |
| "reward_after_std": 0.8179215285927057, | |
| "reward_before_mean": 0.7244215086102486, | |
| "reward_before_std": 0.8351087644696236, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.37899088114500046, | |
| "reward_change_min": -0.6348299775272608, | |
| "reward_change_std": 0.2599595533683896, | |
| "reward_std": 0.8179215602576733, | |
| "rewards/accuracy_reward": 0.5000000186264515, | |
| "rewards/cosine_scaled_reward": 0.22442149464040995, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2970.8333740234375, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.020854219794273376, | |
| "kl": 0.00018525123596191406, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": -0.0473, | |
| "reward": -0.1481495127081871, | |
| "reward_after_mean": -0.1481495127081871, | |
| "reward_after_std": 0.31386037822812796, | |
| "reward_before_mean": 0.1116566862910986, | |
| "reward_before_std": 0.26476416178047657, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2598061878234148, | |
| "reward_change_min": -0.39690806716680527, | |
| "reward_change_std": 0.15259934635832906, | |
| "reward_std": 0.31386038288474083, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/cosine_scaled_reward": -0.07584333047270775, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2383.6875762939453, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.02130362018942833, | |
| "kl": 0.00012382864952087402, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.944597532678119e-07, | |
| "loss": -0.0524, | |
| "reward": 0.41686034575104713, | |
| "reward_after_mean": 0.41686034575104713, | |
| "reward_after_std": 0.7474758476018906, | |
| "reward_before_mean": 0.8319165632128716, | |
| "reward_before_std": 0.75706597417593, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.4150562435388565, | |
| "reward_change_min": -0.7054598368704319, | |
| "reward_change_std": 0.2837554384022951, | |
| "reward_std": 0.7474758699536324, | |
| "rewards/accuracy_reward": 0.5416666809469461, | |
| "rewards/cosine_scaled_reward": 0.29024988505989313, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2900.750045776367, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.02519163116812706, | |
| "kl": 0.00015944242477416992, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": -0.0202, | |
| "reward": -0.03954649716615677, | |
| "reward_after_mean": -0.03954649716615677, | |
| "reward_after_std": 0.4638585727661848, | |
| "reward_before_mean": 0.24700810015201569, | |
| "reward_before_std": 0.4689239803701639, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2865545880049467, | |
| "reward_change_min": -0.48880807869136333, | |
| "reward_change_std": 0.19339969009160995, | |
| "reward_std": 0.4638585839420557, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/cosine_scaled_reward": 0.017841406166553497, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2334.791717529297, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.023573867976665497, | |
| "kl": 0.00011053681373596191, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.917322325514487e-07, | |
| "loss": -0.0054, | |
| "reward": 0.13868720829486847, | |
| "reward_after_mean": 0.13868720829486847, | |
| "reward_after_std": 0.612119173631072, | |
| "reward_before_mean": 0.4695329191163182, | |
| "reward_before_std": 0.6050059096887708, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3308457229286432, | |
| "reward_change_min": -0.5487178079783916, | |
| "reward_change_std": 0.22997961565852165, | |
| "reward_std": 0.612119173631072, | |
| "rewards/accuracy_reward": 0.37500000558793545, | |
| "rewards/cosine_scaled_reward": 0.09453292191028595, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2833.729217529297, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.022108081728219986, | |
| "kl": 0.00012211501598358154, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0161, | |
| "reward": 0.07408714387565851, | |
| "reward_after_mean": 0.07408714387565851, | |
| "reward_after_std": 0.6689751651138067, | |
| "reward_before_mean": 0.37650261726230383, | |
| "reward_before_std": 0.7002740390598774, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30241546407341957, | |
| "reward_change_min": -0.5437397100031376, | |
| "reward_change_std": 0.22630772832781076, | |
| "reward_std": 0.6689751725643873, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/cosine_scaled_reward": 0.0640026107430458, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1962.7500381469727, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.02836132049560547, | |
| "kl": 0.0001449286937713623, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.88466529153356e-07, | |
| "loss": 0.0147, | |
| "reward": 0.4290049262344837, | |
| "reward_after_mean": 0.4290049262344837, | |
| "reward_after_std": 0.5306954458355904, | |
| "reward_before_mean": 0.8744035568088293, | |
| "reward_before_std": 0.46768255438655615, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.44539863243699074, | |
| "reward_change_min": -0.6339141335338354, | |
| "reward_change_std": 0.2669675601646304, | |
| "reward_std": 0.5306954644620419, | |
| "rewards/accuracy_reward": 0.562500013038516, | |
| "rewards/cosine_scaled_reward": 0.31190355867147446, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2638.770896911621, | |
| "epoch": 0.04, | |
| "grad_norm": 0.03418930992484093, | |
| "kl": 0.00013949722051620483, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0269, | |
| "reward": 0.20577108953148127, | |
| "reward_after_mean": 0.20577108953148127, | |
| "reward_after_std": 0.6861968599259853, | |
| "reward_before_mean": 0.5486643612384796, | |
| "reward_before_std": 0.6807287614792585, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3428932707756758, | |
| "reward_change_min": -0.5318975541740656, | |
| "reward_change_std": 0.22158884536474943, | |
| "reward_std": 0.6861968897283077, | |
| "rewards/accuracy_reward": 0.4166666828095913, | |
| "rewards/cosine_scaled_reward": 0.13199766166508198, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3177.8958892822266, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.023622367531061172, | |
| "kl": 0.0001952648162841797, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.846666218300807e-07, | |
| "loss": 0.0101, | |
| "reward": -0.35834434535354376, | |
| "reward_after_mean": -0.35834434535354376, | |
| "reward_after_std": 0.40897000953555107, | |
| "reward_before_mean": -0.1906146677210927, | |
| "reward_before_std": 0.37281213887035847, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.16772967763245106, | |
| "reward_change_min": -0.2771889641880989, | |
| "reward_change_std": 0.09608037583529949, | |
| "reward_std": 0.40897002816200256, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.23228134028613567, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2871.0208892822266, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.02082606963813305, | |
| "kl": 0.00013563036918640137, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0088, | |
| "reward": -0.22048189118504524, | |
| "reward_after_mean": -0.22048189118504524, | |
| "reward_after_std": 0.41180545277893543, | |
| "reward_before_mean": 0.0031149107962846756, | |
| "reward_before_std": 0.4039106909185648, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22359680384397507, | |
| "reward_change_min": -0.3989143669605255, | |
| "reward_change_std": 0.15272453986108303, | |
| "reward_std": 0.41180546395480633, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/cosine_scaled_reward": -0.12188508547842503, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3201.9375228881836, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.01921209692955017, | |
| "kl": 0.00017651915550231934, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.80337140183366e-07, | |
| "loss": 0.0277, | |
| "reward": -0.09033865109086037, | |
| "reward_after_mean": -0.09033865109086037, | |
| "reward_after_std": 0.3066476993262768, | |
| "reward_before_mean": 0.19127687066793442, | |
| "reward_before_std": 0.2528393566608429, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28161551244556904, | |
| "reward_change_min": -0.4111398421227932, | |
| "reward_change_std": 0.15858831629157066, | |
| "reward_std": 0.3066477105021477, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.01705646887421608, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2304.458366394043, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.02351287379860878, | |
| "kl": 0.00011439248919487, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": -0.0053, | |
| "reward": 0.05605981033295393, | |
| "reward_after_mean": 0.05605981033295393, | |
| "reward_after_std": 0.46067033149302006, | |
| "reward_before_mean": 0.36542993783950806, | |
| "reward_before_std": 0.3694801307283342, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30937014520168304, | |
| "reward_change_min": -0.47338311932981014, | |
| "reward_change_std": 0.17591516766697168, | |
| "reward_std": 0.4606703519821167, | |
| "rewards/accuracy_reward": 0.31250000186264515, | |
| "rewards/cosine_scaled_reward": 0.05292995506897569, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2159.895881652832, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.026050910353660583, | |
| "kl": 0.0001217871904373169, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.754833590196926e-07, | |
| "loss": 0.0642, | |
| "reward": 0.002425914630293846, | |
| "reward_after_mean": 0.002425914630293846, | |
| "reward_after_std": 0.5221548210829496, | |
| "reward_before_mean": 0.289883803576231, | |
| "reward_before_std": 0.4837529417127371, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2874579019844532, | |
| "reward_change_min": -0.47877912409603596, | |
| "reward_change_std": 0.18734649941325188, | |
| "reward_std": 0.5221548229455948, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/cosine_scaled_reward": 0.019050464034080505, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2697.1875076293945, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.02190599963068962, | |
| "kl": 0.00014284253120422363, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0291, | |
| "reward": 0.08761137025430799, | |
| "reward_after_mean": 0.08761137025430799, | |
| "reward_after_std": 0.6340927109122276, | |
| "reward_before_mean": 0.39676812407560647, | |
| "reward_before_std": 0.6426020693033934, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30915676802396774, | |
| "reward_change_min": -0.49815394170582294, | |
| "reward_change_std": 0.20790001936256886, | |
| "reward_std": 0.634092727676034, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/cosine_scaled_reward": 0.08426813036203384, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2502.229223251343, | |
| "epoch": 0.048, | |
| "grad_norm": 0.04056044667959213, | |
| "kl": 0.00016070902347564697, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.701111919237408e-07, | |
| "loss": 0.0258, | |
| "reward": -0.2749686185270548, | |
| "reward_after_mean": -0.2749686185270548, | |
| "reward_after_std": 0.3445035833865404, | |
| "reward_before_mean": -0.06608736759517342, | |
| "reward_before_std": 0.3068845123052597, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20888124220073223, | |
| "reward_change_min": -0.3034993875771761, | |
| "reward_change_std": 0.11960937362164259, | |
| "reward_std": 0.3445035871118307, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/cosine_scaled_reward": -0.1910873781889677, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2652.625030517578, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.020540975034236908, | |
| "kl": 0.00012725219130516052, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0065, | |
| "reward": -0.0786220645532012, | |
| "reward_after_mean": -0.0786220645532012, | |
| "reward_after_std": 0.4567228890955448, | |
| "reward_before_mean": 0.19149728119373322, | |
| "reward_before_std": 0.4425015412271023, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27011934854090214, | |
| "reward_change_min": -0.4120590351521969, | |
| "reward_change_std": 0.16643204633146524, | |
| "reward_std": 0.45672290213406086, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/cosine_scaled_reward": -0.016836050897836685, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2204.1458587646484, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.02781352587044239, | |
| "kl": 0.00011363625526428223, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.64227184053598e-07, | |
| "loss": 0.0196, | |
| "reward": 0.23546710796654224, | |
| "reward_after_mean": 0.23546710796654224, | |
| "reward_after_std": 0.4349679071456194, | |
| "reward_before_mean": 0.6162683628499508, | |
| "reward_before_std": 0.3583456464111805, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3808012641966343, | |
| "reward_change_min": -0.5730564780533314, | |
| "reward_change_std": 0.21749800257384777, | |
| "reward_std": 0.4349679220467806, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/cosine_scaled_reward": 0.22043502517044544, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3081.6459045410156, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.01855648308992386, | |
| "kl": 0.00013652443885803223, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": -0.0101, | |
| "reward": 0.14500193297863007, | |
| "reward_after_mean": 0.14500193297863007, | |
| "reward_after_std": 0.5667851958423853, | |
| "reward_before_mean": 0.47629803395830095, | |
| "reward_before_std": 0.5099421134218574, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3312961012125015, | |
| "reward_change_min": -0.503504516556859, | |
| "reward_change_std": 0.19466788694262505, | |
| "reward_std": 0.5667852181941271, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/cosine_scaled_reward": 0.1429646834731102, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2907.1666870117188, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.026002762839198112, | |
| "kl": 0.00016480684280395508, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.578385041664925e-07, | |
| "loss": -0.0122, | |
| "reward": -0.2713017947971821, | |
| "reward_after_mean": -0.2713017947971821, | |
| "reward_after_std": 0.41631367057561874, | |
| "reward_before_mean": -0.06967430002987385, | |
| "reward_before_std": 0.388171230442822, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20162750035524368, | |
| "reward_change_min": -0.33353596180677414, | |
| "reward_change_std": 0.12530343793332577, | |
| "reward_std": 0.41631368920207024, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/cosine_scaled_reward": -0.1530076339840889, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2134.458354949951, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.03345310315489769, | |
| "kl": 8.27684998512268e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": -0.011, | |
| "reward": 0.23032055981457233, | |
| "reward_after_mean": 0.23032055981457233, | |
| "reward_after_std": 0.5171677935868502, | |
| "reward_before_mean": 0.5986741930246353, | |
| "reward_before_std": 0.4480929058045149, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3683536574244499, | |
| "reward_change_min": -0.5596556253731251, | |
| "reward_change_std": 0.21970490273088217, | |
| "reward_std": 0.5171678010374308, | |
| "rewards/accuracy_reward": 0.41666666977107525, | |
| "rewards/cosine_scaled_reward": 0.18200752511620522, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2394.6875534057617, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.02606636844575405, | |
| "kl": 0.00011596083641052246, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.509529358847654e-07, | |
| "loss": 0.0201, | |
| "reward": 0.12413597200065851, | |
| "reward_after_mean": 0.12413597200065851, | |
| "reward_after_std": 0.45973930321633816, | |
| "reward_before_mean": 0.46354489028453827, | |
| "reward_before_std": 0.4050141889601946, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3394088950008154, | |
| "reward_change_min": -0.491641778498888, | |
| "reward_change_std": 0.1968485563993454, | |
| "reward_std": 0.4597393050789833, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/cosine_scaled_reward": 0.10937818745151162, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1708.5833435058594, | |
| "epoch": 0.056, | |
| "grad_norm": 0.03054640255868435, | |
| "kl": 0.00010991096496582031, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0575, | |
| "reward": 0.29493133816868067, | |
| "reward_after_mean": 0.29493133816868067, | |
| "reward_after_std": 0.6500303111970425, | |
| "reward_before_mean": 0.6766238370910287, | |
| "reward_before_std": 0.6318102139048278, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3816925100982189, | |
| "reward_change_min": -0.6162350811064243, | |
| "reward_change_std": 0.250560705550015, | |
| "reward_std": 0.6500303186476231, | |
| "rewards/accuracy_reward": 0.45833334140479565, | |
| "rewards/cosine_scaled_reward": 0.2182904863730073, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2679.7083740234375, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.02415233850479126, | |
| "kl": 0.00010981038212776184, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.43578868212728e-07, | |
| "loss": -0.0278, | |
| "reward": 0.14336813369300216, | |
| "reward_after_mean": 0.14336813369300216, | |
| "reward_after_std": 0.5428028926253319, | |
| "reward_before_mean": 0.47334863245487213, | |
| "reward_before_std": 0.4351601582020521, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.32998047955334187, | |
| "reward_change_min": -0.4792014453560114, | |
| "reward_change_std": 0.18526286352425814, | |
| "reward_std": 0.5428028963506222, | |
| "rewards/accuracy_reward": 0.3541666679084301, | |
| "rewards/cosine_scaled_reward": 0.11918195243924856, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2168.1875343322754, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.03308018669486046, | |
| "kl": 0.0001481473445892334, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": -0.012, | |
| "reward": -0.22561240196228027, | |
| "reward_after_mean": -0.22561240196228027, | |
| "reward_after_std": 0.4365761708468199, | |
| "reward_before_mean": -0.0061477068811655045, | |
| "reward_before_std": 0.4297064580023289, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21946469321846962, | |
| "reward_change_min": -0.3908286802470684, | |
| "reward_change_std": 0.15255583450198174, | |
| "reward_std": 0.4365761801600456, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.15198103338479996, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2549.7083892822266, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.030142705887556076, | |
| "kl": 0.00010135024785995483, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.357252853159505e-07, | |
| "loss": 0.0317, | |
| "reward": 0.26532851438969374, | |
| "reward_after_mean": 0.26532851438969374, | |
| "reward_after_std": 0.4831724725663662, | |
| "reward_before_mean": 0.6504863314330578, | |
| "reward_before_std": 0.38277638517320156, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.38515781983733177, | |
| "reward_change_min": -0.601357439532876, | |
| "reward_change_std": 0.22396727558225393, | |
| "reward_std": 0.48317247815430164, | |
| "rewards/accuracy_reward": 0.4791666679084301, | |
| "rewards/cosine_scaled_reward": 0.17131963837891817, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2409.6667251586914, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.022006014361977577, | |
| "kl": 0.00011900067329406738, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": -0.0174, | |
| "reward": 0.27613873686641455, | |
| "reward_after_mean": 0.27613873686641455, | |
| "reward_after_std": 0.6787273976951838, | |
| "reward_before_mean": 0.6442126063629985, | |
| "reward_before_std": 0.6745601836591959, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3680738639086485, | |
| "reward_change_min": -0.5628750901669264, | |
| "reward_change_std": 0.23260985035449266, | |
| "reward_std": 0.6787274144589901, | |
| "rewards/accuracy_reward": 0.4166666828095913, | |
| "rewards/cosine_scaled_reward": 0.22754593688296154, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1849.8542175292969, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.03163987770676613, | |
| "kl": 7.21663236618042e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.274017555754407e-07, | |
| "loss": -0.0455, | |
| "reward": 0.5419067908078432, | |
| "reward_after_mean": 0.5419067908078432, | |
| "reward_after_std": 0.621409310027957, | |
| "reward_before_mean": 1.0097733028233051, | |
| "reward_before_std": 0.5289697218686342, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.4678665027022362, | |
| "reward_change_min": -0.687706695869565, | |
| "reward_change_std": 0.2778399270027876, | |
| "reward_std": 0.6214093323796988, | |
| "rewards/accuracy_reward": 0.6250000055879354, | |
| "rewards/cosine_scaled_reward": 0.38477328792214394, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2662.562545776367, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.021438749507069588, | |
| "kl": 0.00012673437595367432, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0578, | |
| "reward": 0.24588292092084885, | |
| "reward_after_mean": 0.24588292092084885, | |
| "reward_after_std": 0.45386996306478977, | |
| "reward_before_mean": 0.6329527571797371, | |
| "reward_before_std": 0.39988668635487556, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.38706981018185616, | |
| "reward_change_min": -0.575838714838028, | |
| "reward_change_std": 0.2312335381284356, | |
| "reward_std": 0.45386997424066067, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/cosine_scaled_reward": 0.2162860780954361, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2517.291748046875, | |
| "epoch": 0.064, | |
| "grad_norm": 0.024150483310222626, | |
| "kl": 0.00012356042861938477, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.186184199300463e-07, | |
| "loss": 0.0008, | |
| "reward": -0.0010210014879703522, | |
| "reward_after_mean": -0.0010210014879703522, | |
| "reward_after_std": 0.5338380401954055, | |
| "reward_before_mean": 0.2886288957670331, | |
| "reward_before_std": 0.5403741393238306, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28964989073574543, | |
| "reward_change_min": -0.47638524509966373, | |
| "reward_change_std": 0.19530559238046408, | |
| "reward_std": 0.5338380504399538, | |
| "rewards/accuracy_reward": 0.27083334140479565, | |
| "rewards/cosine_scaled_reward": 0.01779552362859249, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2843.8958892822266, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.018026748672127724, | |
| "kl": 0.00010999536607414484, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0482, | |
| "reward": 0.07039481587707996, | |
| "reward_after_mean": 0.07039481587707996, | |
| "reward_after_std": 0.6540415622293949, | |
| "reward_before_mean": 0.3758677262812853, | |
| "reward_before_std": 0.7031431794166565, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30547289550304413, | |
| "reward_change_min": -0.6006171070039272, | |
| "reward_change_std": 0.23927316907793283, | |
| "reward_std": 0.6540416032075882, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/cosine_scaled_reward": 0.042534375097602606, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1548.770866394043, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.03142126649618149, | |
| "kl": 7.739663124084473e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.093859795212817e-07, | |
| "loss": 0.0822, | |
| "reward": 0.2759701292961836, | |
| "reward_after_mean": 0.2759701292961836, | |
| "reward_after_std": 0.4890221580862999, | |
| "reward_before_mean": 0.6628518868237734, | |
| "reward_before_std": 0.4022231069393456, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.386881772428751, | |
| "reward_change_min": -0.535983219742775, | |
| "reward_change_std": 0.21398326009511948, | |
| "reward_std": 0.48902217485010624, | |
| "rewards/accuracy_reward": 0.47916667722165585, | |
| "rewards/cosine_scaled_reward": 0.18368521006777883, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2577.645835876465, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.024798288941383362, | |
| "kl": 9.592529386281967e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": -0.0158, | |
| "reward": 0.014745804481208324, | |
| "reward_after_mean": 0.014745804481208324, | |
| "reward_after_std": 0.6127710696309805, | |
| "reward_before_mean": 0.29811959713697433, | |
| "reward_before_std": 0.6092889029532671, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2833738047629595, | |
| "reward_change_min": -0.4874247722327709, | |
| "reward_change_std": 0.18988565262407064, | |
| "reward_std": 0.6127710789442062, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/cosine_scaled_reward": 0.027286252938210964, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2456.7292251586914, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.021669333800673485, | |
| "kl": 0.00011354684829711914, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.997156826556369e-07, | |
| "loss": 0.0436, | |
| "reward": 0.044893345795571804, | |
| "reward_after_mean": 0.044893345795571804, | |
| "reward_after_std": 0.5106327962130308, | |
| "reward_before_mean": 0.35379540640860796, | |
| "reward_before_std": 0.48913951963186264, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30890206806361675, | |
| "reward_change_min": -0.5454690717160702, | |
| "reward_change_std": 0.20998750906437635, | |
| "reward_std": 0.5106328222900629, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/cosine_scaled_reward": 0.062128732446581125, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2429.104217529297, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.023875517770648003, | |
| "kl": 8.734315633773804e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0734, | |
| "reward": -0.02687007375061512, | |
| "reward_after_mean": -0.02687007375061512, | |
| "reward_after_std": 0.357270210981369, | |
| "reward_before_mean": 0.2700694063678384, | |
| "reward_before_std": 0.3037982089444995, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.29693946428596973, | |
| "reward_change_min": -0.44768262282013893, | |
| "reward_change_std": 0.16799015924334526, | |
| "reward_std": 0.3572702258825302, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/cosine_scaled_reward": -0.000763963907957077, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2164.687515258789, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.027663985267281532, | |
| "kl": 8.076801896095276e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.896193111002475e-07, | |
| "loss": -0.0317, | |
| "reward": 0.07955900579690933, | |
| "reward_after_mean": 0.07955900579690933, | |
| "reward_after_std": 0.44076032005250454, | |
| "reward_before_mean": 0.40465743746608496, | |
| "reward_before_std": 0.37633848655968904, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3250984400510788, | |
| "reward_change_min": -0.47303689643740654, | |
| "reward_change_std": 0.18647983483970165, | |
| "reward_std": 0.4407603293657303, | |
| "rewards/accuracy_reward": 0.33333334140479565, | |
| "rewards/cosine_scaled_reward": 0.07132405880838633, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1490.9375305175781, | |
| "epoch": 0.072, | |
| "grad_norm": 0.035348497331142426, | |
| "kl": 0.0001027137041091919, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": -0.0576, | |
| "reward": 0.32882157526910305, | |
| "reward_after_mean": 0.32882157526910305, | |
| "reward_after_std": 0.49383416399359703, | |
| "reward_before_mean": 0.73639902099967, | |
| "reward_before_std": 0.4061170890927315, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.40757744386792183, | |
| "reward_change_min": -0.6154311131685972, | |
| "reward_change_std": 0.23690672032535076, | |
| "reward_std": 0.49383416771888733, | |
| "rewards/accuracy_reward": 0.5000000037252903, | |
| "rewards/cosine_scaled_reward": 0.23639898875262588, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2587.6666946411133, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.02268359065055847, | |
| "kl": 0.00013853982090950012, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.791091657286267e-07, | |
| "loss": -0.0032, | |
| "reward": 0.10220323409885168, | |
| "reward_after_mean": 0.10220323409885168, | |
| "reward_after_std": 0.5815557111054659, | |
| "reward_before_mean": 0.4171946104615927, | |
| "reward_before_std": 0.5369735099375248, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3149913866072893, | |
| "reward_change_min": -0.5246579889208078, | |
| "reward_change_std": 0.2050698734819889, | |
| "reward_std": 0.5815557222813368, | |
| "rewards/accuracy_reward": 0.33333333767950535, | |
| "rewards/cosine_scaled_reward": 0.08386127499397844, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2444.4791984558105, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.026023779064416885, | |
| "kl": 0.00010403990745544434, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": -0.0247, | |
| "reward": 0.05848180502653122, | |
| "reward_after_mean": 0.05848180502653122, | |
| "reward_after_std": 0.4780960585922003, | |
| "reward_before_mean": 0.37100529856979847, | |
| "reward_before_std": 0.4231007066555321, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31252351962029934, | |
| "reward_change_min": -0.4703453090041876, | |
| "reward_change_std": 0.1822062935680151, | |
| "reward_std": 0.4780960753560066, | |
| "rewards/accuracy_reward": 0.33333334140479565, | |
| "rewards/cosine_scaled_reward": 0.03767195844557136, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2057.604175567627, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.030141225084662437, | |
| "kl": 9.322166442871094e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.681980515339463e-07, | |
| "loss": -0.0104, | |
| "reward": -0.023890173994004726, | |
| "reward_after_mean": -0.023890173994004726, | |
| "reward_after_std": 0.35533210076391697, | |
| "reward_before_mean": 0.2734305802732706, | |
| "reward_before_std": 0.26786400750279427, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2973207589238882, | |
| "reward_change_min": -0.45873321034014225, | |
| "reward_change_std": 0.1713191168382764, | |
| "reward_std": 0.3553321100771427, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": -0.01823609508574009, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3077.3958892822266, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.020309919491410255, | |
| "kl": 0.00010600686073303223, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": -0.0188, | |
| "reward": -0.2882272908464074, | |
| "reward_after_mean": -0.2882272908464074, | |
| "reward_after_std": 0.5068832859396935, | |
| "reward_before_mean": -0.11004448961466551, | |
| "reward_before_std": 0.4490641765296459, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17818280309438705, | |
| "reward_change_min": -0.2667464707046747, | |
| "reward_change_std": 0.09650260768830776, | |
| "reward_std": 0.5068832859396935, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.17254449147731066, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1452.2291793823242, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.03242521733045578, | |
| "kl": 8.340179920196533e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.568992620281243e-07, | |
| "loss": 0.0175, | |
| "reward": 0.07238315790891647, | |
| "reward_after_mean": 0.07238315790891647, | |
| "reward_after_std": 0.4353354647755623, | |
| "reward_before_mean": 0.39404843375086784, | |
| "reward_before_std": 0.3516052491031587, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3216652628034353, | |
| "reward_change_min": -0.5194528475403786, | |
| "reward_change_std": 0.19221886433660984, | |
| "reward_std": 0.43533547781407833, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/cosine_scaled_reward": 0.06071509560570121, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1787.2916946411133, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.031117206439375877, | |
| "kl": 9.419023990631104e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": -0.0016, | |
| "reward": -0.026340688578784466, | |
| "reward_after_mean": -0.026340688578784466, | |
| "reward_after_std": 0.5681092850863934, | |
| "reward_before_mean": 0.24961409904062748, | |
| "reward_before_std": 0.5811920054256916, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2759547829627991, | |
| "reward_change_min": -0.48821524903178215, | |
| "reward_change_std": 0.1979276780039072, | |
| "reward_std": 0.5681093074381351, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/cosine_scaled_reward": 0.020447423681616783, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2451.0625534057617, | |
| "epoch": 0.08, | |
| "grad_norm": 0.022252434864640236, | |
| "kl": 9.554624557495117e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.452265630457282e-07, | |
| "loss": -0.0644, | |
| "reward": 0.03181967884302139, | |
| "reward_after_mean": 0.03181967884302139, | |
| "reward_after_std": 0.41130639240145683, | |
| "reward_before_mean": 0.34573002345860004, | |
| "reward_before_std": 0.36813389230519533, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31391034089028835, | |
| "reward_change_min": -0.4767356403172016, | |
| "reward_change_std": 0.19002152141183615, | |
| "reward_std": 0.4113064054399729, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/cosine_scaled_reward": 0.05406337045133114, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2502.958366394043, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.0264949519187212, | |
| "kl": 0.00011813640594482422, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": -0.0179, | |
| "reward": 0.06412788107991219, | |
| "reward_after_mean": 0.06412788107991219, | |
| "reward_after_std": 0.41010222770273685, | |
| "reward_before_mean": 0.39106011018157005, | |
| "reward_before_std": 0.3685198612511158, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.32693223282694817, | |
| "reward_change_min": -0.4851485788822174, | |
| "reward_change_std": 0.19277132395654917, | |
| "reward_std": 0.4101022370159626, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/cosine_scaled_reward": 0.07856010273098946, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2082.812545776367, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.027392003685235977, | |
| "kl": 0.00011332333087921143, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.331941759724268e-07, | |
| "loss": 0.0923, | |
| "reward": -0.1911002192646265, | |
| "reward_after_mean": -0.1911002192646265, | |
| "reward_after_std": 0.48363407514989376, | |
| "reward_before_mean": 0.027870051562786102, | |
| "reward_before_std": 0.4447115077637136, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2189702782779932, | |
| "reward_change_min": -0.3989233523607254, | |
| "reward_change_std": 0.14245110657066107, | |
| "reward_std": 0.48363409377634525, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.09712994811707176, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3078.3334045410156, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.019918017089366913, | |
| "kl": 0.00015282630920410156, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0662, | |
| "reward": 0.020195575430989265, | |
| "reward_after_mean": 0.020195575430989265, | |
| "reward_after_std": 0.5312496908009052, | |
| "reward_before_mean": 0.31786563992500305, | |
| "reward_before_std": 0.5375419212505221, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.29767004027962685, | |
| "reward_change_min": -0.47399161756038666, | |
| "reward_change_std": 0.19663708750158548, | |
| "reward_std": 0.5312496926635504, | |
| "rewards/accuracy_reward": 0.29166667722165585, | |
| "rewards/cosine_scaled_reward": 0.026198940351605415, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2040.7917251586914, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.025541391223669052, | |
| "kl": 9.54754650592804e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.208167604184217e-07, | |
| "loss": -0.0443, | |
| "reward": 0.14915499277412891, | |
| "reward_after_mean": 0.14915499277412891, | |
| "reward_after_std": 0.6291306726634502, | |
| "reward_before_mean": 0.482904102653265, | |
| "reward_before_std": 0.6536181448027492, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3337490987032652, | |
| "reward_change_min": -0.6043441817164421, | |
| "reward_change_std": 0.2390197478234768, | |
| "reward_std": 0.6291306801140308, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/cosine_scaled_reward": 0.10790410172194242, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2779.729202270508, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.020267607644200325, | |
| "kl": 0.00012791156768798828, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": -0.0545, | |
| "reward": 0.0270681269466877, | |
| "reward_after_mean": 0.0270681269466877, | |
| "reward_after_std": 0.4193691723048687, | |
| "reward_before_mean": 0.3381798770278692, | |
| "reward_before_std": 0.3713110312819481, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31111176311969757, | |
| "reward_change_min": -0.47804239578545094, | |
| "reward_change_std": 0.1906196428462863, | |
| "reward_std": 0.41936918161809444, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/cosine_scaled_reward": 0.04651320539414883, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2396.770881652832, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.028096716850996017, | |
| "kl": 0.00011730939149856567, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.081093963579707e-07, | |
| "loss": 0.1023, | |
| "reward": -0.1743035688996315, | |
| "reward_after_mean": -0.1743035688996315, | |
| "reward_after_std": 0.376751147210598, | |
| "reward_before_mean": 0.07117291446775198, | |
| "reward_before_std": 0.36577551485970616, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2454764936119318, | |
| "reward_change_min": -0.4111335948109627, | |
| "reward_change_std": 0.15914648212492466, | |
| "reward_std": 0.3767511546611786, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.053827076219022274, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2609.666702270508, | |
| "epoch": 0.088, | |
| "grad_norm": 0.024190831929445267, | |
| "kl": 0.00011560320854187012, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": -0.0252, | |
| "reward": -0.12485083658248186, | |
| "reward_after_mean": -0.12485083658248186, | |
| "reward_after_std": 0.5494148954749107, | |
| "reward_before_mean": 0.1104511353187263, | |
| "reward_before_std": 0.5148029942065477, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23530197329819202, | |
| "reward_change_min": -0.3927488550543785, | |
| "reward_change_std": 0.14942125510424376, | |
| "reward_std": 0.5494149122387171, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/cosine_scaled_reward": -0.07704886957071722, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2710.0833587646484, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.02033821865916252, | |
| "kl": 0.00011965632438659668, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.950875657567621e-07, | |
| "loss": 0.0067, | |
| "reward": 0.16592460870742798, | |
| "reward_after_mean": 0.16592460870742798, | |
| "reward_after_std": 0.628672743216157, | |
| "reward_before_mean": 0.5066994614899158, | |
| "reward_before_std": 0.6296672336757183, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3407748378813267, | |
| "reward_change_min": -0.6165256667882204, | |
| "reward_change_std": 0.2455195877701044, | |
| "reward_std": 0.6286727450788021, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/cosine_scaled_reward": 0.11086611449718475, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1924.916706085205, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.028922580182552338, | |
| "kl": 9.059533476829529e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": -0.0012, | |
| "reward": 0.06329749338328838, | |
| "reward_after_mean": 0.06329749338328838, | |
| "reward_after_std": 0.5062609296292067, | |
| "reward_before_mean": 0.3792388401925564, | |
| "reward_before_std": 0.49266021978110075, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31594133377075195, | |
| "reward_change_min": -0.534465042874217, | |
| "reward_change_std": 0.2129287514835596, | |
| "reward_std": 0.5062609408050776, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/cosine_scaled_reward": 0.04590547701809555, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2843.2708587646484, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.021287092939019203, | |
| "kl": 0.0001614391803741455, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.817671337095244e-07, | |
| "loss": 0.0507, | |
| "reward": -0.00979284942150116, | |
| "reward_after_mean": -0.00979284942150116, | |
| "reward_after_std": 0.4034804105758667, | |
| "reward_before_mean": 0.2902548350393772, | |
| "reward_before_std": 0.35455449763685465, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3000476788729429, | |
| "reward_change_min": -0.4641227424144745, | |
| "reward_change_std": 0.18029196839779615, | |
| "reward_std": 0.40348043479025364, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/cosine_scaled_reward": -0.0014118626713752747, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2935.375030517578, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.03196020796895027, | |
| "kl": 0.00017774105072021484, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0139, | |
| "reward": -0.19430748652666807, | |
| "reward_after_mean": -0.19430748652666807, | |
| "reward_after_std": 0.5501824514940381, | |
| "reward_before_mean": 0.013918843120336533, | |
| "reward_before_std": 0.5049468795768917, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20822633057832718, | |
| "reward_change_min": -0.325034998357296, | |
| "reward_change_std": 0.1252102516591549, | |
| "reward_std": 0.5501824719831347, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/cosine_scaled_reward": -0.1319145057350397, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2397.5833625793457, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.028649814426898956, | |
| "kl": 9.499490261077881e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.681643291108517e-07, | |
| "loss": 0.0607, | |
| "reward": 0.05334258824586868, | |
| "reward_after_mean": 0.05334258824586868, | |
| "reward_after_std": 0.5218823049217463, | |
| "reward_before_mean": 0.35954809142276645, | |
| "reward_before_std": 0.4847041219472885, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30620551481842995, | |
| "reward_change_min": -0.4997597597539425, | |
| "reward_change_std": 0.19008434005081654, | |
| "reward_std": 0.5218823160976171, | |
| "rewards/accuracy_reward": 0.27083333767950535, | |
| "rewards/cosine_scaled_reward": 0.08871474675834179, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2344.2084159851074, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.03297533839941025, | |
| "kl": 0.0001214146614074707, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0267, | |
| "reward": 0.014584167511202395, | |
| "reward_after_mean": 0.014584167511202395, | |
| "reward_after_std": 0.5300229340791702, | |
| "reward_before_mean": 0.30404046457260847, | |
| "reward_before_std": 0.4831734402105212, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.289456307888031, | |
| "reward_change_min": -0.4781253971159458, | |
| "reward_change_std": 0.1838990869000554, | |
| "reward_std": 0.5300229452550411, | |
| "rewards/accuracy_reward": 0.25000000186264515, | |
| "rewards/cosine_scaled_reward": 0.054040471790358424, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2405.4792098999023, | |
| "epoch": 0.096, | |
| "grad_norm": 0.024380596354603767, | |
| "kl": 0.0001236051321029663, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.54295724882796e-07, | |
| "loss": 0.0294, | |
| "reward": 0.020913776010274887, | |
| "reward_after_mean": 0.020913776010274887, | |
| "reward_after_std": 0.5260360110551119, | |
| "reward_before_mean": 0.3158747926354408, | |
| "reward_before_std": 0.48735920153558254, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2949610147625208, | |
| "reward_change_min": -0.47205518186092377, | |
| "reward_change_std": 0.18748428858816624, | |
| "reward_std": 0.5260360259562731, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/cosine_scaled_reward": 0.0033747986890375614, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2649.854217529297, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.0174813661724329, | |
| "kl": 9.45068895816803e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0103, | |
| "reward": -0.03742504213005304, | |
| "reward_after_mean": -0.03742504213005304, | |
| "reward_after_std": 0.5757533330470324, | |
| "reward_before_mean": 0.2353352140635252, | |
| "reward_before_std": 0.588757986202836, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2727602645754814, | |
| "reward_change_min": -0.5207938365638256, | |
| "reward_change_std": 0.20239645708352327, | |
| "reward_std": 0.5757533498108387, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/cosine_scaled_reward": -0.014664788264781237, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2589.5833435058594, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.030955081805586815, | |
| "kl": 0.0001302659511566162, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.401782177833147e-07, | |
| "loss": 0.0739, | |
| "reward": -0.07755789160728455, | |
| "reward_after_mean": -0.07755789160728455, | |
| "reward_after_std": 0.3382277116179466, | |
| "reward_before_mean": 0.20555407088249922, | |
| "reward_before_std": 0.29440235160291195, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2831119429320097, | |
| "reward_change_min": -0.42291648127138615, | |
| "reward_change_std": 0.16565488744527102, | |
| "reward_std": 0.3382277172058821, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.0027792779728770256, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2388.1667251586914, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.022453241050243378, | |
| "kl": 0.00016289204359054565, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0271, | |
| "reward": -0.01107841357588768, | |
| "reward_after_mean": -0.01107841357588768, | |
| "reward_after_std": 0.3539695702493191, | |
| "reward_before_mean": 0.28700726421084255, | |
| "reward_before_std": 0.25306378304958344, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2980856914073229, | |
| "reward_change_min": -0.4043935965746641, | |
| "reward_change_std": 0.15176600962877274, | |
| "reward_std": 0.35396958142518997, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/cosine_scaled_reward": 0.016173945739865303, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1487.0833702087402, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.03448113799095154, | |
| "kl": 9.407103061676025e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.258290078201731e-07, | |
| "loss": 0.1153, | |
| "reward": 0.21288079069927335, | |
| "reward_after_mean": 0.21288079069927335, | |
| "reward_after_std": 0.6643541660159826, | |
| "reward_before_mean": 0.5578272799029946, | |
| "reward_before_std": 0.6279368726536632, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.34494646824896336, | |
| "reward_change_min": -0.5568280145525932, | |
| "reward_change_std": 0.22005709912627935, | |
| "reward_std": 0.6643541809171438, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/cosine_scaled_reward": 0.16199392126873136, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2516.625068664551, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.023190459236502647, | |
| "kl": 0.00011932849884033203, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": -0.057, | |
| "reward": -0.04916583467274904, | |
| "reward_after_mean": -0.04916583467274904, | |
| "reward_after_std": 0.5242477711290121, | |
| "reward_before_mean": 0.22224761126562953, | |
| "reward_before_std": 0.518500761128962, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27141344733536243, | |
| "reward_change_min": -0.47212454676628113, | |
| "reward_change_std": 0.18147558439522982, | |
| "reward_std": 0.5242477972060442, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/cosine_scaled_reward": -0.006919063627719879, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2447.1042251586914, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.04171357303857803, | |
| "kl": 0.0001575946807861328, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.11265577295385e-07, | |
| "loss": 0.0484, | |
| "reward": -0.30102725327014923, | |
| "reward_after_mean": -0.30102725327014923, | |
| "reward_after_std": 0.40490369498729706, | |
| "reward_before_mean": -0.10983736906200647, | |
| "reward_before_std": 0.37877833284437656, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19118987582623959, | |
| "reward_change_min": -0.33792233280837536, | |
| "reward_change_std": 0.12286930158734322, | |
| "reward_std": 0.4049036977812648, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.17233736719936132, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2565.500030517578, | |
| "epoch": 0.104, | |
| "grad_norm": 0.026068881154060364, | |
| "kl": 0.00012224912643432617, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0069, | |
| "reward": 0.014372130390256643, | |
| "reward_after_mean": 0.014372130390256643, | |
| "reward_after_std": 0.5293956436216831, | |
| "reward_before_mean": 0.3061292991042137, | |
| "reward_before_std": 0.5099629778414965, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.29175717756152153, | |
| "reward_change_min": -0.47916352190077305, | |
| "reward_change_std": 0.1855539120733738, | |
| "reward_std": 0.5293956585228443, | |
| "rewards/accuracy_reward": 0.29166666977107525, | |
| "rewards/cosine_scaled_reward": 0.014462634921073914, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2142.541702270508, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.026767289265990257, | |
| "kl": 8.273124694824219e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.965056695057204e-07, | |
| "loss": 0.0693, | |
| "reward": -0.031038912944495678, | |
| "reward_after_mean": -0.031038912944495678, | |
| "reward_after_std": 0.5491299722343683, | |
| "reward_before_mean": 0.24104281282052398, | |
| "reward_before_std": 0.5185518572106957, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27208174392580986, | |
| "reward_change_min": -0.41842199116945267, | |
| "reward_change_std": 0.16613258328288794, | |
| "reward_std": 0.5491300020366907, | |
| "rewards/accuracy_reward": 0.25000000931322575, | |
| "rewards/cosine_scaled_reward": -0.008957181125879288, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3579.1041870117188, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.020955311134457588, | |
| "kl": 0.0002243518829345703, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0019, | |
| "reward": -0.39391759666614234, | |
| "reward_after_mean": -0.39391759666614234, | |
| "reward_after_std": 0.2926015192642808, | |
| "reward_before_mean": -0.2218842003494501, | |
| "reward_before_std": 0.24477995350025594, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1720333844423294, | |
| "reward_change_min": -0.2595607787370682, | |
| "reward_change_std": 0.09285970125347376, | |
| "reward_std": 0.29260152392089367, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.2427175386401359, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2538.5625610351562, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.02233041636645794, | |
| "kl": 0.00014513731002807617, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.815672671252315e-07, | |
| "loss": -0.0132, | |
| "reward": 0.07038530055433512, | |
| "reward_after_mean": 0.07038530055433512, | |
| "reward_after_std": 0.4959055408835411, | |
| "reward_before_mean": 0.38510218542069197, | |
| "reward_before_std": 0.4519264791160822, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31471688486635685, | |
| "reward_change_min": -0.47875550389289856, | |
| "reward_change_std": 0.18934866040945053, | |
| "reward_std": 0.49590555392205715, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/cosine_scaled_reward": 0.0726021807640791, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3149.312545776367, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.016374317929148674, | |
| "kl": 0.00013837218284606934, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0191, | |
| "reward": -0.2019346058368683, | |
| "reward_after_mean": -0.2019346058368683, | |
| "reward_after_std": 0.5740157756954432, | |
| "reward_before_mean": 0.00486493157222867, | |
| "reward_before_std": 0.5541965216398239, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20679953508079052, | |
| "reward_change_min": -0.3921053633093834, | |
| "reward_change_std": 0.1454045455902815, | |
| "reward_std": 0.5740158017724752, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.14096840284764767, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2384.375015258789, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.02656826190650463, | |
| "kl": 9.119324386119843e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.664685702961344e-07, | |
| "loss": 0.0676, | |
| "reward": 0.1127127856016159, | |
| "reward_after_mean": 0.1127127856016159, | |
| "reward_after_std": 0.4883039314299822, | |
| "reward_before_mean": 0.44525690749287605, | |
| "reward_before_std": 0.42334912437945604, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3325441386550665, | |
| "reward_change_min": -0.5463209841400385, | |
| "reward_change_std": 0.203442326746881, | |
| "reward_std": 0.4883039500564337, | |
| "rewards/accuracy_reward": 0.3750000037252903, | |
| "rewards/cosine_scaled_reward": 0.07025692239403725, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2828.0416946411133, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.02195425145328045, | |
| "kl": 0.00012743473052978516, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": -0.0071, | |
| "reward": -0.03396608494222164, | |
| "reward_after_mean": -0.03396608494222164, | |
| "reward_after_std": 0.42939132265746593, | |
| "reward_before_mean": 0.2542315907776356, | |
| "reward_before_std": 0.3915413152426481, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2881976831704378, | |
| "reward_change_min": -0.4680513422936201, | |
| "reward_change_std": 0.17823405750095844, | |
| "reward_std": 0.42939133010804653, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": 0.004231559112668037, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2182.6458740234375, | |
| "epoch": 0.112, | |
| "grad_norm": 0.02803581953048706, | |
| "kl": 9.182840585708618e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.512279744547392e-07, | |
| "loss": 0.036, | |
| "reward": -0.11260436498560011, | |
| "reward_after_mean": -0.11260436498560011, | |
| "reward_after_std": 0.5100179938599467, | |
| "reward_before_mean": 0.13090651109814644, | |
| "reward_before_std": 0.45763199776411057, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24351087771356106, | |
| "reward_change_min": -0.33865879476070404, | |
| "reward_change_std": 0.1343588917516172, | |
| "reward_std": 0.5100180115550756, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.03576015151338652, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2859.729217529297, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.02229207567870617, | |
| "kl": 0.00012323260307312012, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0162, | |
| "reward": -0.042411248199641705, | |
| "reward_after_mean": -0.042411248199641705, | |
| "reward_after_std": 0.5648014172911644, | |
| "reward_before_mean": 0.22283665975555778, | |
| "reward_before_std": 0.5460882969200611, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2652479037642479, | |
| "reward_change_min": -0.4092443734407425, | |
| "reward_change_std": 0.1645316081121564, | |
| "reward_std": 0.5648014266043901, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/cosine_scaled_reward": -0.0063300225883722305, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2322.125030517578, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.023206228390336037, | |
| "kl": 0.00010912120342254639, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.358640479194451e-07, | |
| "loss": -0.045, | |
| "reward": 0.12453807704150677, | |
| "reward_after_mean": 0.12453807704150677, | |
| "reward_after_std": 0.622159369289875, | |
| "reward_before_mean": 0.4508266132324934, | |
| "reward_before_std": 0.641321973875165, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.32628853246569633, | |
| "reward_change_min": -0.5445595029741526, | |
| "reward_change_std": 0.22335629165172577, | |
| "reward_std": 0.6221593860536814, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/cosine_scaled_reward": 0.0966599378734827, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2310.062545776367, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.02827632799744606, | |
| "kl": 0.00014510750770568848, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0523, | |
| "reward": 0.18787965178489685, | |
| "reward_after_mean": 0.18787965178489685, | |
| "reward_after_std": 0.4799032285809517, | |
| "reward_before_mean": 0.5493117831647396, | |
| "reward_before_std": 0.425581069663167, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3614321555942297, | |
| "reward_change_min": -0.5766200236976147, | |
| "reward_change_std": 0.22458772454410791, | |
| "reward_std": 0.479903232306242, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/cosine_scaled_reward": 0.17431178130209446, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1985.1666946411133, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.032261401414871216, | |
| "kl": 0.00013130903244018555, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.203955092681039e-07, | |
| "loss": -0.001, | |
| "reward": -0.05215142108500004, | |
| "reward_after_mean": -0.05215142108500004, | |
| "reward_after_std": 0.5923234205693007, | |
| "reward_before_mean": 0.20914378141242196, | |
| "reward_before_std": 0.5869351290166378, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2612952049821615, | |
| "reward_change_min": -0.48573706299066544, | |
| "reward_change_std": 0.18874530028551817, | |
| "reward_std": 0.5923234317451715, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/cosine_scaled_reward": 0.0008104409789666533, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2266.312545776367, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.03391628339886665, | |
| "kl": 0.00010603666305541992, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": -0.0353, | |
| "reward": -0.2809916576370597, | |
| "reward_after_mean": -0.2809916576370597, | |
| "reward_after_std": 0.38146258890628815, | |
| "reward_before_mean": -0.07696734461933374, | |
| "reward_before_std": 0.36126195592805743, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20402430556714535, | |
| "reward_change_min": -0.33082425221800804, | |
| "reward_change_std": 0.1302658850327134, | |
| "reward_std": 0.3814626010134816, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.18113402277231216, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2302.4375534057617, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.03015494905412197, | |
| "kl": 0.00013568997383117676, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.048412045323164e-07, | |
| "loss": 0.0696, | |
| "reward": -0.09460114315152168, | |
| "reward_after_mean": -0.09460114315152168, | |
| "reward_after_std": 0.4551072083413601, | |
| "reward_before_mean": 0.1622155588120222, | |
| "reward_before_std": 0.395945000462234, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2568166982382536, | |
| "reward_change_min": -0.4168005548417568, | |
| "reward_change_std": 0.15201660431921482, | |
| "reward_std": 0.45510722137987614, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.046117788180708885, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2372.9583740234375, | |
| "epoch": 0.12, | |
| "grad_norm": 0.026441100984811783, | |
| "kl": 0.00013068318367004395, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": -0.0192, | |
| "reward": 0.299602385610342, | |
| "reward_after_mean": 0.299602385610342, | |
| "reward_after_std": 0.6463410649448633, | |
| "reward_before_mean": 0.6807096730917692, | |
| "reward_before_std": 0.6122201485559344, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3811072837561369, | |
| "reward_change_min": -0.5962531901896, | |
| "reward_change_std": 0.240672436542809, | |
| "reward_std": 0.6463411003351212, | |
| "rewards/accuracy_reward": 0.45833334140479565, | |
| "rewards/cosine_scaled_reward": 0.22237632365431637, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1800.8333892822266, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.02916141226887703, | |
| "kl": 5.840137600898743e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.892200842364462e-07, | |
| "loss": 0.0226, | |
| "reward": 0.3201424069702625, | |
| "reward_after_mean": 0.3201424069702625, | |
| "reward_after_std": 0.5018086824566126, | |
| "reward_before_mean": 0.7261572647839785, | |
| "reward_before_std": 0.4295506803318858, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.40601486526429653, | |
| "reward_change_min": -0.6012663654983044, | |
| "reward_change_std": 0.24013797752559185, | |
| "reward_std": 0.501808712258935, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/cosine_scaled_reward": 0.18449058942496777, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2687.291702270508, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.02377397008240223, | |
| "kl": 0.00017151236534118652, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": -0.0375, | |
| "reward": -0.040606689639389515, | |
| "reward_after_mean": -0.040606689639389515, | |
| "reward_after_std": 0.43058050237596035, | |
| "reward_before_mean": 0.2451710607856512, | |
| "reward_before_std": 0.38810104969888926, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2857777550816536, | |
| "reward_change_min": -0.4491597171872854, | |
| "reward_change_std": 0.17806370370090008, | |
| "reward_std": 0.4305805191397667, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/cosine_scaled_reward": -0.025662289932370186, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2884.1458740234375, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.02014852873980999, | |
| "kl": 0.00014007091522216797, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.735511803093248e-07, | |
| "loss": 0.0236, | |
| "reward": 0.015964743681252003, | |
| "reward_after_mean": 0.015964743681252003, | |
| "reward_after_std": 0.6370243337005377, | |
| "reward_before_mean": 0.2958701690658927, | |
| "reward_before_std": 0.6383323790505528, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27990544214844704, | |
| "reward_change_min": -0.4842909462749958, | |
| "reward_change_std": 0.19452710915356874, | |
| "reward_std": 0.6370243430137634, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/cosine_scaled_reward": 0.025036831386387348, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2778.187515258789, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.02239762246608734, | |
| "kl": 0.0001312941312789917, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": -0.0326, | |
| "reward": -0.197466429322958, | |
| "reward_after_mean": -0.197466429322958, | |
| "reward_after_std": 0.3072771281003952, | |
| "reward_before_mean": 0.042145409155637026, | |
| "reward_before_std": 0.23205038718879223, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23961183801293373, | |
| "reward_change_min": -0.3293800801038742, | |
| "reward_change_std": 0.12456908635795116, | |
| "reward_std": 0.3072771355509758, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.10368793923407793, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2621.166717529297, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.028512069955468178, | |
| "kl": 0.00011426769196987152, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.578535828967777e-07, | |
| "loss": 0.0195, | |
| "reward": 0.20604130998253822, | |
| "reward_after_mean": 0.20604130998253822, | |
| "reward_after_std": 0.6211877912282944, | |
| "reward_before_mean": 0.5551559692248702, | |
| "reward_before_std": 0.5809559132903814, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.34911466389894485, | |
| "reward_change_min": -0.5866297446191311, | |
| "reward_change_std": 0.22965920250862837, | |
| "reward_std": 0.6211878024041653, | |
| "rewards/accuracy_reward": 0.45833333767950535, | |
| "rewards/cosine_scaled_reward": 0.09682262875139713, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2751.0625762939453, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.023924345150589943, | |
| "kl": 0.00016123056411743164, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.5e-07, | |
| "loss": -0.0046, | |
| "reward": 0.05653337016701698, | |
| "reward_after_mean": 0.05653337016701698, | |
| "reward_after_std": 0.35486595053225756, | |
| "reward_before_mean": 0.38315436616539955, | |
| "reward_before_std": 0.27544057788327336, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3266210127621889, | |
| "reward_change_min": -0.47289396263659, | |
| "reward_change_std": 0.1823914684355259, | |
| "reward_std": 0.3548659607768059, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/cosine_scaled_reward": 0.07065436616539955, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2868.0208740234375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.0201033316552639, | |
| "kl": 0.00013107061386108398, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.421464171032224e-07, | |
| "loss": 0.015, | |
| "reward": 0.12696666829288006, | |
| "reward_after_mean": 0.12696666829288006, | |
| "reward_after_std": 0.4511380046606064, | |
| "reward_before_mean": 0.4671168327331543, | |
| "reward_before_std": 0.36786600202322006, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3401501439511776, | |
| "reward_change_min": -0.5330948643386364, | |
| "reward_change_std": 0.1999045666307211, | |
| "reward_std": 0.4511380158364773, | |
| "rewards/accuracy_reward": 0.3541666679084301, | |
| "rewards/cosine_scaled_reward": 0.11295014806091785, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1964.208381652832, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.034607209265232086, | |
| "kl": 0.00012370198965072632, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0372, | |
| "reward": -0.06578832399100065, | |
| "reward_after_mean": -0.06578832399100065, | |
| "reward_after_std": 0.6056272368878126, | |
| "reward_before_mean": 0.18658637441694736, | |
| "reward_before_std": 0.5886824317276478, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2523746956139803, | |
| "reward_change_min": -0.40224030055105686, | |
| "reward_change_std": 0.1620657118037343, | |
| "reward_std": 0.605627266690135, | |
| "rewards/accuracy_reward": 0.25000000931322575, | |
| "rewards/cosine_scaled_reward": -0.06341363039973658, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1979.9792251586914, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.026191117241978645, | |
| "kl": 8.503347635269165e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.264488196906752e-07, | |
| "loss": -0.0251, | |
| "reward": -0.1890785889700055, | |
| "reward_after_mean": -0.1890785889700055, | |
| "reward_after_std": 0.3168610939756036, | |
| "reward_before_mean": 0.054542893543839455, | |
| "reward_before_std": 0.2577635142952204, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24362149834632874, | |
| "reward_change_min": -0.3862832821905613, | |
| "reward_change_std": 0.14109937846660614, | |
| "reward_std": 0.31686110980808735, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.11212376441108063, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2766.020866394043, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.023059694096446037, | |
| "kl": 0.00013384222984313965, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0056, | |
| "reward": 0.10441343765705824, | |
| "reward_after_mean": 0.10441343765705824, | |
| "reward_after_std": 0.5611015018075705, | |
| "reward_before_mean": 0.42113048676401377, | |
| "reward_before_std": 0.49046179838478565, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3167170472443104, | |
| "reward_change_min": -0.5452346540987492, | |
| "reward_change_std": 0.1955110440030694, | |
| "reward_std": 0.561101520434022, | |
| "rewards/accuracy_reward": 0.35416666977107525, | |
| "rewards/cosine_scaled_reward": 0.06696383003145456, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3097.833366394043, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.02340916357934475, | |
| "kl": 0.00016766786575317383, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.107799157635538e-07, | |
| "loss": -0.0232, | |
| "reward": -0.19432928040623665, | |
| "reward_after_mean": -0.19432928040623665, | |
| "reward_after_std": 0.4012261498719454, | |
| "reward_before_mean": 0.043743424117565155, | |
| "reward_before_std": 0.400667910464108, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23807269148528576, | |
| "reward_change_min": -0.395206730812788, | |
| "reward_change_std": 0.1616207016631961, | |
| "reward_std": 0.4012261591851711, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/cosine_scaled_reward": -0.12292325869202614, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2717.6875610351562, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.023171402513980865, | |
| "kl": 0.00016413629055023193, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": -0.0007, | |
| "reward": -0.11855829134583473, | |
| "reward_after_mean": -0.11855829134583473, | |
| "reward_after_std": 0.4816027395427227, | |
| "reward_before_mean": 0.1314456146210432, | |
| "reward_before_std": 0.4761247858405113, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25000389851629734, | |
| "reward_change_min": -0.4218711256980896, | |
| "reward_change_std": 0.1656272802501917, | |
| "reward_std": 0.48160274885594845, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.03522104769945145, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2885.2083587646484, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.01870822347700596, | |
| "kl": 0.00013941526412963867, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.951587954676837e-07, | |
| "loss": 0.0402, | |
| "reward": 0.3842864651232958, | |
| "reward_after_mean": 0.3842864651232958, | |
| "reward_after_std": 0.7551897596567869, | |
| "reward_before_mean": 0.7818266898393631, | |
| "reward_before_std": 0.7093747109174728, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3975402247160673, | |
| "reward_change_min": -0.7075115144252777, | |
| "reward_change_std": 0.272270480170846, | |
| "reward_std": 0.7551897652447224, | |
| "rewards/accuracy_reward": 0.5000000055879354, | |
| "rewards/cosine_scaled_reward": 0.28182668425142765, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1926.687515258789, | |
| "epoch": 0.136, | |
| "grad_norm": 0.0322633758187294, | |
| "kl": 0.00010208180174231529, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0234, | |
| "reward": 0.1267358995974064, | |
| "reward_after_mean": 0.1267358995974064, | |
| "reward_after_std": 0.3533069547265768, | |
| "reward_before_mean": 0.47834774386137724, | |
| "reward_before_std": 0.2697556195780635, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3516118451952934, | |
| "reward_change_min": -0.5000638235360384, | |
| "reward_change_std": 0.19549622386693954, | |
| "reward_std": 0.3533069621771574, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/cosine_scaled_reward": 0.14501440059393644, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2186.1875381469727, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.031494706869125366, | |
| "kl": 0.0001770704984664917, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.79604490731896e-07, | |
| "loss": 0.0894, | |
| "reward": 0.07591197546571493, | |
| "reward_after_mean": 0.07591197546571493, | |
| "reward_after_std": 0.5809869198128581, | |
| "reward_before_mean": 0.3816301135811955, | |
| "reward_before_std": 0.551488799508661, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3057181444019079, | |
| "reward_change_min": -0.47399745136499405, | |
| "reward_change_std": 0.18618585728108883, | |
| "reward_std": 0.5809869384393096, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/cosine_scaled_reward": 0.02746343519538641, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1793.2500228881836, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.03826192766427994, | |
| "kl": 0.0001408308744430542, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.063, | |
| "reward": -0.10150941228494048, | |
| "reward_after_mean": -0.10150941228494048, | |
| "reward_after_std": 0.3905038358643651, | |
| "reward_before_mean": 0.16313835605978966, | |
| "reward_before_std": 0.3259769971482456, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2646478023380041, | |
| "reward_change_min": -0.3776704464107752, | |
| "reward_change_std": 0.15065845055505633, | |
| "reward_std": 0.3905038433149457, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/cosine_scaled_reward": -0.0660282839089632, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2746.604179382324, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.02808026410639286, | |
| "kl": 0.00018015503883361816, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.641359520805548e-07, | |
| "loss": 0.0635, | |
| "reward": 0.0752830570563674, | |
| "reward_after_mean": 0.0752830570563674, | |
| "reward_after_std": 0.6505839275196195, | |
| "reward_before_mean": 0.3795435354113579, | |
| "reward_before_std": 0.674185479991138, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30426048301160336, | |
| "reward_change_min": -0.6016716994345188, | |
| "reward_change_std": 0.22931489627808332, | |
| "reward_std": 0.6505839368328452, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/cosine_scaled_reward": 0.06704353634268045, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2631.7916870117188, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.02497822605073452, | |
| "kl": 0.00014847517013549805, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": -0.0774, | |
| "reward": 0.07049691770225763, | |
| "reward_after_mean": 0.07049691770225763, | |
| "reward_after_std": 0.41926424019038677, | |
| "reward_before_mean": 0.39295812509953976, | |
| "reward_before_std": 0.33388588251546025, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3224612195044756, | |
| "reward_change_min": -0.4707703944295645, | |
| "reward_change_std": 0.18158453051000834, | |
| "reward_std": 0.41926424857228994, | |
| "rewards/accuracy_reward": 0.33333333395421505, | |
| "rewards/cosine_scaled_reward": 0.05962479766458273, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2051.0000381469727, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.025261210277676582, | |
| "kl": 7.904693484306335e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.4877202554526084e-07, | |
| "loss": 0.0796, | |
| "reward": 0.2634911872446537, | |
| "reward_after_mean": 0.2634911872446537, | |
| "reward_after_std": 0.5967842470854521, | |
| "reward_before_mean": 0.6438823733478785, | |
| "reward_before_std": 0.6048776777461171, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.38039117865264416, | |
| "reward_change_min": -0.635780643671751, | |
| "reward_change_std": 0.26227735076099634, | |
| "reward_std": 0.5967842638492584, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/cosine_scaled_reward": 0.20638234540820122, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2334.8541679382324, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.023498935624957085, | |
| "kl": 0.00010383129119873047, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": -0.0518, | |
| "reward": 0.0770783182233572, | |
| "reward_after_mean": 0.0770783182233572, | |
| "reward_after_std": 0.5987216774374247, | |
| "reward_before_mean": 0.376096501480788, | |
| "reward_before_std": 0.5320965368300676, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2990182042121887, | |
| "reward_change_min": -0.4671790637075901, | |
| "reward_change_std": 0.17802791390568018, | |
| "reward_std": 0.5987217091023922, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/cosine_scaled_reward": 0.06359649077057838, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2312.9583740234375, | |
| "epoch": 0.144, | |
| "grad_norm": 0.02435940131545067, | |
| "kl": 9.564310312271118e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.3353142970386557e-07, | |
| "loss": 0.0478, | |
| "reward": 0.09839674085378647, | |
| "reward_after_mean": 0.09839674085378647, | |
| "reward_after_std": 0.4974265359342098, | |
| "reward_before_mean": 0.42580396682024, | |
| "reward_before_std": 0.46056526992470026, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.32740725204348564, | |
| "reward_change_min": -0.4952290430665016, | |
| "reward_change_std": 0.1967391036450863, | |
| "reward_std": 0.4974265471100807, | |
| "rewards/accuracy_reward": 0.33333334140479565, | |
| "rewards/cosine_scaled_reward": 0.09247064776718616, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3158.5833740234375, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.021022077649831772, | |
| "kl": 0.00015288591384887695, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": -0.0607, | |
| "reward": -0.19837769120931625, | |
| "reward_after_mean": -0.19837769120931625, | |
| "reward_after_std": 0.4446485564112663, | |
| "reward_before_mean": 0.024635582813061774, | |
| "reward_before_std": 0.4137072516605258, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2230132780969143, | |
| "reward_change_min": -0.3493082635104656, | |
| "reward_change_std": 0.13094482477754354, | |
| "reward_std": 0.4446485601365566, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/cosine_scaled_reward": -0.12119775079190731, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2172.7917098999023, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.02802017331123352, | |
| "kl": 0.0001382678747177124, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.1843273287476854e-07, | |
| "loss": 0.0109, | |
| "reward": 0.24382356368005276, | |
| "reward_after_mean": 0.24382356368005276, | |
| "reward_after_std": 0.5414328817278147, | |
| "reward_before_mean": 0.6193723455071449, | |
| "reward_before_std": 0.51410650042817, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3755488097667694, | |
| "reward_change_min": -0.6071460507810116, | |
| "reward_change_std": 0.23622124083340168, | |
| "reward_std": 0.5414328835904598, | |
| "rewards/accuracy_reward": 0.4375000111758709, | |
| "rewards/cosine_scaled_reward": 0.18187237158417702, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3269.0625610351562, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.0199818667024374, | |
| "kl": 0.00011938810348510742, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.027, | |
| "reward": 0.07939598336815834, | |
| "reward_after_mean": 0.07939598336815834, | |
| "reward_after_std": 0.5406165793538094, | |
| "reward_before_mean": 0.39787398651242256, | |
| "reward_before_std": 0.5439900029450655, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.318478025496006, | |
| "reward_change_min": -0.5440695825964212, | |
| "reward_change_std": 0.21484098490327597, | |
| "reward_std": 0.5406166054308414, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/cosine_scaled_reward": 0.08537399768829346, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3137.0833740234375, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.01886666566133499, | |
| "kl": 0.00018727779388427734, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.034943304942796e-07, | |
| "loss": 0.0105, | |
| "reward": -0.13560364861041307, | |
| "reward_after_mean": -0.13560364861041307, | |
| "reward_after_std": 0.4868227355182171, | |
| "reward_before_mean": 0.11097788251936436, | |
| "reward_before_std": 0.49222803860902786, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24658154882490635, | |
| "reward_change_min": -0.455986674875021, | |
| "reward_change_std": 0.17662312928587198, | |
| "reward_std": 0.486822746694088, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/cosine_scaled_reward": -0.07652211067033932, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2480.5208892822266, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.027400832623243332, | |
| "kl": 0.00014418736100196838, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0254, | |
| "reward": 0.07110828906297684, | |
| "reward_after_mean": 0.07110828906297684, | |
| "reward_after_std": 0.36002582497894764, | |
| "reward_before_mean": 0.4055180950090289, | |
| "reward_before_std": 0.3088535754941404, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.33440983295440674, | |
| "reward_change_min": -0.470782570540905, | |
| "reward_change_std": 0.18981244694441557, | |
| "reward_std": 0.3600258268415928, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/cosine_scaled_reward": 0.0721847927197814, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2486.645851135254, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.02796083875000477, | |
| "kl": 0.00013111159205436707, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.8873442270461485e-07, | |
| "loss": 0.0842, | |
| "reward": 0.2597197503782809, | |
| "reward_after_mean": 0.2597197503782809, | |
| "reward_after_std": 0.744433356449008, | |
| "reward_before_mean": 0.6226858850568533, | |
| "reward_before_std": 0.7914422228932381, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3629660848528147, | |
| "reward_change_min": -0.6258580330759287, | |
| "reward_change_std": 0.2674333294853568, | |
| "reward_std": 0.7444333788007498, | |
| "rewards/accuracy_reward": 0.43750001303851604, | |
| "rewards/cosine_scaled_reward": 0.18518583837430924, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2971.8958587646484, | |
| "epoch": 0.152, | |
| "grad_norm": 0.019855745136737823, | |
| "kl": 0.00016885995864868164, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": -0.0021, | |
| "reward": -0.15273336577229202, | |
| "reward_after_mean": -0.15273336577229202, | |
| "reward_after_std": 0.46419150568544865, | |
| "reward_before_mean": 0.08369430713355541, | |
| "reward_before_std": 0.410770776681602, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2364276945590973, | |
| "reward_change_min": -0.357030825689435, | |
| "reward_change_std": 0.13842764357104897, | |
| "reward_std": 0.4641915149986744, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": -0.1663056961260736, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2386.479202270508, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.026198577135801315, | |
| "kl": 0.00013698264956474304, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.7417099217982686e-07, | |
| "loss": -0.0157, | |
| "reward": 0.19155731541104615, | |
| "reward_after_mean": 0.19155731541104615, | |
| "reward_after_std": 0.5861021094024181, | |
| "reward_before_mean": 0.5390464821830392, | |
| "reward_before_std": 0.5199935543350875, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.34748917259275913, | |
| "reward_change_min": -0.5791817046701908, | |
| "reward_change_std": 0.2208269750699401, | |
| "reward_std": 0.586102120578289, | |
| "rewards/accuracy_reward": 0.37500000186264515, | |
| "rewards/cosine_scaled_reward": 0.16404648189200088, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1384.2083587646484, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.05298980697989464, | |
| "kl": 6.917491555213928e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0295, | |
| "reward": 0.32029616460204124, | |
| "reward_after_mean": 0.32029616460204124, | |
| "reward_after_std": 0.469398295506835, | |
| "reward_before_mean": 0.7312853448092937, | |
| "reward_before_std": 0.38515608105808496, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.4109892025589943, | |
| "reward_change_min": -0.6098762080073357, | |
| "reward_change_std": 0.24706434831023216, | |
| "reward_std": 0.4693983215838671, | |
| "rewards/accuracy_reward": 0.5208333358168602, | |
| "rewards/cosine_scaled_reward": 0.2104520034044981, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2418.229179382324, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.02546970546245575, | |
| "kl": 0.00011172890663146973, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.5982178221668533e-07, | |
| "loss": 0.0076, | |
| "reward": 0.20953483134508133, | |
| "reward_after_mean": 0.20953483134508133, | |
| "reward_after_std": 0.5650828517973423, | |
| "reward_before_mean": 0.5645723771303892, | |
| "reward_before_std": 0.49963863380253315, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3550375532358885, | |
| "reward_change_min": -0.5531989093869925, | |
| "reward_change_std": 0.21487182471901178, | |
| "reward_std": 0.5650828760117292, | |
| "rewards/accuracy_reward": 0.39583333395421505, | |
| "rewards/cosine_scaled_reward": 0.16873904690146446, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2880.583396911621, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.02026941254734993, | |
| "kl": 0.00010737031698226929, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": -0.0051, | |
| "reward": -0.23039704840630293, | |
| "reward_after_mean": -0.23039704840630293, | |
| "reward_after_std": 0.4545791279524565, | |
| "reward_before_mean": -0.0207052119076252, | |
| "reward_before_std": 0.4199818782508373, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2096918299794197, | |
| "reward_change_min": -0.3325869217514992, | |
| "reward_change_std": 0.12889837939292192, | |
| "reward_std": 0.45457913912832737, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.1457052135374397, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2572.875045776367, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.024446720257401466, | |
| "kl": 9.263120591640472e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.45704275117204e-07, | |
| "loss": -0.0266, | |
| "reward": -0.0885739466175437, | |
| "reward_after_mean": -0.0885739466175437, | |
| "reward_after_std": 0.40795043855905533, | |
| "reward_before_mean": 0.17822610400617123, | |
| "reward_before_std": 0.33472174778580666, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2668000441044569, | |
| "reward_change_min": -0.3915207665413618, | |
| "reward_change_std": 0.1494421288371086, | |
| "reward_std": 0.40795045532286167, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/cosine_scaled_reward": -0.05094056576490402, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2697.7916946411133, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.02511826530098915, | |
| "kl": 0.00013872981071472168, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0188, | |
| "reward": -0.028292442206293344, | |
| "reward_after_mean": -0.028292442206293344, | |
| "reward_after_std": 0.40028360672295094, | |
| "reward_before_mean": 0.25999742932617664, | |
| "reward_before_std": 0.3327331282198429, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2882898673415184, | |
| "reward_change_min": -0.41081140749156475, | |
| "reward_change_std": 0.15616974979639053, | |
| "reward_std": 0.40028361417353153, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/cosine_scaled_reward": 0.009997433982789516, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2810.041732788086, | |
| "epoch": 0.16, | |
| "grad_norm": 0.038509517908096313, | |
| "kl": 0.0001595616340637207, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.3183567088914833e-07, | |
| "loss": 0.0088, | |
| "reward": 0.14832666027359664, | |
| "reward_after_mean": 0.14832666027359664, | |
| "reward_after_std": 0.6521247308701277, | |
| "reward_before_mean": 0.4682345949113369, | |
| "reward_before_std": 0.5863152034580708, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31990794092416763, | |
| "reward_change_min": -0.5289614573121071, | |
| "reward_change_std": 0.20588868111371994, | |
| "reward_std": 0.652124747633934, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/cosine_scaled_reward": 0.11406791373156011, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2737.7709045410156, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.02538427524268627, | |
| "kl": 0.00011593103408813477, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": -0.0546, | |
| "reward": -0.2177041512914002, | |
| "reward_after_mean": -0.2177041512914002, | |
| "reward_after_std": 0.594437601044774, | |
| "reward_before_mean": -0.023291918449103832, | |
| "reward_before_std": 0.568119059316814, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19441223330795765, | |
| "reward_change_min": -0.386432521045208, | |
| "reward_change_std": 0.13637393061071634, | |
| "reward_std": 0.5944376047700644, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.1482919171685353, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2527.4583740234375, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.021552937105298042, | |
| "kl": 0.0001442432403564453, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.182328662904756e-07, | |
| "loss": -0.0467, | |
| "reward": 0.038692621514201164, | |
| "reward_after_mean": 0.038692621514201164, | |
| "reward_after_std": 0.49257983826100826, | |
| "reward_before_mean": 0.33910070918500423, | |
| "reward_before_std": 0.43346802331507206, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3004080858081579, | |
| "reward_change_min": -0.472257686778903, | |
| "reward_change_std": 0.17685699556022882, | |
| "reward_std": 0.49257984571158886, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/cosine_scaled_reward": 0.047434025444090366, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2238.604202270508, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.025303639471530914, | |
| "kl": 0.00010967254638671875, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0228, | |
| "reward": -0.13753529638051987, | |
| "reward_after_mean": -0.13753529638051987, | |
| "reward_after_std": 0.3711821623146534, | |
| "reward_before_mean": 0.12087950762361288, | |
| "reward_before_std": 0.3427344807423651, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2584147993475199, | |
| "reward_change_min": -0.39502568170428276, | |
| "reward_change_std": 0.15850333217531443, | |
| "reward_std": 0.37118216790258884, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/cosine_scaled_reward": -0.045787165872752666, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2460.583381652832, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.03277630731463432, | |
| "kl": 0.00010095536708831787, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.0491243424323783e-07, | |
| "loss": 0.0722, | |
| "reward": 0.4543617179733701, | |
| "reward_after_mean": 0.4543617179733701, | |
| "reward_after_std": 0.7069333475083113, | |
| "reward_before_mean": 0.8829482905566692, | |
| "reward_before_std": 0.6676272489130497, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.42858656495809555, | |
| "reward_change_min": -0.6201369129121304, | |
| "reward_change_std": 0.2603969210758805, | |
| "reward_std": 0.7069333605468273, | |
| "rewards/accuracy_reward": 0.5833333469927311, | |
| "rewards/cosine_scaled_reward": 0.2996149277314544, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1835.270866394043, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.03344608470797539, | |
| "kl": 0.00010331720113754272, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0229, | |
| "reward": 0.09240274596959352, | |
| "reward_after_mean": 0.09240274596959352, | |
| "reward_after_std": 0.45070875994861126, | |
| "reward_before_mean": 0.41964344773441553, | |
| "reward_before_std": 0.37119755055755377, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.32724072225391865, | |
| "reward_change_min": -0.4856059141457081, | |
| "reward_change_std": 0.19001474510878325, | |
| "reward_std": 0.45070877112448215, | |
| "rewards/accuracy_reward": 0.33333333395421505, | |
| "rewards/cosine_scaled_reward": 0.08631011750549078, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2060.9583892822266, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.026087850332260132, | |
| "kl": 0.00010473420843482018, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.918906036420294e-07, | |
| "loss": -0.022, | |
| "reward": -0.3456582888029516, | |
| "reward_after_mean": -0.3456582888029516, | |
| "reward_after_std": 0.3900475464761257, | |
| "reward_before_mean": -0.16955635324120522, | |
| "reward_before_std": 0.35085606575012207, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17610193602740765, | |
| "reward_change_min": -0.3039908893406391, | |
| "reward_change_std": 0.1093815853819251, | |
| "reward_std": 0.3900475464761257, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.2320563482644502, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3446.062530517578, | |
| "epoch": 0.168, | |
| "grad_norm": 0.017223268747329712, | |
| "kl": 0.00015714764595031738, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.035, | |
| "reward": 0.05932202748954296, | |
| "reward_after_mean": 0.05932202748954296, | |
| "reward_after_std": 0.647826049476862, | |
| "reward_before_mean": 0.3570065386593342, | |
| "reward_before_std": 0.6669072303920984, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2976845409721136, | |
| "reward_change_min": -0.5494338441640139, | |
| "reward_change_std": 0.21914179529994726, | |
| "reward_std": 0.647826075553894, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/cosine_scaled_reward": 0.06533987820148468, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2190.6875534057617, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.026238586753606796, | |
| "kl": 0.00011092424392700195, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.791832395815782e-07, | |
| "loss": 0.0352, | |
| "reward": 0.02173960581421852, | |
| "reward_after_mean": 0.02173960581421852, | |
| "reward_after_std": 0.6264125965535641, | |
| "reward_before_mean": 0.3032309217378497, | |
| "reward_before_std": 0.5903957700356841, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28149132430553436, | |
| "reward_change_min": -0.49092668667435646, | |
| "reward_change_std": 0.19152779690921307, | |
| "reward_std": 0.6264126114547253, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": 0.011564248241484165, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2382.7083587646484, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.02286612056195736, | |
| "kl": 9.250640869140625e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0309, | |
| "reward": 0.1198802962899208, | |
| "reward_after_mean": 0.1198802962899208, | |
| "reward_after_std": 0.4947097469121218, | |
| "reward_before_mean": 0.45648779161274433, | |
| "reward_before_std": 0.46653653495013714, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.33660750463604927, | |
| "reward_change_min": -0.5417331680655479, | |
| "reward_change_std": 0.21329155191779137, | |
| "reward_std": 0.4947097636759281, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/cosine_scaled_reward": 0.10232111997902393, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2792.333351135254, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.022381342947483063, | |
| "kl": 0.00012479722499847412, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.6680582402757324e-07, | |
| "loss": 0.0524, | |
| "reward": -0.01498852763324976, | |
| "reward_after_mean": -0.01498852763324976, | |
| "reward_after_std": 0.664625771343708, | |
| "reward_before_mean": 0.25175523199141026, | |
| "reward_before_std": 0.6803292762488127, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.26674376986920834, | |
| "reward_change_min": -0.53781808167696, | |
| "reward_change_std": 0.20475875865668058, | |
| "reward_std": 0.6646257899701595, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/cosine_scaled_reward": 0.001755234319716692, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2384.8125610351562, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.028256850317120552, | |
| "kl": 0.00014731287956237793, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.138, | |
| "reward": 0.35779024893417954, | |
| "reward_after_mean": 0.35779024893417954, | |
| "reward_after_std": 0.7775043789297342, | |
| "reward_before_mean": 0.745909059420228, | |
| "reward_before_std": 0.7730156276375055, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3881188202649355, | |
| "reward_change_min": -0.6950398050248623, | |
| "reward_change_std": 0.271209386177361, | |
| "reward_std": 0.7775043845176697, | |
| "rewards/accuracy_reward": 0.47916667722165585, | |
| "rewards/cosine_scaled_reward": 0.26674237998668104, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2806.062511444092, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.03883467614650726, | |
| "kl": 0.00019087642431259155, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.547734369542718e-07, | |
| "loss": 0.0386, | |
| "reward": -0.1646068338304758, | |
| "reward_after_mean": -0.1646068338304758, | |
| "reward_after_std": 0.5001123249530792, | |
| "reward_before_mean": 0.06822102330625057, | |
| "reward_before_std": 0.49387937784194946, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23282786458730698, | |
| "reward_change_min": -0.45555905625224113, | |
| "reward_change_std": 0.16819952800869942, | |
| "reward_std": 0.5001123435795307, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.09844564087688923, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2609.625045776367, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.029369013383984566, | |
| "kl": 0.00015197694301605225, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0563, | |
| "reward": -0.2579225329682231, | |
| "reward_after_mean": -0.2579225329682231, | |
| "reward_after_std": 0.41702854074537754, | |
| "reward_before_mean": -0.0514589948579669, | |
| "reward_before_std": 0.3938685590401292, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2064635269343853, | |
| "reward_change_min": -0.3605457991361618, | |
| "reward_change_std": 0.13238902669399977, | |
| "reward_std": 0.4170285416767001, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/cosine_scaled_reward": -0.19729233742691576, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2773.625045776367, | |
| "epoch": 0.176, | |
| "grad_norm": 0.019490770995616913, | |
| "kl": 0.00013262033462524414, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.4310073797187573e-07, | |
| "loss": 0.0404, | |
| "reward": 0.2587023861706257, | |
| "reward_after_mean": 0.2587023861706257, | |
| "reward_after_std": 0.5390413794666529, | |
| "reward_before_mean": 0.6395768262445927, | |
| "reward_before_std": 0.492486541159451, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3808744475245476, | |
| "reward_change_min": -0.6062983945012093, | |
| "reward_change_std": 0.24063634779304266, | |
| "reward_std": 0.5390413850545883, | |
| "rewards/accuracy_reward": 0.4583333395421505, | |
| "rewards/cosine_scaled_reward": 0.18124347925186157, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2265.0833435058594, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.02680964767932892, | |
| "kl": 0.00013046711683273315, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": -0.0159, | |
| "reward": 0.21415067370980978, | |
| "reward_after_mean": 0.21415067370980978, | |
| "reward_after_std": 0.6358739994466305, | |
| "reward_before_mean": 0.5597648918628693, | |
| "reward_before_std": 0.5540275080129504, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.34561420790851116, | |
| "reward_change_min": -0.6032971385866404, | |
| "reward_change_std": 0.2241066563874483, | |
| "reward_std": 0.6358740348368883, | |
| "rewards/accuracy_reward": 0.4583333358168602, | |
| "rewards/cosine_scaled_reward": 0.10143155371770263, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2724.979179382324, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.02436206117272377, | |
| "kl": 0.00014898180961608887, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.3180194846605364e-07, | |
| "loss": -0.0307, | |
| "reward": -0.16922233253717422, | |
| "reward_after_mean": -0.16922233253717422, | |
| "reward_after_std": 0.4979768879711628, | |
| "reward_before_mean": 0.06080557717359625, | |
| "reward_before_std": 0.48400300554931164, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2300279177725315, | |
| "reward_change_min": -0.38487651385366917, | |
| "reward_change_std": 0.1520459521561861, | |
| "reward_std": 0.49797692708671093, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.10586108081042767, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2841.1666870117188, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.020490428432822227, | |
| "kl": 0.00014954805374145508, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0408, | |
| "reward": -0.16689512692391872, | |
| "reward_after_mean": -0.16689512692391872, | |
| "reward_after_std": 0.449503768235445, | |
| "reward_before_mean": 0.06273568281903863, | |
| "reward_before_std": 0.369269410148263, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22963083535432816, | |
| "reward_change_min": -0.339257437735796, | |
| "reward_change_std": 0.12097494397312403, | |
| "reward_std": 0.44950377382338047, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.10393097810447216, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2291.1875610351562, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.028161099180579185, | |
| "kl": 0.00011629331856966019, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.2089083427137329e-07, | |
| "loss": -0.0064, | |
| "reward": 0.3283900278620422, | |
| "reward_after_mean": 0.3283900278620422, | |
| "reward_after_std": 0.71586455963552, | |
| "reward_before_mean": 0.7013807380571961, | |
| "reward_before_std": 0.6162153771147132, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3729907236993313, | |
| "reward_change_min": -0.5737095661461353, | |
| "reward_change_std": 0.22160479053854942, | |
| "reward_std": 0.715864596888423, | |
| "rewards/accuracy_reward": 0.41666666977107525, | |
| "rewards/cosine_scaled_reward": 0.2847140731755644, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3291.9584045410156, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.01683027669787407, | |
| "kl": 0.00015926361083984375, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": -0.0387, | |
| "reward": -0.14818060956895351, | |
| "reward_after_mean": -0.14818060956895351, | |
| "reward_after_std": 0.4753723032772541, | |
| "reward_before_mean": 0.09327768813818693, | |
| "reward_before_std": 0.46588534861803055, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24145829305052757, | |
| "reward_change_min": -0.4540882632136345, | |
| "reward_change_std": 0.1682984195649624, | |
| "reward_std": 0.4753723070025444, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/cosine_scaled_reward": -0.07338898256421089, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2902.479217529297, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.023850562050938606, | |
| "kl": 0.00019377470016479492, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.1038068889975259e-07, | |
| "loss": 0.0365, | |
| "reward": 0.07942142337560654, | |
| "reward_after_mean": 0.07942142337560654, | |
| "reward_after_std": 0.5131905730813742, | |
| "reward_before_mean": 0.39866685029119253, | |
| "reward_before_std": 0.48175664618611336, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3192454166710377, | |
| "reward_change_min": -0.5308761186897755, | |
| "reward_change_std": 0.2128805061802268, | |
| "reward_std": 0.5131905842572451, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/cosine_scaled_reward": 0.06533349771052599, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2287.291732788086, | |
| "epoch": 0.184, | |
| "grad_norm": 0.025397833436727524, | |
| "kl": 0.00013327598571777344, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.1125, | |
| "reward": 0.009378287941217422, | |
| "reward_after_mean": 0.009378287941217422, | |
| "reward_after_std": 0.5341452080756426, | |
| "reward_before_mean": 0.30027469992637634, | |
| "reward_before_std": 0.5237694401293993, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.290896400809288, | |
| "reward_change_min": -0.46712340973317623, | |
| "reward_change_std": 0.18900143820792437, | |
| "reward_std": 0.5341452155262232, | |
| "rewards/accuracy_reward": 0.29166667722165585, | |
| "rewards/cosine_scaled_reward": 0.008608013857156038, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3025.312545776367, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.02271593175828457, | |
| "kl": 0.00016546249389648438, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.0028431734436308e-07, | |
| "loss": 0.0268, | |
| "reward": 0.14291446842253208, | |
| "reward_after_mean": 0.14291446842253208, | |
| "reward_after_std": 0.4402059204876423, | |
| "reward_before_mean": 0.4916996471583843, | |
| "reward_before_std": 0.36170108430087566, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3487852066755295, | |
| "reward_change_min": -0.5253034494817257, | |
| "reward_change_std": 0.20280368253588676, | |
| "reward_std": 0.4402059353888035, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/cosine_scaled_reward": 0.09586631692945957, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2381.000030517578, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.02407103404402733, | |
| "kl": 0.00012842798605561256, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0324, | |
| "reward": 0.11635435372591019, | |
| "reward_after_mean": 0.11635435372591019, | |
| "reward_after_std": 0.34932328946888447, | |
| "reward_before_mean": 0.46577244251966476, | |
| "reward_before_std": 0.26789624989032745, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.34941811859607697, | |
| "reward_change_min": -0.4971570298075676, | |
| "reward_change_std": 0.18925141356885433, | |
| "reward_std": 0.3493233025074005, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/cosine_scaled_reward": 0.11160578578710556, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2291.8750610351562, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.028548384085297585, | |
| "kl": 0.00012259185314178467, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.9061402047871833e-07, | |
| "loss": 0.0285, | |
| "reward": 0.1446489430963993, | |
| "reward_after_mean": 0.1446489430963993, | |
| "reward_after_std": 0.5495432000607252, | |
| "reward_before_mean": 0.4804554167203605, | |
| "reward_before_std": 0.5176653284579515, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.33580648340284824, | |
| "reward_change_min": -0.5389121547341347, | |
| "reward_change_std": 0.21476514916867018, | |
| "reward_std": 0.5495432112365961, | |
| "rewards/accuracy_reward": 0.37500000558793545, | |
| "rewards/cosine_scaled_reward": 0.1054554246366024, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2865.854248046875, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.02938673086464405, | |
| "kl": 0.00016605854034423828, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.1174, | |
| "reward": -0.1965638529509306, | |
| "reward_after_mean": -0.1965638529509306, | |
| "reward_after_std": 0.47414442524313927, | |
| "reward_before_mean": 0.02646741457283497, | |
| "reward_before_std": 0.45417727902531624, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22303126752376556, | |
| "reward_change_min": -0.38297396898269653, | |
| "reward_change_std": 0.14797239750623703, | |
| "reward_std": 0.47414442524313927, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.11936591006815434, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2499.2291870117188, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.01974744163453579, | |
| "kl": 0.0001074373722076416, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.8138158006995363e-07, | |
| "loss": 0.0582, | |
| "reward": 0.16037676320411265, | |
| "reward_after_mean": 0.16037676320411265, | |
| "reward_after_std": 0.5590666178613901, | |
| "reward_before_mean": 0.49760086461901665, | |
| "reward_before_std": 0.5006757825613022, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.33722409792244434, | |
| "reward_change_min": -0.4930756986141205, | |
| "reward_change_std": 0.1948847435414791, | |
| "reward_std": 0.5590666364878416, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/cosine_scaled_reward": 0.14343418591306545, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2107.625030517578, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.025774935260415077, | |
| "kl": 0.00010276585817337036, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": -0.0464, | |
| "reward": -0.030031360685825348, | |
| "reward_after_mean": -0.030031360685825348, | |
| "reward_after_std": 0.4247422106564045, | |
| "reward_before_mean": 0.2586988788098097, | |
| "reward_before_std": 0.37567474879324436, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28873022459447384, | |
| "reward_change_min": -0.4525489006191492, | |
| "reward_change_std": 0.17794163059443235, | |
| "reward_std": 0.42474222742021084, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/cosine_scaled_reward": -0.012134447693824768, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2797.791717529297, | |
| "epoch": 0.192, | |
| "grad_norm": 0.019881395623087883, | |
| "kl": 0.00014641880989074707, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.7259824442455923e-07, | |
| "loss": 0.0198, | |
| "reward": -0.06211714819073677, | |
| "reward_after_mean": -0.06211714819073677, | |
| "reward_after_std": 0.442181596532464, | |
| "reward_before_mean": 0.21284539625048637, | |
| "reward_before_std": 0.3944641398265958, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2749625276774168, | |
| "reward_change_min": -0.46516215056180954, | |
| "reward_change_std": 0.1713942475616932, | |
| "reward_std": 0.4421816077083349, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/cosine_scaled_reward": -0.01632128842175007, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2072.145881652832, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.024925388395786285, | |
| "kl": 0.00010547041893005371, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": -0.039, | |
| "reward": 0.33457482111407444, | |
| "reward_after_mean": 0.33457482111407444, | |
| "reward_after_std": 0.45331256836652756, | |
| "reward_before_mean": 0.7455343250185251, | |
| "reward_before_std": 0.3269349467009306, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.41095951199531555, | |
| "reward_change_min": -0.5560596249997616, | |
| "reward_change_std": 0.2161042196676135, | |
| "reward_std": 0.45331257209181786, | |
| "rewards/accuracy_reward": 0.5000000055879354, | |
| "rewards/cosine_scaled_reward": 0.24553431570529938, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2338.166717529297, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.027214445173740387, | |
| "kl": 0.00011979043483734131, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.6427471468404952e-07, | |
| "loss": -0.0404, | |
| "reward": 0.05816604569554329, | |
| "reward_after_mean": 0.05816604569554329, | |
| "reward_after_std": 0.33920222520828247, | |
| "reward_before_mean": 0.38867138512432575, | |
| "reward_before_std": 0.26190874679014087, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.33050532080233097, | |
| "reward_change_min": -0.47121275775134563, | |
| "reward_change_std": 0.1878534136340022, | |
| "reward_std": 0.33920223265886307, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/cosine_scaled_reward": 0.055338045582175255, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2271.375015258789, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.027967043220996857, | |
| "kl": 0.00012468546628952026, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": -0.0165, | |
| "reward": 0.06615722924470901, | |
| "reward_after_mean": 0.06615722924470901, | |
| "reward_after_std": 0.4302575755864382, | |
| "reward_before_mean": 0.38474041223526, | |
| "reward_before_std": 0.33841412514448166, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3185831569135189, | |
| "reward_change_min": -0.4822757709771395, | |
| "reward_change_std": 0.17764397989958525, | |
| "reward_std": 0.4302575970068574, | |
| "rewards/accuracy_reward": 0.31250000186264515, | |
| "rewards/cosine_scaled_reward": 0.07224038429558277, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2777.7708702087402, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.02615823782980442, | |
| "kl": 0.00017011165618896484, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.5642113178727193e-07, | |
| "loss": -0.0005, | |
| "reward": 0.03716196492314339, | |
| "reward_after_mean": 0.03716196492314339, | |
| "reward_after_std": 0.44201391376554966, | |
| "reward_before_mean": 0.3422218947671354, | |
| "reward_before_std": 0.3421974731609225, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30505993589758873, | |
| "reward_change_min": -0.4638092163950205, | |
| "reward_change_std": 0.17159069795161486, | |
| "reward_std": 0.44201392494142056, | |
| "rewards/accuracy_reward": 0.31250000186264515, | |
| "rewards/cosine_scaled_reward": 0.029721886618062854, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1847.3333435058594, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.03499305993318558, | |
| "kl": 7.937708869576454e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0217, | |
| "reward": -0.23481484316289425, | |
| "reward_after_mean": -0.23481484316289425, | |
| "reward_after_std": 0.24997185822576284, | |
| "reward_before_mean": -0.0019354680553078651, | |
| "reward_before_std": 0.1694914740510285, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23287938348948956, | |
| "reward_change_min": -0.31925959698855877, | |
| "reward_change_std": 0.12057728180661798, | |
| "reward_std": 0.24997186101973057, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.14776881225407124, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1855.6666870117188, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.025248348712921143, | |
| "kl": 0.00010962784290313721, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.4904706411523448e-07, | |
| "loss": -0.0114, | |
| "reward": -0.07872031070291996, | |
| "reward_after_mean": -0.07872031070291996, | |
| "reward_after_std": 0.5705854296684265, | |
| "reward_before_mean": 0.17297677602618933, | |
| "reward_before_std": 0.5529223121702671, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25169707648456097, | |
| "reward_change_min": -0.41667111963033676, | |
| "reward_change_std": 0.16653849836438894, | |
| "reward_std": 0.5705854464322329, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/cosine_scaled_reward": -0.01452323398552835, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2315.5000190734863, | |
| "epoch": 0.2, | |
| "grad_norm": 0.025843625888228416, | |
| "kl": 0.0001019798219203949, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": -0.0411, | |
| "reward": 0.044883210211992264, | |
| "reward_after_mean": 0.044883210211992264, | |
| "reward_after_std": 0.3484018575400114, | |
| "reward_before_mean": 0.3673110632225871, | |
| "reward_before_std": 0.2589006684720516, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.322427861392498, | |
| "reward_change_min": -0.4825479593127966, | |
| "reward_change_std": 0.1801274660974741, | |
| "reward_std": 0.34840187057852745, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": 0.07564437948167324, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2629.7708740234375, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.026233607903122902, | |
| "kl": 0.00010827556252479553, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.4216149583350755e-07, | |
| "loss": 0.0766, | |
| "reward": 0.07939248671755195, | |
| "reward_after_mean": 0.07939248671755195, | |
| "reward_after_std": 0.6790229994803667, | |
| "reward_before_mean": 0.3831321783363819, | |
| "reward_before_std": 0.7209639446809888, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3037397023290396, | |
| "reward_change_min": -0.5897121950984001, | |
| "reward_change_std": 0.23858889937400818, | |
| "reward_std": 0.6790230087935925, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/cosine_scaled_reward": 0.028965501580387354, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2575.916702270508, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.025286749005317688, | |
| "kl": 0.00016096234321594238, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": -0.0311, | |
| "reward": -0.19152541272342205, | |
| "reward_after_mean": -0.19152541272342205, | |
| "reward_after_std": 0.48578111454844475, | |
| "reward_before_mean": 0.033257571049034595, | |
| "reward_before_std": 0.47964665945619345, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22478299029171467, | |
| "reward_change_min": -0.4445956815034151, | |
| "reward_change_std": 0.16561621148139238, | |
| "reward_std": 0.4857811164110899, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.11257576791103929, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2311.7291870117188, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.027437448501586914, | |
| "kl": 0.00011655688285827637, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.3577281594640182e-07, | |
| "loss": -0.0543, | |
| "reward": 0.1088696513324976, | |
| "reward_after_mean": 0.1088696513324976, | |
| "reward_after_std": 0.469313045963645, | |
| "reward_before_mean": 0.44181622844189405, | |
| "reward_before_std": 0.41828347370028496, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3329465799033642, | |
| "reward_change_min": -0.4839160367846489, | |
| "reward_change_std": 0.19409743417054415, | |
| "reward_std": 0.4693130645900965, | |
| "rewards/accuracy_reward": 0.33333334140479565, | |
| "rewards/cosine_scaled_reward": 0.10848287865519524, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2721.7500610351562, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.022730229422450066, | |
| "kl": 0.00014713406562805176, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.034, | |
| "reward": -0.2384864166378975, | |
| "reward_after_mean": -0.2384864166378975, | |
| "reward_after_std": 0.48379051871597767, | |
| "reward_before_mean": -0.037665948970243335, | |
| "reward_before_std": 0.4321022112853825, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20082047581672668, | |
| "reward_change_min": -0.33639476634562016, | |
| "reward_change_std": 0.12049192376434803, | |
| "reward_std": 0.48379052244126797, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.12099928548559546, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2094.208354949951, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.0351104699075222, | |
| "kl": 0.00013990700244903564, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.2988880807625927e-07, | |
| "loss": -0.0636, | |
| "reward": 0.09567644819617271, | |
| "reward_after_mean": 0.09567644819617271, | |
| "reward_after_std": 0.28684780560433865, | |
| "reward_before_mean": 0.44278225488960743, | |
| "reward_before_std": 0.15074736287351698, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3471058327704668, | |
| "reward_change_min": -0.45528485253453255, | |
| "reward_change_std": 0.17486567981541157, | |
| "reward_std": 0.2868478149175644, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/cosine_scaled_reward": 0.06778226979076862, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3052.812530517578, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.0223699901252985, | |
| "kl": 0.00018121302127838135, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0092, | |
| "reward": -0.17876180354505777, | |
| "reward_after_mean": -0.17876180354505777, | |
| "reward_after_std": 0.3157486580312252, | |
| "reward_before_mean": 0.0692460760474205, | |
| "reward_before_std": 0.24795555789023638, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24800788797438145, | |
| "reward_change_min": -0.3895928617566824, | |
| "reward_change_std": 0.14537531603127718, | |
| "reward_std": 0.31574865989387035, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.09742058239498874, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1716.9791870117188, | |
| "epoch": 0.208, | |
| "grad_norm": 0.02780199609696865, | |
| "kl": 6.859749555587769e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.2451664098030743e-07, | |
| "loss": -0.077, | |
| "reward": 0.143024992197752, | |
| "reward_after_mean": 0.143024992197752, | |
| "reward_after_std": 0.6203626422211528, | |
| "reward_before_mean": 0.47820378467440605, | |
| "reward_before_std": 0.6523203919641674, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3351787682622671, | |
| "reward_change_min": -0.6069156751036644, | |
| "reward_change_std": 0.24839763902127743, | |
| "reward_std": 0.620362657122314, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/cosine_scaled_reward": 0.08237039996311069, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1873.270881652832, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.032433025538921356, | |
| "kl": 7.05718994140625e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": -0.0625, | |
| "reward": 0.04434862919151783, | |
| "reward_after_mean": 0.04434862919151783, | |
| "reward_after_std": 0.5306362751871347, | |
| "reward_before_mean": 0.3466955330222845, | |
| "reward_before_std": 0.5030294321477413, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3023468852043152, | |
| "reward_change_min": -0.5408249255269766, | |
| "reward_change_std": 0.2029698370024562, | |
| "reward_std": 0.5306362900882959, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/cosine_scaled_reward": 0.05502885114401579, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2780.0000038146973, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.03345809876918793, | |
| "kl": 0.00015169382095336914, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1966285981663407e-07, | |
| "loss": -0.0395, | |
| "reward": -0.2764766328036785, | |
| "reward_after_mean": -0.2764766328036785, | |
| "reward_after_std": 0.28763100504875183, | |
| "reward_before_mean": -0.05617565102875233, | |
| "reward_before_std": 0.27192449755966663, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22030097991228104, | |
| "reward_change_min": -0.33715784922242165, | |
| "reward_change_std": 0.13025930058211088, | |
| "reward_std": 0.2876310106366873, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/cosine_scaled_reward": -0.16034232266247272, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2354.2083587646484, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.02904898300766945, | |
| "kl": 8.22991132736206e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": -0.0195, | |
| "reward": -0.06851540505886078, | |
| "reward_after_mean": -0.06851540505886078, | |
| "reward_after_std": 0.4076218158006668, | |
| "reward_before_mean": 0.20718638133257627, | |
| "reward_before_std": 0.3651102539151907, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27570181526243687, | |
| "reward_change_min": -0.4055694956332445, | |
| "reward_change_std": 0.15835797414183617, | |
| "reward_std": 0.40762182511389256, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/cosine_scaled_reward": -0.021980268880724907, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2604.5625, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.020262470468878746, | |
| "kl": 0.00013430416584014893, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1533337816991931e-07, | |
| "loss": -0.0241, | |
| "reward": -0.04451223462820053, | |
| "reward_after_mean": -0.04451223462820053, | |
| "reward_after_std": 0.5044716745615005, | |
| "reward_before_mean": 0.22654481540666893, | |
| "reward_before_std": 0.450691357254982, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27105705067515373, | |
| "reward_change_min": -0.45378575660288334, | |
| "reward_change_std": 0.16853850428014994, | |
| "reward_std": 0.5044716857373714, | |
| "rewards/accuracy_reward": 0.25000000186264515, | |
| "rewards/cosine_scaled_reward": -0.023455200716853142, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2542.2917098999023, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.024545790627598763, | |
| "kl": 0.0001626908779144287, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0473, | |
| "reward": -0.1829390935599804, | |
| "reward_after_mean": -0.1829390935599804, | |
| "reward_after_std": 0.3984376899898052, | |
| "reward_before_mean": 0.05813688226044178, | |
| "reward_before_std": 0.3919172268360853, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24107596464455128, | |
| "reward_change_min": -0.4061664007604122, | |
| "reward_change_std": 0.15977757051587105, | |
| "reward_std": 0.3984377086162567, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.08769645728170872, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3444.750030517578, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.017645789310336113, | |
| "kl": 0.00020623207092285156, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1153347084664419e-07, | |
| "loss": 0.0223, | |
| "reward": -0.1741393506526947, | |
| "reward_after_mean": -0.1741393506526947, | |
| "reward_after_std": 0.3922067657113075, | |
| "reward_before_mean": 0.06998030468821526, | |
| "reward_before_std": 0.3800971172749996, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24411965906620026, | |
| "reward_change_min": -0.40364984050393105, | |
| "reward_change_std": 0.1577291926369071, | |
| "reward_std": 0.3922067675739527, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.07585303112864494, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1966.9375305175781, | |
| "epoch": 0.216, | |
| "grad_norm": 0.034927211701869965, | |
| "kl": 9.391456842422485e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": -0.0265, | |
| "reward": -0.12510699033737183, | |
| "reward_after_mean": -0.12510699033737183, | |
| "reward_after_std": 0.5509497374296188, | |
| "reward_before_mean": 0.10953010153025389, | |
| "reward_before_std": 0.5270057059824467, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23463710024952888, | |
| "reward_change_min": -0.40135206654667854, | |
| "reward_change_std": 0.15052294824272394, | |
| "reward_std": 0.5509497616440058, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/cosine_scaled_reward": -0.07796989846974611, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2617.0416946411133, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.026627399027347565, | |
| "kl": 0.00012832880020141602, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0826776744855121e-07, | |
| "loss": 0.0292, | |
| "reward": 0.10528106242418289, | |
| "reward_after_mean": 0.10528106242418289, | |
| "reward_after_std": 0.6541636940091848, | |
| "reward_before_mean": 0.4219757579267025, | |
| "reward_before_std": 0.6895054774358869, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3166946694254875, | |
| "reward_change_min": -0.6117767505347729, | |
| "reward_change_std": 0.24059188924729824, | |
| "reward_std": 0.6541637200862169, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/cosine_scaled_reward": 0.08864239510148764, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2205.750030517578, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.029876256361603737, | |
| "kl": 0.00012889504432678223, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0117, | |
| "reward": 0.07531885802745819, | |
| "reward_after_mean": 0.07531885802745819, | |
| "reward_after_std": 0.4699189569801092, | |
| "reward_before_mean": 0.39738621190190315, | |
| "reward_before_std": 0.4458512868732214, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.32206736505031586, | |
| "reward_change_min": -0.537959199398756, | |
| "reward_change_std": 0.20273534674197435, | |
| "reward_std": 0.46991895884275436, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/cosine_scaled_reward": 0.06405287701636553, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2956.979263305664, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.02069353684782982, | |
| "kl": 0.0001544952392578125, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0554024673218806e-07, | |
| "loss": -0.0794, | |
| "reward": -0.16378409788012505, | |
| "reward_after_mean": -0.16378409788012505, | |
| "reward_after_std": 0.33716665115207434, | |
| "reward_before_mean": 0.08590960502624512, | |
| "reward_before_std": 0.2754701506346464, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24969367682933807, | |
| "reward_change_min": -0.4076712429523468, | |
| "reward_change_std": 0.14498529862612486, | |
| "reward_std": 0.33716665860265493, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.08075705729424953, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2565.6459045410156, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.02433474361896515, | |
| "kl": 0.00013750791549682617, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0119, | |
| "reward": -0.11538952589035034, | |
| "reward_after_mean": -0.11538952589035034, | |
| "reward_after_std": 0.4653550498187542, | |
| "reward_before_mean": 0.14366911351680756, | |
| "reward_before_std": 0.47822121903300285, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25905864126980305, | |
| "reward_change_min": -0.4472095873206854, | |
| "reward_change_std": 0.18343419581651688, | |
| "reward_std": 0.46535505168139935, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.06466422416269779, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2630.2708587646484, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.021205004304647446, | |
| "kl": 0.00015100836753845215, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0335423176140511e-07, | |
| "loss": -0.014, | |
| "reward": 0.5263579860329628, | |
| "reward_after_mean": 0.5263579860329628, | |
| "reward_after_std": 0.5279496256262064, | |
| "reward_before_mean": 1.0014060586690903, | |
| "reward_before_std": 0.43055359087884426, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.47504812479019165, | |
| "reward_change_min": -0.7046387940645218, | |
| "reward_change_std": 0.2720959987491369, | |
| "reward_std": 0.5279496368020773, | |
| "rewards/accuracy_reward": 0.6250000111758709, | |
| "rewards/cosine_scaled_reward": 0.37640603724867105, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2478.1250381469727, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.02273085154592991, | |
| "kl": 0.00011633709073066711, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": -0.0778, | |
| "reward": -0.03121163323521614, | |
| "reward_after_mean": -0.03121163323521614, | |
| "reward_after_std": 0.542201291769743, | |
| "reward_before_mean": 0.2403367217630148, | |
| "reward_before_std": 0.49494979437440634, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27154832519590855, | |
| "reward_change_min": -0.47133193723857403, | |
| "reward_change_std": 0.1756760822609067, | |
| "reward_std": 0.5422013197094202, | |
| "rewards/accuracy_reward": 0.25000000186264515, | |
| "rewards/cosine_scaled_reward": -0.009663309436291456, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3269.354217529297, | |
| "epoch": 0.224, | |
| "grad_norm": 0.02504117041826248, | |
| "kl": 0.00018531084060668945, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.017123858587145e-07, | |
| "loss": 0.0545, | |
| "reward": -0.16638334095478058, | |
| "reward_after_mean": -0.16638334095478058, | |
| "reward_after_std": 0.3524969248101115, | |
| "reward_before_mean": 0.08567467518150806, | |
| "reward_before_std": 0.33011660259217024, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25205800868570805, | |
| "reward_change_min": -0.38781771063804626, | |
| "reward_change_std": 0.15303326025605202, | |
| "reward_std": 0.3524969294667244, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/cosine_scaled_reward": -0.10182532295584679, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2431.6667404174805, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.03106440044939518, | |
| "kl": 0.0001646280288696289, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": -0.0034, | |
| "reward": 0.3956623272970319, | |
| "reward_after_mean": 0.3956623272970319, | |
| "reward_after_std": 0.9187420383095741, | |
| "reward_before_mean": 0.7849392527714372, | |
| "reward_before_std": 0.980433851480484, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3892769180238247, | |
| "reward_change_min": -0.7530074659734964, | |
| "reward_change_std": 0.3080608732998371, | |
| "reward_std": 0.9187420606613159, | |
| "rewards/accuracy_reward": 0.5416666809469461, | |
| "rewards/cosine_scaled_reward": 0.24327257159166038, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2481.4375534057617, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.029221223667263985, | |
| "kl": 0.00012829899787902832, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0061670936044178e-07, | |
| "loss": 0.0738, | |
| "reward": -0.051010750234127045, | |
| "reward_after_mean": -0.051010750234127045, | |
| "reward_after_std": 0.5678279399871826, | |
| "reward_before_mean": 0.21409518271684647, | |
| "reward_before_std": 0.5671654343605042, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2651059068739414, | |
| "reward_change_min": -0.44747380912303925, | |
| "reward_change_std": 0.18114776257425547, | |
| "reward_std": 0.5678279716521502, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/cosine_scaled_reward": -0.03590485081076622, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3558.1041870117188, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.017193729057908058, | |
| "kl": 0.0001755356788635254, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0034, | |
| "reward": -0.16698902659118176, | |
| "reward_after_mean": -0.16698902659118176, | |
| "reward_after_std": 0.5533907692879438, | |
| "reward_before_mean": 0.055191148683661595, | |
| "reward_before_std": 0.5313135031610727, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2221801672130823, | |
| "reward_change_min": -0.39364006742835045, | |
| "reward_change_std": 0.14498097822070122, | |
| "reward_std": 0.5533907972276211, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/cosine_scaled_reward": -0.11147552821785212, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1830.0000686645508, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.028155136853456497, | |
| "kl": 9.66787338256836e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0006853717962393e-07, | |
| "loss": -0.0584, | |
| "reward": 0.29632306285202503, | |
| "reward_after_mean": 0.29632306285202503, | |
| "reward_after_std": 0.5755883920937777, | |
| "reward_before_mean": 0.6877593696117401, | |
| "reward_before_std": 0.5522297900170088, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.39143630117177963, | |
| "reward_change_min": -0.6200563348829746, | |
| "reward_change_std": 0.24899613857269287, | |
| "reward_std": 0.5755884237587452, | |
| "rewards/accuracy_reward": 0.4583333469927311, | |
| "rewards/cosine_scaled_reward": 0.22942602587863803, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "step": 200, | |
| "total_flos": 0.0, | |
| "train_loss": 0.009834988048532977, | |
| "train_runtime": 61157.4359, | |
| "train_samples_per_second": 0.157, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |