MMR-DAPO-7B / trainer_state.json
kangdawei's picture
Model save
c2d0fec verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.22857142857142856,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_fraction": 0.0,
"completion_length": 2253.854206085205,
"epoch": 0.001142857142857143,
"grad_norm": 0.029817435890436172,
"kl": 0.0,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 0.0,
"loss": -0.048,
"reward": 0.18865508306771517,
"reward_after_mean": 0.18865508306771517,
"reward_after_std": 0.5825161132961512,
"reward_before_mean": 0.5353203006088734,
"reward_before_std": 0.5411310354247689,
"reward_change_max": 0.0,
"reward_change_mean": -0.3466652315109968,
"reward_change_min": -0.5511383786797523,
"reward_change_std": 0.21760745346546173,
"reward_std": 0.5825161281973124,
"rewards/accuracy_reward": 0.37500000931322575,
"rewards/cosine_scaled_reward": 0.16032031644135714,
"step": 1
},
{
"clip_fraction": 0.0,
"completion_length": 2566.395854949951,
"epoch": 0.002285714285714286,
"grad_norm": 0.025483140721917152,
"kl": 0.0,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5e-08,
"loss": 0.0336,
"reward": 0.19053915701806545,
"reward_after_mean": 0.19053915701806545,
"reward_after_std": 0.5598375909030437,
"reward_before_mean": 0.5439198296517134,
"reward_before_std": 0.5335724893957376,
"reward_change_max": 0.0,
"reward_change_mean": -0.3533806595951319,
"reward_change_min": -0.5397481862455606,
"reward_change_std": 0.22024841140955687,
"reward_std": 0.5598376058042049,
"rewards/accuracy_reward": 0.41666667722165585,
"rewards/cosine_scaled_reward": 0.12725313939154148,
"step": 2
},
{
"clip_fraction": 0.0,
"completion_length": 2870.9166946411133,
"epoch": 0.0034285714285714284,
"grad_norm": 0.02348862774670124,
"kl": 0.00016453862190246582,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1e-07,
"loss": 0.0332,
"reward": -0.11309619061648846,
"reward_after_mean": -0.11309619061648846,
"reward_after_std": 0.4816359058022499,
"reward_before_mean": 0.13815331272780895,
"reward_before_std": 0.4635454909875989,
"reward_change_max": 0.0,
"reward_change_mean": -0.2512495145201683,
"reward_change_min": -0.4654350485652685,
"reward_change_std": 0.16807015426456928,
"reward_std": 0.4816359244287014,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.04934668634086847,
"step": 3
},
{
"clip_fraction": 0.0,
"completion_length": 1454.0625305175781,
"epoch": 0.004571428571428572,
"grad_norm": 0.03703964129090309,
"kl": 9.372830390930176e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.5e-07,
"loss": -0.0271,
"reward": 0.31341979652643204,
"reward_after_mean": 0.31341979652643204,
"reward_after_std": 0.6322482246905565,
"reward_before_mean": 0.7019761502742767,
"reward_before_std": 0.6114191431552172,
"reward_change_max": 0.0,
"reward_change_mean": -0.38855636678636074,
"reward_change_min": -0.6810240596532822,
"reward_change_std": 0.26496267691254616,
"reward_std": 0.632248230278492,
"rewards/accuracy_reward": 0.4791666679084301,
"rewards/cosine_scaled_reward": 0.2228094656020403,
"step": 4
},
{
"clip_fraction": 0.0,
"completion_length": 3106.2708892822266,
"epoch": 0.005714285714285714,
"grad_norm": 0.02083689533174038,
"kl": 0.00016355514526367188,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2e-07,
"loss": 0.014,
"reward": -0.14295638352632523,
"reward_after_mean": -0.14295638352632523,
"reward_after_std": 0.5368688032031059,
"reward_before_mean": 0.08551615010946989,
"reward_before_std": 0.4862247873097658,
"reward_change_max": 0.0,
"reward_change_mean": -0.22847254946827888,
"reward_change_min": -0.34290769696235657,
"reward_change_std": 0.12987546809017658,
"reward_std": 0.5368688274174929,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.08115051127970219,
"step": 5
},
{
"clip_fraction": 0.0,
"completion_length": 2359.7083587646484,
"epoch": 0.006857142857142857,
"grad_norm": 0.026594510301947594,
"kl": 0.00012198090553283691,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.5e-07,
"loss": 0.0042,
"reward": 0.02189142513088882,
"reward_after_mean": 0.02189142513088882,
"reward_after_std": 0.6753048785030842,
"reward_before_mean": 0.2973055485635996,
"reward_before_std": 0.6682278430089355,
"reward_change_max": 0.0,
"reward_change_mean": -0.2754141204059124,
"reward_change_min": -0.4809851162135601,
"reward_change_std": 0.19098785053938627,
"reward_std": 0.6753048803657293,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": 0.026472217752598226,
"step": 6
},
{
"clip_fraction": 0.0,
"completion_length": 2399.3125610351562,
"epoch": 0.008,
"grad_norm": 0.02208767458796501,
"kl": 0.00013466179370880127,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3e-07,
"loss": 0.0248,
"reward": 0.04978923127055168,
"reward_after_mean": 0.04978923127055168,
"reward_after_std": 0.6014937199652195,
"reward_before_mean": 0.3478062404319644,
"reward_before_std": 0.5934015912935138,
"reward_change_max": 0.0,
"reward_change_mean": -0.298017006367445,
"reward_change_min": -0.48044517263770103,
"reward_change_std": 0.19282954651862383,
"reward_std": 0.6014937292784452,
"rewards/accuracy_reward": 0.3125000111758709,
"rewards/cosine_scaled_reward": 0.0353062367066741,
"step": 7
},
{
"clip_fraction": 0.0,
"completion_length": 1923.0625343322754,
"epoch": 0.009142857142857144,
"grad_norm": 0.025719735771417618,
"kl": 7.99819827079773e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.5e-07,
"loss": 0.0407,
"reward": 0.20353314653038979,
"reward_after_mean": 0.20353314653038979,
"reward_after_std": 0.522263016551733,
"reward_before_mean": 0.5592934358865023,
"reward_before_std": 0.4454885171726346,
"reward_change_max": 0.0,
"reward_change_mean": -0.3557603023946285,
"reward_change_min": -0.5532414987683296,
"reward_change_std": 0.20976066123694181,
"reward_std": 0.5222630221396685,
"rewards/accuracy_reward": 0.41666667349636555,
"rewards/cosine_scaled_reward": 0.1426267744973302,
"step": 8
},
{
"clip_fraction": 0.0,
"completion_length": 2551.104202270508,
"epoch": 0.010285714285714285,
"grad_norm": 0.028185561299324036,
"kl": 0.0001239469274878502,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4e-07,
"loss": -0.0295,
"reward": -0.09345190459862351,
"reward_after_mean": -0.09345190459862351,
"reward_after_std": 0.6783309075981379,
"reward_before_mean": 0.1387091837823391,
"reward_before_std": 0.6670187395066023,
"reward_change_max": 0.0,
"reward_change_mean": -0.23216108232736588,
"reward_change_min": -0.4182189740240574,
"reward_change_std": 0.1609387183561921,
"reward_std": 0.6783309262245893,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.04879084415733814,
"step": 9
},
{
"clip_fraction": 0.0,
"completion_length": 2372.895881652832,
"epoch": 0.011428571428571429,
"grad_norm": 0.026600031182169914,
"kl": 0.00011247396469116211,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.5e-07,
"loss": 0.0272,
"reward": 0.042636663652956486,
"reward_after_mean": 0.042636663652956486,
"reward_after_std": 0.564329631626606,
"reward_before_mean": 0.340551670640707,
"reward_before_std": 0.5367275485768914,
"reward_change_max": 0.0,
"reward_change_mean": -0.29791501350700855,
"reward_change_min": -0.5192943438887596,
"reward_change_std": 0.2046388229355216,
"reward_std": 0.5643296670168638,
"rewards/accuracy_reward": 0.33333333767950535,
"rewards/cosine_scaled_reward": 0.007218348793685436,
"step": 10
},
{
"clip_fraction": 0.0,
"completion_length": 3101.125015258789,
"epoch": 0.012571428571428572,
"grad_norm": 0.02012811414897442,
"kl": 0.0001429915428161621,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5e-07,
"loss": 0.0255,
"reward": -0.1189196240156889,
"reward_after_mean": -0.1189196240156889,
"reward_after_std": 0.5897521004080772,
"reward_before_mean": 0.12155997939407825,
"reward_before_std": 0.607317803427577,
"reward_change_max": 0.0,
"reward_change_mean": -0.24047961458563805,
"reward_change_min": -0.47533175721764565,
"reward_change_std": 0.18331623543053865,
"reward_std": 0.5897521134465933,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.08677334897220135,
"step": 11
},
{
"clip_fraction": 0.0,
"completion_length": 1999.7708740234375,
"epoch": 0.013714285714285714,
"grad_norm": 0.030618170276284218,
"kl": 0.00013138353824615479,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.5e-07,
"loss": 0.0727,
"reward": -0.11606020852923393,
"reward_after_mean": -0.11606020852923393,
"reward_after_std": 0.45726848393678665,
"reward_before_mean": 0.13688807259313762,
"reward_before_std": 0.4379520956426859,
"reward_change_max": 0.0,
"reward_change_mean": -0.25294830463826656,
"reward_change_min": -0.417834984138608,
"reward_change_std": 0.16013498976826668,
"reward_std": 0.4572684969753027,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/cosine_scaled_reward": -0.05061192624270916,
"step": 12
},
{
"clip_fraction": 0.0,
"completion_length": 2364.7916946411133,
"epoch": 0.014857142857142857,
"grad_norm": 0.021956922486424446,
"kl": 0.0001315474510192871,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6e-07,
"loss": -0.0309,
"reward": -0.07681845407932997,
"reward_after_mean": -0.07681845407932997,
"reward_after_std": 0.5858405251055956,
"reward_before_mean": 0.17525275237858295,
"reward_before_std": 0.5871480498462915,
"reward_change_max": 0.0,
"reward_change_mean": -0.25207119435071945,
"reward_change_min": -0.4718685280531645,
"reward_change_std": 0.1800235854461789,
"reward_std": 0.5858405511826277,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": -0.03308058716356754,
"step": 13
},
{
"clip_fraction": 0.0,
"completion_length": 2517.3750534057617,
"epoch": 0.016,
"grad_norm": 0.02542865462601185,
"kl": 0.0001404285430908203,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.5e-07,
"loss": -0.0269,
"reward": -0.05427968641743064,
"reward_after_mean": -0.05427968641743064,
"reward_after_std": 0.5012820027768612,
"reward_before_mean": 0.21205449337139726,
"reward_before_std": 0.4408043739385903,
"reward_change_max": 0.0,
"reward_change_mean": -0.2663341574370861,
"reward_change_min": -0.4154788888990879,
"reward_change_std": 0.15815073158591986,
"reward_std": 0.5012820195406675,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/cosine_scaled_reward": -0.0171121833845973,
"step": 14
},
{
"clip_fraction": 0.0,
"completion_length": 2587.333381652832,
"epoch": 0.017142857142857144,
"grad_norm": 0.026293920353055,
"kl": 0.00010640174150466919,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7e-07,
"loss": -0.0171,
"reward": 0.1291836015880108,
"reward_after_mean": 0.1291836015880108,
"reward_after_std": 0.34985754638910294,
"reward_before_mean": 0.48446296714246273,
"reward_before_std": 0.26648052502423525,
"reward_change_max": 0.0,
"reward_change_mean": -0.35527935065329075,
"reward_change_min": -0.5008186884224415,
"reward_change_std": 0.19560196995735168,
"reward_std": 0.34985755756497383,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.1511296145617962,
"step": 15
},
{
"clip_fraction": 0.0,
"completion_length": 3479.3958740234375,
"epoch": 0.018285714285714287,
"grad_norm": 0.018178097903728485,
"kl": 0.00018554925918579102,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.5e-07,
"loss": 0.0308,
"reward": -0.12316650152206421,
"reward_after_mean": -0.12316650152206421,
"reward_after_std": 0.48174857906997204,
"reward_before_mean": 0.12909814529120922,
"reward_before_std": 0.4807006008923054,
"reward_change_max": 0.0,
"reward_change_mean": -0.2522646654397249,
"reward_change_min": -0.4256697855889797,
"reward_change_std": 0.17396669182926416,
"reward_std": 0.48174858279526234,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.037568524945527315,
"step": 16
},
{
"clip_fraction": 0.0,
"completion_length": 2003.1458587646484,
"epoch": 0.019428571428571427,
"grad_norm": 0.037696439772844315,
"kl": 0.00012348592281341553,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8e-07,
"loss": -0.0628,
"reward": 0.28727056505158544,
"reward_after_mean": 0.28727056505158544,
"reward_after_std": 0.7488968446850777,
"reward_before_mean": 0.6418840168043971,
"reward_before_std": 0.6678282842040062,
"reward_change_max": 0.0,
"reward_change_mean": -0.3546134736388922,
"reward_change_min": -0.5937347710132599,
"reward_change_std": 0.2221750607714057,
"reward_std": 0.7488968670368195,
"rewards/accuracy_reward": 0.4583333358168602,
"rewards/cosine_scaled_reward": 0.18355069373501465,
"step": 17
},
{
"clip_fraction": 0.0,
"completion_length": 2641.895881652832,
"epoch": 0.02057142857142857,
"grad_norm": 0.018719196319580078,
"kl": 0.0001255720853805542,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.499999999999999e-07,
"loss": -0.0322,
"reward": 0.13862532377243042,
"reward_after_mean": 0.13862532377243042,
"reward_after_std": 0.4716039840131998,
"reward_before_mean": 0.48331868555396795,
"reward_before_std": 0.4304672125726938,
"reward_change_max": 0.0,
"reward_change_mean": -0.3446933813393116,
"reward_change_min": -0.49582854844629765,
"reward_change_std": 0.19730697199702263,
"reward_std": 0.4716039877384901,
"rewards/accuracy_reward": 0.35416667722165585,
"rewards/cosine_scaled_reward": 0.1291519934311509,
"step": 18
},
{
"clip_fraction": 0.0,
"completion_length": 2432.6667251586914,
"epoch": 0.021714285714285714,
"grad_norm": 0.024000316858291626,
"kl": 0.00012255460023880005,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9e-07,
"loss": 0.0587,
"reward": 0.22441758587956429,
"reward_after_mean": 0.22441758587956429,
"reward_after_std": 0.7845676727592945,
"reward_before_mean": 0.556532722664997,
"reward_before_std": 0.759381739422679,
"reward_change_max": 0.0,
"reward_change_mean": -0.3321151062846184,
"reward_change_min": -0.6048189178109169,
"reward_change_std": 0.2319869976490736,
"reward_std": 0.7845676802098751,
"rewards/accuracy_reward": 0.4166666753590107,
"rewards/cosine_scaled_reward": 0.13986602576915175,
"step": 19
},
{
"clip_fraction": 0.0,
"completion_length": 1485.7917366027832,
"epoch": 0.022857142857142857,
"grad_norm": 0.033277831971645355,
"kl": 6.410479545593262e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0805,
"reward": 0.27623686753213406,
"reward_after_mean": 0.27623686753213406,
"reward_after_std": 0.5876359883695841,
"reward_before_mean": 0.649633388966322,
"reward_before_std": 0.5021514918189496,
"reward_change_max": 0.0,
"reward_change_mean": -0.37339654564857483,
"reward_change_min": -0.5447837132960558,
"reward_change_std": 0.21536927483975887,
"reward_std": 0.5876360051333904,
"rewards/accuracy_reward": 0.4583333395421505,
"rewards/cosine_scaled_reward": 0.19130004616454244,
"step": 20
},
{
"clip_fraction": 0.0,
"completion_length": 2453.8958587646484,
"epoch": 0.024,
"grad_norm": 0.03172062337398529,
"kl": 0.00014081597328186035,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1e-06,
"loss": 0.0158,
"reward": 0.054135403130203485,
"reward_after_mean": 0.054135403130203485,
"reward_after_std": 0.673911839723587,
"reward_before_mean": 0.3450733758509159,
"reward_before_std": 0.6906127110123634,
"reward_change_max": 0.0,
"reward_change_mean": -0.2909379918128252,
"reward_change_min": -0.5521223619580269,
"reward_change_std": 0.21271574683487415,
"reward_std": 0.6739118546247482,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": 0.05340671516023576,
"step": 21
},
{
"clip_fraction": 0.0,
"completion_length": 1199.6458587646484,
"epoch": 0.025142857142857144,
"grad_norm": 0.03503501042723656,
"kl": 0.00010113418102264404,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.99931462820376e-07,
"loss": 0.0149,
"reward": -0.011358192190527916,
"reward_after_mean": -0.011358192190527916,
"reward_after_std": 0.42460223753005266,
"reward_before_mean": 0.2760426625609398,
"reward_before_std": 0.30881113989744335,
"reward_change_max": 0.0,
"reward_change_mean": -0.28740084543824196,
"reward_change_min": -0.3831062186509371,
"reward_change_std": 0.14771532081067562,
"reward_std": 0.42460225615650415,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": -0.015624009916791692,
"step": 22
},
{
"clip_fraction": 0.0,
"completion_length": 2435.3542251586914,
"epoch": 0.026285714285714287,
"grad_norm": 0.024877028539776802,
"kl": 0.00011622905731201172,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0085,
"reward": 0.01893002726137638,
"reward_after_mean": 0.01893002726137638,
"reward_after_std": 0.7557151317596436,
"reward_before_mean": 0.28602612018585205,
"reward_before_std": 0.779970521107316,
"reward_change_max": 0.0,
"reward_change_mean": -0.2670961171388626,
"reward_change_min": -0.5823409277945757,
"reward_change_std": 0.21948671340942383,
"reward_std": 0.7557151485234499,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": -0.005640537710860372,
"step": 23
},
{
"clip_fraction": 0.0,
"completion_length": 1879.1666870117188,
"epoch": 0.027428571428571427,
"grad_norm": 0.025321708992123604,
"kl": 7.653236389160156e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.993832906395582e-07,
"loss": 0.0233,
"reward": 0.27044946048408747,
"reward_after_mean": 0.27044946048408747,
"reward_after_std": 0.6552858538925648,
"reward_before_mean": 0.6367096854373813,
"reward_before_std": 0.612536040134728,
"reward_change_max": 0.0,
"reward_change_mean": -0.3662601951509714,
"reward_change_min": -0.5752917006611824,
"reward_change_std": 0.22535593062639236,
"reward_std": 0.6552858743816614,
"rewards/accuracy_reward": 0.4166666753590107,
"rewards/cosine_scaled_reward": 0.22004299331456423,
"step": 24
},
{
"clip_fraction": 0.0,
"completion_length": 2328.708366394043,
"epoch": 0.02857142857142857,
"grad_norm": 0.022590631619095802,
"kl": 0.00014277477748692036,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0379,
"reward": 0.15500614792108536,
"reward_after_mean": 0.15500614792108536,
"reward_after_std": 0.46420795284211636,
"reward_before_mean": 0.5061908392235637,
"reward_before_std": 0.4215713571757078,
"reward_change_max": 0.0,
"reward_change_mean": -0.351184718310833,
"reward_change_min": -0.5469736345112324,
"reward_change_std": 0.21114493068307638,
"reward_std": 0.46420796401798725,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.15202417224645615,
"step": 25
},
{
"clip_fraction": 0.0,
"completion_length": 2400.645851135254,
"epoch": 0.029714285714285714,
"grad_norm": 0.023803971707820892,
"kl": 0.00012956559658050537,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.982876141412855e-07,
"loss": -0.0398,
"reward": -0.30033721402287483,
"reward_after_mean": -0.30033721402287483,
"reward_after_std": 0.40887343883514404,
"reward_before_mean": -0.10959341688430868,
"reward_before_std": 0.38057865016162395,
"reward_change_max": 0.0,
"reward_change_mean": -0.19074379839003086,
"reward_change_min": -0.3431377075612545,
"reward_change_std": 0.12020129058510065,
"reward_std": 0.40887344256043434,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.17209341190755367,
"step": 26
},
{
"clip_fraction": 0.0,
"completion_length": 2402.500068664551,
"epoch": 0.030857142857142857,
"grad_norm": 0.027803365141153336,
"kl": 0.00014105439186096191,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0521,
"reward": 0.012697070837020874,
"reward_after_mean": 0.012697070837020874,
"reward_after_std": 0.4288240037858486,
"reward_before_mean": 0.3213062509894371,
"reward_before_std": 0.4260330041870475,
"reward_change_max": 0.0,
"reward_change_mean": -0.30860918015241623,
"reward_change_min": -0.4718206785619259,
"reward_change_std": 0.19340202026069164,
"reward_std": 0.42882401682436466,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/cosine_scaled_reward": 0.05047290958464146,
"step": 27
},
{
"clip_fraction": 0.0,
"completion_length": 2371.5000343322754,
"epoch": 0.032,
"grad_norm": 0.03396248817443848,
"kl": 0.00012034177780151367,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.96645768238595e-07,
"loss": -0.0088,
"reward": 0.34543062816374004,
"reward_after_mean": 0.34543062816374004,
"reward_after_std": 0.8179215285927057,
"reward_before_mean": 0.7244215086102486,
"reward_before_std": 0.8351087644696236,
"reward_change_max": 0.0,
"reward_change_mean": -0.37899088114500046,
"reward_change_min": -0.6348299775272608,
"reward_change_std": 0.2599595533683896,
"reward_std": 0.8179215602576733,
"rewards/accuracy_reward": 0.5000000186264515,
"rewards/cosine_scaled_reward": 0.22442149464040995,
"step": 28
},
{
"clip_fraction": 0.0,
"completion_length": 2970.8333740234375,
"epoch": 0.03314285714285714,
"grad_norm": 0.020854219794273376,
"kl": 0.00018525123596191406,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.956206309337066e-07,
"loss": -0.0473,
"reward": -0.1481495127081871,
"reward_after_mean": -0.1481495127081871,
"reward_after_std": 0.31386037822812796,
"reward_before_mean": 0.1116566862910986,
"reward_before_std": 0.26476416178047657,
"reward_change_max": 0.0,
"reward_change_mean": -0.2598061878234148,
"reward_change_min": -0.39690806716680527,
"reward_change_std": 0.15259934635832906,
"reward_std": 0.31386038288474083,
"rewards/accuracy_reward": 0.1875,
"rewards/cosine_scaled_reward": -0.07584333047270775,
"step": 29
},
{
"clip_fraction": 0.0,
"completion_length": 2383.6875762939453,
"epoch": 0.03428571428571429,
"grad_norm": 0.02130362018942833,
"kl": 0.00012382864952087402,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.944597532678119e-07,
"loss": -0.0524,
"reward": 0.41686034575104713,
"reward_after_mean": 0.41686034575104713,
"reward_after_std": 0.7474758476018906,
"reward_before_mean": 0.8319165632128716,
"reward_before_std": 0.75706597417593,
"reward_change_max": 0.0,
"reward_change_mean": -0.4150562435388565,
"reward_change_min": -0.7054598368704319,
"reward_change_std": 0.2837554384022951,
"reward_std": 0.7474758699536324,
"rewards/accuracy_reward": 0.5416666809469461,
"rewards/cosine_scaled_reward": 0.29024988505989313,
"step": 30
},
{
"clip_fraction": 0.0,
"completion_length": 2900.750045776367,
"epoch": 0.03542857142857143,
"grad_norm": 0.02519163116812706,
"kl": 0.00015944242477416992,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0202,
"reward": -0.03954649716615677,
"reward_after_mean": -0.03954649716615677,
"reward_after_std": 0.4638585727661848,
"reward_before_mean": 0.24700810015201569,
"reward_before_std": 0.4689239803701639,
"reward_change_max": 0.0,
"reward_change_mean": -0.2865545880049467,
"reward_change_min": -0.48880807869136333,
"reward_change_std": 0.19339969009160995,
"reward_std": 0.4638585839420557,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/cosine_scaled_reward": 0.017841406166553497,
"step": 31
},
{
"clip_fraction": 0.0,
"completion_length": 2334.791717529297,
"epoch": 0.036571428571428574,
"grad_norm": 0.023573867976665497,
"kl": 0.00011053681373596191,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.917322325514487e-07,
"loss": -0.0054,
"reward": 0.13868720829486847,
"reward_after_mean": 0.13868720829486847,
"reward_after_std": 0.612119173631072,
"reward_before_mean": 0.4695329191163182,
"reward_before_std": 0.6050059096887708,
"reward_change_max": 0.0,
"reward_change_mean": -0.3308457229286432,
"reward_change_min": -0.5487178079783916,
"reward_change_std": 0.22997961565852165,
"reward_std": 0.612119173631072,
"rewards/accuracy_reward": 0.37500000558793545,
"rewards/cosine_scaled_reward": 0.09453292191028595,
"step": 32
},
{
"clip_fraction": 0.0,
"completion_length": 2833.729217529297,
"epoch": 0.037714285714285714,
"grad_norm": 0.022108081728219986,
"kl": 0.00012211501598358154,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0161,
"reward": 0.07408714387565851,
"reward_after_mean": 0.07408714387565851,
"reward_after_std": 0.6689751651138067,
"reward_before_mean": 0.37650261726230383,
"reward_before_std": 0.7002740390598774,
"reward_change_max": 0.0,
"reward_change_mean": -0.30241546407341957,
"reward_change_min": -0.5437397100031376,
"reward_change_std": 0.22630772832781076,
"reward_std": 0.6689751725643873,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/cosine_scaled_reward": 0.0640026107430458,
"step": 33
},
{
"clip_fraction": 0.0,
"completion_length": 1962.7500381469727,
"epoch": 0.038857142857142854,
"grad_norm": 0.02836132049560547,
"kl": 0.0001449286937713623,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.88466529153356e-07,
"loss": 0.0147,
"reward": 0.4290049262344837,
"reward_after_mean": 0.4290049262344837,
"reward_after_std": 0.5306954458355904,
"reward_before_mean": 0.8744035568088293,
"reward_before_std": 0.46768255438655615,
"reward_change_max": 0.0,
"reward_change_mean": -0.44539863243699074,
"reward_change_min": -0.6339141335338354,
"reward_change_std": 0.2669675601646304,
"reward_std": 0.5306954644620419,
"rewards/accuracy_reward": 0.562500013038516,
"rewards/cosine_scaled_reward": 0.31190355867147446,
"step": 34
},
{
"clip_fraction": 0.0,
"completion_length": 2638.770896911621,
"epoch": 0.04,
"grad_norm": 0.03418930992484093,
"kl": 0.00013949722051620483,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0269,
"reward": 0.20577108953148127,
"reward_after_mean": 0.20577108953148127,
"reward_after_std": 0.6861968599259853,
"reward_before_mean": 0.5486643612384796,
"reward_before_std": 0.6807287614792585,
"reward_change_max": 0.0,
"reward_change_mean": -0.3428932707756758,
"reward_change_min": -0.5318975541740656,
"reward_change_std": 0.22158884536474943,
"reward_std": 0.6861968897283077,
"rewards/accuracy_reward": 0.4166666828095913,
"rewards/cosine_scaled_reward": 0.13199766166508198,
"step": 35
},
{
"clip_fraction": 0.0,
"completion_length": 3177.8958892822266,
"epoch": 0.04114285714285714,
"grad_norm": 0.023622367531061172,
"kl": 0.0001952648162841797,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.846666218300807e-07,
"loss": 0.0101,
"reward": -0.35834434535354376,
"reward_after_mean": -0.35834434535354376,
"reward_after_std": 0.40897000953555107,
"reward_before_mean": -0.1906146677210927,
"reward_before_std": 0.37281213887035847,
"reward_change_max": 0.0,
"reward_change_mean": -0.16772967763245106,
"reward_change_min": -0.2771889641880989,
"reward_change_std": 0.09608037583529949,
"reward_std": 0.40897002816200256,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/cosine_scaled_reward": -0.23228134028613567,
"step": 36
},
{
"clip_fraction": 0.0,
"completion_length": 2871.0208892822266,
"epoch": 0.04228571428571429,
"grad_norm": 0.02082606963813305,
"kl": 0.00013563036918640137,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0088,
"reward": -0.22048189118504524,
"reward_after_mean": -0.22048189118504524,
"reward_after_std": 0.41180545277893543,
"reward_before_mean": 0.0031149107962846756,
"reward_before_std": 0.4039106909185648,
"reward_change_max": 0.0,
"reward_change_mean": -0.22359680384397507,
"reward_change_min": -0.3989143669605255,
"reward_change_std": 0.15272453986108303,
"reward_std": 0.41180546395480633,
"rewards/accuracy_reward": 0.125,
"rewards/cosine_scaled_reward": -0.12188508547842503,
"step": 37
},
{
"clip_fraction": 0.0,
"completion_length": 3201.9375228881836,
"epoch": 0.04342857142857143,
"grad_norm": 0.01921209692955017,
"kl": 0.00017651915550231934,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.80337140183366e-07,
"loss": 0.0277,
"reward": -0.09033865109086037,
"reward_after_mean": -0.09033865109086037,
"reward_after_std": 0.3066476993262768,
"reward_before_mean": 0.19127687066793442,
"reward_before_std": 0.2528393566608429,
"reward_change_max": 0.0,
"reward_change_mean": -0.28161551244556904,
"reward_change_min": -0.4111398421227932,
"reward_change_std": 0.15858831629157066,
"reward_std": 0.3066477105021477,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.01705646887421608,
"step": 38
},
{
"clip_fraction": 0.0,
"completion_length": 2304.458366394043,
"epoch": 0.044571428571428574,
"grad_norm": 0.02351287379860878,
"kl": 0.00011439248919487,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.779754323328192e-07,
"loss": -0.0053,
"reward": 0.05605981033295393,
"reward_after_mean": 0.05605981033295393,
"reward_after_std": 0.46067033149302006,
"reward_before_mean": 0.36542993783950806,
"reward_before_std": 0.3694801307283342,
"reward_change_max": 0.0,
"reward_change_mean": -0.30937014520168304,
"reward_change_min": -0.47338311932981014,
"reward_change_std": 0.17591516766697168,
"reward_std": 0.4606703519821167,
"rewards/accuracy_reward": 0.31250000186264515,
"rewards/cosine_scaled_reward": 0.05292995506897569,
"step": 39
},
{
"clip_fraction": 0.0,
"completion_length": 2159.895881652832,
"epoch": 0.045714285714285714,
"grad_norm": 0.026050910353660583,
"kl": 0.0001217871904373169,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.754833590196926e-07,
"loss": 0.0642,
"reward": 0.002425914630293846,
"reward_after_mean": 0.002425914630293846,
"reward_after_std": 0.5221548210829496,
"reward_before_mean": 0.289883803576231,
"reward_before_std": 0.4837529417127371,
"reward_change_max": 0.0,
"reward_change_mean": -0.2874579019844532,
"reward_change_min": -0.47877912409603596,
"reward_change_std": 0.18734649941325188,
"reward_std": 0.5221548229455948,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": 0.019050464034080505,
"step": 40
},
{
"clip_fraction": 0.0,
"completion_length": 2697.1875076293945,
"epoch": 0.046857142857142854,
"grad_norm": 0.02190599963068962,
"kl": 0.00014284253120422363,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0291,
"reward": 0.08761137025430799,
"reward_after_mean": 0.08761137025430799,
"reward_after_std": 0.6340927109122276,
"reward_before_mean": 0.39676812407560647,
"reward_before_std": 0.6426020693033934,
"reward_change_max": 0.0,
"reward_change_mean": -0.30915676802396774,
"reward_change_min": -0.49815394170582294,
"reward_change_std": 0.20790001936256886,
"reward_std": 0.634092727676034,
"rewards/accuracy_reward": 0.3125000111758709,
"rewards/cosine_scaled_reward": 0.08426813036203384,
"step": 41
},
{
"clip_fraction": 0.0,
"completion_length": 2502.229223251343,
"epoch": 0.048,
"grad_norm": 0.04056044667959213,
"kl": 0.00016070902347564697,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.701111919237408e-07,
"loss": 0.0258,
"reward": -0.2749686185270548,
"reward_after_mean": -0.2749686185270548,
"reward_after_std": 0.3445035833865404,
"reward_before_mean": -0.06608736759517342,
"reward_before_std": 0.3068845123052597,
"reward_change_max": 0.0,
"reward_change_mean": -0.20888124220073223,
"reward_change_min": -0.3034993875771761,
"reward_change_std": 0.11960937362164259,
"reward_std": 0.3445035871118307,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/cosine_scaled_reward": -0.1910873781889677,
"step": 42
},
{
"clip_fraction": 0.0,
"completion_length": 2652.625030517578,
"epoch": 0.04914285714285714,
"grad_norm": 0.020540975034236908,
"kl": 0.00012725219130516052,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0065,
"reward": -0.0786220645532012,
"reward_after_mean": -0.0786220645532012,
"reward_after_std": 0.4567228890955448,
"reward_before_mean": 0.19149728119373322,
"reward_before_std": 0.4425015412271023,
"reward_change_max": 0.0,
"reward_change_mean": -0.27011934854090214,
"reward_change_min": -0.4120590351521969,
"reward_change_std": 0.16643204633146524,
"reward_std": 0.45672290213406086,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/cosine_scaled_reward": -0.016836050897836685,
"step": 43
},
{
"clip_fraction": 0.0,
"completion_length": 2204.1458587646484,
"epoch": 0.05028571428571429,
"grad_norm": 0.02781352587044239,
"kl": 0.00011363625526428223,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.64227184053598e-07,
"loss": 0.0196,
"reward": 0.23546710796654224,
"reward_after_mean": 0.23546710796654224,
"reward_after_std": 0.4349679071456194,
"reward_before_mean": 0.6162683628499508,
"reward_before_std": 0.3583456464111805,
"reward_change_max": 0.0,
"reward_change_mean": -0.3808012641966343,
"reward_change_min": -0.5730564780533314,
"reward_change_std": 0.21749800257384777,
"reward_std": 0.4349679220467806,
"rewards/accuracy_reward": 0.3958333395421505,
"rewards/cosine_scaled_reward": 0.22043502517044544,
"step": 44
},
{
"clip_fraction": 0.0,
"completion_length": 3081.6459045410156,
"epoch": 0.05142857142857143,
"grad_norm": 0.01855648308992386,
"kl": 0.00013652443885803223,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.610954559391704e-07,
"loss": -0.0101,
"reward": 0.14500193297863007,
"reward_after_mean": 0.14500193297863007,
"reward_after_std": 0.5667851958423853,
"reward_before_mean": 0.47629803395830095,
"reward_before_std": 0.5099421134218574,
"reward_change_max": 0.0,
"reward_change_mean": -0.3312961012125015,
"reward_change_min": -0.503504516556859,
"reward_change_std": 0.19466788694262505,
"reward_std": 0.5667852181941271,
"rewards/accuracy_reward": 0.3333333395421505,
"rewards/cosine_scaled_reward": 0.1429646834731102,
"step": 45
},
{
"clip_fraction": 0.0,
"completion_length": 2907.1666870117188,
"epoch": 0.052571428571428575,
"grad_norm": 0.026002762839198112,
"kl": 0.00016480684280395508,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.578385041664925e-07,
"loss": -0.0122,
"reward": -0.2713017947971821,
"reward_after_mean": -0.2713017947971821,
"reward_after_std": 0.41631367057561874,
"reward_before_mean": -0.06967430002987385,
"reward_before_std": 0.388171230442822,
"reward_change_max": 0.0,
"reward_change_mean": -0.20162750035524368,
"reward_change_min": -0.33353596180677414,
"reward_change_std": 0.12530343793332577,
"reward_std": 0.41631368920207024,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/cosine_scaled_reward": -0.1530076339840889,
"step": 46
},
{
"clip_fraction": 0.0,
"completion_length": 2134.458354949951,
"epoch": 0.053714285714285714,
"grad_norm": 0.03345310315489769,
"kl": 8.27684998512268e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.54457320834625e-07,
"loss": -0.011,
"reward": 0.23032055981457233,
"reward_after_mean": 0.23032055981457233,
"reward_after_std": 0.5171677935868502,
"reward_before_mean": 0.5986741930246353,
"reward_before_std": 0.4480929058045149,
"reward_change_max": 0.0,
"reward_change_mean": -0.3683536574244499,
"reward_change_min": -0.5596556253731251,
"reward_change_std": 0.21970490273088217,
"reward_std": 0.5171678010374308,
"rewards/accuracy_reward": 0.41666666977107525,
"rewards/cosine_scaled_reward": 0.18200752511620522,
"step": 47
},
{
"clip_fraction": 0.0,
"completion_length": 2394.6875534057617,
"epoch": 0.054857142857142854,
"grad_norm": 0.02606636844575405,
"kl": 0.00011596083641052246,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.509529358847654e-07,
"loss": 0.0201,
"reward": 0.12413597200065851,
"reward_after_mean": 0.12413597200065851,
"reward_after_std": 0.45973930321633816,
"reward_before_mean": 0.46354489028453827,
"reward_before_std": 0.4050141889601946,
"reward_change_max": 0.0,
"reward_change_mean": -0.3394088950008154,
"reward_change_min": -0.491641778498888,
"reward_change_std": 0.1968485563993454,
"reward_std": 0.4597393050789833,
"rewards/accuracy_reward": 0.35416667722165585,
"rewards/cosine_scaled_reward": 0.10937818745151162,
"step": 48
},
{
"clip_fraction": 0.0,
"completion_length": 1708.5833435058594,
"epoch": 0.056,
"grad_norm": 0.03054640255868435,
"kl": 0.00010991096496582031,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0575,
"reward": 0.29493133816868067,
"reward_after_mean": 0.29493133816868067,
"reward_after_std": 0.6500303111970425,
"reward_before_mean": 0.6766238370910287,
"reward_before_std": 0.6318102139048278,
"reward_change_max": 0.0,
"reward_change_mean": -0.3816925100982189,
"reward_change_min": -0.6162350811064243,
"reward_change_std": 0.250560705550015,
"reward_std": 0.6500303186476231,
"rewards/accuracy_reward": 0.45833334140479565,
"rewards/cosine_scaled_reward": 0.2182904863730073,
"step": 49
},
{
"clip_fraction": 0.0,
"completion_length": 2679.7083740234375,
"epoch": 0.05714285714285714,
"grad_norm": 0.02415233850479126,
"kl": 0.00010981038212776184,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.43578868212728e-07,
"loss": -0.0278,
"reward": 0.14336813369300216,
"reward_after_mean": 0.14336813369300216,
"reward_after_std": 0.5428028926253319,
"reward_before_mean": 0.47334863245487213,
"reward_before_std": 0.4351601582020521,
"reward_change_max": 0.0,
"reward_change_mean": -0.32998047955334187,
"reward_change_min": -0.4792014453560114,
"reward_change_std": 0.18526286352425814,
"reward_std": 0.5428028963506222,
"rewards/accuracy_reward": 0.3541666679084301,
"rewards/cosine_scaled_reward": 0.11918195243924856,
"step": 50
},
{
"clip_fraction": 0.0,
"completion_length": 2168.1875343322754,
"epoch": 0.05828571428571429,
"grad_norm": 0.03308018669486046,
"kl": 0.0001481473445892334,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.397114317029974e-07,
"loss": -0.012,
"reward": -0.22561240196228027,
"reward_after_mean": -0.22561240196228027,
"reward_after_std": 0.4365761708468199,
"reward_before_mean": -0.0061477068811655045,
"reward_before_std": 0.4297064580023289,
"reward_change_max": 0.0,
"reward_change_mean": -0.21946469321846962,
"reward_change_min": -0.3908286802470684,
"reward_change_std": 0.15255583450198174,
"reward_std": 0.4365761801600456,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.15198103338479996,
"step": 51
},
{
"clip_fraction": 0.0,
"completion_length": 2549.7083892822266,
"epoch": 0.05942857142857143,
"grad_norm": 0.030142705887556076,
"kl": 0.00010135024785995483,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.357252853159505e-07,
"loss": 0.0317,
"reward": 0.26532851438969374,
"reward_after_mean": 0.26532851438969374,
"reward_after_std": 0.4831724725663662,
"reward_before_mean": 0.6504863314330578,
"reward_before_std": 0.38277638517320156,
"reward_change_max": 0.0,
"reward_change_mean": -0.38515781983733177,
"reward_change_min": -0.601357439532876,
"reward_change_std": 0.22396727558225393,
"reward_std": 0.48317247815430164,
"rewards/accuracy_reward": 0.4791666679084301,
"rewards/cosine_scaled_reward": 0.17131963837891817,
"step": 52
},
{
"clip_fraction": 0.0,
"completion_length": 2409.6667251586914,
"epoch": 0.060571428571428575,
"grad_norm": 0.022006014361977577,
"kl": 0.00011900067329406738,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.316216432703916e-07,
"loss": -0.0174,
"reward": 0.27613873686641455,
"reward_after_mean": 0.27613873686641455,
"reward_after_std": 0.6787273976951838,
"reward_before_mean": 0.6442126063629985,
"reward_before_std": 0.6745601836591959,
"reward_change_max": 0.0,
"reward_change_mean": -0.3680738639086485,
"reward_change_min": -0.5628750901669264,
"reward_change_std": 0.23260985035449266,
"reward_std": 0.6787274144589901,
"rewards/accuracy_reward": 0.4166666828095913,
"rewards/cosine_scaled_reward": 0.22754593688296154,
"step": 53
},
{
"clip_fraction": 0.0,
"completion_length": 1849.8542175292969,
"epoch": 0.061714285714285715,
"grad_norm": 0.03163987770676613,
"kl": 7.21663236618042e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.274017555754407e-07,
"loss": -0.0455,
"reward": 0.5419067908078432,
"reward_after_mean": 0.5419067908078432,
"reward_after_std": 0.621409310027957,
"reward_before_mean": 1.0097733028233051,
"reward_before_std": 0.5289697218686342,
"reward_change_max": 0.0,
"reward_change_mean": -0.4678665027022362,
"reward_change_min": -0.687706695869565,
"reward_change_std": 0.2778399270027876,
"reward_std": 0.6214093323796988,
"rewards/accuracy_reward": 0.6250000055879354,
"rewards/cosine_scaled_reward": 0.38477328792214394,
"step": 54
},
{
"clip_fraction": 0.0,
"completion_length": 2662.562545776367,
"epoch": 0.06285714285714286,
"grad_norm": 0.021438749507069588,
"kl": 0.00012673437595367432,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0578,
"reward": 0.24588292092084885,
"reward_after_mean": 0.24588292092084885,
"reward_after_std": 0.45386996306478977,
"reward_before_mean": 0.6329527571797371,
"reward_before_std": 0.39988668635487556,
"reward_change_max": 0.0,
"reward_change_mean": -0.38706981018185616,
"reward_change_min": -0.575838714838028,
"reward_change_std": 0.2312335381284356,
"reward_std": 0.45386997424066067,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/cosine_scaled_reward": 0.2162860780954361,
"step": 55
},
{
"clip_fraction": 0.0,
"completion_length": 2517.291748046875,
"epoch": 0.064,
"grad_norm": 0.024150483310222626,
"kl": 0.00012356042861938477,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.186184199300463e-07,
"loss": 0.0008,
"reward": -0.0010210014879703522,
"reward_after_mean": -0.0010210014879703522,
"reward_after_std": 0.5338380401954055,
"reward_before_mean": 0.2886288957670331,
"reward_before_std": 0.5403741393238306,
"reward_change_max": 0.0,
"reward_change_mean": -0.28964989073574543,
"reward_change_min": -0.47638524509966373,
"reward_change_std": 0.19530559238046408,
"reward_std": 0.5338380504399538,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/cosine_scaled_reward": 0.01779552362859249,
"step": 56
},
{
"clip_fraction": 0.0,
"completion_length": 2843.8958892822266,
"epoch": 0.06514285714285714,
"grad_norm": 0.018026748672127724,
"kl": 0.00010999536607414484,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0482,
"reward": 0.07039481587707996,
"reward_after_mean": 0.07039481587707996,
"reward_after_std": 0.6540415622293949,
"reward_before_mean": 0.3758677262812853,
"reward_before_std": 0.7031431794166565,
"reward_change_max": 0.0,
"reward_change_mean": -0.30547289550304413,
"reward_change_min": -0.6006171070039272,
"reward_change_std": 0.23927316907793283,
"reward_std": 0.6540416032075882,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/cosine_scaled_reward": 0.042534375097602606,
"step": 57
},
{
"clip_fraction": 0.0,
"completion_length": 1548.770866394043,
"epoch": 0.06628571428571428,
"grad_norm": 0.03142126649618149,
"kl": 7.739663124084473e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.093859795212817e-07,
"loss": 0.0822,
"reward": 0.2759701292961836,
"reward_after_mean": 0.2759701292961836,
"reward_after_std": 0.4890221580862999,
"reward_before_mean": 0.6628518868237734,
"reward_before_std": 0.4022231069393456,
"reward_change_max": 0.0,
"reward_change_mean": -0.386881772428751,
"reward_change_min": -0.535983219742775,
"reward_change_std": 0.21398326009511948,
"reward_std": 0.48902217485010624,
"rewards/accuracy_reward": 0.47916667722165585,
"rewards/cosine_scaled_reward": 0.18368521006777883,
"step": 58
},
{
"clip_fraction": 0.0,
"completion_length": 2577.645835876465,
"epoch": 0.06742857142857143,
"grad_norm": 0.024798288941383362,
"kl": 9.592529386281967e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 9.046048391230247e-07,
"loss": -0.0158,
"reward": 0.014745804481208324,
"reward_after_mean": 0.014745804481208324,
"reward_after_std": 0.6127710696309805,
"reward_before_mean": 0.29811959713697433,
"reward_before_std": 0.6092889029532671,
"reward_change_max": 0.0,
"reward_change_mean": -0.2833738047629595,
"reward_change_min": -0.4874247722327709,
"reward_change_std": 0.18988565262407064,
"reward_std": 0.6127710789442062,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/cosine_scaled_reward": 0.027286252938210964,
"step": 59
},
{
"clip_fraction": 0.0,
"completion_length": 2456.7292251586914,
"epoch": 0.06857142857142857,
"grad_norm": 0.021669333800673485,
"kl": 0.00011354684829711914,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.997156826556369e-07,
"loss": 0.0436,
"reward": 0.044893345795571804,
"reward_after_mean": 0.044893345795571804,
"reward_after_std": 0.5106327962130308,
"reward_before_mean": 0.35379540640860796,
"reward_before_std": 0.48913951963186264,
"reward_change_max": 0.0,
"reward_change_mean": -0.30890206806361675,
"reward_change_min": -0.5454690717160702,
"reward_change_std": 0.20998750906437635,
"reward_std": 0.5106328222900629,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": 0.062128732446581125,
"step": 60
},
{
"clip_fraction": 0.0,
"completion_length": 2429.104217529297,
"epoch": 0.06971428571428571,
"grad_norm": 0.023875517770648003,
"kl": 8.734315633773804e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0734,
"reward": -0.02687007375061512,
"reward_after_mean": -0.02687007375061512,
"reward_after_std": 0.357270210981369,
"reward_before_mean": 0.2700694063678384,
"reward_before_std": 0.3037982089444995,
"reward_change_max": 0.0,
"reward_change_mean": -0.29693946428596973,
"reward_change_min": -0.44768262282013893,
"reward_change_std": 0.16799015924334526,
"reward_std": 0.3572702258825302,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": -0.000763963907957077,
"step": 61
},
{
"clip_fraction": 0.0,
"completion_length": 2164.687515258789,
"epoch": 0.07085714285714285,
"grad_norm": 0.027663985267281532,
"kl": 8.076801896095276e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.896193111002475e-07,
"loss": -0.0317,
"reward": 0.07955900579690933,
"reward_after_mean": 0.07955900579690933,
"reward_after_std": 0.44076032005250454,
"reward_before_mean": 0.40465743746608496,
"reward_before_std": 0.37633848655968904,
"reward_change_max": 0.0,
"reward_change_mean": -0.3250984400510788,
"reward_change_min": -0.47303689643740654,
"reward_change_std": 0.18647983483970165,
"reward_std": 0.4407603293657303,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.07132405880838633,
"step": 62
},
{
"clip_fraction": 0.0,
"completion_length": 1490.9375305175781,
"epoch": 0.072,
"grad_norm": 0.035348497331142426,
"kl": 0.0001027137041091919,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.844151714648274e-07,
"loss": -0.0576,
"reward": 0.32882157526910305,
"reward_after_mean": 0.32882157526910305,
"reward_after_std": 0.49383416399359703,
"reward_before_mean": 0.73639902099967,
"reward_before_std": 0.4061170890927315,
"reward_change_max": 0.0,
"reward_change_mean": -0.40757744386792183,
"reward_change_min": -0.6154311131685972,
"reward_change_std": 0.23690672032535076,
"reward_std": 0.49383416771888733,
"rewards/accuracy_reward": 0.5000000037252903,
"rewards/cosine_scaled_reward": 0.23639898875262588,
"step": 63
},
{
"clip_fraction": 0.0,
"completion_length": 2587.6666946411133,
"epoch": 0.07314285714285715,
"grad_norm": 0.02268359065055847,
"kl": 0.00013853982090950012,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.791091657286267e-07,
"loss": -0.0032,
"reward": 0.10220323409885168,
"reward_after_mean": 0.10220323409885168,
"reward_after_std": 0.5815557111054659,
"reward_before_mean": 0.4171946104615927,
"reward_before_std": 0.5369735099375248,
"reward_change_max": 0.0,
"reward_change_mean": -0.3149913866072893,
"reward_change_min": -0.5246579889208078,
"reward_change_std": 0.2050698734819889,
"reward_std": 0.5815557222813368,
"rewards/accuracy_reward": 0.33333333767950535,
"rewards/cosine_scaled_reward": 0.08386127499397844,
"step": 64
},
{
"clip_fraction": 0.0,
"completion_length": 2444.4791984558105,
"epoch": 0.07428571428571429,
"grad_norm": 0.026023779064416885,
"kl": 0.00010403990745544434,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.737029101523929e-07,
"loss": -0.0247,
"reward": 0.05848180502653122,
"reward_after_mean": 0.05848180502653122,
"reward_after_std": 0.4780960585922003,
"reward_before_mean": 0.37100529856979847,
"reward_before_std": 0.4231007066555321,
"reward_change_max": 0.0,
"reward_change_mean": -0.31252351962029934,
"reward_change_min": -0.4703453090041876,
"reward_change_std": 0.1822062935680151,
"reward_std": 0.4780960753560066,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.03767195844557136,
"step": 65
},
{
"clip_fraction": 0.0,
"completion_length": 2057.604175567627,
"epoch": 0.07542857142857143,
"grad_norm": 0.030141225084662437,
"kl": 9.322166442871094e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.681980515339463e-07,
"loss": -0.0104,
"reward": -0.023890173994004726,
"reward_after_mean": -0.023890173994004726,
"reward_after_std": 0.35533210076391697,
"reward_before_mean": 0.2734305802732706,
"reward_before_std": 0.26786400750279427,
"reward_change_max": 0.0,
"reward_change_mean": -0.2973207589238882,
"reward_change_min": -0.45873321034014225,
"reward_change_std": 0.1713191168382764,
"reward_std": 0.3553321100771427,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": -0.01823609508574009,
"step": 66
},
{
"clip_fraction": 0.0,
"completion_length": 3077.3958892822266,
"epoch": 0.07657142857142857,
"grad_norm": 0.020309919491410255,
"kl": 0.00010600686073303223,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.625962667065487e-07,
"loss": -0.0188,
"reward": -0.2882272908464074,
"reward_after_mean": -0.2882272908464074,
"reward_after_std": 0.5068832859396935,
"reward_before_mean": -0.11004448961466551,
"reward_before_std": 0.4490641765296459,
"reward_change_max": 0.0,
"reward_change_mean": -0.17818280309438705,
"reward_change_min": -0.2667464707046747,
"reward_change_std": 0.09650260768830776,
"reward_std": 0.5068832859396935,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.17254449147731066,
"step": 67
},
{
"clip_fraction": 0.0,
"completion_length": 1452.2291793823242,
"epoch": 0.07771428571428571,
"grad_norm": 0.03242521733045578,
"kl": 8.340179920196533e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.568992620281243e-07,
"loss": 0.0175,
"reward": 0.07238315790891647,
"reward_after_mean": 0.07238315790891647,
"reward_after_std": 0.4353354647755623,
"reward_before_mean": 0.39404843375086784,
"reward_before_std": 0.3516052491031587,
"reward_change_max": 0.0,
"reward_change_mean": -0.3216652628034353,
"reward_change_min": -0.5194528475403786,
"reward_change_std": 0.19221886433660984,
"reward_std": 0.43533547781407833,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.06071509560570121,
"step": 68
},
{
"clip_fraction": 0.0,
"completion_length": 1787.2916946411133,
"epoch": 0.07885714285714286,
"grad_norm": 0.031117206439375877,
"kl": 9.419023990631104e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.511087728614862e-07,
"loss": -0.0016,
"reward": -0.026340688578784466,
"reward_after_mean": -0.026340688578784466,
"reward_after_std": 0.5681092850863934,
"reward_before_mean": 0.24961409904062748,
"reward_before_std": 0.5811920054256916,
"reward_change_max": 0.0,
"reward_change_mean": -0.2759547829627991,
"reward_change_min": -0.48821524903178215,
"reward_change_std": 0.1979276780039072,
"reward_std": 0.5681093074381351,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": 0.020447423681616783,
"step": 69
},
{
"clip_fraction": 0.0,
"completion_length": 2451.0625534057617,
"epoch": 0.08,
"grad_norm": 0.022252434864640236,
"kl": 9.554624557495117e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.452265630457282e-07,
"loss": -0.0644,
"reward": 0.03181967884302139,
"reward_after_mean": 0.03181967884302139,
"reward_after_std": 0.41130639240145683,
"reward_before_mean": 0.34573002345860004,
"reward_before_std": 0.36813389230519533,
"reward_change_max": 0.0,
"reward_change_mean": -0.31391034089028835,
"reward_change_min": -0.4767356403172016,
"reward_change_std": 0.19002152141183615,
"reward_std": 0.4113064054399729,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": 0.05406337045133114,
"step": 70
},
{
"clip_fraction": 0.0,
"completion_length": 2502.958366394043,
"epoch": 0.08114285714285714,
"grad_norm": 0.0264949519187212,
"kl": 0.00011813640594482422,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.392544243589427e-07,
"loss": -0.0179,
"reward": 0.06412788107991219,
"reward_after_mean": 0.06412788107991219,
"reward_after_std": 0.41010222770273685,
"reward_before_mean": 0.39106011018157005,
"reward_before_std": 0.3685198612511158,
"reward_change_max": 0.0,
"reward_change_mean": -0.32693223282694817,
"reward_change_min": -0.4851485788822174,
"reward_change_std": 0.19277132395654917,
"reward_std": 0.4101022370159626,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.07856010273098946,
"step": 71
},
{
"clip_fraction": 0.0,
"completion_length": 2082.812545776367,
"epoch": 0.08228571428571428,
"grad_norm": 0.027392003685235977,
"kl": 0.00011332333087921143,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.331941759724268e-07,
"loss": 0.0923,
"reward": -0.1911002192646265,
"reward_after_mean": -0.1911002192646265,
"reward_after_std": 0.48363407514989376,
"reward_before_mean": 0.027870051562786102,
"reward_before_std": 0.4447115077637136,
"reward_change_max": 0.0,
"reward_change_mean": -0.2189702782779932,
"reward_change_min": -0.3989233523607254,
"reward_change_std": 0.14245110657066107,
"reward_std": 0.48363409377634525,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.09712994811707176,
"step": 72
},
{
"clip_fraction": 0.0,
"completion_length": 3078.3334045410156,
"epoch": 0.08342857142857144,
"grad_norm": 0.019918017089366913,
"kl": 0.00015282630920410156,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0662,
"reward": 0.020195575430989265,
"reward_after_mean": 0.020195575430989265,
"reward_after_std": 0.5312496908009052,
"reward_before_mean": 0.31786563992500305,
"reward_before_std": 0.5375419212505221,
"reward_change_max": 0.0,
"reward_change_mean": -0.29767004027962685,
"reward_change_min": -0.47399161756038666,
"reward_change_std": 0.19663708750158548,
"reward_std": 0.5312496926635504,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/cosine_scaled_reward": 0.026198940351605415,
"step": 73
},
{
"clip_fraction": 0.0,
"completion_length": 2040.7917251586914,
"epoch": 0.08457142857142858,
"grad_norm": 0.025541391223669052,
"kl": 9.54754650592804e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.208167604184217e-07,
"loss": -0.0443,
"reward": 0.14915499277412891,
"reward_after_mean": 0.14915499277412891,
"reward_after_std": 0.6291306726634502,
"reward_before_mean": 0.482904102653265,
"reward_before_std": 0.6536181448027492,
"reward_change_max": 0.0,
"reward_change_mean": -0.3337490987032652,
"reward_change_min": -0.6043441817164421,
"reward_change_std": 0.2390197478234768,
"reward_std": 0.6291306801140308,
"rewards/accuracy_reward": 0.3750000149011612,
"rewards/cosine_scaled_reward": 0.10790410172194242,
"step": 74
},
{
"clip_fraction": 0.0,
"completion_length": 2779.729202270508,
"epoch": 0.08571428571428572,
"grad_norm": 0.020267607644200325,
"kl": 0.00012791156768798828,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.145033635316128e-07,
"loss": -0.0545,
"reward": 0.0270681269466877,
"reward_after_mean": 0.0270681269466877,
"reward_after_std": 0.4193691723048687,
"reward_before_mean": 0.3381798770278692,
"reward_before_std": 0.3713110312819481,
"reward_change_max": 0.0,
"reward_change_mean": -0.31111176311969757,
"reward_change_min": -0.47804239578545094,
"reward_change_std": 0.1906196428462863,
"reward_std": 0.41936918161809444,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": 0.04651320539414883,
"step": 75
},
{
"clip_fraction": 0.0,
"completion_length": 2396.770881652832,
"epoch": 0.08685714285714285,
"grad_norm": 0.028096716850996017,
"kl": 0.00011730939149856567,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.081093963579707e-07,
"loss": 0.1023,
"reward": -0.1743035688996315,
"reward_after_mean": -0.1743035688996315,
"reward_after_std": 0.376751147210598,
"reward_before_mean": 0.07117291446775198,
"reward_before_std": 0.36577551485970616,
"reward_change_max": 0.0,
"reward_change_mean": -0.2454764936119318,
"reward_change_min": -0.4111335948109627,
"reward_change_std": 0.15914648212492466,
"reward_std": 0.3767511546611786,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.053827076219022274,
"step": 76
},
{
"clip_fraction": 0.0,
"completion_length": 2609.666702270508,
"epoch": 0.088,
"grad_norm": 0.024190831929445267,
"kl": 0.00011560320854187012,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 8.01636806561836e-07,
"loss": -0.0252,
"reward": -0.12485083658248186,
"reward_after_mean": -0.12485083658248186,
"reward_after_std": 0.5494148954749107,
"reward_before_mean": 0.1104511353187263,
"reward_before_std": 0.5148029942065477,
"reward_change_max": 0.0,
"reward_change_mean": -0.23530197329819202,
"reward_change_min": -0.3927488550543785,
"reward_change_std": 0.14942125510424376,
"reward_std": 0.5494149122387171,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.07704886957071722,
"step": 77
},
{
"clip_fraction": 0.0,
"completion_length": 2710.0833587646484,
"epoch": 0.08914285714285715,
"grad_norm": 0.02033821865916252,
"kl": 0.00011965632438659668,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.950875657567621e-07,
"loss": 0.0067,
"reward": 0.16592460870742798,
"reward_after_mean": 0.16592460870742798,
"reward_after_std": 0.628672743216157,
"reward_before_mean": 0.5066994614899158,
"reward_before_std": 0.6296672336757183,
"reward_change_max": 0.0,
"reward_change_mean": -0.3407748378813267,
"reward_change_min": -0.6165256667882204,
"reward_change_std": 0.2455195877701044,
"reward_std": 0.6286727450788021,
"rewards/accuracy_reward": 0.3958333395421505,
"rewards/cosine_scaled_reward": 0.11086611449718475,
"step": 78
},
{
"clip_fraction": 0.0,
"completion_length": 1924.916706085205,
"epoch": 0.09028571428571429,
"grad_norm": 0.028922580182552338,
"kl": 9.059533476829529e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.884636689049422e-07,
"loss": -0.0012,
"reward": 0.06329749338328838,
"reward_after_mean": 0.06329749338328838,
"reward_after_std": 0.5062609296292067,
"reward_before_mean": 0.3792388401925564,
"reward_before_std": 0.49266021978110075,
"reward_change_max": 0.0,
"reward_change_mean": -0.31594133377075195,
"reward_change_min": -0.534465042874217,
"reward_change_std": 0.2129287514835596,
"reward_std": 0.5062609408050776,
"rewards/accuracy_reward": 0.3333333395421505,
"rewards/cosine_scaled_reward": 0.04590547701809555,
"step": 79
},
{
"clip_fraction": 0.0,
"completion_length": 2843.2708587646484,
"epoch": 0.09142857142857143,
"grad_norm": 0.021287092939019203,
"kl": 0.0001614391803741455,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.817671337095244e-07,
"loss": 0.0507,
"reward": -0.00979284942150116,
"reward_after_mean": -0.00979284942150116,
"reward_after_std": 0.4034804105758667,
"reward_before_mean": 0.2902548350393772,
"reward_before_std": 0.35455449763685465,
"reward_change_max": 0.0,
"reward_change_mean": -0.3000476788729429,
"reward_change_min": -0.4641227424144745,
"reward_change_std": 0.18029196839779615,
"reward_std": 0.40348043479025364,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": -0.0014118626713752747,
"step": 80
},
{
"clip_fraction": 0.0,
"completion_length": 2935.375030517578,
"epoch": 0.09257142857142857,
"grad_norm": 0.03196020796895027,
"kl": 0.00017774105072021484,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.75e-07,
"loss": 0.0139,
"reward": -0.19430748652666807,
"reward_after_mean": -0.19430748652666807,
"reward_after_std": 0.5501824514940381,
"reward_before_mean": 0.013918843120336533,
"reward_before_std": 0.5049468795768917,
"reward_change_max": 0.0,
"reward_change_mean": -0.20822633057832718,
"reward_change_min": -0.325034998357296,
"reward_change_std": 0.1252102516591549,
"reward_std": 0.5501824719831347,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.1319145057350397,
"step": 81
},
{
"clip_fraction": 0.0,
"completion_length": 2397.5833625793457,
"epoch": 0.09371428571428571,
"grad_norm": 0.028649814426898956,
"kl": 9.499490261077881e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.681643291108517e-07,
"loss": 0.0607,
"reward": 0.05334258824586868,
"reward_after_mean": 0.05334258824586868,
"reward_after_std": 0.5218823049217463,
"reward_before_mean": 0.35954809142276645,
"reward_before_std": 0.4847041219472885,
"reward_change_max": 0.0,
"reward_change_mean": -0.30620551481842995,
"reward_change_min": -0.4997597597539425,
"reward_change_std": 0.19008434005081654,
"reward_std": 0.5218823160976171,
"rewards/accuracy_reward": 0.27083333767950535,
"rewards/cosine_scaled_reward": 0.08871474675834179,
"step": 82
},
{
"clip_fraction": 0.0,
"completion_length": 2344.2084159851074,
"epoch": 0.09485714285714286,
"grad_norm": 0.03297533839941025,
"kl": 0.0001214146614074707,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0267,
"reward": 0.014584167511202395,
"reward_after_mean": 0.014584167511202395,
"reward_after_std": 0.5300229340791702,
"reward_before_mean": 0.30404046457260847,
"reward_before_std": 0.4831734402105212,
"reward_change_max": 0.0,
"reward_change_mean": -0.289456307888031,
"reward_change_min": -0.4781253971159458,
"reward_change_std": 0.1838990869000554,
"reward_std": 0.5300229452550411,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": 0.054040471790358424,
"step": 83
},
{
"clip_fraction": 0.0,
"completion_length": 2405.4792098999023,
"epoch": 0.096,
"grad_norm": 0.024380596354603767,
"kl": 0.0001236051321029663,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.54295724882796e-07,
"loss": 0.0294,
"reward": 0.020913776010274887,
"reward_after_mean": 0.020913776010274887,
"reward_after_std": 0.5260360110551119,
"reward_before_mean": 0.3158747926354408,
"reward_before_std": 0.48735920153558254,
"reward_change_max": 0.0,
"reward_change_mean": -0.2949610147625208,
"reward_change_min": -0.47205518186092377,
"reward_change_std": 0.18748428858816624,
"reward_std": 0.5260360259562731,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/cosine_scaled_reward": 0.0033747986890375614,
"step": 84
},
{
"clip_fraction": 0.0,
"completion_length": 2649.854217529297,
"epoch": 0.09714285714285714,
"grad_norm": 0.0174813661724329,
"kl": 9.45068895816803e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0103,
"reward": -0.03742504213005304,
"reward_after_mean": -0.03742504213005304,
"reward_after_std": 0.5757533330470324,
"reward_before_mean": 0.2353352140635252,
"reward_before_std": 0.588757986202836,
"reward_change_max": 0.0,
"reward_change_mean": -0.2727602645754814,
"reward_change_min": -0.5207938365638256,
"reward_change_std": 0.20239645708352327,
"reward_std": 0.5757533498108387,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/cosine_scaled_reward": -0.014664788264781237,
"step": 85
},
{
"clip_fraction": 0.0,
"completion_length": 2589.5833435058594,
"epoch": 0.09828571428571428,
"grad_norm": 0.030955081805586815,
"kl": 0.0001302659511566162,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.401782177833147e-07,
"loss": 0.0739,
"reward": -0.07755789160728455,
"reward_after_mean": -0.07755789160728455,
"reward_after_std": 0.3382277116179466,
"reward_before_mean": 0.20555407088249922,
"reward_before_std": 0.29440235160291195,
"reward_change_max": 0.0,
"reward_change_mean": -0.2831119429320097,
"reward_change_min": -0.42291648127138615,
"reward_change_std": 0.16565488744527102,
"reward_std": 0.3382277172058821,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.0027792779728770256,
"step": 86
},
{
"clip_fraction": 0.0,
"completion_length": 2388.1667251586914,
"epoch": 0.09942857142857142,
"grad_norm": 0.022453241050243378,
"kl": 0.00016289204359054565,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0271,
"reward": -0.01107841357588768,
"reward_after_mean": -0.01107841357588768,
"reward_after_std": 0.3539695702493191,
"reward_before_mean": 0.28700726421084255,
"reward_before_std": 0.25306378304958344,
"reward_change_max": 0.0,
"reward_change_mean": -0.2980856914073229,
"reward_change_min": -0.4043935965746641,
"reward_change_std": 0.15176600962877274,
"reward_std": 0.35396958142518997,
"rewards/accuracy_reward": 0.27083333395421505,
"rewards/cosine_scaled_reward": 0.016173945739865303,
"step": 87
},
{
"clip_fraction": 0.0,
"completion_length": 1487.0833702087402,
"epoch": 0.10057142857142858,
"grad_norm": 0.03448113799095154,
"kl": 9.407103061676025e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.258290078201731e-07,
"loss": 0.1153,
"reward": 0.21288079069927335,
"reward_after_mean": 0.21288079069927335,
"reward_after_std": 0.6643541660159826,
"reward_before_mean": 0.5578272799029946,
"reward_before_std": 0.6279368726536632,
"reward_change_max": 0.0,
"reward_change_mean": -0.34494646824896336,
"reward_change_min": -0.5568280145525932,
"reward_change_std": 0.22005709912627935,
"reward_std": 0.6643541809171438,
"rewards/accuracy_reward": 0.3958333432674408,
"rewards/cosine_scaled_reward": 0.16199392126873136,
"step": 88
},
{
"clip_fraction": 0.0,
"completion_length": 2516.625068664551,
"epoch": 0.10171428571428572,
"grad_norm": 0.023190459236502647,
"kl": 0.00011932849884033203,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.185729670371604e-07,
"loss": -0.057,
"reward": -0.04916583467274904,
"reward_after_mean": -0.04916583467274904,
"reward_after_std": 0.5242477711290121,
"reward_before_mean": 0.22224761126562953,
"reward_before_std": 0.518500761128962,
"reward_change_max": 0.0,
"reward_change_mean": -0.27141344733536243,
"reward_change_min": -0.47212454676628113,
"reward_change_std": 0.18147558439522982,
"reward_std": 0.5242477972060442,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/cosine_scaled_reward": -0.006919063627719879,
"step": 89
},
{
"clip_fraction": 0.0,
"completion_length": 2447.1042251586914,
"epoch": 0.10285714285714286,
"grad_norm": 0.04171357303857803,
"kl": 0.0001575946807861328,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.11265577295385e-07,
"loss": 0.0484,
"reward": -0.30102725327014923,
"reward_after_mean": -0.30102725327014923,
"reward_after_std": 0.40490369498729706,
"reward_before_mean": -0.10983736906200647,
"reward_before_std": 0.37877833284437656,
"reward_change_max": 0.0,
"reward_change_mean": -0.19118987582623959,
"reward_change_min": -0.33792233280837536,
"reward_change_std": 0.12286930158734322,
"reward_std": 0.4049036977812648,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.17233736719936132,
"step": 90
},
{
"clip_fraction": 0.0,
"completion_length": 2565.500030517578,
"epoch": 0.104,
"grad_norm": 0.026068881154060364,
"kl": 0.00012224912643432617,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0069,
"reward": 0.014372130390256643,
"reward_after_mean": 0.014372130390256643,
"reward_after_std": 0.5293956436216831,
"reward_before_mean": 0.3061292991042137,
"reward_before_std": 0.5099629778414965,
"reward_change_max": 0.0,
"reward_change_mean": -0.29175717756152153,
"reward_change_min": -0.47916352190077305,
"reward_change_std": 0.1855539120733738,
"reward_std": 0.5293956585228443,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/cosine_scaled_reward": 0.014462634921073914,
"step": 91
},
{
"clip_fraction": 0.0,
"completion_length": 2142.541702270508,
"epoch": 0.10514285714285715,
"grad_norm": 0.026767289265990257,
"kl": 8.273124694824219e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.965056695057204e-07,
"loss": 0.0693,
"reward": -0.031038912944495678,
"reward_after_mean": -0.031038912944495678,
"reward_after_std": 0.5491299722343683,
"reward_before_mean": 0.24104281282052398,
"reward_before_std": 0.5185518572106957,
"reward_change_max": 0.0,
"reward_change_mean": -0.27208174392580986,
"reward_change_min": -0.41842199116945267,
"reward_change_std": 0.16613258328288794,
"reward_std": 0.5491300020366907,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": -0.008957181125879288,
"step": 92
},
{
"clip_fraction": 0.0,
"completion_length": 3579.1041870117188,
"epoch": 0.10628571428571429,
"grad_norm": 0.020955311134457588,
"kl": 0.0002243518829345703,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0019,
"reward": -0.39391759666614234,
"reward_after_mean": -0.39391759666614234,
"reward_after_std": 0.2926015192642808,
"reward_before_mean": -0.2218842003494501,
"reward_before_std": 0.24477995350025594,
"reward_change_max": 0.0,
"reward_change_mean": -0.1720333844423294,
"reward_change_min": -0.2595607787370682,
"reward_change_std": 0.09285970125347376,
"reward_std": 0.29260152392089367,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/cosine_scaled_reward": -0.2427175386401359,
"step": 93
},
{
"clip_fraction": 0.0,
"completion_length": 2538.5625610351562,
"epoch": 0.10742857142857143,
"grad_norm": 0.02233041636645794,
"kl": 0.00014513731002807617,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.815672671252315e-07,
"loss": -0.0132,
"reward": 0.07038530055433512,
"reward_after_mean": 0.07038530055433512,
"reward_after_std": 0.4959055408835411,
"reward_before_mean": 0.38510218542069197,
"reward_before_std": 0.4519264791160822,
"reward_change_max": 0.0,
"reward_change_mean": -0.31471688486635685,
"reward_change_min": -0.47875550389289856,
"reward_change_std": 0.18934866040945053,
"reward_std": 0.49590555392205715,
"rewards/accuracy_reward": 0.31250000558793545,
"rewards/cosine_scaled_reward": 0.0726021807640791,
"step": 94
},
{
"clip_fraction": 0.0,
"completion_length": 3149.312545776367,
"epoch": 0.10857142857142857,
"grad_norm": 0.016374317929148674,
"kl": 0.00013837218284606934,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0191,
"reward": -0.2019346058368683,
"reward_after_mean": -0.2019346058368683,
"reward_after_std": 0.5740157756954432,
"reward_before_mean": 0.00486493157222867,
"reward_before_std": 0.5541965216398239,
"reward_change_max": 0.0,
"reward_change_mean": -0.20679953508079052,
"reward_change_min": -0.3921053633093834,
"reward_change_std": 0.1454045455902815,
"reward_std": 0.5740158017724752,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.14096840284764767,
"step": 95
},
{
"clip_fraction": 0.0,
"completion_length": 2384.375015258789,
"epoch": 0.10971428571428571,
"grad_norm": 0.02656826190650463,
"kl": 9.119324386119843e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.664685702961344e-07,
"loss": 0.0676,
"reward": 0.1127127856016159,
"reward_after_mean": 0.1127127856016159,
"reward_after_std": 0.4883039314299822,
"reward_before_mean": 0.44525690749287605,
"reward_before_std": 0.42334912437945604,
"reward_change_max": 0.0,
"reward_change_mean": -0.3325441386550665,
"reward_change_min": -0.5463209841400385,
"reward_change_std": 0.203442326746881,
"reward_std": 0.4883039500564337,
"rewards/accuracy_reward": 0.3750000037252903,
"rewards/cosine_scaled_reward": 0.07025692239403725,
"step": 96
},
{
"clip_fraction": 0.0,
"completion_length": 2828.0416946411133,
"epoch": 0.11085714285714286,
"grad_norm": 0.02195425145328045,
"kl": 0.00012743473052978516,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.588648530198504e-07,
"loss": -0.0071,
"reward": -0.03396608494222164,
"reward_after_mean": -0.03396608494222164,
"reward_after_std": 0.42939132265746593,
"reward_before_mean": 0.2542315907776356,
"reward_before_std": 0.3915413152426481,
"reward_change_max": 0.0,
"reward_change_mean": -0.2881976831704378,
"reward_change_min": -0.4680513422936201,
"reward_change_std": 0.17823405750095844,
"reward_std": 0.42939133010804653,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": 0.004231559112668037,
"step": 97
},
{
"clip_fraction": 0.0,
"completion_length": 2182.6458740234375,
"epoch": 0.112,
"grad_norm": 0.02803581953048706,
"kl": 9.182840585708618e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.512279744547392e-07,
"loss": 0.036,
"reward": -0.11260436498560011,
"reward_after_mean": -0.11260436498560011,
"reward_after_std": 0.5100179938599467,
"reward_before_mean": 0.13090651109814644,
"reward_before_std": 0.45763199776411057,
"reward_change_max": 0.0,
"reward_change_mean": -0.24351087771356106,
"reward_change_min": -0.33865879476070404,
"reward_change_std": 0.1343588917516172,
"reward_std": 0.5100180115550756,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/cosine_scaled_reward": -0.03576015151338652,
"step": 98
},
{
"clip_fraction": 0.0,
"completion_length": 2859.729217529297,
"epoch": 0.11314285714285714,
"grad_norm": 0.02229207567870617,
"kl": 0.00012323260307312012,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0162,
"reward": -0.042411248199641705,
"reward_after_mean": -0.042411248199641705,
"reward_after_std": 0.5648014172911644,
"reward_before_mean": 0.22283665975555778,
"reward_before_std": 0.5460882969200611,
"reward_change_max": 0.0,
"reward_change_mean": -0.2652479037642479,
"reward_change_min": -0.4092443734407425,
"reward_change_std": 0.1645316081121564,
"reward_std": 0.5648014266043901,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/cosine_scaled_reward": -0.0063300225883722305,
"step": 99
},
{
"clip_fraction": 0.0,
"completion_length": 2322.125030517578,
"epoch": 0.11428571428571428,
"grad_norm": 0.023206228390336037,
"kl": 0.00010912120342254639,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.358640479194451e-07,
"loss": -0.045,
"reward": 0.12453807704150677,
"reward_after_mean": 0.12453807704150677,
"reward_after_std": 0.622159369289875,
"reward_before_mean": 0.4508266132324934,
"reward_before_std": 0.641321973875165,
"reward_change_max": 0.0,
"reward_change_mean": -0.32628853246569633,
"reward_change_min": -0.5445595029741526,
"reward_change_std": 0.22335629165172577,
"reward_std": 0.6221593860536814,
"rewards/accuracy_reward": 0.35416667722165585,
"rewards/cosine_scaled_reward": 0.0966599378734827,
"step": 100
},
{
"clip_fraction": 0.0,
"completion_length": 2310.062545776367,
"epoch": 0.11542857142857142,
"grad_norm": 0.02827632799744606,
"kl": 0.00014510750770568848,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0523,
"reward": 0.18787965178489685,
"reward_after_mean": 0.18787965178489685,
"reward_after_std": 0.4799032285809517,
"reward_before_mean": 0.5493117831647396,
"reward_before_std": 0.425581069663167,
"reward_change_max": 0.0,
"reward_change_mean": -0.3614321555942297,
"reward_change_min": -0.5766200236976147,
"reward_change_std": 0.22458772454410791,
"reward_std": 0.479903232306242,
"rewards/accuracy_reward": 0.375,
"rewards/cosine_scaled_reward": 0.17431178130209446,
"step": 101
},
{
"clip_fraction": 0.0,
"completion_length": 1985.1666946411133,
"epoch": 0.11657142857142858,
"grad_norm": 0.032261401414871216,
"kl": 0.00013130903244018555,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.203955092681039e-07,
"loss": -0.001,
"reward": -0.05215142108500004,
"reward_after_mean": -0.05215142108500004,
"reward_after_std": 0.5923234205693007,
"reward_before_mean": 0.20914378141242196,
"reward_before_std": 0.5869351290166378,
"reward_change_max": 0.0,
"reward_change_mean": -0.2612952049821615,
"reward_change_min": -0.48573706299066544,
"reward_change_std": 0.18874530028551817,
"reward_std": 0.5923234317451715,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/cosine_scaled_reward": 0.0008104409789666533,
"step": 102
},
{
"clip_fraction": 0.0,
"completion_length": 2266.312545776367,
"epoch": 0.11771428571428572,
"grad_norm": 0.03391628339886665,
"kl": 0.00010603666305541992,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.126278954320294e-07,
"loss": -0.0353,
"reward": -0.2809916576370597,
"reward_after_mean": -0.2809916576370597,
"reward_after_std": 0.38146258890628815,
"reward_before_mean": -0.07696734461933374,
"reward_before_std": 0.36126195592805743,
"reward_change_max": 0.0,
"reward_change_mean": -0.20402430556714535,
"reward_change_min": -0.33082425221800804,
"reward_change_std": 0.1302658850327134,
"reward_std": 0.3814626010134816,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/cosine_scaled_reward": -0.18113402277231216,
"step": 103
},
{
"clip_fraction": 0.0,
"completion_length": 2302.4375534057617,
"epoch": 0.11885714285714286,
"grad_norm": 0.03015494905412197,
"kl": 0.00013568997383117676,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 6.048412045323164e-07,
"loss": 0.0696,
"reward": -0.09460114315152168,
"reward_after_mean": -0.09460114315152168,
"reward_after_std": 0.4551072083413601,
"reward_before_mean": 0.1622155588120222,
"reward_before_std": 0.395945000462234,
"reward_change_max": 0.0,
"reward_change_mean": -0.2568166982382536,
"reward_change_min": -0.4168005548417568,
"reward_change_std": 0.15201660431921482,
"reward_std": 0.45510722137987614,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.046117788180708885,
"step": 104
},
{
"clip_fraction": 0.0,
"completion_length": 2372.9583740234375,
"epoch": 0.12,
"grad_norm": 0.026441100984811783,
"kl": 0.00013068318367004395,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.97037808470444e-07,
"loss": -0.0192,
"reward": 0.299602385610342,
"reward_after_mean": 0.299602385610342,
"reward_after_std": 0.6463410649448633,
"reward_before_mean": 0.6807096730917692,
"reward_before_std": 0.6122201485559344,
"reward_change_max": 0.0,
"reward_change_mean": -0.3811072837561369,
"reward_change_min": -0.5962531901896,
"reward_change_std": 0.240672436542809,
"reward_std": 0.6463411003351212,
"rewards/accuracy_reward": 0.45833334140479565,
"rewards/cosine_scaled_reward": 0.22237632365431637,
"step": 105
},
{
"clip_fraction": 0.0,
"completion_length": 1800.8333892822266,
"epoch": 0.12114285714285715,
"grad_norm": 0.02916141226887703,
"kl": 5.840137600898743e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.892200842364462e-07,
"loss": 0.0226,
"reward": 0.3201424069702625,
"reward_after_mean": 0.3201424069702625,
"reward_after_std": 0.5018086824566126,
"reward_before_mean": 0.7261572647839785,
"reward_before_std": 0.4295506803318858,
"reward_change_max": 0.0,
"reward_change_mean": -0.40601486526429653,
"reward_change_min": -0.6012663654983044,
"reward_change_std": 0.24013797752559185,
"reward_std": 0.501808712258935,
"rewards/accuracy_reward": 0.541666679084301,
"rewards/cosine_scaled_reward": 0.18449058942496777,
"step": 106
},
{
"clip_fraction": 0.0,
"completion_length": 2687.291702270508,
"epoch": 0.12228571428571429,
"grad_norm": 0.02377397008240223,
"kl": 0.00017151236534118652,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.813904131848564e-07,
"loss": -0.0375,
"reward": -0.040606689639389515,
"reward_after_mean": -0.040606689639389515,
"reward_after_std": 0.43058050237596035,
"reward_before_mean": 0.2451710607856512,
"reward_before_std": 0.38810104969888926,
"reward_change_max": 0.0,
"reward_change_mean": -0.2857777550816536,
"reward_change_min": -0.4491597171872854,
"reward_change_std": 0.17806370370090008,
"reward_std": 0.4305805191397667,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/cosine_scaled_reward": -0.025662289932370186,
"step": 107
},
{
"clip_fraction": 0.0,
"completion_length": 2884.1458740234375,
"epoch": 0.12342857142857143,
"grad_norm": 0.02014852873980999,
"kl": 0.00014007091522216797,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.735511803093248e-07,
"loss": 0.0236,
"reward": 0.015964743681252003,
"reward_after_mean": 0.015964743681252003,
"reward_after_std": 0.6370243337005377,
"reward_before_mean": 0.2958701690658927,
"reward_before_std": 0.6383323790505528,
"reward_change_max": 0.0,
"reward_change_mean": -0.27990544214844704,
"reward_change_min": -0.4842909462749958,
"reward_change_std": 0.19452710915356874,
"reward_std": 0.6370243430137634,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/cosine_scaled_reward": 0.025036831386387348,
"step": 108
},
{
"clip_fraction": 0.0,
"completion_length": 2778.187515258789,
"epoch": 0.12457142857142857,
"grad_norm": 0.02239762246608734,
"kl": 0.0001312941312789917,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.657047735161255e-07,
"loss": -0.0326,
"reward": -0.197466429322958,
"reward_after_mean": -0.197466429322958,
"reward_after_std": 0.3072771281003952,
"reward_before_mean": 0.042145409155637026,
"reward_before_std": 0.23205038718879223,
"reward_change_max": 0.0,
"reward_change_mean": -0.23961183801293373,
"reward_change_min": -0.3293800801038742,
"reward_change_std": 0.12456908635795116,
"reward_std": 0.3072771355509758,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.10368793923407793,
"step": 109
},
{
"clip_fraction": 0.0,
"completion_length": 2621.166717529297,
"epoch": 0.12571428571428572,
"grad_norm": 0.028512069955468178,
"kl": 0.00011426769196987152,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.578535828967777e-07,
"loss": 0.0195,
"reward": 0.20604130998253822,
"reward_after_mean": 0.20604130998253822,
"reward_after_std": 0.6211877912282944,
"reward_before_mean": 0.5551559692248702,
"reward_before_std": 0.5809559132903814,
"reward_change_max": 0.0,
"reward_change_mean": -0.34911466389894485,
"reward_change_min": -0.5866297446191311,
"reward_change_std": 0.22965920250862837,
"reward_std": 0.6211878024041653,
"rewards/accuracy_reward": 0.45833333767950535,
"rewards/cosine_scaled_reward": 0.09682262875139713,
"step": 110
},
{
"clip_fraction": 0.0,
"completion_length": 2751.0625762939453,
"epoch": 0.12685714285714286,
"grad_norm": 0.023924345150589943,
"kl": 0.00016123056411743164,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.5e-07,
"loss": -0.0046,
"reward": 0.05653337016701698,
"reward_after_mean": 0.05653337016701698,
"reward_after_std": 0.35486595053225756,
"reward_before_mean": 0.38315436616539955,
"reward_before_std": 0.27544057788327336,
"reward_change_max": 0.0,
"reward_change_mean": -0.3266210127621889,
"reward_change_min": -0.47289396263659,
"reward_change_std": 0.1823914684355259,
"reward_std": 0.3548659607768059,
"rewards/accuracy_reward": 0.3125,
"rewards/cosine_scaled_reward": 0.07065436616539955,
"step": 111
},
{
"clip_fraction": 0.0,
"completion_length": 2868.0208740234375,
"epoch": 0.128,
"grad_norm": 0.0201033316552639,
"kl": 0.00013107061386108398,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.421464171032224e-07,
"loss": 0.015,
"reward": 0.12696666829288006,
"reward_after_mean": 0.12696666829288006,
"reward_after_std": 0.4511380046606064,
"reward_before_mean": 0.4671168327331543,
"reward_before_std": 0.36786600202322006,
"reward_change_max": 0.0,
"reward_change_mean": -0.3401501439511776,
"reward_change_min": -0.5330948643386364,
"reward_change_std": 0.1999045666307211,
"reward_std": 0.4511380158364773,
"rewards/accuracy_reward": 0.3541666679084301,
"rewards/cosine_scaled_reward": 0.11295014806091785,
"step": 112
},
{
"clip_fraction": 0.0,
"completion_length": 1964.208381652832,
"epoch": 0.12914285714285714,
"grad_norm": 0.034607209265232086,
"kl": 0.00012370198965072632,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0372,
"reward": -0.06578832399100065,
"reward_after_mean": -0.06578832399100065,
"reward_after_std": 0.6056272368878126,
"reward_before_mean": 0.18658637441694736,
"reward_before_std": 0.5886824317276478,
"reward_change_max": 0.0,
"reward_change_mean": -0.2523746956139803,
"reward_change_min": -0.40224030055105686,
"reward_change_std": 0.1620657118037343,
"reward_std": 0.605627266690135,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/cosine_scaled_reward": -0.06341363039973658,
"step": 113
},
{
"clip_fraction": 0.0,
"completion_length": 1979.9792251586914,
"epoch": 0.13028571428571428,
"grad_norm": 0.026191117241978645,
"kl": 8.503347635269165e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.264488196906752e-07,
"loss": -0.0251,
"reward": -0.1890785889700055,
"reward_after_mean": -0.1890785889700055,
"reward_after_std": 0.3168610939756036,
"reward_before_mean": 0.054542893543839455,
"reward_before_std": 0.2577635142952204,
"reward_change_max": 0.0,
"reward_change_mean": -0.24362149834632874,
"reward_change_min": -0.3862832821905613,
"reward_change_std": 0.14109937846660614,
"reward_std": 0.31686110980808735,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.11212376441108063,
"step": 114
},
{
"clip_fraction": 0.0,
"completion_length": 2766.020866394043,
"epoch": 0.13142857142857142,
"grad_norm": 0.023059694096446037,
"kl": 0.00013384222984313965,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0056,
"reward": 0.10441343765705824,
"reward_after_mean": 0.10441343765705824,
"reward_after_std": 0.5611015018075705,
"reward_before_mean": 0.42113048676401377,
"reward_before_std": 0.49046179838478565,
"reward_change_max": 0.0,
"reward_change_mean": -0.3167170472443104,
"reward_change_min": -0.5452346540987492,
"reward_change_std": 0.1955110440030694,
"reward_std": 0.561101520434022,
"rewards/accuracy_reward": 0.35416666977107525,
"rewards/cosine_scaled_reward": 0.06696383003145456,
"step": 115
},
{
"clip_fraction": 0.0,
"completion_length": 3097.833366394043,
"epoch": 0.13257142857142856,
"grad_norm": 0.02340916357934475,
"kl": 0.00016766786575317383,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.107799157635538e-07,
"loss": -0.0232,
"reward": -0.19432928040623665,
"reward_after_mean": -0.19432928040623665,
"reward_after_std": 0.4012261498719454,
"reward_before_mean": 0.043743424117565155,
"reward_before_std": 0.400667910464108,
"reward_change_max": 0.0,
"reward_change_mean": -0.23807269148528576,
"reward_change_min": -0.395206730812788,
"reward_change_std": 0.1616207016631961,
"reward_std": 0.4012261591851711,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.12292325869202614,
"step": 116
},
{
"clip_fraction": 0.0,
"completion_length": 2717.6875610351562,
"epoch": 0.1337142857142857,
"grad_norm": 0.023171402513980865,
"kl": 0.00016413629055023193,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 5.02962191529556e-07,
"loss": -0.0007,
"reward": -0.11855829134583473,
"reward_after_mean": -0.11855829134583473,
"reward_after_std": 0.4816027395427227,
"reward_before_mean": 0.1314456146210432,
"reward_before_std": 0.4761247858405113,
"reward_change_max": 0.0,
"reward_change_mean": -0.25000389851629734,
"reward_change_min": -0.4218711256980896,
"reward_change_std": 0.1656272802501917,
"reward_std": 0.48160274885594845,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.03522104769945145,
"step": 117
},
{
"clip_fraction": 0.0,
"completion_length": 2885.2083587646484,
"epoch": 0.13485714285714287,
"grad_norm": 0.01870822347700596,
"kl": 0.00013941526412963867,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.951587954676837e-07,
"loss": 0.0402,
"reward": 0.3842864651232958,
"reward_after_mean": 0.3842864651232958,
"reward_after_std": 0.7551897596567869,
"reward_before_mean": 0.7818266898393631,
"reward_before_std": 0.7093747109174728,
"reward_change_max": 0.0,
"reward_change_mean": -0.3975402247160673,
"reward_change_min": -0.7075115144252777,
"reward_change_std": 0.272270480170846,
"reward_std": 0.7551897652447224,
"rewards/accuracy_reward": 0.5000000055879354,
"rewards/cosine_scaled_reward": 0.28182668425142765,
"step": 118
},
{
"clip_fraction": 0.0,
"completion_length": 1926.687515258789,
"epoch": 0.136,
"grad_norm": 0.0322633758187294,
"kl": 0.00010208180174231529,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0234,
"reward": 0.1267358995974064,
"reward_after_mean": 0.1267358995974064,
"reward_after_std": 0.3533069547265768,
"reward_before_mean": 0.47834774386137724,
"reward_before_std": 0.2697556195780635,
"reward_change_max": 0.0,
"reward_change_mean": -0.3516118451952934,
"reward_change_min": -0.5000638235360384,
"reward_change_std": 0.19549622386693954,
"reward_std": 0.3533069621771574,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.14501440059393644,
"step": 119
},
{
"clip_fraction": 0.0,
"completion_length": 2186.1875381469727,
"epoch": 0.13714285714285715,
"grad_norm": 0.031494706869125366,
"kl": 0.0001770704984664917,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.79604490731896e-07,
"loss": 0.0894,
"reward": 0.07591197546571493,
"reward_after_mean": 0.07591197546571493,
"reward_after_std": 0.5809869198128581,
"reward_before_mean": 0.3816301135811955,
"reward_before_std": 0.551488799508661,
"reward_change_max": 0.0,
"reward_change_mean": -0.3057181444019079,
"reward_change_min": -0.47399745136499405,
"reward_change_std": 0.18618585728108883,
"reward_std": 0.5809869384393096,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.02746343519538641,
"step": 120
},
{
"clip_fraction": 0.0,
"completion_length": 1793.2500228881836,
"epoch": 0.1382857142857143,
"grad_norm": 0.03826192766427994,
"kl": 0.0001408308744430542,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.063,
"reward": -0.10150941228494048,
"reward_after_mean": -0.10150941228494048,
"reward_after_std": 0.3905038358643651,
"reward_before_mean": 0.16313835605978966,
"reward_before_std": 0.3259769971482456,
"reward_change_max": 0.0,
"reward_change_mean": -0.2646478023380041,
"reward_change_min": -0.3776704464107752,
"reward_change_std": 0.15065845055505633,
"reward_std": 0.3905038433149457,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.0660282839089632,
"step": 121
},
{
"clip_fraction": 0.0,
"completion_length": 2746.604179382324,
"epoch": 0.13942857142857143,
"grad_norm": 0.02808026410639286,
"kl": 0.00018015503883361816,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.641359520805548e-07,
"loss": 0.0635,
"reward": 0.0752830570563674,
"reward_after_mean": 0.0752830570563674,
"reward_after_std": 0.6505839275196195,
"reward_before_mean": 0.3795435354113579,
"reward_before_std": 0.674185479991138,
"reward_change_max": 0.0,
"reward_change_mean": -0.30426048301160336,
"reward_change_min": -0.6016716994345188,
"reward_change_std": 0.22931489627808332,
"reward_std": 0.6505839368328452,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.06704353634268045,
"step": 122
},
{
"clip_fraction": 0.0,
"completion_length": 2631.7916870117188,
"epoch": 0.14057142857142857,
"grad_norm": 0.02497822605073452,
"kl": 0.00014847517013549805,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.5643973913200837e-07,
"loss": -0.0774,
"reward": 0.07049691770225763,
"reward_after_mean": 0.07049691770225763,
"reward_after_std": 0.41926424019038677,
"reward_before_mean": 0.39295812509953976,
"reward_before_std": 0.33388588251546025,
"reward_change_max": 0.0,
"reward_change_mean": -0.3224612195044756,
"reward_change_min": -0.4707703944295645,
"reward_change_std": 0.18158453051000834,
"reward_std": 0.41926424857228994,
"rewards/accuracy_reward": 0.33333333395421505,
"rewards/cosine_scaled_reward": 0.05962479766458273,
"step": 123
},
{
"clip_fraction": 0.0,
"completion_length": 2051.0000381469727,
"epoch": 0.1417142857142857,
"grad_norm": 0.025261210277676582,
"kl": 7.904693484306335e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.4877202554526084e-07,
"loss": 0.0796,
"reward": 0.2634911872446537,
"reward_after_mean": 0.2634911872446537,
"reward_after_std": 0.5967842470854521,
"reward_before_mean": 0.6438823733478785,
"reward_before_std": 0.6048776777461171,
"reward_change_max": 0.0,
"reward_change_mean": -0.38039117865264416,
"reward_change_min": -0.635780643671751,
"reward_change_std": 0.26227735076099634,
"reward_std": 0.5967842638492584,
"rewards/accuracy_reward": 0.4375000074505806,
"rewards/cosine_scaled_reward": 0.20638234540820122,
"step": 124
},
{
"clip_fraction": 0.0,
"completion_length": 2334.8541679382324,
"epoch": 0.14285714285714285,
"grad_norm": 0.023498935624957085,
"kl": 0.00010383129119873047,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0518,
"reward": 0.0770783182233572,
"reward_after_mean": 0.0770783182233572,
"reward_after_std": 0.5987216774374247,
"reward_before_mean": 0.376096501480788,
"reward_before_std": 0.5320965368300676,
"reward_change_max": 0.0,
"reward_change_mean": -0.2990182042121887,
"reward_change_min": -0.4671790637075901,
"reward_change_std": 0.17802791390568018,
"reward_std": 0.5987217091023922,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/cosine_scaled_reward": 0.06359649077057838,
"step": 125
},
{
"clip_fraction": 0.0,
"completion_length": 2312.9583740234375,
"epoch": 0.144,
"grad_norm": 0.02435940131545067,
"kl": 9.564310312271118e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.3353142970386557e-07,
"loss": 0.0478,
"reward": 0.09839674085378647,
"reward_after_mean": 0.09839674085378647,
"reward_after_std": 0.4974265359342098,
"reward_before_mean": 0.42580396682024,
"reward_before_std": 0.46056526992470026,
"reward_change_max": 0.0,
"reward_change_mean": -0.32740725204348564,
"reward_change_min": -0.4952290430665016,
"reward_change_std": 0.1967391036450863,
"reward_std": 0.4974265471100807,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.09247064776718616,
"step": 126
},
{
"clip_fraction": 0.0,
"completion_length": 3158.5833740234375,
"epoch": 0.14514285714285713,
"grad_norm": 0.021022077649831772,
"kl": 0.00015288591384887695,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.2596318988235037e-07,
"loss": -0.0607,
"reward": -0.19837769120931625,
"reward_after_mean": -0.19837769120931625,
"reward_after_std": 0.4446485564112663,
"reward_before_mean": 0.024635582813061774,
"reward_before_std": 0.4137072516605258,
"reward_change_max": 0.0,
"reward_change_mean": -0.2230132780969143,
"reward_change_min": -0.3493082635104656,
"reward_change_std": 0.13094482477754354,
"reward_std": 0.4446485601365566,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/cosine_scaled_reward": -0.12119775079190731,
"step": 127
},
{
"clip_fraction": 0.0,
"completion_length": 2172.7917098999023,
"epoch": 0.1462857142857143,
"grad_norm": 0.02802017331123352,
"kl": 0.0001382678747177124,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.1843273287476854e-07,
"loss": 0.0109,
"reward": 0.24382356368005276,
"reward_after_mean": 0.24382356368005276,
"reward_after_std": 0.5414328817278147,
"reward_before_mean": 0.6193723455071449,
"reward_before_std": 0.51410650042817,
"reward_change_max": 0.0,
"reward_change_mean": -0.3755488097667694,
"reward_change_min": -0.6071460507810116,
"reward_change_std": 0.23622124083340168,
"reward_std": 0.5414328835904598,
"rewards/accuracy_reward": 0.4375000111758709,
"rewards/cosine_scaled_reward": 0.18187237158417702,
"step": 128
},
{
"clip_fraction": 0.0,
"completion_length": 3269.0625610351562,
"epoch": 0.14742857142857144,
"grad_norm": 0.0199818667024374,
"kl": 0.00011938810348510742,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.027,
"reward": 0.07939598336815834,
"reward_after_mean": 0.07939598336815834,
"reward_after_std": 0.5406165793538094,
"reward_before_mean": 0.39787398651242256,
"reward_before_std": 0.5439900029450655,
"reward_change_max": 0.0,
"reward_change_mean": -0.318478025496006,
"reward_change_min": -0.5440695825964212,
"reward_change_std": 0.21484098490327597,
"reward_std": 0.5406166054308414,
"rewards/accuracy_reward": 0.3125000111758709,
"rewards/cosine_scaled_reward": 0.08537399768829346,
"step": 129
},
{
"clip_fraction": 0.0,
"completion_length": 3137.0833740234375,
"epoch": 0.14857142857142858,
"grad_norm": 0.01886666566133499,
"kl": 0.00018727779388427734,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 4.034943304942796e-07,
"loss": 0.0105,
"reward": -0.13560364861041307,
"reward_after_mean": -0.13560364861041307,
"reward_after_std": 0.4868227355182171,
"reward_before_mean": 0.11097788251936436,
"reward_before_std": 0.49222803860902786,
"reward_change_max": 0.0,
"reward_change_mean": -0.24658154882490635,
"reward_change_min": -0.455986674875021,
"reward_change_std": 0.17662312928587198,
"reward_std": 0.486822746694088,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.07652211067033932,
"step": 130
},
{
"clip_fraction": 0.0,
"completion_length": 2480.5208892822266,
"epoch": 0.14971428571428572,
"grad_norm": 0.027400832623243332,
"kl": 0.00014418736100196838,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0254,
"reward": 0.07110828906297684,
"reward_after_mean": 0.07110828906297684,
"reward_after_std": 0.36002582497894764,
"reward_before_mean": 0.4055180950090289,
"reward_before_std": 0.3088535754941404,
"reward_change_max": 0.0,
"reward_change_mean": -0.33440983295440674,
"reward_change_min": -0.470782570540905,
"reward_change_std": 0.18981244694441557,
"reward_std": 0.3600258268415928,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/cosine_scaled_reward": 0.0721847927197814,
"step": 131
},
{
"clip_fraction": 0.0,
"completion_length": 2486.645851135254,
"epoch": 0.15085714285714286,
"grad_norm": 0.02796083875000477,
"kl": 0.00013111159205436707,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.8873442270461485e-07,
"loss": 0.0842,
"reward": 0.2597197503782809,
"reward_after_mean": 0.2597197503782809,
"reward_after_std": 0.744433356449008,
"reward_before_mean": 0.6226858850568533,
"reward_before_std": 0.7914422228932381,
"reward_change_max": 0.0,
"reward_change_mean": -0.3629660848528147,
"reward_change_min": -0.6258580330759287,
"reward_change_std": 0.2674333294853568,
"reward_std": 0.7444333788007498,
"rewards/accuracy_reward": 0.43750001303851604,
"rewards/cosine_scaled_reward": 0.18518583837430924,
"step": 132
},
{
"clip_fraction": 0.0,
"completion_length": 2971.8958587646484,
"epoch": 0.152,
"grad_norm": 0.019855745136737823,
"kl": 0.00016885995864868164,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.8142703296283953e-07,
"loss": -0.0021,
"reward": -0.15273336577229202,
"reward_after_mean": -0.15273336577229202,
"reward_after_std": 0.46419150568544865,
"reward_before_mean": 0.08369430713355541,
"reward_before_std": 0.410770776681602,
"reward_change_max": 0.0,
"reward_change_mean": -0.2364276945590973,
"reward_change_min": -0.357030825689435,
"reward_change_std": 0.13842764357104897,
"reward_std": 0.4641915149986744,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/cosine_scaled_reward": -0.1663056961260736,
"step": 133
},
{
"clip_fraction": 0.0,
"completion_length": 2386.479202270508,
"epoch": 0.15314285714285714,
"grad_norm": 0.026198577135801315,
"kl": 0.00013698264956474304,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.7417099217982686e-07,
"loss": -0.0157,
"reward": 0.19155731541104615,
"reward_after_mean": 0.19155731541104615,
"reward_after_std": 0.5861021094024181,
"reward_before_mean": 0.5390464821830392,
"reward_before_std": 0.5199935543350875,
"reward_change_max": 0.0,
"reward_change_mean": -0.34748917259275913,
"reward_change_min": -0.5791817046701908,
"reward_change_std": 0.2208269750699401,
"reward_std": 0.586102120578289,
"rewards/accuracy_reward": 0.37500000186264515,
"rewards/cosine_scaled_reward": 0.16404648189200088,
"step": 134
},
{
"clip_fraction": 0.0,
"completion_length": 1384.2083587646484,
"epoch": 0.15428571428571428,
"grad_norm": 0.05298980697989464,
"kl": 6.917491555213928e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0295,
"reward": 0.32029616460204124,
"reward_after_mean": 0.32029616460204124,
"reward_after_std": 0.469398295506835,
"reward_before_mean": 0.7312853448092937,
"reward_before_std": 0.38515608105808496,
"reward_change_max": 0.0,
"reward_change_mean": -0.4109892025589943,
"reward_change_min": -0.6098762080073357,
"reward_change_std": 0.24706434831023216,
"reward_std": 0.4693983215838671,
"rewards/accuracy_reward": 0.5208333358168602,
"rewards/cosine_scaled_reward": 0.2104520034044981,
"step": 135
},
{
"clip_fraction": 0.0,
"completion_length": 2418.229179382324,
"epoch": 0.15542857142857142,
"grad_norm": 0.02546970546245575,
"kl": 0.00011172890663146973,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.5982178221668533e-07,
"loss": 0.0076,
"reward": 0.20953483134508133,
"reward_after_mean": 0.20953483134508133,
"reward_after_std": 0.5650828517973423,
"reward_before_mean": 0.5645723771303892,
"reward_before_std": 0.49963863380253315,
"reward_change_max": 0.0,
"reward_change_mean": -0.3550375532358885,
"reward_change_min": -0.5531989093869925,
"reward_change_std": 0.21487182471901178,
"reward_std": 0.5650828760117292,
"rewards/accuracy_reward": 0.39583333395421505,
"rewards/cosine_scaled_reward": 0.16873904690146446,
"step": 136
},
{
"clip_fraction": 0.0,
"completion_length": 2880.583396911621,
"epoch": 0.15657142857142858,
"grad_norm": 0.02026941254734993,
"kl": 0.00010737031698226929,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0051,
"reward": -0.23039704840630293,
"reward_after_mean": -0.23039704840630293,
"reward_after_std": 0.4545791279524565,
"reward_before_mean": -0.0207052119076252,
"reward_before_std": 0.4199818782508373,
"reward_change_max": 0.0,
"reward_change_mean": -0.2096918299794197,
"reward_change_min": -0.3325869217514992,
"reward_change_std": 0.12889837939292192,
"reward_std": 0.45457913912832737,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.1457052135374397,
"step": 137
},
{
"clip_fraction": 0.0,
"completion_length": 2572.875045776367,
"epoch": 0.15771428571428572,
"grad_norm": 0.024446720257401466,
"kl": 9.263120591640472e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.45704275117204e-07,
"loss": -0.0266,
"reward": -0.0885739466175437,
"reward_after_mean": -0.0885739466175437,
"reward_after_std": 0.40795043855905533,
"reward_before_mean": 0.17822610400617123,
"reward_before_std": 0.33472174778580666,
"reward_change_max": 0.0,
"reward_change_mean": -0.2668000441044569,
"reward_change_min": -0.3915207665413618,
"reward_change_std": 0.1494421288371086,
"reward_std": 0.40795045532286167,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.05094056576490402,
"step": 138
},
{
"clip_fraction": 0.0,
"completion_length": 2697.7916946411133,
"epoch": 0.15885714285714286,
"grad_norm": 0.02511826530098915,
"kl": 0.00013872981071472168,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0188,
"reward": -0.028292442206293344,
"reward_after_mean": -0.028292442206293344,
"reward_after_std": 0.40028360672295094,
"reward_before_mean": 0.25999742932617664,
"reward_before_std": 0.3327331282198429,
"reward_change_max": 0.0,
"reward_change_mean": -0.2882898673415184,
"reward_change_min": -0.41081140749156475,
"reward_change_std": 0.15616974979639053,
"reward_std": 0.40028361417353153,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": 0.009997433982789516,
"step": 139
},
{
"clip_fraction": 0.0,
"completion_length": 2810.041732788086,
"epoch": 0.16,
"grad_norm": 0.038509517908096313,
"kl": 0.0001595616340637207,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.3183567088914833e-07,
"loss": 0.0088,
"reward": 0.14832666027359664,
"reward_after_mean": 0.14832666027359664,
"reward_after_std": 0.6521247308701277,
"reward_before_mean": 0.4682345949113369,
"reward_before_std": 0.5863152034580708,
"reward_change_max": 0.0,
"reward_change_mean": -0.31990794092416763,
"reward_change_min": -0.5289614573121071,
"reward_change_std": 0.20588868111371994,
"reward_std": 0.652124747633934,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.11406791373156011,
"step": 140
},
{
"clip_fraction": 0.0,
"completion_length": 2737.7709045410156,
"epoch": 0.16114285714285714,
"grad_norm": 0.02538427524268627,
"kl": 0.00011593103408813477,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.250000000000001e-07,
"loss": -0.0546,
"reward": -0.2177041512914002,
"reward_after_mean": -0.2177041512914002,
"reward_after_std": 0.594437601044774,
"reward_before_mean": -0.023291918449103832,
"reward_before_std": 0.568119059316814,
"reward_change_max": 0.0,
"reward_change_mean": -0.19441223330795765,
"reward_change_min": -0.386432521045208,
"reward_change_std": 0.13637393061071634,
"reward_std": 0.5944376047700644,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/cosine_scaled_reward": -0.1482919171685353,
"step": 141
},
{
"clip_fraction": 0.0,
"completion_length": 2527.4583740234375,
"epoch": 0.16228571428571428,
"grad_norm": 0.021552937105298042,
"kl": 0.0001442432403564453,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.182328662904756e-07,
"loss": -0.0467,
"reward": 0.038692621514201164,
"reward_after_mean": 0.038692621514201164,
"reward_after_std": 0.49257983826100826,
"reward_before_mean": 0.33910070918500423,
"reward_before_std": 0.43346802331507206,
"reward_change_max": 0.0,
"reward_change_mean": -0.3004080858081579,
"reward_change_min": -0.472257686778903,
"reward_change_std": 0.17685699556022882,
"reward_std": 0.49257984571158886,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": 0.047434025444090366,
"step": 142
},
{
"clip_fraction": 0.0,
"completion_length": 2238.604202270508,
"epoch": 0.16342857142857142,
"grad_norm": 0.025303639471530914,
"kl": 0.00010967254638671875,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0228,
"reward": -0.13753529638051987,
"reward_after_mean": -0.13753529638051987,
"reward_after_std": 0.3711821623146534,
"reward_before_mean": 0.12087950762361288,
"reward_before_std": 0.3427344807423651,
"reward_change_max": 0.0,
"reward_change_mean": -0.2584147993475199,
"reward_change_min": -0.39502568170428276,
"reward_change_std": 0.15850333217531443,
"reward_std": 0.37118216790258884,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.045787165872752666,
"step": 143
},
{
"clip_fraction": 0.0,
"completion_length": 2460.583381652832,
"epoch": 0.16457142857142856,
"grad_norm": 0.03277630731463432,
"kl": 0.00010095536708831787,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 3.0491243424323783e-07,
"loss": 0.0722,
"reward": 0.4543617179733701,
"reward_after_mean": 0.4543617179733701,
"reward_after_std": 0.7069333475083113,
"reward_before_mean": 0.8829482905566692,
"reward_before_std": 0.6676272489130497,
"reward_change_max": 0.0,
"reward_change_mean": -0.42858656495809555,
"reward_change_min": -0.6201369129121304,
"reward_change_std": 0.2603969210758805,
"reward_std": 0.7069333605468273,
"rewards/accuracy_reward": 0.5833333469927311,
"rewards/cosine_scaled_reward": 0.2996149277314544,
"step": 144
},
{
"clip_fraction": 0.0,
"completion_length": 1835.270866394043,
"epoch": 0.1657142857142857,
"grad_norm": 0.03344608470797539,
"kl": 0.00010331720113754272,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0229,
"reward": 0.09240274596959352,
"reward_after_mean": 0.09240274596959352,
"reward_after_std": 0.45070875994861126,
"reward_before_mean": 0.41964344773441553,
"reward_before_std": 0.37119755055755377,
"reward_change_max": 0.0,
"reward_change_mean": -0.32724072225391865,
"reward_change_min": -0.4856059141457081,
"reward_change_std": 0.19001474510878325,
"reward_std": 0.45070877112448215,
"rewards/accuracy_reward": 0.33333333395421505,
"rewards/cosine_scaled_reward": 0.08631011750549078,
"step": 145
},
{
"clip_fraction": 0.0,
"completion_length": 2060.9583892822266,
"epoch": 0.16685714285714287,
"grad_norm": 0.026087850332260132,
"kl": 0.00010473420843482018,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.918906036420294e-07,
"loss": -0.022,
"reward": -0.3456582888029516,
"reward_after_mean": -0.3456582888029516,
"reward_after_std": 0.3900475464761257,
"reward_before_mean": -0.16955635324120522,
"reward_before_std": 0.35085606575012207,
"reward_change_max": 0.0,
"reward_change_mean": -0.17610193602740765,
"reward_change_min": -0.3039908893406391,
"reward_change_std": 0.1093815853819251,
"reward_std": 0.3900475464761257,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/cosine_scaled_reward": -0.2320563482644502,
"step": 146
},
{
"clip_fraction": 0.0,
"completion_length": 3446.062530517578,
"epoch": 0.168,
"grad_norm": 0.017223268747329712,
"kl": 0.00015714764595031738,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.854966364683872e-07,
"loss": 0.035,
"reward": 0.05932202748954296,
"reward_after_mean": 0.05932202748954296,
"reward_after_std": 0.647826049476862,
"reward_before_mean": 0.3570065386593342,
"reward_before_std": 0.6669072303920984,
"reward_change_max": 0.0,
"reward_change_mean": -0.2976845409721136,
"reward_change_min": -0.5494338441640139,
"reward_change_std": 0.21914179529994726,
"reward_std": 0.647826075553894,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/cosine_scaled_reward": 0.06533987820148468,
"step": 147
},
{
"clip_fraction": 0.0,
"completion_length": 2190.6875534057617,
"epoch": 0.16914285714285715,
"grad_norm": 0.026238586753606796,
"kl": 0.00011092424392700195,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.791832395815782e-07,
"loss": 0.0352,
"reward": 0.02173960581421852,
"reward_after_mean": 0.02173960581421852,
"reward_after_std": 0.6264125965535641,
"reward_before_mean": 0.3032309217378497,
"reward_before_std": 0.5903957700356841,
"reward_change_max": 0.0,
"reward_change_mean": -0.28149132430553436,
"reward_change_min": -0.49092668667435646,
"reward_change_std": 0.19152779690921307,
"reward_std": 0.6264126114547253,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.011564248241484165,
"step": 148
},
{
"clip_fraction": 0.0,
"completion_length": 2382.7083587646484,
"epoch": 0.1702857142857143,
"grad_norm": 0.02286612056195736,
"kl": 9.250640869140625e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0309,
"reward": 0.1198802962899208,
"reward_after_mean": 0.1198802962899208,
"reward_after_std": 0.4947097469121218,
"reward_before_mean": 0.45648779161274433,
"reward_before_std": 0.46653653495013714,
"reward_change_max": 0.0,
"reward_change_mean": -0.33660750463604927,
"reward_change_min": -0.5417331680655479,
"reward_change_std": 0.21329155191779137,
"reward_std": 0.4947097636759281,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.10232111997902393,
"step": 149
},
{
"clip_fraction": 0.0,
"completion_length": 2792.333351135254,
"epoch": 0.17142857142857143,
"grad_norm": 0.022381342947483063,
"kl": 0.00012479722499847412,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.6680582402757324e-07,
"loss": 0.0524,
"reward": -0.01498852763324976,
"reward_after_mean": -0.01498852763324976,
"reward_after_std": 0.664625771343708,
"reward_before_mean": 0.25175523199141026,
"reward_before_std": 0.6803292762488127,
"reward_change_max": 0.0,
"reward_change_mean": -0.26674376986920834,
"reward_change_min": -0.53781808167696,
"reward_change_std": 0.20475875865668058,
"reward_std": 0.6646257899701595,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": 0.001755234319716692,
"step": 150
},
{
"clip_fraction": 0.0,
"completion_length": 2384.8125610351562,
"epoch": 0.17257142857142857,
"grad_norm": 0.028256850317120552,
"kl": 0.00014731287956237793,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.138,
"reward": 0.35779024893417954,
"reward_after_mean": 0.35779024893417954,
"reward_after_std": 0.7775043789297342,
"reward_before_mean": 0.745909059420228,
"reward_before_std": 0.7730156276375055,
"reward_change_max": 0.0,
"reward_change_mean": -0.3881188202649355,
"reward_change_min": -0.6950398050248623,
"reward_change_std": 0.271209386177361,
"reward_std": 0.7775043845176697,
"rewards/accuracy_reward": 0.47916667722165585,
"rewards/cosine_scaled_reward": 0.26674237998668104,
"step": 151
},
{
"clip_fraction": 0.0,
"completion_length": 2806.062511444092,
"epoch": 0.1737142857142857,
"grad_norm": 0.03883467614650726,
"kl": 0.00019087642431259155,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.547734369542718e-07,
"loss": 0.0386,
"reward": -0.1646068338304758,
"reward_after_mean": -0.1646068338304758,
"reward_after_std": 0.5001123249530792,
"reward_before_mean": 0.06822102330625057,
"reward_before_std": 0.49387937784194946,
"reward_change_max": 0.0,
"reward_change_mean": -0.23282786458730698,
"reward_change_min": -0.45555905625224113,
"reward_change_std": 0.16819952800869942,
"reward_std": 0.5001123435795307,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.09844564087688923,
"step": 152
},
{
"clip_fraction": 0.0,
"completion_length": 2609.625045776367,
"epoch": 0.17485714285714285,
"grad_norm": 0.029369013383984566,
"kl": 0.00015197694301605225,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0563,
"reward": -0.2579225329682231,
"reward_after_mean": -0.2579225329682231,
"reward_after_std": 0.41702854074537754,
"reward_before_mean": -0.0514589948579669,
"reward_before_std": 0.3938685590401292,
"reward_change_max": 0.0,
"reward_change_mean": -0.2064635269343853,
"reward_change_min": -0.3605457991361618,
"reward_change_std": 0.13238902669399977,
"reward_std": 0.4170285416767001,
"rewards/accuracy_reward": 0.14583333767950535,
"rewards/cosine_scaled_reward": -0.19729233742691576,
"step": 153
},
{
"clip_fraction": 0.0,
"completion_length": 2773.625045776367,
"epoch": 0.176,
"grad_norm": 0.019490770995616913,
"kl": 0.00013262033462524414,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.4310073797187573e-07,
"loss": 0.0404,
"reward": 0.2587023861706257,
"reward_after_mean": 0.2587023861706257,
"reward_after_std": 0.5390413794666529,
"reward_before_mean": 0.6395768262445927,
"reward_before_std": 0.492486541159451,
"reward_change_max": 0.0,
"reward_change_mean": -0.3808744475245476,
"reward_change_min": -0.6062983945012093,
"reward_change_std": 0.24063634779304266,
"reward_std": 0.5390413850545883,
"rewards/accuracy_reward": 0.4583333395421505,
"rewards/cosine_scaled_reward": 0.18124347925186157,
"step": 154
},
{
"clip_fraction": 0.0,
"completion_length": 2265.0833435058594,
"epoch": 0.17714285714285713,
"grad_norm": 0.02680964767932892,
"kl": 0.00013046711683273315,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.374037332934512e-07,
"loss": -0.0159,
"reward": 0.21415067370980978,
"reward_after_mean": 0.21415067370980978,
"reward_after_std": 0.6358739994466305,
"reward_before_mean": 0.5597648918628693,
"reward_before_std": 0.5540275080129504,
"reward_change_max": 0.0,
"reward_change_mean": -0.34561420790851116,
"reward_change_min": -0.6032971385866404,
"reward_change_std": 0.2241066563874483,
"reward_std": 0.6358740348368883,
"rewards/accuracy_reward": 0.4583333358168602,
"rewards/cosine_scaled_reward": 0.10143155371770263,
"step": 155
},
{
"clip_fraction": 0.0,
"completion_length": 2724.979179382324,
"epoch": 0.1782857142857143,
"grad_norm": 0.02436206117272377,
"kl": 0.00014898180961608887,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.3180194846605364e-07,
"loss": -0.0307,
"reward": -0.16922233253717422,
"reward_after_mean": -0.16922233253717422,
"reward_after_std": 0.4979768879711628,
"reward_before_mean": 0.06080557717359625,
"reward_before_std": 0.48400300554931164,
"reward_change_max": 0.0,
"reward_change_mean": -0.2300279177725315,
"reward_change_min": -0.38487651385366917,
"reward_change_std": 0.1520459521561861,
"reward_std": 0.49797692708671093,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/cosine_scaled_reward": -0.10586108081042767,
"step": 156
},
{
"clip_fraction": 0.0,
"completion_length": 2841.1666870117188,
"epoch": 0.17942857142857144,
"grad_norm": 0.020490428432822227,
"kl": 0.00014954805374145508,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0408,
"reward": -0.16689512692391872,
"reward_after_mean": -0.16689512692391872,
"reward_after_std": 0.449503768235445,
"reward_before_mean": 0.06273568281903863,
"reward_before_std": 0.369269410148263,
"reward_change_max": 0.0,
"reward_change_mean": -0.22963083535432816,
"reward_change_min": -0.339257437735796,
"reward_change_std": 0.12097494397312403,
"reward_std": 0.44950377382338047,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.10393097810447216,
"step": 157
},
{
"clip_fraction": 0.0,
"completion_length": 2291.1875610351562,
"epoch": 0.18057142857142858,
"grad_norm": 0.028161099180579185,
"kl": 0.00011629331856966019,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.2089083427137329e-07,
"loss": -0.0064,
"reward": 0.3283900278620422,
"reward_after_mean": 0.3283900278620422,
"reward_after_std": 0.71586455963552,
"reward_before_mean": 0.7013807380571961,
"reward_before_std": 0.6162153771147132,
"reward_change_max": 0.0,
"reward_change_mean": -0.3729907236993313,
"reward_change_min": -0.5737095661461353,
"reward_change_std": 0.22160479053854942,
"reward_std": 0.715864596888423,
"rewards/accuracy_reward": 0.41666666977107525,
"rewards/cosine_scaled_reward": 0.2847140731755644,
"step": 158
},
{
"clip_fraction": 0.0,
"completion_length": 3291.9584045410156,
"epoch": 0.18171428571428572,
"grad_norm": 0.01683027669787407,
"kl": 0.00015926361083984375,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.1558482853517253e-07,
"loss": -0.0387,
"reward": -0.14818060956895351,
"reward_after_mean": -0.14818060956895351,
"reward_after_std": 0.4753723032772541,
"reward_before_mean": 0.09327768813818693,
"reward_before_std": 0.46588534861803055,
"reward_change_max": 0.0,
"reward_change_mean": -0.24145829305052757,
"reward_change_min": -0.4540882632136345,
"reward_change_std": 0.1682984195649624,
"reward_std": 0.4753723070025444,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.07338898256421089,
"step": 159
},
{
"clip_fraction": 0.0,
"completion_length": 2902.479217529297,
"epoch": 0.18285714285714286,
"grad_norm": 0.023850562050938606,
"kl": 0.00019377470016479492,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.1038068889975259e-07,
"loss": 0.0365,
"reward": 0.07942142337560654,
"reward_after_mean": 0.07942142337560654,
"reward_after_std": 0.5131905730813742,
"reward_before_mean": 0.39866685029119253,
"reward_before_std": 0.48175664618611336,
"reward_change_max": 0.0,
"reward_change_mean": -0.3192454166710377,
"reward_change_min": -0.5308761186897755,
"reward_change_std": 0.2128805061802268,
"reward_std": 0.5131905842572451,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.06533349771052599,
"step": 160
},
{
"clip_fraction": 0.0,
"completion_length": 2287.291732788086,
"epoch": 0.184,
"grad_norm": 0.025397833436727524,
"kl": 0.00013327598571777344,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.1125,
"reward": 0.009378287941217422,
"reward_after_mean": 0.009378287941217422,
"reward_after_std": 0.5341452080756426,
"reward_before_mean": 0.30027469992637634,
"reward_before_std": 0.5237694401293993,
"reward_change_max": 0.0,
"reward_change_mean": -0.290896400809288,
"reward_change_min": -0.46712340973317623,
"reward_change_std": 0.18900143820792437,
"reward_std": 0.5341452155262232,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/cosine_scaled_reward": 0.008608013857156038,
"step": 161
},
{
"clip_fraction": 0.0,
"completion_length": 3025.312545776367,
"epoch": 0.18514285714285714,
"grad_norm": 0.02271593175828457,
"kl": 0.00016546249389648438,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 2.0028431734436308e-07,
"loss": 0.0268,
"reward": 0.14291446842253208,
"reward_after_mean": 0.14291446842253208,
"reward_after_std": 0.4402059204876423,
"reward_before_mean": 0.4916996471583843,
"reward_before_std": 0.36170108430087566,
"reward_change_max": 0.0,
"reward_change_mean": -0.3487852066755295,
"reward_change_min": -0.5253034494817257,
"reward_change_std": 0.20280368253588676,
"reward_std": 0.4402059353888035,
"rewards/accuracy_reward": 0.3958333395421505,
"rewards/cosine_scaled_reward": 0.09586631692945957,
"step": 162
},
{
"clip_fraction": 0.0,
"completion_length": 2381.000030517578,
"epoch": 0.18628571428571428,
"grad_norm": 0.02407103404402733,
"kl": 0.00012842798605561256,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0324,
"reward": 0.11635435372591019,
"reward_after_mean": 0.11635435372591019,
"reward_after_std": 0.34932328946888447,
"reward_before_mean": 0.46577244251966476,
"reward_before_std": 0.26789624989032745,
"reward_change_max": 0.0,
"reward_change_mean": -0.34941811859607697,
"reward_change_min": -0.4971570298075676,
"reward_change_std": 0.18925141356885433,
"reward_std": 0.3493233025074005,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/cosine_scaled_reward": 0.11160578578710556,
"step": 163
},
{
"clip_fraction": 0.0,
"completion_length": 2291.8750610351562,
"epoch": 0.18742857142857142,
"grad_norm": 0.028548384085297585,
"kl": 0.00012259185314178467,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.9061402047871833e-07,
"loss": 0.0285,
"reward": 0.1446489430963993,
"reward_after_mean": 0.1446489430963993,
"reward_after_std": 0.5495432000607252,
"reward_before_mean": 0.4804554167203605,
"reward_before_std": 0.5176653284579515,
"reward_change_max": 0.0,
"reward_change_mean": -0.33580648340284824,
"reward_change_min": -0.5389121547341347,
"reward_change_std": 0.21476514916867018,
"reward_std": 0.5495432112365961,
"rewards/accuracy_reward": 0.37500000558793545,
"rewards/cosine_scaled_reward": 0.1054554246366024,
"step": 164
},
{
"clip_fraction": 0.0,
"completion_length": 2865.854248046875,
"epoch": 0.18857142857142858,
"grad_norm": 0.02938673086464405,
"kl": 0.00016605854034423828,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.1174,
"reward": -0.1965638529509306,
"reward_after_mean": -0.1965638529509306,
"reward_after_std": 0.47414442524313927,
"reward_before_mean": 0.02646741457283497,
"reward_before_std": 0.45417727902531624,
"reward_change_max": 0.0,
"reward_change_mean": -0.22303126752376556,
"reward_change_min": -0.38297396898269653,
"reward_change_std": 0.14797239750623703,
"reward_std": 0.47414442524313927,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.11936591006815434,
"step": 165
},
{
"clip_fraction": 0.0,
"completion_length": 2499.2291870117188,
"epoch": 0.18971428571428572,
"grad_norm": 0.01974744163453579,
"kl": 0.0001074373722076416,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.8138158006995363e-07,
"loss": 0.0582,
"reward": 0.16037676320411265,
"reward_after_mean": 0.16037676320411265,
"reward_after_std": 0.5590666178613901,
"reward_before_mean": 0.49760086461901665,
"reward_before_std": 0.5006757825613022,
"reward_change_max": 0.0,
"reward_change_mean": -0.33722409792244434,
"reward_change_min": -0.4930756986141205,
"reward_change_std": 0.1948847435414791,
"reward_std": 0.5590666364878416,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.14343418591306545,
"step": 166
},
{
"clip_fraction": 0.0,
"completion_length": 2107.625030517578,
"epoch": 0.19085714285714286,
"grad_norm": 0.025774935260415077,
"kl": 0.00010276585817337036,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.7693309235023127e-07,
"loss": -0.0464,
"reward": -0.030031360685825348,
"reward_after_mean": -0.030031360685825348,
"reward_after_std": 0.4247422106564045,
"reward_before_mean": 0.2586988788098097,
"reward_before_std": 0.37567474879324436,
"reward_change_max": 0.0,
"reward_change_mean": -0.28873022459447384,
"reward_change_min": -0.4525489006191492,
"reward_change_std": 0.17794163059443235,
"reward_std": 0.42474222742021084,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/cosine_scaled_reward": -0.012134447693824768,
"step": 167
},
{
"clip_fraction": 0.0,
"completion_length": 2797.791717529297,
"epoch": 0.192,
"grad_norm": 0.019881395623087883,
"kl": 0.00014641880989074707,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.7259824442455923e-07,
"loss": 0.0198,
"reward": -0.06211714819073677,
"reward_after_mean": -0.06211714819073677,
"reward_after_std": 0.442181596532464,
"reward_before_mean": 0.21284539625048637,
"reward_before_std": 0.3944641398265958,
"reward_change_max": 0.0,
"reward_change_mean": -0.2749625276774168,
"reward_change_min": -0.46516215056180954,
"reward_change_std": 0.1713942475616932,
"reward_std": 0.4421816077083349,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/cosine_scaled_reward": -0.01632128842175007,
"step": 168
},
{
"clip_fraction": 0.0,
"completion_length": 2072.145881652832,
"epoch": 0.19314285714285714,
"grad_norm": 0.024925388395786285,
"kl": 0.00010547041893005371,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.6837835672960831e-07,
"loss": -0.039,
"reward": 0.33457482111407444,
"reward_after_mean": 0.33457482111407444,
"reward_after_std": 0.45331256836652756,
"reward_before_mean": 0.7455343250185251,
"reward_before_std": 0.3269349467009306,
"reward_change_max": 0.0,
"reward_change_mean": -0.41095951199531555,
"reward_change_min": -0.5560596249997616,
"reward_change_std": 0.2161042196676135,
"reward_std": 0.45331257209181786,
"rewards/accuracy_reward": 0.5000000055879354,
"rewards/cosine_scaled_reward": 0.24553431570529938,
"step": 169
},
{
"clip_fraction": 0.0,
"completion_length": 2338.166717529297,
"epoch": 0.19428571428571428,
"grad_norm": 0.027214445173740387,
"kl": 0.00011979043483734131,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.6427471468404952e-07,
"loss": -0.0404,
"reward": 0.05816604569554329,
"reward_after_mean": 0.05816604569554329,
"reward_after_std": 0.33920222520828247,
"reward_before_mean": 0.38867138512432575,
"reward_before_std": 0.26190874679014087,
"reward_change_max": 0.0,
"reward_change_mean": -0.33050532080233097,
"reward_change_min": -0.47121275775134563,
"reward_change_std": 0.1878534136340022,
"reward_std": 0.33920223265886307,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/cosine_scaled_reward": 0.055338045582175255,
"step": 170
},
{
"clip_fraction": 0.0,
"completion_length": 2271.375015258789,
"epoch": 0.19542857142857142,
"grad_norm": 0.027967043220996857,
"kl": 0.00012468546628952026,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.0165,
"reward": 0.06615722924470901,
"reward_after_mean": 0.06615722924470901,
"reward_after_std": 0.4302575755864382,
"reward_before_mean": 0.38474041223526,
"reward_before_std": 0.33841412514448166,
"reward_change_max": 0.0,
"reward_change_mean": -0.3185831569135189,
"reward_change_min": -0.4822757709771395,
"reward_change_std": 0.17764397989958525,
"reward_std": 0.4302575970068574,
"rewards/accuracy_reward": 0.31250000186264515,
"rewards/cosine_scaled_reward": 0.07224038429558277,
"step": 171
},
{
"clip_fraction": 0.0,
"completion_length": 2777.7708702087402,
"epoch": 0.19657142857142856,
"grad_norm": 0.02615823782980442,
"kl": 0.00017011165618896484,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.5642113178727193e-07,
"loss": -0.0005,
"reward": 0.03716196492314339,
"reward_after_mean": 0.03716196492314339,
"reward_after_std": 0.44201391376554966,
"reward_before_mean": 0.3422218947671354,
"reward_before_std": 0.3421974731609225,
"reward_change_max": 0.0,
"reward_change_mean": -0.30505993589758873,
"reward_change_min": -0.4638092163950205,
"reward_change_std": 0.17159069795161486,
"reward_std": 0.44201392494142056,
"rewards/accuracy_reward": 0.31250000186264515,
"rewards/cosine_scaled_reward": 0.029721886618062854,
"step": 172
},
{
"clip_fraction": 0.0,
"completion_length": 1847.3333435058594,
"epoch": 0.1977142857142857,
"grad_norm": 0.03499305993318558,
"kl": 7.937708869576454e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0217,
"reward": -0.23481484316289425,
"reward_after_mean": -0.23481484316289425,
"reward_after_std": 0.24997185822576284,
"reward_before_mean": -0.0019354680553078651,
"reward_before_std": 0.1694914740510285,
"reward_change_max": 0.0,
"reward_change_mean": -0.23287938348948956,
"reward_change_min": -0.31925959698855877,
"reward_change_std": 0.12057728180661798,
"reward_std": 0.24997186101973057,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/cosine_scaled_reward": -0.14776881225407124,
"step": 173
},
{
"clip_fraction": 0.0,
"completion_length": 1855.6666870117188,
"epoch": 0.19885714285714284,
"grad_norm": 0.025248348712921143,
"kl": 0.00010962784290313721,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.4904706411523448e-07,
"loss": -0.0114,
"reward": -0.07872031070291996,
"reward_after_mean": -0.07872031070291996,
"reward_after_std": 0.5705854296684265,
"reward_before_mean": 0.17297677602618933,
"reward_before_std": 0.5529223121702671,
"reward_change_max": 0.0,
"reward_change_mean": -0.25169707648456097,
"reward_change_min": -0.41667111963033676,
"reward_change_std": 0.16653849836438894,
"reward_std": 0.5705854464322329,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/cosine_scaled_reward": -0.01452323398552835,
"step": 174
},
{
"clip_fraction": 0.0,
"completion_length": 2315.5000190734863,
"epoch": 0.2,
"grad_norm": 0.025843625888228416,
"kl": 0.0001019798219203949,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.4554267916537495e-07,
"loss": -0.0411,
"reward": 0.044883210211992264,
"reward_after_mean": 0.044883210211992264,
"reward_after_std": 0.3484018575400114,
"reward_before_mean": 0.3673110632225871,
"reward_before_std": 0.2589006684720516,
"reward_change_max": 0.0,
"reward_change_mean": -0.322427861392498,
"reward_change_min": -0.4825479593127966,
"reward_change_std": 0.1801274660974741,
"reward_std": 0.34840187057852745,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/cosine_scaled_reward": 0.07564437948167324,
"step": 175
},
{
"clip_fraction": 0.0,
"completion_length": 2629.7708740234375,
"epoch": 0.20114285714285715,
"grad_norm": 0.026233607903122902,
"kl": 0.00010827556252479553,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.4216149583350755e-07,
"loss": 0.0766,
"reward": 0.07939248671755195,
"reward_after_mean": 0.07939248671755195,
"reward_after_std": 0.6790229994803667,
"reward_before_mean": 0.3831321783363819,
"reward_before_std": 0.7209639446809888,
"reward_change_max": 0.0,
"reward_change_mean": -0.3037397023290396,
"reward_change_min": -0.5897121950984001,
"reward_change_std": 0.23858889937400818,
"reward_std": 0.6790230087935925,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/cosine_scaled_reward": 0.028965501580387354,
"step": 176
},
{
"clip_fraction": 0.0,
"completion_length": 2575.916702270508,
"epoch": 0.2022857142857143,
"grad_norm": 0.025286749005317688,
"kl": 0.00016096234321594238,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.3890454406082956e-07,
"loss": -0.0311,
"reward": -0.19152541272342205,
"reward_after_mean": -0.19152541272342205,
"reward_after_std": 0.48578111454844475,
"reward_before_mean": 0.033257571049034595,
"reward_before_std": 0.47964665945619345,
"reward_change_max": 0.0,
"reward_change_mean": -0.22478299029171467,
"reward_change_min": -0.4445956815034151,
"reward_change_std": 0.16561621148139238,
"reward_std": 0.4857811164110899,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.11257576791103929,
"step": 177
},
{
"clip_fraction": 0.0,
"completion_length": 2311.7291870117188,
"epoch": 0.20342857142857143,
"grad_norm": 0.027437448501586914,
"kl": 0.00011655688285827637,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.3577281594640182e-07,
"loss": -0.0543,
"reward": 0.1088696513324976,
"reward_after_mean": 0.1088696513324976,
"reward_after_std": 0.469313045963645,
"reward_before_mean": 0.44181622844189405,
"reward_before_std": 0.41828347370028496,
"reward_change_max": 0.0,
"reward_change_mean": -0.3329465799033642,
"reward_change_min": -0.4839160367846489,
"reward_change_std": 0.19409743417054415,
"reward_std": 0.4693130645900965,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/cosine_scaled_reward": 0.10848287865519524,
"step": 178
},
{
"clip_fraction": 0.0,
"completion_length": 2721.7500610351562,
"epoch": 0.20457142857142857,
"grad_norm": 0.022730229422450066,
"kl": 0.00014713406562805176,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.034,
"reward": -0.2384864166378975,
"reward_after_mean": -0.2384864166378975,
"reward_after_std": 0.48379051871597767,
"reward_before_mean": -0.037665948970243335,
"reward_before_std": 0.4321022112853825,
"reward_change_max": 0.0,
"reward_change_mean": -0.20082047581672668,
"reward_change_min": -0.33639476634562016,
"reward_change_std": 0.12049192376434803,
"reward_std": 0.48379052244126797,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/cosine_scaled_reward": -0.12099928548559546,
"step": 179
},
{
"clip_fraction": 0.0,
"completion_length": 2094.208354949951,
"epoch": 0.2057142857142857,
"grad_norm": 0.0351104699075222,
"kl": 0.00013990700244903564,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.2988880807625927e-07,
"loss": -0.0636,
"reward": 0.09567644819617271,
"reward_after_mean": 0.09567644819617271,
"reward_after_std": 0.28684780560433865,
"reward_before_mean": 0.44278225488960743,
"reward_before_std": 0.15074736287351698,
"reward_change_max": 0.0,
"reward_change_mean": -0.3471058327704668,
"reward_change_min": -0.45528485253453255,
"reward_change_std": 0.17486567981541157,
"reward_std": 0.2868478149175644,
"rewards/accuracy_reward": 0.375,
"rewards/cosine_scaled_reward": 0.06778226979076862,
"step": 180
},
{
"clip_fraction": 0.0,
"completion_length": 3052.812530517578,
"epoch": 0.20685714285714285,
"grad_norm": 0.0223699901252985,
"kl": 0.00018121302127838135,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0092,
"reward": -0.17876180354505777,
"reward_after_mean": -0.17876180354505777,
"reward_after_std": 0.3157486580312252,
"reward_before_mean": 0.0692460760474205,
"reward_before_std": 0.24795555789023638,
"reward_change_max": 0.0,
"reward_change_mean": -0.24800788797438145,
"reward_change_min": -0.3895928617566824,
"reward_change_std": 0.14537531603127718,
"reward_std": 0.31574865989387035,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.09742058239498874,
"step": 181
},
{
"clip_fraction": 0.0,
"completion_length": 1716.9791870117188,
"epoch": 0.208,
"grad_norm": 0.02780199609696865,
"kl": 6.859749555587769e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.2451664098030743e-07,
"loss": -0.077,
"reward": 0.143024992197752,
"reward_after_mean": 0.143024992197752,
"reward_after_std": 0.6203626422211528,
"reward_before_mean": 0.47820378467440605,
"reward_before_std": 0.6523203919641674,
"reward_change_max": 0.0,
"reward_change_mean": -0.3351787682622671,
"reward_change_min": -0.6069156751036644,
"reward_change_std": 0.24839763902127743,
"reward_std": 0.620362657122314,
"rewards/accuracy_reward": 0.3958333395421505,
"rewards/cosine_scaled_reward": 0.08237039996311069,
"step": 182
},
{
"clip_fraction": 0.0,
"completion_length": 1873.270881652832,
"epoch": 0.20914285714285713,
"grad_norm": 0.032433025538921356,
"kl": 7.05718994140625e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.220245676671809e-07,
"loss": -0.0625,
"reward": 0.04434862919151783,
"reward_after_mean": 0.04434862919151783,
"reward_after_std": 0.5306362751871347,
"reward_before_mean": 0.3466955330222845,
"reward_before_std": 0.5030294321477413,
"reward_change_max": 0.0,
"reward_change_mean": -0.3023468852043152,
"reward_change_min": -0.5408249255269766,
"reward_change_std": 0.2029698370024562,
"reward_std": 0.5306362900882959,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/cosine_scaled_reward": 0.05502885114401579,
"step": 183
},
{
"clip_fraction": 0.0,
"completion_length": 2780.0000038146973,
"epoch": 0.2102857142857143,
"grad_norm": 0.03345809876918793,
"kl": 0.00015169382095336914,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1966285981663407e-07,
"loss": -0.0395,
"reward": -0.2764766328036785,
"reward_after_mean": -0.2764766328036785,
"reward_after_std": 0.28763100504875183,
"reward_before_mean": -0.05617565102875233,
"reward_before_std": 0.27192449755966663,
"reward_change_max": 0.0,
"reward_change_mean": -0.22030097991228104,
"reward_change_min": -0.33715784922242165,
"reward_change_std": 0.13025930058211088,
"reward_std": 0.2876310106366873,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/cosine_scaled_reward": -0.16034232266247272,
"step": 184
},
{
"clip_fraction": 0.0,
"completion_length": 2354.2083587646484,
"epoch": 0.21142857142857144,
"grad_norm": 0.02904898300766945,
"kl": 8.22991132736206e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1743223682775649e-07,
"loss": -0.0195,
"reward": -0.06851540505886078,
"reward_after_mean": -0.06851540505886078,
"reward_after_std": 0.4076218158006668,
"reward_before_mean": 0.20718638133257627,
"reward_before_std": 0.3651102539151907,
"reward_change_max": 0.0,
"reward_change_mean": -0.27570181526243687,
"reward_change_min": -0.4055694956332445,
"reward_change_std": 0.15835797414183617,
"reward_std": 0.40762182511389256,
"rewards/accuracy_reward": 0.22916666977107525,
"rewards/cosine_scaled_reward": -0.021980268880724907,
"step": 185
},
{
"clip_fraction": 0.0,
"completion_length": 2604.5625,
"epoch": 0.21257142857142858,
"grad_norm": 0.020262470468878746,
"kl": 0.00013430416584014893,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1533337816991931e-07,
"loss": -0.0241,
"reward": -0.04451223462820053,
"reward_after_mean": -0.04451223462820053,
"reward_after_std": 0.5044716745615005,
"reward_before_mean": 0.22654481540666893,
"reward_before_std": 0.450691357254982,
"reward_change_max": 0.0,
"reward_change_mean": -0.27105705067515373,
"reward_change_min": -0.45378575660288334,
"reward_change_std": 0.16853850428014994,
"reward_std": 0.5044716857373714,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": -0.023455200716853142,
"step": 186
},
{
"clip_fraction": 0.0,
"completion_length": 2542.2917098999023,
"epoch": 0.21371428571428572,
"grad_norm": 0.024545790627598763,
"kl": 0.0001626908779144287,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0473,
"reward": -0.1829390935599804,
"reward_after_mean": -0.1829390935599804,
"reward_after_std": 0.3984376899898052,
"reward_before_mean": 0.05813688226044178,
"reward_before_std": 0.3919172268360853,
"reward_change_max": 0.0,
"reward_change_mean": -0.24107596464455128,
"reward_change_min": -0.4061664007604122,
"reward_change_std": 0.15977757051587105,
"reward_std": 0.3984377086162567,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.08769645728170872,
"step": 187
},
{
"clip_fraction": 0.0,
"completion_length": 3444.750030517578,
"epoch": 0.21485714285714286,
"grad_norm": 0.017645789310336113,
"kl": 0.00020623207092285156,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.1153347084664419e-07,
"loss": 0.0223,
"reward": -0.1741393506526947,
"reward_after_mean": -0.1741393506526947,
"reward_after_std": 0.3922067657113075,
"reward_before_mean": 0.06998030468821526,
"reward_before_std": 0.3800971172749996,
"reward_change_max": 0.0,
"reward_change_mean": -0.24411965906620026,
"reward_change_min": -0.40364984050393105,
"reward_change_std": 0.1577291926369071,
"reward_std": 0.3922067675739527,
"rewards/accuracy_reward": 0.1458333358168602,
"rewards/cosine_scaled_reward": -0.07585303112864494,
"step": 188
},
{
"clip_fraction": 0.0,
"completion_length": 1966.9375305175781,
"epoch": 0.216,
"grad_norm": 0.034927211701869965,
"kl": 9.391456842422485e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0983357966978745e-07,
"loss": -0.0265,
"reward": -0.12510699033737183,
"reward_after_mean": -0.12510699033737183,
"reward_after_std": 0.5509497374296188,
"reward_before_mean": 0.10953010153025389,
"reward_before_std": 0.5270057059824467,
"reward_change_max": 0.0,
"reward_change_mean": -0.23463710024952888,
"reward_change_min": -0.40135206654667854,
"reward_change_std": 0.15052294824272394,
"reward_std": 0.5509497616440058,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.07796989846974611,
"step": 189
},
{
"clip_fraction": 0.0,
"completion_length": 2617.0416946411133,
"epoch": 0.21714285714285714,
"grad_norm": 0.026627399027347565,
"kl": 0.00012832880020141602,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0826776744855121e-07,
"loss": 0.0292,
"reward": 0.10528106242418289,
"reward_after_mean": 0.10528106242418289,
"reward_after_std": 0.6541636940091848,
"reward_before_mean": 0.4219757579267025,
"reward_before_std": 0.6895054774358869,
"reward_change_max": 0.0,
"reward_change_mean": -0.3166946694254875,
"reward_change_min": -0.6117767505347729,
"reward_change_std": 0.24059188924729824,
"reward_std": 0.6541637200862169,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/cosine_scaled_reward": 0.08864239510148764,
"step": 190
},
{
"clip_fraction": 0.0,
"completion_length": 2205.750030517578,
"epoch": 0.21828571428571428,
"grad_norm": 0.029876256361603737,
"kl": 0.00012889504432678223,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0117,
"reward": 0.07531885802745819,
"reward_after_mean": 0.07531885802745819,
"reward_after_std": 0.4699189569801092,
"reward_before_mean": 0.39738621190190315,
"reward_before_std": 0.4458512868732214,
"reward_change_max": 0.0,
"reward_change_mean": -0.32206736505031586,
"reward_change_min": -0.537959199398756,
"reward_change_std": 0.20273534674197435,
"reward_std": 0.46991895884275436,
"rewards/accuracy_reward": 0.3333333395421505,
"rewards/cosine_scaled_reward": 0.06405287701636553,
"step": 191
},
{
"clip_fraction": 0.0,
"completion_length": 2956.979263305664,
"epoch": 0.21942857142857142,
"grad_norm": 0.02069353684782982,
"kl": 0.0001544952392578125,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0554024673218806e-07,
"loss": -0.0794,
"reward": -0.16378409788012505,
"reward_after_mean": -0.16378409788012505,
"reward_after_std": 0.33716665115207434,
"reward_before_mean": 0.08590960502624512,
"reward_before_std": 0.2754701506346464,
"reward_change_max": 0.0,
"reward_change_mean": -0.24969367682933807,
"reward_change_min": -0.4076712429523468,
"reward_change_std": 0.14498529862612486,
"reward_std": 0.33716665860265493,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/cosine_scaled_reward": -0.08075705729424953,
"step": 192
},
{
"clip_fraction": 0.0,
"completion_length": 2565.6459045410156,
"epoch": 0.22057142857142858,
"grad_norm": 0.02433474361896515,
"kl": 0.00013750791549682617,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0119,
"reward": -0.11538952589035034,
"reward_after_mean": -0.11538952589035034,
"reward_after_std": 0.4653550498187542,
"reward_before_mean": 0.14366911351680756,
"reward_before_std": 0.47822121903300285,
"reward_change_max": 0.0,
"reward_change_mean": -0.25905864126980305,
"reward_change_min": -0.4472095873206854,
"reward_change_std": 0.18343419581651688,
"reward_std": 0.46535505168139935,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/cosine_scaled_reward": -0.06466422416269779,
"step": 193
},
{
"clip_fraction": 0.0,
"completion_length": 2630.2708587646484,
"epoch": 0.22171428571428572,
"grad_norm": 0.021205004304647446,
"kl": 0.00015100836753845215,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0335423176140511e-07,
"loss": -0.014,
"reward": 0.5263579860329628,
"reward_after_mean": 0.5263579860329628,
"reward_after_std": 0.5279496256262064,
"reward_before_mean": 1.0014060586690903,
"reward_before_std": 0.43055359087884426,
"reward_change_max": 0.0,
"reward_change_mean": -0.47504812479019165,
"reward_change_min": -0.7046387940645218,
"reward_change_std": 0.2720959987491369,
"reward_std": 0.5279496368020773,
"rewards/accuracy_reward": 0.6250000111758709,
"rewards/cosine_scaled_reward": 0.37640603724867105,
"step": 194
},
{
"clip_fraction": 0.0,
"completion_length": 2478.1250381469727,
"epoch": 0.22285714285714286,
"grad_norm": 0.02273085154592991,
"kl": 0.00011633709073066711,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0246514708427701e-07,
"loss": -0.0778,
"reward": -0.03121163323521614,
"reward_after_mean": -0.03121163323521614,
"reward_after_std": 0.542201291769743,
"reward_before_mean": 0.2403367217630148,
"reward_before_std": 0.49494979437440634,
"reward_change_max": 0.0,
"reward_change_mean": -0.27154832519590855,
"reward_change_min": -0.47133193723857403,
"reward_change_std": 0.1756760822609067,
"reward_std": 0.5422013197094202,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/cosine_scaled_reward": -0.009663309436291456,
"step": 195
},
{
"clip_fraction": 0.0,
"completion_length": 3269.354217529297,
"epoch": 0.224,
"grad_norm": 0.02504117041826248,
"kl": 0.00018531084060668945,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.017123858587145e-07,
"loss": 0.0545,
"reward": -0.16638334095478058,
"reward_after_mean": -0.16638334095478058,
"reward_after_std": 0.3524969248101115,
"reward_before_mean": 0.08567467518150806,
"reward_before_std": 0.33011660259217024,
"reward_change_max": 0.0,
"reward_change_mean": -0.25205800868570805,
"reward_change_min": -0.38781771063804626,
"reward_change_std": 0.15303326025605202,
"reward_std": 0.3524969294667244,
"rewards/accuracy_reward": 0.1875000074505806,
"rewards/cosine_scaled_reward": -0.10182532295584679,
"step": 196
},
{
"clip_fraction": 0.0,
"completion_length": 2431.6667404174805,
"epoch": 0.22514285714285714,
"grad_norm": 0.03106440044939518,
"kl": 0.0001646280288696289,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0109617738307911e-07,
"loss": -0.0034,
"reward": 0.3956623272970319,
"reward_after_mean": 0.3956623272970319,
"reward_after_std": 0.9187420383095741,
"reward_before_mean": 0.7849392527714372,
"reward_before_std": 0.980433851480484,
"reward_change_max": 0.0,
"reward_change_mean": -0.3892769180238247,
"reward_change_min": -0.7530074659734964,
"reward_change_std": 0.3080608732998371,
"reward_std": 0.9187420606613159,
"rewards/accuracy_reward": 0.5416666809469461,
"rewards/cosine_scaled_reward": 0.24327257159166038,
"step": 197
},
{
"clip_fraction": 0.0,
"completion_length": 2481.4375534057617,
"epoch": 0.22628571428571428,
"grad_norm": 0.029221223667263985,
"kl": 0.00012829899787902832,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0061670936044178e-07,
"loss": 0.0738,
"reward": -0.051010750234127045,
"reward_after_mean": -0.051010750234127045,
"reward_after_std": 0.5678279399871826,
"reward_before_mean": 0.21409518271684647,
"reward_before_std": 0.5671654343605042,
"reward_change_max": 0.0,
"reward_change_mean": -0.2651059068739414,
"reward_change_min": -0.44747380912303925,
"reward_change_std": 0.18114776257425547,
"reward_std": 0.5678279716521502,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/cosine_scaled_reward": -0.03590485081076622,
"step": 198
},
{
"clip_fraction": 0.0,
"completion_length": 3558.1041870117188,
"epoch": 0.22742857142857142,
"grad_norm": 0.017193729057908058,
"kl": 0.0001755356788635254,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0034,
"reward": -0.16698902659118176,
"reward_after_mean": -0.16698902659118176,
"reward_after_std": 0.5533907692879438,
"reward_before_mean": 0.055191148683661595,
"reward_before_std": 0.5313135031610727,
"reward_change_max": 0.0,
"reward_change_mean": -0.2221801672130823,
"reward_change_min": -0.39364006742835045,
"reward_change_std": 0.14498097822070122,
"reward_std": 0.5533907972276211,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/cosine_scaled_reward": -0.11147552821785212,
"step": 199
},
{
"clip_fraction": 0.0,
"completion_length": 1830.0000686645508,
"epoch": 0.22857142857142856,
"grad_norm": 0.028155136853456497,
"kl": 9.66787338256836e-05,
"lambda_div_used": 0.7000000000000001,
"learning_rate": 1.0006853717962393e-07,
"loss": -0.0584,
"reward": 0.29632306285202503,
"reward_after_mean": 0.29632306285202503,
"reward_after_std": 0.5755883920937777,
"reward_before_mean": 0.6877593696117401,
"reward_before_std": 0.5522297900170088,
"reward_change_max": 0.0,
"reward_change_mean": -0.39143630117177963,
"reward_change_min": -0.6200563348829746,
"reward_change_std": 0.24899613857269287,
"reward_std": 0.5755884237587452,
"rewards/accuracy_reward": 0.4583333469927311,
"rewards/cosine_scaled_reward": 0.22942602587863803,
"step": 200
},
{
"epoch": 0.22857142857142856,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.009834988048532977,
"train_runtime": 61157.4359,
"train_samples_per_second": 0.157,
"train_steps_per_second": 0.003
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}