| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5714285714285714, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2253.854206085205, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.029786353930830956, |
| "kl": 0.0, |
| "lambda_div_used": 0.6170438528060913, |
| "learning_rate": 0.0, |
| "loss": -0.0476, |
| "reward": 0.09989889524877071, |
| "reward_after_mean": 0.09989889524877071, |
| "reward_after_std": 0.6247774921357632, |
| "reward_before_mean": 0.5353203006088734, |
| "reward_before_std": 0.5411310354247689, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4354214407503605, |
| "reward_change_min": -0.6615581586956978, |
| "reward_change_std": 0.2600514395162463, |
| "reward_std": 0.6247775163501501, |
| "rewards/accuracy_reward": 0.37500000931322575, |
| "rewards/cosine_scaled_reward": 0.16032031644135714, |
| "step": 1 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2566.395854949951, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.025455903261899948, |
| "kl": 0.0, |
| "lambda_div_used": 0.6156510338187218, |
| "learning_rate": 5e-08, |
| "loss": 0.0349, |
| "reward": 0.10292071849107742, |
| "reward_after_mean": 0.10292071849107742, |
| "reward_after_std": 0.598213616758585, |
| "reward_before_mean": 0.5439198296517134, |
| "reward_before_std": 0.5335724893957376, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.440999086946249, |
| "reward_change_min": -0.6536596268415451, |
| "reward_change_std": 0.2629614323377609, |
| "reward_std": 0.5982136316597462, |
| "rewards/accuracy_reward": 0.41666667722165585, |
| "rewards/cosine_scaled_reward": 0.12725313939154148, |
| "step": 2 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2808.2083740234375, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.026795223355293274, |
| "kl": 0.00017423927783966064, |
| "lambda_div_used": 0.599299855530262, |
| "learning_rate": 1e-07, |
| "loss": -0.011, |
| "reward": -0.23149854317307472, |
| "reward_after_mean": -0.23149854317307472, |
| "reward_after_std": 0.4924859032034874, |
| "reward_before_mean": 0.05049763061106205, |
| "reward_before_std": 0.4575129607692361, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28199618123471737, |
| "reward_change_min": -0.46816934645175934, |
| "reward_change_std": 0.17736693751066923, |
| "reward_std": 0.49248590879142284, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.09533570008352399, |
| "step": 3 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1537.4583435058594, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.027395043522119522, |
| "kl": 8.840858936309814e-05, |
| "lambda_div_used": 0.6135994121432304, |
| "learning_rate": 1.5e-07, |
| "loss": 0.0364, |
| "reward": 0.19573484233114868, |
| "reward_after_mean": 0.19573484233114868, |
| "reward_after_std": 0.5626182612031698, |
| "reward_before_mean": 0.6533232685178518, |
| "reward_before_std": 0.5161786610260606, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4575884137302637, |
| "reward_change_min": -0.662933848798275, |
| "reward_change_std": 0.26597225945442915, |
| "reward_std": 0.5626182779669762, |
| "rewards/accuracy_reward": 0.45833334885537624, |
| "rewards/cosine_scaled_reward": 0.19498991407454014, |
| "step": 4 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2930.541748046875, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.021351713687181473, |
| "kl": 0.00014585256576538086, |
| "lambda_div_used": 0.618914432823658, |
| "learning_rate": 2e-07, |
| "loss": 0.0298, |
| "reward": -0.07733920076861978, |
| "reward_after_mean": -0.07733920076861978, |
| "reward_after_std": 0.6453567277640104, |
| "reward_before_mean": 0.276840849313885, |
| "reward_before_std": 0.5498588550835848, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3541800267994404, |
| "reward_change_min": -0.5702032893896103, |
| "reward_change_std": 0.21250940579921007, |
| "reward_std": 0.6453567445278168, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.0060075074434280396, |
| "step": 5 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2457.187515258789, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.03508686274290085, |
| "kl": 0.00010536611080169678, |
| "lambda_div_used": 0.6270971074700356, |
| "learning_rate": 2.5e-07, |
| "loss": -0.0326, |
| "reward": -0.08803003467619419, |
| "reward_after_mean": -0.08803003467619419, |
| "reward_after_std": 0.6014832425862551, |
| "reward_before_mean": 0.21033997228369117, |
| "reward_before_std": 0.5929712019860744, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29837000742554665, |
| "reward_change_min": -0.5160593837499619, |
| "reward_change_std": 0.2064626282081008, |
| "reward_std": 0.601483253762126, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.01882670260965824, |
| "step": 6 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2581.604263305664, |
| "epoch": 0.008, |
| "grad_norm": 0.02246660739183426, |
| "kl": 0.00012472271919250488, |
| "lambda_div_used": 0.6219364404678345, |
| "learning_rate": 3e-07, |
| "loss": -0.0131, |
| "reward": -0.06724199093878269, |
| "reward_after_mean": -0.06724199093878269, |
| "reward_after_std": 0.5799425262957811, |
| "reward_before_mean": 0.2561297030188143, |
| "reward_before_std": 0.5679045412689447, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3233717121183872, |
| "reward_change_min": -0.5483178608119488, |
| "reward_change_std": 0.213481605052948, |
| "reward_std": 0.5799425337463617, |
| "rewards/accuracy_reward": 0.27083334140479565, |
| "rewards/cosine_scaled_reward": -0.014703631401062012, |
| "step": 7 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1865.0625228881836, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.0264554712921381, |
| "kl": 7.349252700805664e-05, |
| "lambda_div_used": 0.6559017673134804, |
| "learning_rate": 3.5e-07, |
| "loss": 0.0012, |
| "reward": 0.30751039180904627, |
| "reward_after_mean": 0.30751039180904627, |
| "reward_after_std": 0.768000740557909, |
| "reward_before_mean": 0.7620646432042122, |
| "reward_before_std": 0.7257685504155234, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4545542187988758, |
| "reward_change_min": -0.6947779208421707, |
| "reward_change_std": 0.28947674110531807, |
| "reward_std": 0.7680007480084896, |
| "rewards/accuracy_reward": 0.5208333469927311, |
| "rewards/cosine_scaled_reward": 0.24123129644431174, |
| "step": 8 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2611.187515258789, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.02845916524529457, |
| "kl": 0.00014799833297729492, |
| "lambda_div_used": 0.6612376719713211, |
| "learning_rate": 4e-07, |
| "loss": -0.0083, |
| "reward": 0.022774726152420044, |
| "reward_after_mean": 0.022774726152420044, |
| "reward_after_std": 0.7959331478923559, |
| "reward_before_mean": 0.3162382678128779, |
| "reward_before_std": 0.7569107804447412, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29346355609595776, |
| "reward_change_min": -0.4966486766934395, |
| "reward_change_std": 0.20045297034084797, |
| "reward_std": 0.7959331627935171, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": 0.06623826455324888, |
| "step": 9 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2326.8541870117188, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.02597963623702526, |
| "kl": 0.0001014266163110733, |
| "lambda_div_used": 0.6026739403605461, |
| "learning_rate": 4.5e-07, |
| "loss": 0.012, |
| "reward": 0.08584612235426903, |
| "reward_after_mean": 0.08584612235426903, |
| "reward_after_std": 0.5247625019401312, |
| "reward_before_mean": 0.5338415652513504, |
| "reward_before_std": 0.4742008354514837, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4479954708367586, |
| "reward_change_min": -0.6653710156679153, |
| "reward_change_std": 0.2726733274757862, |
| "reward_std": 0.5247625187039375, |
| "rewards/accuracy_reward": 0.3958333432674408, |
| "rewards/cosine_scaled_reward": 0.13800824619829655, |
| "step": 10 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3257.875045776367, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.01903906650841236, |
| "kl": 0.00015848875045776367, |
| "lambda_div_used": 0.6304982751607895, |
| "learning_rate": 5e-07, |
| "loss": 0.037, |
| "reward": -0.19499589689075947, |
| "reward_after_mean": -0.19499589689075947, |
| "reward_after_std": 0.6491430383175611, |
| "reward_before_mean": 0.052283127792179585, |
| "reward_before_std": 0.5964901968836784, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24727902933955193, |
| "reward_change_min": -0.40677722357213497, |
| "reward_change_std": 0.14844622276723385, |
| "reward_std": 0.649143049493432, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.11438353173434734, |
| "step": 11 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1922.6458587646484, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.027084212750196457, |
| "kl": 0.00012683868408203125, |
| "lambda_div_used": 0.6053123474121094, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0637, |
| "reward": -0.10656415252014995, |
| "reward_after_mean": -0.10656415252014995, |
| "reward_after_std": 0.5768749956041574, |
| "reward_before_mean": 0.25147354789078236, |
| "reward_before_std": 0.4855317808687687, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35803768038749695, |
| "reward_change_min": -0.5767498537898064, |
| "reward_change_std": 0.21485394705086946, |
| "reward_std": 0.5768750291317701, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": 0.0014735234435647726, |
| "step": 12 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2585.979217529297, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.02171475626528263, |
| "kl": 0.00012777745723724365, |
| "lambda_div_used": 0.5844379514455795, |
| "learning_rate": 6e-07, |
| "loss": 0.0567, |
| "reward": -0.11025732010602951, |
| "reward_after_mean": -0.11025732010602951, |
| "reward_after_std": 0.4723772555589676, |
| "reward_before_mean": 0.2753620855510235, |
| "reward_before_std": 0.39071971736848354, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3856194168329239, |
| "reward_change_min": -0.5897834450006485, |
| "reward_change_std": 0.2299499223008752, |
| "reward_std": 0.4723772667348385, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.004528738558292389, |
| "step": 13 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2412.1250534057617, |
| "epoch": 0.016, |
| "grad_norm": 0.024103164672851562, |
| "kl": 0.00015282630920410156, |
| "lambda_div_used": 0.6067279502749443, |
| "learning_rate": 6.5e-07, |
| "loss": -0.0015, |
| "reward": -0.08361193258315325, |
| "reward_after_mean": -0.08361193258315325, |
| "reward_after_std": 0.5700989812612534, |
| "reward_before_mean": 0.27934680134058, |
| "reward_before_std": 0.49509103409945965, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36295875161886215, |
| "reward_change_min": -0.6321466080844402, |
| "reward_change_std": 0.2297183210030198, |
| "reward_std": 0.570098988711834, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.0085134650580585, |
| "step": 14 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2612.5833778381348, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.02622115984559059, |
| "kl": 0.00012434273958206177, |
| "lambda_div_used": 0.5387627929449081, |
| "learning_rate": 7e-07, |
| "loss": -0.0354, |
| "reward": -0.03696875274181366, |
| "reward_after_mean": -0.03696875274181366, |
| "reward_after_std": 0.3815008979290724, |
| "reward_before_mean": 0.5313108433037996, |
| "reward_before_std": 0.17025252804160118, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5682796090841293, |
| "reward_change_min": -0.7619944997131824, |
| "reward_change_std": 0.2890887148678303, |
| "reward_std": 0.3815009109675884, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/cosine_scaled_reward": 0.15631086938083172, |
| "step": 15 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3487.4583435058594, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.018314050510525703, |
| "kl": 0.00019049644470214844, |
| "lambda_div_used": 0.5788475871086121, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0111, |
| "reward": -0.1648220755159855, |
| "reward_after_mean": -0.1648220755159855, |
| "reward_after_std": 0.3886314034461975, |
| "reward_before_mean": 0.18453767150640488, |
| "reward_before_std": 0.3589506670832634, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34935975447297096, |
| "reward_change_min": -0.5136087462306023, |
| "reward_change_std": 0.2073321659117937, |
| "reward_std": 0.38863140903413296, |
| "rewards/accuracy_reward": 0.2083333432674408, |
| "rewards/cosine_scaled_reward": -0.023795653134584427, |
| "step": 16 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1923.895851135254, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.03498728573322296, |
| "kl": 0.00010513514280319214, |
| "lambda_div_used": 0.6301147192716599, |
| "learning_rate": 8e-07, |
| "loss": 0.0516, |
| "reward": 0.24809654615819454, |
| "reward_after_mean": 0.24809654615819454, |
| "reward_after_std": 0.7150995936244726, |
| "reward_before_mean": 0.749116275459528, |
| "reward_before_std": 0.5986730419099331, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5010197218507528, |
| "reward_change_min": -0.7233205642551184, |
| "reward_change_std": 0.2854560799896717, |
| "reward_std": 0.7150996085256338, |
| "rewards/accuracy_reward": 0.5208333432674408, |
| "rewards/cosine_scaled_reward": 0.2282829141477123, |
| "step": 17 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2648.3334197998047, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.019363267347216606, |
| "kl": 0.00012411177158355713, |
| "lambda_div_used": 0.5733960121870041, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.0156, |
| "reward": -4.522036761045456e-05, |
| "reward_after_mean": -4.522036761045456e-05, |
| "reward_after_std": 0.491512268781662, |
| "reward_before_mean": 0.4940829328261316, |
| "reward_before_std": 0.33302280586212873, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.49412815272808075, |
| "reward_change_min": -0.6739438865333796, |
| "reward_change_std": 0.25992031022906303, |
| "reward_std": 0.4915122911334038, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.11908289603888988, |
| "step": 18 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2037.93754196167, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.0254252627491951, |
| "kl": 0.00010475516319274902, |
| "lambda_div_used": 0.5972657427191734, |
| "learning_rate": 9e-07, |
| "loss": -0.0277, |
| "reward": 0.3277764454251155, |
| "reward_after_mean": 0.3277764454251155, |
| "reward_after_std": 0.7013481389731169, |
| "reward_before_mean": 0.9843392036855221, |
| "reward_before_std": 0.44702923856675625, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.6565627679228783, |
| "reward_change_min": -0.9309929609298706, |
| "reward_change_std": 0.3530767587944865, |
| "reward_std": 0.7013481538742781, |
| "rewards/accuracy_reward": 0.6041666697710752, |
| "rewards/cosine_scaled_reward": 0.3801724927034229, |
| "step": 19 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1404.291706085205, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.028516631573438644, |
| "kl": 5.914270877838135e-05, |
| "lambda_div_used": 0.6129282414913177, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": 0.0391, |
| "reward": 0.26722301356494427, |
| "reward_after_mean": 0.26722301356494427, |
| "reward_after_std": 0.6774719897657633, |
| "reward_before_mean": 0.8377129100263119, |
| "reward_before_std": 0.5324758047936484, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5704899430274963, |
| "reward_change_min": -0.8648187033832073, |
| "reward_change_std": 0.3376352610066533, |
| "reward_std": 0.6774720121175051, |
| "rewards/accuracy_reward": 0.5416666716337204, |
| "rewards/cosine_scaled_reward": 0.29604623932391405, |
| "step": 20 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2424.6042137145996, |
| "epoch": 0.024, |
| "grad_norm": 0.032928258180618286, |
| "kl": 0.000138014554977417, |
| "lambda_div_used": 0.6448317095637321, |
| "learning_rate": 1e-06, |
| "loss": 0.0361, |
| "reward": 0.23116276413202286, |
| "reward_after_mean": 0.23116276413202286, |
| "reward_after_std": 0.7203101813793182, |
| "reward_before_mean": 0.6635394699405879, |
| "reward_before_std": 0.6762269856408238, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4323767013847828, |
| "reward_change_min": -0.7010968029499054, |
| "reward_change_std": 0.2771869823336601, |
| "reward_std": 0.7203102335333824, |
| "rewards/accuracy_reward": 0.43750000931322575, |
| "rewards/cosine_scaled_reward": 0.22603945806622505, |
| "step": 21 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1410.8750381469727, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.03686099499464035, |
| "kl": 0.00010958313941955566, |
| "lambda_div_used": 0.5709755718708038, |
| "learning_rate": 9.99931462820376e-07, |
| "loss": -0.0637, |
| "reward": -0.13723512832075357, |
| "reward_after_mean": -0.13723512832075357, |
| "reward_after_std": 0.4940304774791002, |
| "reward_before_mean": 0.30008178018033504, |
| "reward_before_std": 0.32625696901232004, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4373169243335724, |
| "reward_change_min": -0.6162486486136913, |
| "reward_change_std": 0.23701479192823172, |
| "reward_std": 0.4940304830670357, |
| "rewards/accuracy_reward": 0.31250000186264515, |
| "rewards/cosine_scaled_reward": -0.012418218422681093, |
| "step": 22 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2354.062568664551, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.025602027773857117, |
| "kl": 0.00011620670557022095, |
| "lambda_div_used": 0.6488568410277367, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": 0.0419, |
| "reward": -0.012333650141954422, |
| "reward_after_mean": -0.012333650141954422, |
| "reward_after_std": 0.698169419541955, |
| "reward_before_mean": 0.27921401464845985, |
| "reward_before_std": 0.6954333996400237, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29154767468571663, |
| "reward_change_min": -0.5527428761124611, |
| "reward_change_std": 0.2108509410172701, |
| "reward_std": 0.6981694512069225, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.008380686864256859, |
| "step": 23 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1993.1041870117188, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.03096413053572178, |
| "kl": 8.243322372436523e-05, |
| "lambda_div_used": 0.6519145146012306, |
| "learning_rate": 9.993832906395582e-07, |
| "loss": 0.0795, |
| "reward": 0.11854812642559409, |
| "reward_after_mean": 0.11854812642559409, |
| "reward_after_std": 0.7567200511693954, |
| "reward_before_mean": 0.49915426783263683, |
| "reward_before_std": 0.7109423782676458, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38060615211725235, |
| "reward_change_min": -0.7067882716655731, |
| "reward_change_std": 0.26373046822845936, |
| "reward_std": 0.7567200735211372, |
| "rewards/accuracy_reward": 0.3750000037252903, |
| "rewards/cosine_scaled_reward": 0.12415427155792713, |
| "step": 24 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2234.8333435058594, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.022234002128243446, |
| "kl": 0.00014199316501617432, |
| "lambda_div_used": 0.6245157197117805, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0107, |
| "reward": 0.08800357580184937, |
| "reward_after_mean": 0.08800357580184937, |
| "reward_after_std": 0.5656109545379877, |
| "reward_before_mean": 0.46775088645517826, |
| "reward_before_std": 0.5775847099721432, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37974734231829643, |
| "reward_change_min": -0.6116136200726032, |
| "reward_change_std": 0.2511585932224989, |
| "reward_std": 0.5656109638512135, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/cosine_scaled_reward": 0.1135842353105545, |
| "step": 25 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2473.708366394043, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.02305966429412365, |
| "kl": 0.00014957785606384277, |
| "lambda_div_used": 0.57183438539505, |
| "learning_rate": 9.982876141412855e-07, |
| "loss": -0.0358, |
| "reward": -0.41022508684545755, |
| "reward_after_mean": -0.41022508684545755, |
| "reward_after_std": 0.3927479684352875, |
| "reward_before_mean": -0.15500983409583569, |
| "reward_before_std": 0.32340476755052805, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25521524623036385, |
| "reward_change_min": -0.4159896522760391, |
| "reward_change_std": 0.1455942215397954, |
| "reward_std": 0.3927479758858681, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.21750983409583569, |
| "step": 26 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2344.5625762939453, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.03154224529862404, |
| "kl": 0.00014576315879821777, |
| "lambda_div_used": 0.5766249001026154, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": 0.1289, |
| "reward": -0.04079665243625641, |
| "reward_after_mean": -0.04079665243625641, |
| "reward_after_std": 0.4307698383927345, |
| "reward_before_mean": 0.39524078369140625, |
| "reward_before_std": 0.3457355350255966, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4360374417155981, |
| "reward_change_min": -0.6437131129205227, |
| "reward_change_std": 0.24637807440012693, |
| "reward_std": 0.4307698402553797, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.061907440423965454, |
| "step": 27 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2371.4375381469727, |
| "epoch": 0.032, |
| "grad_norm": 0.030750228092074394, |
| "kl": 0.00011971592903137207, |
| "lambda_div_used": 0.6046403273940086, |
| "learning_rate": 9.96645768238595e-07, |
| "loss": 0.0725, |
| "reward": 0.06598741095513105, |
| "reward_after_mean": 0.06598741095513105, |
| "reward_after_std": 0.6246800310909748, |
| "reward_before_mean": 0.5469000339508057, |
| "reward_before_std": 0.47788948379456997, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4809126127511263, |
| "reward_change_min": -0.6964126750826836, |
| "reward_change_std": 0.2671422157436609, |
| "reward_std": 0.6246800404042006, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/cosine_scaled_reward": 0.10940003173891455, |
| "step": 28 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2796.6250762939453, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.022529419511556625, |
| "kl": 0.00014966726303100586, |
| "lambda_div_used": 0.5826017782092094, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": -0.0443, |
| "reward": -0.2186606228351593, |
| "reward_after_mean": -0.2186606228351593, |
| "reward_after_std": 0.4133179672062397, |
| "reward_before_mean": 0.09334492683410645, |
| "reward_before_std": 0.3791744504123926, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3120055440813303, |
| "reward_change_min": -0.4988309144973755, |
| "reward_change_std": 0.19146351423114538, |
| "reward_std": 0.41331798397004604, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.07332174107432365, |
| "step": 29 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2330.354232788086, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.02580900862812996, |
| "kl": 0.00011408329010009766, |
| "lambda_div_used": 0.617066040635109, |
| "learning_rate": 9.944597532678119e-07, |
| "loss": 0.0106, |
| "reward": -0.01328302314504981, |
| "reward_after_mean": -0.01328302314504981, |
| "reward_after_std": 0.6155566833913326, |
| "reward_before_mean": 0.36568982464814326, |
| "reward_before_std": 0.536849819123745, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37897284515202045, |
| "reward_change_min": -0.5798661820590496, |
| "reward_change_std": 0.22223789989948273, |
| "reward_std": 0.6155567076057196, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.05318982107564807, |
| "step": 30 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2794.4375381469727, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.021817587316036224, |
| "kl": 0.0001348257064819336, |
| "lambda_div_used": 0.6117052882909775, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": 0.0028, |
| "reward": -0.07263503596186638, |
| "reward_after_mean": -0.07263503596186638, |
| "reward_after_std": 0.5405435804277658, |
| "reward_before_mean": 0.26847894000820816, |
| "reward_before_std": 0.5099836494773626, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34111399203538895, |
| "reward_change_min": -0.5489708594977856, |
| "reward_change_std": 0.21398558467626572, |
| "reward_std": 0.5405435990542173, |
| "rewards/accuracy_reward": 0.27083334140479565, |
| "rewards/cosine_scaled_reward": -0.002354402095079422, |
| "step": 31 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2270.2292098999023, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 0.024800026789307594, |
| "kl": 0.00010867416858673096, |
| "lambda_div_used": 0.6270024925470352, |
| "learning_rate": 9.917322325514487e-07, |
| "loss": 0.0215, |
| "reward": 0.13659005239605904, |
| "reward_after_mean": 0.13659005239605904, |
| "reward_after_std": 0.6418719291687012, |
| "reward_before_mean": 0.5631819479167461, |
| "reward_before_std": 0.5894247069954872, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4265918843448162, |
| "reward_change_min": -0.6561249867081642, |
| "reward_change_std": 0.26332158874720335, |
| "reward_std": 0.6418719589710236, |
| "rewards/accuracy_reward": 0.39583334140479565, |
| "rewards/cosine_scaled_reward": 0.1673485841602087, |
| "step": 32 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2852.000045776367, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.025015637278556824, |
| "kl": 0.00012034177780151367, |
| "lambda_div_used": 0.6343094930052757, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": -0.0682, |
| "reward": 0.12994618620723486, |
| "reward_after_mean": 0.12994618620723486, |
| "reward_after_std": 0.641582889482379, |
| "reward_before_mean": 0.5249918717890978, |
| "reward_before_std": 0.6226585754193366, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39504568465054035, |
| "reward_change_min": -0.6912417784333229, |
| "reward_change_std": 0.2625753004103899, |
| "reward_std": 0.6415829043835402, |
| "rewards/accuracy_reward": 0.3958333432674408, |
| "rewards/cosine_scaled_reward": 0.12915852759033442, |
| "step": 33 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1936.0000305175781, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.027393249794840813, |
| "kl": 0.0001316368579864502, |
| "lambda_div_used": 0.6432492136955261, |
| "learning_rate": 9.88466529153356e-07, |
| "loss": 0.0524, |
| "reward": 0.22461825609207153, |
| "reward_after_mean": 0.22461825609207153, |
| "reward_after_std": 0.6486394293606281, |
| "reward_before_mean": 0.6393125429749489, |
| "reward_before_std": 0.6651058997958899, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41469427943229675, |
| "reward_change_min": -0.6961428225040436, |
| "reward_change_std": 0.27915553003549576, |
| "reward_std": 0.6486394479870796, |
| "rewards/accuracy_reward": 0.416666679084301, |
| "rewards/cosine_scaled_reward": 0.222645852714777, |
| "step": 34 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2444.270851135254, |
| "epoch": 0.04, |
| "grad_norm": 0.036447569727897644, |
| "kl": 0.0001233518123626709, |
| "lambda_div_used": 0.641115739941597, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": 0.0574, |
| "reward": 0.17524952441453934, |
| "reward_after_mean": 0.17524952441453934, |
| "reward_after_std": 0.6338076200336218, |
| "reward_before_mean": 0.5732492320239544, |
| "reward_before_std": 0.6535743195563555, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39799970760941505, |
| "reward_change_min": -0.6731353290379047, |
| "reward_change_std": 0.2706009875983, |
| "reward_std": 0.6338076237589121, |
| "rewards/accuracy_reward": 0.416666679084301, |
| "rewards/cosine_scaled_reward": 0.15658256597816944, |
| "step": 35 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3021.5834045410156, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.023824643343687057, |
| "kl": 0.00018006563186645508, |
| "lambda_div_used": 0.6088642552495003, |
| "learning_rate": 9.846666218300807e-07, |
| "loss": -0.0045, |
| "reward": -0.21179450303316116, |
| "reward_after_mean": -0.21179450303316116, |
| "reward_after_std": 0.521540641784668, |
| "reward_before_mean": 0.07177379354834557, |
| "reward_before_std": 0.5020219217985868, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2835683096200228, |
| "reward_change_min": -0.5416868068277836, |
| "reward_change_std": 0.1942979209125042, |
| "reward_std": 0.5215406529605389, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.07405953668057919, |
| "step": 36 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2800.3541717529297, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.022017668932676315, |
| "kl": 0.0001254826784133911, |
| "lambda_div_used": 0.5779423043131828, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0116, |
| "reward": -0.2564888745546341, |
| "reward_after_mean": -0.2564888745546341, |
| "reward_after_std": 0.4095242340117693, |
| "reward_before_mean": 0.04728756472468376, |
| "reward_before_std": 0.3590739220380783, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3037764262408018, |
| "reward_change_min": -0.48755551874637604, |
| "reward_change_std": 0.18500223569571972, |
| "reward_std": 0.40952424332499504, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.09854578226804733, |
| "step": 37 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3207.8541870117188, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.01808248646557331, |
| "kl": 0.00016523152589797974, |
| "lambda_div_used": 0.5784263759851456, |
| "learning_rate": 9.80337140183366e-07, |
| "loss": 0.0282, |
| "reward": -0.26364604104310274, |
| "reward_after_mean": -0.26364604104310274, |
| "reward_after_std": 0.40342688001692295, |
| "reward_before_mean": 0.05271115526556969, |
| "reward_before_std": 0.35847953893244267, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31635717302560806, |
| "reward_change_min": -0.5081874057650566, |
| "reward_change_std": 0.19082189723849297, |
| "reward_std": 0.4034268856048584, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.0931221836945042, |
| "step": 38 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2402.250045776367, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.02232450246810913, |
| "kl": 0.00010313093662261963, |
| "lambda_div_used": 0.594361886382103, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.0018, |
| "reward": -0.1193324881605804, |
| "reward_after_mean": -0.1193324881605804, |
| "reward_after_std": 0.5286196451634169, |
| "reward_before_mean": 0.25862734392285347, |
| "reward_before_std": 0.4300632723607123, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37795985862612724, |
| "reward_change_min": -0.5456654913723469, |
| "reward_change_std": 0.21102675329893827, |
| "reward_std": 0.5286196675151587, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.01220599515363574, |
| "step": 39 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2159.604217529297, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.02471066638827324, |
| "kl": 0.00011989474296569824, |
| "lambda_div_used": 0.5991866067051888, |
| "learning_rate": 9.754833590196926e-07, |
| "loss": 0.0737, |
| "reward": 0.01806516945362091, |
| "reward_after_mean": 0.01806516945362091, |
| "reward_after_std": 0.5395061280578375, |
| "reward_before_mean": 0.45469519402831793, |
| "reward_before_std": 0.4553617415949702, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43663003854453564, |
| "reward_change_min": -0.6850062496960163, |
| "reward_change_std": 0.2616432458162308, |
| "reward_std": 0.5395061578601599, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.12136187124997377, |
| "step": 40 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2768.5000610351562, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.021616969257593155, |
| "kl": 0.00012002140283584595, |
| "lambda_div_used": 0.6239832416176796, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": -0.0156, |
| "reward": 0.14593233913183212, |
| "reward_after_mean": 0.14593233913183212, |
| "reward_after_std": 0.63340456597507, |
| "reward_before_mean": 0.5886982697993517, |
| "reward_before_std": 0.5802208222448826, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.442765936255455, |
| "reward_change_min": -0.7020618245005608, |
| "reward_change_std": 0.2809063671156764, |
| "reward_std": 0.6334045827388763, |
| "rewards/accuracy_reward": 0.4375000111758709, |
| "rewards/cosine_scaled_reward": 0.1511982548981905, |
| "step": 41 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2634.8333702087402, |
| "epoch": 0.048, |
| "grad_norm": 0.04166898876428604, |
| "kl": 0.00016657263040542603, |
| "lambda_div_used": 0.5780549123883247, |
| "learning_rate": 9.701111919237408e-07, |
| "loss": 0.0133, |
| "reward": -0.34821823611855507, |
| "reward_after_mean": -0.34821823611855507, |
| "reward_after_std": 0.42264553159475327, |
| "reward_before_mean": -0.07792945206165314, |
| "reward_before_std": 0.3588131470605731, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2702887710183859, |
| "reward_change_min": -0.42083460837602615, |
| "reward_change_std": 0.15945285465568304, |
| "reward_std": 0.42264554649591446, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.16126279765740037, |
| "step": 42 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2635.833396911621, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.02200184389948845, |
| "kl": 0.0001204218715429306, |
| "lambda_div_used": 0.6316910237073898, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": -0.0063, |
| "reward": 0.021529126912355423, |
| "reward_after_mean": 0.021529126912355423, |
| "reward_after_std": 0.7083056271076202, |
| "reward_before_mean": 0.3985202740877867, |
| "reward_before_std": 0.6068296208977699, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3769911602139473, |
| "reward_change_min": -0.5825657024979591, |
| "reward_change_std": 0.2227043965831399, |
| "reward_std": 0.7083056569099426, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.08602028086897917, |
| "step": 43 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2195.229248046875, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.036139003932476044, |
| "kl": 0.00012126564979553223, |
| "lambda_div_used": 0.5746868774294853, |
| "learning_rate": 9.64227184053598e-07, |
| "loss": 0.1008, |
| "reward": -0.02278389036655426, |
| "reward_after_mean": -0.02278389036655426, |
| "reward_after_std": 0.43771820329129696, |
| "reward_before_mean": 0.4344022050499916, |
| "reward_before_std": 0.33903054893016815, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.457186084240675, |
| "reward_change_min": -0.6404859870672226, |
| "reward_change_std": 0.257523151114583, |
| "reward_std": 0.43771822564303875, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.10106886364519596, |
| "step": 44 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3065.2083740234375, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.01725860871374607, |
| "kl": 0.0001525580883026123, |
| "lambda_div_used": 0.6148836985230446, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0504, |
| "reward": 0.07026529498398304, |
| "reward_after_mean": 0.07026529498398304, |
| "reward_after_std": 0.5995844416320324, |
| "reward_before_mean": 0.49401637725532055, |
| "reward_before_std": 0.5296608861535788, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42375105805695057, |
| "reward_change_min": -0.6395902335643768, |
| "reward_change_std": 0.25529387686401606, |
| "reward_std": 0.599584462121129, |
| "rewards/accuracy_reward": 0.37500000931322575, |
| "rewards/cosine_scaled_reward": 0.11901635373942554, |
| "step": 45 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2750.166702270508, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.024388441815972328, |
| "kl": 0.0001780986785888672, |
| "lambda_div_used": 0.5804363936185837, |
| "learning_rate": 9.578385041664925e-07, |
| "loss": 0.045, |
| "reward": -0.34557132300687954, |
| "reward_after_mean": -0.34557132300687954, |
| "reward_after_std": 0.41662513464689255, |
| "reward_before_mean": -0.07300508208572865, |
| "reward_before_std": 0.3654432473704219, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2725662402808666, |
| "reward_change_min": -0.43397457897663116, |
| "reward_change_std": 0.16234493535012007, |
| "reward_std": 0.4166251439601183, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.15633842581883073, |
| "step": 46 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2203.437568664551, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.02950853668153286, |
| "kl": 0.00010627508163452148, |
| "lambda_div_used": 0.6182287782430649, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": -0.0342, |
| "reward": 0.08941356465220451, |
| "reward_after_mean": 0.08941356465220451, |
| "reward_after_std": 0.5494941845536232, |
| "reward_before_mean": 0.48168421536684036, |
| "reward_before_std": 0.5449010655283928, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39227062091231346, |
| "reward_change_min": -0.6192042604088783, |
| "reward_change_std": 0.2517077624797821, |
| "reward_std": 0.5494942031800747, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/cosine_scaled_reward": 0.12751753628253937, |
| "step": 47 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2591.729232788086, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.026953846216201782, |
| "kl": 0.00012552738189697266, |
| "lambda_div_used": 0.6436027362942696, |
| "learning_rate": 9.509529358847654e-07, |
| "loss": 0.0003, |
| "reward": 0.07579808123409748, |
| "reward_after_mean": 0.07579808123409748, |
| "reward_after_std": 0.7408107332885265, |
| "reward_before_mean": 0.4633368235081434, |
| "reward_before_std": 0.6680604638531804, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38753873854875565, |
| "reward_change_min": -0.686689231544733, |
| "reward_change_std": 0.25357643235474825, |
| "reward_std": 0.740810751914978, |
| "rewards/accuracy_reward": 0.33333333767950535, |
| "rewards/cosine_scaled_reward": 0.130003463011235, |
| "step": 48 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1606.1875267028809, |
| "epoch": 0.056, |
| "grad_norm": 0.03145647421479225, |
| "kl": 8.93324613571167e-05, |
| "lambda_div_used": 0.6448555663228035, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": -0.0559, |
| "reward": 0.11411497555673122, |
| "reward_after_mean": 0.11411497555673122, |
| "reward_after_std": 0.6693379506468773, |
| "reward_before_mean": 0.456937775015831, |
| "reward_before_std": 0.679039599490352, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3428228013217449, |
| "reward_change_min": -0.5654460191726685, |
| "reward_change_std": 0.23543909844011068, |
| "reward_std": 0.6693379702046514, |
| "rewards/accuracy_reward": 0.35416667722165585, |
| "rewards/cosine_scaled_reward": 0.10277110431343317, |
| "step": 49 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2721.18754196167, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.020998205989599228, |
| "kl": 0.00010278820991516113, |
| "lambda_div_used": 0.5569720417261124, |
| "learning_rate": 9.43578868212728e-07, |
| "loss": 0.0198, |
| "reward": -0.014975886791944504, |
| "reward_after_mean": -0.014975886791944504, |
| "reward_after_std": 0.47250125743448734, |
| "reward_before_mean": 0.5332869850099087, |
| "reward_before_std": 0.25590797886252403, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5482628662139177, |
| "reward_change_min": -0.7288287468254566, |
| "reward_change_std": 0.2796243606135249, |
| "reward_std": 0.4725012853741646, |
| "rewards/accuracy_reward": 0.39583333395421505, |
| "rewards/cosine_scaled_reward": 0.13745362346526235, |
| "step": 50 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2238.145851135254, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.030202677473425865, |
| "kl": 0.00016352534294128418, |
| "lambda_div_used": 0.589074470102787, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": -0.0263, |
| "reward": -0.32303538359701633, |
| "reward_after_mean": -0.32303538359701633, |
| "reward_after_std": 0.4511607848107815, |
| "reward_before_mean": -0.059135761111974716, |
| "reward_before_std": 0.41417009476572275, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26389962807297707, |
| "reward_change_min": -0.46818122640252113, |
| "reward_change_std": 0.1724827392026782, |
| "reward_std": 0.4511608015745878, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.16330242389813066, |
| "step": 51 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2562.2292098999023, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.027546504512429237, |
| "kl": 0.00012978166341781616, |
| "lambda_div_used": 0.5993086248636246, |
| "learning_rate": 9.357252853159505e-07, |
| "loss": 0.0385, |
| "reward": 0.08331240899860859, |
| "reward_after_mean": 0.08331240899860859, |
| "reward_after_std": 0.5890753846615553, |
| "reward_before_mean": 0.5538598063867539, |
| "reward_before_std": 0.45862336084246635, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4705474264919758, |
| "reward_change_min": -0.6809027269482613, |
| "reward_change_std": 0.26692016143351793, |
| "reward_std": 0.5890753846615553, |
| "rewards/accuracy_reward": 0.43750000558793545, |
| "rewards/cosine_scaled_reward": 0.11635981127619743, |
| "step": 52 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2494.8750762939453, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.024038787931203842, |
| "kl": 0.0001424252986907959, |
| "lambda_div_used": 0.6427036076784134, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0239, |
| "reward": 0.016640717163681984, |
| "reward_after_mean": 0.016640717163681984, |
| "reward_after_std": 0.6831068731844425, |
| "reward_before_mean": 0.334790101274848, |
| "reward_before_std": 0.665732966735959, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.318149384111166, |
| "reward_change_min": -0.5409456379711628, |
| "reward_change_std": 0.2164039220660925, |
| "reward_std": 0.6831068824976683, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": 0.043123436626046896, |
| "step": 53 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1934.7083892822266, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.0318799689412117, |
| "kl": 9.445101022720337e-05, |
| "lambda_div_used": 0.6365627199411392, |
| "learning_rate": 9.274017555754407e-07, |
| "loss": 0.0748, |
| "reward": 0.47428043745458126, |
| "reward_after_mean": 0.47428043745458126, |
| "reward_after_std": 0.7259879875928164, |
| "reward_before_mean": 1.077655490487814, |
| "reward_before_std": 0.6342989937402308, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.6033750809729099, |
| "reward_change_min": -0.9116491675376892, |
| "reward_change_std": 0.3683675564825535, |
| "reward_std": 0.7259880118072033, |
| "rewards/accuracy_reward": 0.6458333469927311, |
| "rewards/cosine_scaled_reward": 0.43182216165587306, |
| "step": 54 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2623.7708892822266, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.02094973810017109, |
| "kl": 0.00013570114970207214, |
| "lambda_div_used": 0.645888201892376, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.0808, |
| "reward": 0.17671905946917832, |
| "reward_after_mean": 0.17671905946917832, |
| "reward_after_std": 0.7367929276078939, |
| "reward_before_mean": 0.5884007401764393, |
| "reward_before_std": 0.6809105025604367, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4116816818714142, |
| "reward_change_min": -0.6507021151483059, |
| "reward_change_std": 0.258549933321774, |
| "reward_std": 0.7367929276078939, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.19256740622222424, |
| "step": 55 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2702.8750915527344, |
| "epoch": 0.064, |
| "grad_norm": 0.02438957802951336, |
| "kl": 0.00014606118202209473, |
| "lambda_div_used": 0.583276279270649, |
| "learning_rate": 9.186184199300463e-07, |
| "loss": 0.0126, |
| "reward": -0.2818741099908948, |
| "reward_after_mean": -0.2818741099908948, |
| "reward_after_std": 0.4212169963866472, |
| "reward_before_mean": 0.015760678332298994, |
| "reward_before_std": 0.38252383656799793, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2976347878575325, |
| "reward_change_min": -0.5124507918953896, |
| "reward_change_std": 0.1885841079056263, |
| "reward_std": 0.4212170038372278, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.1092393291182816, |
| "step": 56 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3056.4583740234375, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.01594528928399086, |
| "kl": 0.00010664761066436768, |
| "lambda_div_used": 0.6316090971231461, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0184, |
| "reward": -0.039651480969041586, |
| "reward_after_mean": -0.039651480969041586, |
| "reward_after_std": 0.6351467221975327, |
| "reward_before_mean": 0.2791806310415268, |
| "reward_before_std": 0.605388393625617, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3188321180641651, |
| "reward_change_min": -0.5200116373598576, |
| "reward_change_std": 0.20627015084028244, |
| "reward_std": 0.6351467464119196, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": 0.008347294380655512, |
| "step": 57 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1619.8541946411133, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.028398146852850914, |
| "kl": 7.768720388412476e-05, |
| "lambda_div_used": 0.6244253218173981, |
| "learning_rate": 9.093859795212817e-07, |
| "loss": -0.0744, |
| "reward": 0.02804100140929222, |
| "reward_after_mean": 0.02804100140929222, |
| "reward_after_std": 0.6312676724046469, |
| "reward_before_mean": 0.41605534171685576, |
| "reward_before_std": 0.5784196928143501, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38801432587206364, |
| "reward_change_min": -0.6227842308580875, |
| "reward_change_std": 0.24862205237150192, |
| "reward_std": 0.6312677096575499, |
| "rewards/accuracy_reward": 0.35416666977107525, |
| "rewards/cosine_scaled_reward": 0.061888658441603184, |
| "step": 58 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2521.937530517578, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.023369140923023224, |
| "kl": 9.158626198768616e-05, |
| "lambda_div_used": 0.6007750853896141, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": 0.0054, |
| "reward": -0.07107383571565151, |
| "reward_after_mean": -0.07107383571565151, |
| "reward_after_std": 0.49415648356080055, |
| "reward_before_mean": 0.2814117716625333, |
| "reward_before_std": 0.466181633528322, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3524855989962816, |
| "reward_change_min": -0.5633360184729099, |
| "reward_change_std": 0.22167869098484516, |
| "reward_std": 0.49415648356080055, |
| "rewards/accuracy_reward": 0.2500000111758709, |
| "rewards/cosine_scaled_reward": 0.03141175117343664, |
| "step": 59 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2424.3125762939453, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.020705759525299072, |
| "kl": 0.00011293590068817139, |
| "lambda_div_used": 0.6067081466317177, |
| "learning_rate": 8.997156826556369e-07, |
| "loss": 0.0407, |
| "reward": 0.07130313850939274, |
| "reward_after_mean": 0.07130313850939274, |
| "reward_after_std": 0.5420917756855488, |
| "reward_before_mean": 0.5138208344578743, |
| "reward_before_std": 0.4931760486215353, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4425176791846752, |
| "reward_change_min": -0.6790216974914074, |
| "reward_change_std": 0.2717863190919161, |
| "reward_std": 0.5420917868614197, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.1388207976706326, |
| "step": 60 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2410.625030517578, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.0203632153570652, |
| "kl": 9.970366954803467e-05, |
| "lambda_div_used": 0.5674436464905739, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0006, |
| "reward": -0.1596299186348915, |
| "reward_after_mean": -0.1596299186348915, |
| "reward_after_std": 0.40844789519906044, |
| "reward_before_mean": 0.2439738381654024, |
| "reward_before_std": 0.3030575467273593, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40360378473997116, |
| "reward_change_min": -0.562778364866972, |
| "reward_change_std": 0.21779226139187813, |
| "reward_std": 0.4084479194134474, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": -0.006026154384016991, |
| "step": 61 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1986.6875381469727, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 0.023419808596372604, |
| "kl": 8.377432823181152e-05, |
| "lambda_div_used": 0.6294708475470543, |
| "learning_rate": 8.896193111002475e-07, |
| "loss": 0.0145, |
| "reward": 0.127364382147789, |
| "reward_after_mean": 0.127364382147789, |
| "reward_after_std": 0.6607769038528204, |
| "reward_before_mean": 0.5438444633036852, |
| "reward_before_std": 0.6062875427305698, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41648009419441223, |
| "reward_change_min": -0.6949719563126564, |
| "reward_change_std": 0.27056892681866884, |
| "reward_std": 0.6607769187539816, |
| "rewards/accuracy_reward": 0.4166666753590107, |
| "rewards/cosine_scaled_reward": 0.12717779609374702, |
| "step": 62 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1558.8542098999023, |
| "epoch": 0.072, |
| "grad_norm": 0.029614871367812157, |
| "kl": 9.24495980143547e-05, |
| "lambda_div_used": 0.5882120281457901, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": -0.0217, |
| "reward": 0.24260340631008148, |
| "reward_after_mean": 0.24260340631008148, |
| "reward_after_std": 0.5563407000154257, |
| "reward_before_mean": 0.8341647423803806, |
| "reward_before_std": 0.4078605566173792, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.59156134724617, |
| "reward_change_min": -0.8355859033763409, |
| "reward_change_std": 0.33061067573726177, |
| "reward_std": 0.5563407260924578, |
| "rewards/accuracy_reward": 0.5416666716337204, |
| "rewards/cosine_scaled_reward": 0.29249807819724083, |
| "step": 63 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2526.979217529297, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.023758206516504288, |
| "kl": 0.00014294683933258057, |
| "lambda_div_used": 0.5706712529063225, |
| "learning_rate": 8.791091657286267e-07, |
| "loss": -0.0491, |
| "reward": 0.005135258659720421, |
| "reward_after_mean": 0.005135258659720421, |
| "reward_after_std": 0.4730408936738968, |
| "reward_before_mean": 0.509556919336319, |
| "reward_before_std": 0.318298134021461, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5044216811656952, |
| "reward_change_min": -0.6950805820524693, |
| "reward_change_std": 0.2665313957259059, |
| "reward_std": 0.4730409197509289, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.13455691374838352, |
| "step": 64 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2619.6875343322754, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.02972961962223053, |
| "kl": 9.997934103012085e-05, |
| "lambda_div_used": 0.5918664485216141, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0061, |
| "reward": -0.009068667888641357, |
| "reward_after_mean": -0.009068667888641357, |
| "reward_after_std": 0.523827837780118, |
| "reward_before_mean": 0.42262596264481544, |
| "reward_before_std": 0.4240496205165982, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43169461004436016, |
| "reward_change_min": -0.6695713810622692, |
| "reward_change_std": 0.25297324638813734, |
| "reward_std": 0.5238278452306986, |
| "rewards/accuracy_reward": 0.3541666679084301, |
| "rewards/cosine_scaled_reward": 0.06845926493406296, |
| "step": 65 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2048.645835876465, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.029114792123436928, |
| "kl": 8.37370753288269e-05, |
| "lambda_div_used": 0.5725493803620338, |
| "learning_rate": 8.681980515339463e-07, |
| "loss": -0.0155, |
| "reward": -0.16004172409884632, |
| "reward_after_mean": -0.16004172409884632, |
| "reward_after_std": 0.49360031075775623, |
| "reward_before_mean": 0.2602699510753155, |
| "reward_before_std": 0.33009787695482373, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4203116577118635, |
| "reward_change_min": -0.6120298802852631, |
| "reward_change_std": 0.2290408704429865, |
| "reward_std": 0.49360031820833683, |
| "rewards/accuracy_reward": 0.31250000186264515, |
| "rewards/cosine_scaled_reward": -0.05223005823791027, |
| "step": 66 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3108.1458740234375, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.017497915774583817, |
| "kl": 0.00010813027620315552, |
| "lambda_div_used": 0.630346029996872, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": -0.024, |
| "reward": -0.17341885343194008, |
| "reward_after_mean": -0.17341885343194008, |
| "reward_after_std": 0.6313954871147871, |
| "reward_before_mean": 0.0926114417379722, |
| "reward_before_std": 0.6103347176685929, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26603029295802116, |
| "reward_change_min": -0.5225372426211834, |
| "reward_change_std": 0.1893094191327691, |
| "reward_std": 0.6313955169171095, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.07405522745102644, |
| "step": 67 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1551.3541870117188, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.03526793047785759, |
| "kl": 9.113550186157227e-05, |
| "lambda_div_used": 0.6158603206276894, |
| "learning_rate": 8.568992620281243e-07, |
| "loss": -0.07, |
| "reward": -0.04862045869231224, |
| "reward_after_mean": -0.04862045869231224, |
| "reward_after_std": 0.5381349269300699, |
| "reward_before_mean": 0.3049982152879238, |
| "reward_before_std": 0.5310354437679052, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35361868515610695, |
| "reward_change_min": -0.6167616136372089, |
| "reward_change_std": 0.2329900823533535, |
| "reward_std": 0.5381349604576826, |
| "rewards/accuracy_reward": 0.291666679084301, |
| "rewards/cosine_scaled_reward": 0.013331551104784012, |
| "step": 68 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1843.0000305175781, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.034535001963377, |
| "kl": 0.00010597705841064453, |
| "lambda_div_used": 0.6196342781186104, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": -0.0403, |
| "reward": -0.16936753690242767, |
| "reward_after_mean": -0.16936753690242767, |
| "reward_after_std": 0.5928534604609013, |
| "reward_before_mean": 0.10612463857978582, |
| "reward_before_std": 0.5493131745606661, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2754921726882458, |
| "reward_change_min": -0.4671623595058918, |
| "reward_change_std": 0.1736066685989499, |
| "reward_std": 0.592853469774127, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.06054202886298299, |
| "step": 69 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2385.6250610351562, |
| "epoch": 0.08, |
| "grad_norm": 0.022255579009652138, |
| "kl": 9.971857070922852e-05, |
| "lambda_div_used": 0.5713987499475479, |
| "learning_rate": 8.452265630457282e-07, |
| "loss": 0.0293, |
| "reward": -0.15412342175841331, |
| "reward_after_mean": -0.15412342175841331, |
| "reward_after_std": 0.4452939387410879, |
| "reward_before_mean": 0.2508242540061474, |
| "reward_before_std": 0.32261871080845594, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4049476757645607, |
| "reward_change_min": -0.5730207115411758, |
| "reward_change_std": 0.21870618779212236, |
| "reward_std": 0.44529395177960396, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": 0.0008242330513894558, |
| "step": 70 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2444.6666717529297, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 0.03434319049119949, |
| "kl": 0.0001207888126373291, |
| "lambda_div_used": 0.6165208369493484, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": -0.0114, |
| "reward": -0.04324318375438452, |
| "reward_after_mean": -0.04324318375438452, |
| "reward_after_std": 0.5684425849467516, |
| "reward_before_mean": 0.3016075724735856, |
| "reward_before_std": 0.535145154222846, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3448507599532604, |
| "reward_change_min": -0.5520102642476559, |
| "reward_change_std": 0.21349613554775715, |
| "reward_std": 0.5684425886720419, |
| "rewards/accuracy_reward": 0.27083334513008595, |
| "rewards/cosine_scaled_reward": 0.030774242244660854, |
| "step": 71 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2275.229232788086, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.026326576247811317, |
| "kl": 0.0001144111156463623, |
| "lambda_div_used": 0.5820747911930084, |
| "learning_rate": 8.331941759724268e-07, |
| "loss": 0.0486, |
| "reward": -0.22072702879086137, |
| "reward_after_mean": -0.22072702879086137, |
| "reward_after_std": 0.4294308237731457, |
| "reward_before_mean": 0.11065018083900213, |
| "reward_before_std": 0.37079737335443497, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3313772287219763, |
| "reward_change_min": -0.49349113181233406, |
| "reward_change_std": 0.18969058711081743, |
| "reward_std": 0.4294308312237263, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.05601647589355707, |
| "step": 72 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3070.5209350585938, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.0183473639190197, |
| "kl": 0.00015050172805786133, |
| "lambda_div_used": 0.6504631415009499, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0732, |
| "reward": -0.024378453381359577, |
| "reward_after_mean": -0.024378453381359577, |
| "reward_after_std": 0.719476904720068, |
| "reward_before_mean": 0.25941105699166656, |
| "reward_before_std": 0.7029449231922626, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28378950990736485, |
| "reward_change_min": -0.4811365678906441, |
| "reward_change_std": 0.19341129437088966, |
| "reward_std": 0.7194769158959389, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.01142227090895176, |
| "step": 73 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2230.604202270508, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.02717745117843151, |
| "kl": 9.462237358093262e-05, |
| "lambda_div_used": 0.6418861970305443, |
| "learning_rate": 8.208167604184217e-07, |
| "loss": -0.0956, |
| "reward": -0.0033467919565737247, |
| "reward_after_mean": -0.0033467919565737247, |
| "reward_after_std": 0.6568808052688837, |
| "reward_before_mean": 0.3098285049200058, |
| "reward_before_std": 0.6575386971235275, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3131752759218216, |
| "reward_change_min": -0.6135218031704426, |
| "reward_change_std": 0.22661382239311934, |
| "reward_std": 0.6568808313459158, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/cosine_scaled_reward": 0.018161814659833908, |
| "step": 74 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2656.416732788086, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.018360283225774765, |
| "kl": 0.00010110437870025635, |
| "lambda_div_used": 0.5670187771320343, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": -0.0053, |
| "reward": 0.006984639912843704, |
| "reward_after_mean": 0.006984639912843704, |
| "reward_after_std": 0.42523463629186153, |
| "reward_before_mean": 0.5017829714342952, |
| "reward_before_std": 0.3057099119760096, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.49479835107922554, |
| "reward_change_min": -0.685304194688797, |
| "reward_change_std": 0.27397861890494823, |
| "reward_std": 0.4252346530556679, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.1684496277011931, |
| "step": 75 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2364.0000228881836, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.026309814304113388, |
| "kl": 0.00010971724987030029, |
| "lambda_div_used": 0.5593390390276909, |
| "learning_rate": 8.081093963579707e-07, |
| "loss": 0.0787, |
| "reward": -0.24861154425889254, |
| "reward_after_mean": -0.24861154425889254, |
| "reward_after_std": 0.3816519398242235, |
| "reward_before_mean": 0.14198310300707817, |
| "reward_before_std": 0.2715805321931839, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3905946556478739, |
| "reward_change_min": -0.5774808749556541, |
| "reward_change_std": 0.21815251000225544, |
| "reward_std": 0.3816519435495138, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.02468355232849717, |
| "step": 76 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2667.750030517578, |
| "epoch": 0.088, |
| "grad_norm": 0.020327381789684296, |
| "kl": 0.00012464821338653564, |
| "lambda_div_used": 0.5539242178201675, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": -0.003, |
| "reward": -0.2628857381641865, |
| "reward_after_mean": -0.2628857381641865, |
| "reward_after_std": 0.31219773180782795, |
| "reward_before_mean": 0.10476060304790735, |
| "reward_before_std": 0.24125095596536994, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36764636635780334, |
| "reward_change_min": -0.5086396895349026, |
| "reward_change_std": 0.20270386710762978, |
| "reward_std": 0.3121977373957634, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.08273938857018948, |
| "step": 77 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2686.8125534057617, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.021180735900998116, |
| "kl": 0.0001368001103401184, |
| "lambda_div_used": 0.6677093878388405, |
| "learning_rate": 7.950875657567621e-07, |
| "loss": 0.11, |
| "reward": 0.1766232904046774, |
| "reward_after_mean": 0.1766232904046774, |
| "reward_after_std": 0.8389766626060009, |
| "reward_before_mean": 0.5535875726491213, |
| "reward_before_std": 0.7847605030983686, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3769642859697342, |
| "reward_change_min": -0.6617914140224457, |
| "reward_change_std": 0.25535117369145155, |
| "reward_std": 0.8389766924083233, |
| "rewards/accuracy_reward": 0.3958333358168602, |
| "rewards/cosine_scaled_reward": 0.15775423496961594, |
| "step": 78 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1962.8750457763672, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.030601773411035538, |
| "kl": 8.64267349243164e-05, |
| "lambda_div_used": 0.6305629685521126, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": -0.0874, |
| "reward": -0.07519742846488953, |
| "reward_after_mean": -0.07519742846488953, |
| "reward_after_std": 0.6089170537889004, |
| "reward_before_mean": 0.22960891388356686, |
| "reward_before_std": 0.611657090485096, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.304806362837553, |
| "reward_change_min": -0.5647807456552982, |
| "reward_change_std": 0.21934652887284756, |
| "reward_std": 0.608917074277997, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": -0.04122441209619865, |
| "step": 79 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2891.8333587646484, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.020956117659807205, |
| "kl": 0.00014778971672058105, |
| "lambda_div_used": 0.5999866053462029, |
| "learning_rate": 7.817671337095244e-07, |
| "loss": 0.0016, |
| "reward": -0.013454930856823921, |
| "reward_after_mean": -0.013454930856823921, |
| "reward_after_std": 0.5396539904177189, |
| "reward_before_mean": 0.39186157658696175, |
| "reward_before_std": 0.4594053290784359, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4053164832293987, |
| "reward_change_min": -0.6001381352543831, |
| "reward_change_std": 0.23710554651916027, |
| "reward_std": 0.5396539978682995, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.058528220281004906, |
| "step": 80 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2756.125057220459, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.0333174467086792, |
| "kl": 0.0001837015151977539, |
| "lambda_div_used": 0.6196143180131912, |
| "learning_rate": 7.75e-07, |
| "loss": -0.019, |
| "reward": -0.15753823146224022, |
| "reward_after_mean": -0.15753823146224022, |
| "reward_after_std": 0.5845062825828791, |
| "reward_before_mean": 0.12821976901614107, |
| "reward_before_std": 0.5542377643287182, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28575799986720085, |
| "reward_change_min": -0.5201962739229202, |
| "reward_change_std": 0.1913682147860527, |
| "reward_std": 0.5845062825828791, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.08011356927454472, |
| "step": 81 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2416.895851135254, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.02389819547533989, |
| "kl": 0.00010943412780761719, |
| "lambda_div_used": 0.6039423421025276, |
| "learning_rate": 7.681643291108517e-07, |
| "loss": -0.0069, |
| "reward": -0.03246039338409901, |
| "reward_after_mean": -0.03246039338409901, |
| "reward_after_std": 0.5534762311726809, |
| "reward_before_mean": 0.3658472504466772, |
| "reward_before_std": 0.4814961114898324, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3983076363801956, |
| "reward_change_min": -0.655058030039072, |
| "reward_change_std": 0.24730877578258514, |
| "reward_std": 0.5534762516617775, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.05334722436964512, |
| "step": 82 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2424.3125534057617, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.032633859664201736, |
| "kl": 0.00012442469596862793, |
| "lambda_div_used": 0.6332797482609749, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0298, |
| "reward": 0.024711462669074535, |
| "reward_after_mean": 0.024711462669074535, |
| "reward_after_std": 0.6729160957038403, |
| "reward_before_mean": 0.3993762247264385, |
| "reward_before_std": 0.6236082511022687, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3746647536754608, |
| "reward_change_min": -0.691291693598032, |
| "reward_change_std": 0.2538198195397854, |
| "reward_std": 0.6729161199182272, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.08687621541321278, |
| "step": 83 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2199.125026702881, |
| "epoch": 0.096, |
| "grad_norm": 0.023145966231822968, |
| "kl": 9.820610284805298e-05, |
| "lambda_div_used": 0.6221627593040466, |
| "learning_rate": 7.54295724882796e-07, |
| "loss": -0.0603, |
| "reward": 0.019974265713244677, |
| "reward_after_mean": 0.019974265713244677, |
| "reward_after_std": 0.6427162848412991, |
| "reward_before_mean": 0.40708103217184544, |
| "reward_before_std": 0.5660292999818921, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3871067576110363, |
| "reward_change_min": -0.6353648640215397, |
| "reward_change_std": 0.23979215417057276, |
| "reward_std": 0.6427163053303957, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.07374768820591271, |
| "step": 84 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2697.812545776367, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.018655812367796898, |
| "kl": 0.0001118779182434082, |
| "lambda_div_used": 0.6938974410295486, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0247, |
| "reward": 0.22309327218681574, |
| "reward_after_mean": 0.22309327218681574, |
| "reward_after_std": 0.8897394463419914, |
| "reward_before_mean": 0.5425103409215808, |
| "reward_before_std": 0.909519312903285, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3194170743227005, |
| "reward_change_min": -0.6195828355848789, |
| "reward_change_std": 0.24075988680124283, |
| "reward_std": 0.889739491045475, |
| "rewards/accuracy_reward": 0.3750000111758709, |
| "rewards/cosine_scaled_reward": 0.16751032788306475, |
| "step": 85 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2550.145851135254, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.02583778277039528, |
| "kl": 0.00013977289199829102, |
| "lambda_div_used": 0.5981776341795921, |
| "learning_rate": 7.401782177833147e-07, |
| "loss": -0.0016, |
| "reward": -0.1776493340730667, |
| "reward_after_mean": -0.1776493340730667, |
| "reward_after_std": 0.4848842676728964, |
| "reward_before_mean": 0.13707906752824783, |
| "reward_before_std": 0.44980547949671745, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31472842022776604, |
| "reward_change_min": -0.5129407718777657, |
| "reward_change_std": 0.19486056733876467, |
| "reward_std": 0.4848842900246382, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.029587595723569393, |
| "step": 86 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2216.520866394043, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.026928512379527092, |
| "kl": 0.00014719367027282715, |
| "lambda_div_used": 0.5552037805318832, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": -0.0112, |
| "reward": -0.14393189689144492, |
| "reward_after_mean": -0.14393189689144492, |
| "reward_after_std": 0.41725931130349636, |
| "reward_before_mean": 0.3129472378641367, |
| "reward_before_std": 0.24625429138541222, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4568791352212429, |
| "reward_change_min": -0.6228058040142059, |
| "reward_change_std": 0.2342971321195364, |
| "reward_std": 0.41725931875407696, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": 0.04211391881108284, |
| "step": 87 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1673.833366394043, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.03403550013899803, |
| "kl": 9.492039680480957e-05, |
| "lambda_div_used": 0.6714882552623749, |
| "learning_rate": 7.258290078201731e-07, |
| "loss": 0.1357, |
| "reward": 0.23210743255913258, |
| "reward_after_mean": 0.23210743255913258, |
| "reward_after_std": 0.7776901721954346, |
| "reward_before_mean": 0.6004263032227755, |
| "reward_before_std": 0.8064676076173782, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3683188706636429, |
| "reward_change_min": -0.6842552609741688, |
| "reward_change_std": 0.2728601209819317, |
| "reward_std": 0.7776901982724667, |
| "rewards/accuracy_reward": 0.4375000149011612, |
| "rewards/cosine_scaled_reward": 0.16292626922950149, |
| "step": 88 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2455.875030517578, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.02145099826157093, |
| "kl": 0.00010183453559875488, |
| "lambda_div_used": 0.6517865061759949, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.0088, |
| "reward": 0.022896312177181244, |
| "reward_after_mean": 0.022896312177181244, |
| "reward_after_std": 0.7065913639962673, |
| "reward_before_mean": 0.327204130589962, |
| "reward_before_std": 0.7111460026353598, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30430781841278076, |
| "reward_change_min": -0.5758539959788322, |
| "reward_change_std": 0.22174649592489004, |
| "reward_std": 0.7065913733094931, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.056370790116488934, |
| "step": 89 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2390.3125381469727, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.04259632155299187, |
| "kl": 0.00015431642532348633, |
| "lambda_div_used": 0.5984295755624771, |
| "learning_rate": 7.11265577295385e-07, |
| "loss": 0.034, |
| "reward": -0.32630743458867073, |
| "reward_after_mean": -0.32630743458867073, |
| "reward_after_std": 0.5043698158115149, |
| "reward_before_mean": -0.07746448495890945, |
| "reward_before_std": 0.4487060569226742, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24884295091032982, |
| "reward_change_min": -0.42664676532149315, |
| "reward_change_std": 0.15124379005283117, |
| "reward_std": 0.5043698251247406, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.1607978215906769, |
| "step": 90 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2609.041702270508, |
| "epoch": 0.104, |
| "grad_norm": 0.022521013393998146, |
| "kl": 0.0001251697540283203, |
| "lambda_div_used": 0.6242434978485107, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0254, |
| "reward": 0.06410847418010235, |
| "reward_after_mean": 0.06410847418010235, |
| "reward_after_std": 0.6396163944154978, |
| "reward_before_mean": 0.4679965991526842, |
| "reward_before_std": 0.5784324184060097, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4038881305605173, |
| "reward_change_min": -0.6435716077685356, |
| "reward_change_std": 0.2524958234280348, |
| "reward_std": 0.6396164130419493, |
| "rewards/accuracy_reward": 0.35416666977107525, |
| "rewards/cosine_scaled_reward": 0.11382993124425411, |
| "step": 91 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2140.312526702881, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.028272144496440887, |
| "kl": 8.565187454223633e-05, |
| "lambda_div_used": 0.6041740253567696, |
| "learning_rate": 6.965056695057204e-07, |
| "loss": -0.0131, |
| "reward": -0.22151808440685272, |
| "reward_after_mean": -0.22151808440685272, |
| "reward_after_std": 0.5138680338859558, |
| "reward_before_mean": 0.06442609056830406, |
| "reward_before_std": 0.48106229305267334, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28594417311251163, |
| "reward_change_min": -0.4845799170434475, |
| "reward_change_std": 0.1817196160554886, |
| "reward_std": 0.5138680376112461, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.10224058385938406, |
| "step": 92 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3470.3333435058594, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.02051234431564808, |
| "kl": 0.00021988153457641602, |
| "lambda_div_used": 0.550777792930603, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0098, |
| "reward": -0.4400870492681861, |
| "reward_after_mean": -0.4400870492681861, |
| "reward_after_std": 0.30945760011672974, |
| "reward_before_mean": -0.15331347286701202, |
| "reward_before_std": 0.22546498104929924, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2867735829204321, |
| "reward_change_min": -0.40346677228808403, |
| "reward_change_std": 0.15250255912542343, |
| "reward_std": 0.3094576168805361, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.17414680309593678, |
| "step": 93 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2433.375045776367, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.02587928995490074, |
| "kl": 0.0001605413854122162, |
| "lambda_div_used": 0.5997196212410927, |
| "learning_rate": 6.815672671252315e-07, |
| "loss": 0.0496, |
| "reward": -0.1711340295150876, |
| "reward_after_mean": -0.1711340295150876, |
| "reward_after_std": 0.5549158975481987, |
| "reward_before_mean": 0.1783520970493555, |
| "reward_before_std": 0.45881492272019386, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34948613308370113, |
| "reward_change_min": -0.5870647989213467, |
| "reward_change_std": 0.21199375297874212, |
| "reward_std": 0.5549159198999405, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.05081457580672577, |
| "step": 94 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3021.2083740234375, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.016700398176908493, |
| "kl": 0.00013339519500732422, |
| "lambda_div_used": 0.6040999740362167, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.0501, |
| "reward": -0.10988862998783588, |
| "reward_after_mean": -0.10988862998783588, |
| "reward_after_std": 0.574177211150527, |
| "reward_before_mean": 0.259714370011352, |
| "reward_before_std": 0.4856903199106455, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36960301361978054, |
| "reward_change_min": -0.6000017635524273, |
| "reward_change_std": 0.2258477583527565, |
| "reward_std": 0.5741772279143333, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": 0.009714371990412474, |
| "step": 95 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2388.979202270508, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.02693852037191391, |
| "kl": 0.00010056048631668091, |
| "lambda_div_used": 0.6452958509325981, |
| "learning_rate": 6.664685702961344e-07, |
| "loss": 0.0131, |
| "reward": 0.1582602821290493, |
| "reward_after_mean": 0.1582602821290493, |
| "reward_after_std": 0.7448761742562056, |
| "reward_before_mean": 0.5632272865623236, |
| "reward_before_std": 0.6762064695358276, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4049670249223709, |
| "reward_change_min": -0.6407857313752174, |
| "reward_change_std": 0.2506500957533717, |
| "reward_std": 0.7448761742562056, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.1673939572647214, |
| "step": 96 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2700.0000534057617, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.022453829646110535, |
| "kl": 0.00012642145156860352, |
| "lambda_div_used": 0.5770404115319252, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0441, |
| "reward": -0.24632946588099003, |
| "reward_after_mean": -0.24632946588099003, |
| "reward_after_std": 0.4705936200916767, |
| "reward_before_mean": 0.10453066416084766, |
| "reward_before_std": 0.3494142349809408, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35086013562977314, |
| "reward_change_min": -0.5019582267850637, |
| "reward_change_std": 0.18468604423105717, |
| "reward_std": 0.4705936200916767, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.06213599909096956, |
| "step": 97 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2192.8750534057617, |
| "epoch": 0.112, |
| "grad_norm": 0.021812062710523605, |
| "kl": 8.285045623779297e-05, |
| "lambda_div_used": 0.6265708059072495, |
| "learning_rate": 6.512279744547392e-07, |
| "loss": 0.0464, |
| "reward": -0.06961194425821304, |
| "reward_after_mean": -0.06961194425821304, |
| "reward_after_std": 0.705444872379303, |
| "reward_before_mean": 0.27397448010742664, |
| "reward_before_std": 0.5812770891934633, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3435864243656397, |
| "reward_change_min": -0.521237924695015, |
| "reward_change_std": 0.19378468580543995, |
| "reward_std": 0.7054448891431093, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": 0.023974468291271478, |
| "step": 98 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2786.1875534057617, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.02357642538845539, |
| "kl": 0.00013721734285354614, |
| "lambda_div_used": 0.6171735525131226, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": 0.0071, |
| "reward": -0.024897070601582527, |
| "reward_after_mean": -0.024897070601582527, |
| "reward_after_std": 0.6280203014612198, |
| "reward_before_mean": 0.3600337319076061, |
| "reward_before_std": 0.5423067910596728, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3849308080971241, |
| "reward_change_min": -0.6169135756790638, |
| "reward_change_std": 0.2341146618127823, |
| "reward_std": 0.6280203089118004, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.04753373749554157, |
| "step": 99 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2387.687545776367, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.023666124790906906, |
| "kl": 0.00012056529521942139, |
| "lambda_div_used": 0.6473532766103745, |
| "learning_rate": 6.358640479194451e-07, |
| "loss": -0.0079, |
| "reward": 0.10418231040239334, |
| "reward_after_mean": 0.10418231040239334, |
| "reward_after_std": 0.6740316934883595, |
| "reward_before_mean": 0.4593420661985874, |
| "reward_before_std": 0.6859642090275884, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3551597539335489, |
| "reward_change_min": -0.6693484336137772, |
| "reward_change_std": 0.2530285455286503, |
| "reward_std": 0.6740317121148109, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.12600870989263058, |
| "step": 100 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2082.4167098999023, |
| "epoch": 0.11542857142857142, |
| "grad_norm": 0.025257760658860207, |
| "kl": 0.00012496113777160645, |
| "lambda_div_used": 0.6337181106209755, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": 0.0127, |
| "reward": 0.08042715396732092, |
| "reward_after_mean": 0.08042715396732092, |
| "reward_after_std": 0.6870364677160978, |
| "reward_before_mean": 0.4686305020004511, |
| "reward_before_std": 0.6316956970840693, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3882033359259367, |
| "reward_change_min": -0.6386316269636154, |
| "reward_change_std": 0.2518463246524334, |
| "reward_std": 0.6870364770293236, |
| "rewards/accuracy_reward": 0.33333333767950535, |
| "rewards/cosine_scaled_reward": 0.13529715640470386, |
| "step": 101 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2037.208381652832, |
| "epoch": 0.11657142857142858, |
| "grad_norm": 0.030245469883084297, |
| "kl": 0.00012689828872680664, |
| "lambda_div_used": 0.6057270467281342, |
| "learning_rate": 6.203955092681039e-07, |
| "loss": 0.0419, |
| "reward": 0.04565976280719042, |
| "reward_after_mean": 0.04565976280719042, |
| "reward_after_std": 0.58235695771873, |
| "reward_before_mean": 0.49184186570346355, |
| "reward_before_std": 0.48894889652729034, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4461820814758539, |
| "reward_change_min": -0.7175954841077328, |
| "reward_change_std": 0.27023847959935665, |
| "reward_std": 0.5823569800704718, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.17934184102341533, |
| "step": 102 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2204.291717529297, |
| "epoch": 0.11771428571428572, |
| "grad_norm": 0.035654906183481216, |
| "kl": 0.00011385977268218994, |
| "lambda_div_used": 0.5589818432927132, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": -0.0085, |
| "reward": -0.2170967198908329, |
| "reward_after_mean": -0.2170967198908329, |
| "reward_after_std": 0.37913205102086067, |
| "reward_before_mean": 0.17684321105480194, |
| "reward_before_std": 0.26896010898053646, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3939399253576994, |
| "reward_change_min": -0.5747586265206337, |
| "reward_change_std": 0.2185236681252718, |
| "reward_std": 0.3791320640593767, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.010656798258423805, |
| "step": 103 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2460.1458435058594, |
| "epoch": 0.11885714285714286, |
| "grad_norm": 0.034317746758461, |
| "kl": 0.00014722347259521484, |
| "lambda_div_used": 0.5736617371439934, |
| "learning_rate": 6.048412045323164e-07, |
| "loss": -0.0349, |
| "reward": -0.1729673482477665, |
| "reward_after_mean": -0.1729673482477665, |
| "reward_after_std": 0.44365703873336315, |
| "reward_before_mean": 0.2049333555623889, |
| "reward_before_std": 0.3348153894767165, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3779007289558649, |
| "reward_change_min": -0.5351628288626671, |
| "reward_change_std": 0.20819698367267847, |
| "reward_std": 0.4436570517718792, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.024233306758105755, |
| "step": 104 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2288.958381652832, |
| "epoch": 0.12, |
| "grad_norm": 0.02615894190967083, |
| "kl": 0.00012151896953582764, |
| "lambda_div_used": 0.6090050563216209, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0354, |
| "reward": 0.04345609247684479, |
| "reward_after_mean": 0.04345609247684479, |
| "reward_after_std": 0.5673400796949863, |
| "reward_before_mean": 0.4590000305324793, |
| "reward_before_std": 0.5081056347116828, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41554390639066696, |
| "reward_change_min": -0.6591046005487442, |
| "reward_change_std": 0.25887916050851345, |
| "reward_std": 0.5673400945961475, |
| "rewards/accuracy_reward": 0.3333333358168602, |
| "rewards/cosine_scaled_reward": 0.1256666723638773, |
| "step": 105 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1890.4791870117188, |
| "epoch": 0.12114285714285715, |
| "grad_norm": 0.02484404481947422, |
| "kl": 5.9254467487335205e-05, |
| "lambda_div_used": 0.5908190160989761, |
| "learning_rate": 5.892200842364462e-07, |
| "loss": -0.031, |
| "reward": 0.13268680218607187, |
| "reward_after_mean": 0.13268680218607187, |
| "reward_after_std": 0.5557667016983032, |
| "reward_before_mean": 0.6536997258663177, |
| "reward_before_std": 0.4149925457313657, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5210129152983427, |
| "reward_change_min": -0.7558874487876892, |
| "reward_change_std": 0.2945323744788766, |
| "reward_std": 0.555766711011529, |
| "rewards/accuracy_reward": 0.4791666753590107, |
| "rewards/cosine_scaled_reward": 0.1745330523699522, |
| "step": 106 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2767.75004196167, |
| "epoch": 0.12228571428571429, |
| "grad_norm": 0.02508847787976265, |
| "kl": 0.00016683340072631836, |
| "lambda_div_used": 0.5527122691273689, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": 0.0551, |
| "reward": -0.10423782840371132, |
| "reward_after_mean": -0.10423782840371132, |
| "reward_after_std": 0.40749385207891464, |
| "reward_before_mean": 0.3917595148086548, |
| "reward_before_std": 0.23519209399819374, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4959973506629467, |
| "reward_change_min": -0.6841400265693665, |
| "reward_change_std": 0.26082153245806694, |
| "reward_std": 0.40749386698007584, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/cosine_scaled_reward": 0.07925950735807419, |
| "step": 107 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2765.1041870117188, |
| "epoch": 0.12342857142857143, |
| "grad_norm": 0.01993192359805107, |
| "kl": 0.0001267939805984497, |
| "lambda_div_used": 0.5970002189278603, |
| "learning_rate": 5.735511803093248e-07, |
| "loss": -0.0109, |
| "reward": 0.0014873668551445007, |
| "reward_after_mean": 0.0014873668551445007, |
| "reward_after_std": 0.5301447622478008, |
| "reward_before_mean": 0.4201008062809706, |
| "reward_before_std": 0.44587238878011703, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41861344687640667, |
| "reward_change_min": -0.6076503656804562, |
| "reward_change_std": 0.24174897000193596, |
| "reward_std": 0.5301447845995426, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.08676748792640865, |
| "step": 108 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2777.8541946411133, |
| "epoch": 0.12457142857142857, |
| "grad_norm": 0.023633325472474098, |
| "kl": 0.0001358315348625183, |
| "lambda_div_used": 0.5594293028116226, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": -0.0301, |
| "reward": -0.27439120411872864, |
| "reward_after_mean": -0.27439120411872864, |
| "reward_after_std": 0.3761340919882059, |
| "reward_before_mean": 0.09332936629652977, |
| "reward_before_std": 0.26834795251488686, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36772056482732296, |
| "reward_change_min": -0.5622114911675453, |
| "reward_change_std": 0.20468105003237724, |
| "reward_std": 0.37613409385085106, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.07333731185644865, |
| "step": 109 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2538.7917098999023, |
| "epoch": 0.12571428571428572, |
| "grad_norm": 0.028952863067388535, |
| "kl": 0.00011619925498962402, |
| "lambda_div_used": 0.6277726292610168, |
| "learning_rate": 5.578535828967777e-07, |
| "loss": -0.0225, |
| "reward": 0.13001340767368674, |
| "reward_after_mean": 0.13001340767368674, |
| "reward_after_std": 0.6052438467741013, |
| "reward_before_mean": 0.5293129477649927, |
| "reward_before_std": 0.5916427094489336, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3992995321750641, |
| "reward_change_min": -0.6062962152063847, |
| "reward_change_std": 0.25030655320733786, |
| "reward_std": 0.6052438709884882, |
| "rewards/accuracy_reward": 0.41666668467223644, |
| "rewards/cosine_scaled_reward": 0.1126462584361434, |
| "step": 110 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2590.833351135254, |
| "epoch": 0.12685714285714286, |
| "grad_norm": 0.023588471114635468, |
| "kl": 0.00016444921493530273, |
| "lambda_div_used": 0.6258220672607422, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0326, |
| "reward": 0.05748726427555084, |
| "reward_after_mean": 0.05748726427555084, |
| "reward_after_std": 0.6602962389588356, |
| "reward_before_mean": 0.4434679429978132, |
| "reward_before_std": 0.5859999302774668, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3859807029366493, |
| "reward_change_min": -0.6280175969004631, |
| "reward_change_std": 0.24166050180792809, |
| "reward_std": 0.6602962575852871, |
| "rewards/accuracy_reward": 0.33333333767950535, |
| "rewards/cosine_scaled_reward": 0.11013460718095303, |
| "step": 111 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2659.3750915527344, |
| "epoch": 0.128, |
| "grad_norm": 0.022469794377684593, |
| "kl": 0.0001360177993774414, |
| "lambda_div_used": 0.6310129314661026, |
| "learning_rate": 5.421464171032224e-07, |
| "loss": 0.0414, |
| "reward": 0.12392518669366837, |
| "reward_after_mean": 0.12392518669366837, |
| "reward_after_std": 0.6471672505140305, |
| "reward_before_mean": 0.5420562420040369, |
| "reward_before_std": 0.6111889835447073, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4181310646235943, |
| "reward_change_min": -0.6937783919274807, |
| "reward_change_std": 0.27559296786785126, |
| "reward_std": 0.6471672654151917, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.14622290851548314, |
| "step": 112 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1887.0000534057617, |
| "epoch": 0.12914285714285714, |
| "grad_norm": 0.034688595682382584, |
| "kl": 0.0001182258129119873, |
| "lambda_div_used": 0.6175974532961845, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0045, |
| "reward": -0.10040622856467962, |
| "reward_after_mean": -0.10040622856467962, |
| "reward_after_std": 0.5642315912991762, |
| "reward_before_mean": 0.21734675765037537, |
| "reward_before_std": 0.5426071379333735, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31775298714637756, |
| "reward_change_min": -0.5628380477428436, |
| "reward_change_std": 0.21154501475393772, |
| "reward_std": 0.564231613650918, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": -0.011819899722468108, |
| "step": 113 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2079.3541870117188, |
| "epoch": 0.13028571428571428, |
| "grad_norm": 0.023916104808449745, |
| "kl": 8.407607674598694e-05, |
| "lambda_div_used": 0.5991180911660194, |
| "learning_rate": 5.264488196906752e-07, |
| "loss": 0.0147, |
| "reward": -0.25019002705812454, |
| "reward_after_mean": -0.25019002705812454, |
| "reward_after_std": 0.48966687358915806, |
| "reward_before_mean": 0.01878603477962315, |
| "reward_before_std": 0.4579437389038503, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26897607184946537, |
| "reward_change_min": -0.4482330121099949, |
| "reward_change_std": 0.17346476390957832, |
| "reward_std": 0.4896668866276741, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.1478806473314762, |
| "step": 114 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2811.6041984558105, |
| "epoch": 0.13142857142857142, |
| "grad_norm": 0.02382122538983822, |
| "kl": 0.00011992454528808594, |
| "lambda_div_used": 0.6031630709767342, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0298, |
| "reward": -0.024587277323007584, |
| "reward_after_mean": -0.024587277323007584, |
| "reward_after_std": 0.5156112629920244, |
| "reward_before_mean": 0.36486465483903885, |
| "reward_before_std": 0.47580901626497507, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38945191726088524, |
| "reward_change_min": -0.6165403127670288, |
| "reward_change_std": 0.24257665127515793, |
| "reward_std": 0.5156112778931856, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.01069796271622181, |
| "step": 115 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3174.166679382324, |
| "epoch": 0.13257142857142856, |
| "grad_norm": 0.0280291810631752, |
| "kl": 0.000163152813911438, |
| "lambda_div_used": 0.5893594026565552, |
| "learning_rate": 5.107799157635538e-07, |
| "loss": -0.0065, |
| "reward": -0.2329910285770893, |
| "reward_after_mean": -0.2329910285770893, |
| "reward_after_std": 0.42214493826031685, |
| "reward_before_mean": 0.07756753638386726, |
| "reward_before_std": 0.4099442269653082, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31055857613682747, |
| "reward_change_min": -0.5211558938026428, |
| "reward_change_std": 0.2011605817824602, |
| "reward_std": 0.42214495688676834, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.06826579943299294, |
| "step": 116 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2800.854232788086, |
| "epoch": 0.1337142857142857, |
| "grad_norm": 0.021884813904762268, |
| "kl": 0.00016610324382781982, |
| "lambda_div_used": 0.5740129947662354, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": -0.0155, |
| "reward": -0.4151143445633352, |
| "reward_after_mean": -0.4151143445633352, |
| "reward_after_std": 0.41484479792416096, |
| "reward_before_mean": -0.17339750938117504, |
| "reward_before_std": 0.3358896663412452, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24171681702136993, |
| "reward_change_min": -0.35830606892704964, |
| "reward_change_std": 0.12972851190716028, |
| "reward_std": 0.41484480164945126, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.21506418148055673, |
| "step": 117 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2905.0000610351562, |
| "epoch": 0.13485714285714287, |
| "grad_norm": 0.019685380160808563, |
| "kl": 0.00012448430061340332, |
| "lambda_div_used": 0.6509419903159142, |
| "learning_rate": 4.951587954676837e-07, |
| "loss": 0.0699, |
| "reward": 0.37886764854192734, |
| "reward_after_mean": 0.37886764854192734, |
| "reward_after_std": 0.8599284738302231, |
| "reward_before_mean": 0.9192078877240419, |
| "reward_before_std": 0.7036966122686863, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.54034024477005, |
| "reward_change_min": -0.8373602591454983, |
| "reward_change_std": 0.3177654892206192, |
| "reward_std": 0.8599285036325455, |
| "rewards/accuracy_reward": 0.5833333395421505, |
| "rewards/cosine_scaled_reward": 0.33587456680834293, |
| "step": 118 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1735.0416946411133, |
| "epoch": 0.136, |
| "grad_norm": 0.03752947598695755, |
| "kl": 9.926781058311462e-05, |
| "lambda_div_used": 0.615598551928997, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0198, |
| "reward": 0.11371836951002479, |
| "reward_after_mean": 0.11371836951002479, |
| "reward_after_std": 0.608397152274847, |
| "reward_before_mean": 0.5552113465964794, |
| "reward_before_std": 0.5343325138092041, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.44149295426905155, |
| "reward_change_min": -0.6740178428590298, |
| "reward_change_std": 0.26589817740023136, |
| "reward_std": 0.6083971671760082, |
| "rewards/accuracy_reward": 0.37500000931322575, |
| "rewards/cosine_scaled_reward": 0.18021131958812475, |
| "step": 119 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2124.3958702087402, |
| "epoch": 0.13714285714285715, |
| "grad_norm": 0.03277261555194855, |
| "kl": 0.00016069412231445312, |
| "lambda_div_used": 0.621355876326561, |
| "learning_rate": 4.79604490731896e-07, |
| "loss": -0.0313, |
| "reward": 0.030127520207315683, |
| "reward_after_mean": 0.030127520207315683, |
| "reward_after_std": 0.6104090996086597, |
| "reward_before_mean": 0.3965782462619245, |
| "reward_before_std": 0.5573655245825648, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36645070649683475, |
| "reward_change_min": -0.5546461082994938, |
| "reward_change_std": 0.21761172730475664, |
| "reward_std": 0.6104091145098209, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.04241155949421227, |
| "step": 120 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1674.895866394043, |
| "epoch": 0.1382857142857143, |
| "grad_norm": 0.028996704146265984, |
| "kl": 0.00010481476783752441, |
| "lambda_div_used": 0.5694864094257355, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0117, |
| "reward": -0.15189427509903908, |
| "reward_after_mean": -0.15189427509903908, |
| "reward_after_std": 0.4316890323534608, |
| "reward_before_mean": 0.2471799086779356, |
| "reward_before_std": 0.3141373130492866, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3990741856396198, |
| "reward_change_min": -0.5547297224402428, |
| "reward_change_std": 0.21437653806060553, |
| "reward_std": 0.43168904818594456, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": -0.002820094407070428, |
| "step": 121 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2674.1666870117188, |
| "epoch": 0.13942857142857143, |
| "grad_norm": 0.02413249760866165, |
| "kl": 0.00017333030700683594, |
| "lambda_div_used": 0.5866860523819923, |
| "learning_rate": 4.641359520805548e-07, |
| "loss": -0.0399, |
| "reward": 0.04396146908402443, |
| "reward_after_mean": 0.04396146908402443, |
| "reward_after_std": 0.5285588596016169, |
| "reward_before_mean": 0.5353102702647448, |
| "reward_before_std": 0.40454091038554907, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4913488235324621, |
| "reward_change_min": -0.7136706411838531, |
| "reward_change_std": 0.2844190578907728, |
| "reward_std": 0.5285588726401329, |
| "rewards/accuracy_reward": 0.3958333358168602, |
| "rewards/cosine_scaled_reward": 0.13947694562375546, |
| "step": 122 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2401.270881652832, |
| "epoch": 0.14057142857142857, |
| "grad_norm": 0.021569611504673958, |
| "kl": 0.00012353062629699707, |
| "lambda_div_used": 0.560750350356102, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": -0.0352, |
| "reward": -0.11011217907071114, |
| "reward_after_mean": -0.11011217907071114, |
| "reward_after_std": 0.43483646027743816, |
| "reward_before_mean": 0.3471109885722399, |
| "reward_before_std": 0.2799733504652977, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4572231862694025, |
| "reward_change_min": -0.6473490297794342, |
| "reward_change_std": 0.2491364972665906, |
| "reward_std": 0.4348364770412445, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/cosine_scaled_reward": 0.034610994160175323, |
| "step": 123 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2023.1875381469727, |
| "epoch": 0.1417142857142857, |
| "grad_norm": 0.024433298036456108, |
| "kl": 8.402764797210693e-05, |
| "lambda_div_used": 0.5741237699985504, |
| "learning_rate": 4.4877202554526084e-07, |
| "loss": 0.0177, |
| "reward": 0.11256878264248371, |
| "reward_after_mean": 0.11256878264248371, |
| "reward_after_std": 0.5534880999475718, |
| "reward_before_mean": 0.7005790947005153, |
| "reward_before_std": 0.34279677364975214, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5880102626979351, |
| "reward_change_min": -0.8437007814645767, |
| "reward_change_std": 0.32156766299158335, |
| "reward_std": 0.5534881185740232, |
| "rewards/accuracy_reward": 0.4583333358168602, |
| "rewards/cosine_scaled_reward": 0.2422457179054618, |
| "step": 124 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2390.12504196167, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.027825474739074707, |
| "kl": 9.434670209884644e-05, |
| "lambda_div_used": 0.5816653594374657, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": -0.1441, |
| "reward": -0.022026576101779938, |
| "reward_after_mean": -0.022026576101779938, |
| "reward_after_std": 0.5128289703279734, |
| "reward_before_mean": 0.4427599459886551, |
| "reward_before_std": 0.37189827114343643, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4647865351289511, |
| "reward_change_min": -0.6719168424606323, |
| "reward_change_std": 0.2539686840027571, |
| "reward_std": 0.5128289721906185, |
| "rewards/accuracy_reward": 0.33333333395421505, |
| "rewards/cosine_scaled_reward": 0.10942662274464965, |
| "step": 125 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2348.625030517578, |
| "epoch": 0.144, |
| "grad_norm": 0.024593623355031013, |
| "kl": 0.0001001209020614624, |
| "lambda_div_used": 0.5944257900118828, |
| "learning_rate": 4.3353142970386557e-07, |
| "loss": 0.0235, |
| "reward": -0.036125872284173965, |
| "reward_after_mean": -0.036125872284173965, |
| "reward_after_std": 0.5458226818591356, |
| "reward_before_mean": 0.3889114623889327, |
| "reward_before_std": 0.43509659357368946, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4250373411923647, |
| "reward_change_min": -0.6385823003947735, |
| "reward_change_std": 0.24260270595550537, |
| "reward_std": 0.5458226818591356, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": 0.09724476374685764, |
| "step": 126 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3143.229217529297, |
| "epoch": 0.14514285714285713, |
| "grad_norm": 0.021421613171696663, |
| "kl": 0.0001538693904876709, |
| "lambda_div_used": 0.5594507232308388, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": -0.0322, |
| "reward": -0.4113082066178322, |
| "reward_after_mean": -0.4113082066178322, |
| "reward_after_std": 0.34261589869856834, |
| "reward_before_mean": -0.14389780722558498, |
| "reward_before_std": 0.269620718434453, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2674104031175375, |
| "reward_change_min": -0.4196575991809368, |
| "reward_change_std": 0.15403888188302517, |
| "reward_std": 0.3426159042865038, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.20639780443161726, |
| "step": 127 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2329.5208740234375, |
| "epoch": 0.1462857142857143, |
| "grad_norm": 0.025876455008983612, |
| "kl": 0.00014778971672058105, |
| "lambda_div_used": 0.5925442725419998, |
| "learning_rate": 4.1843273287476854e-07, |
| "loss": -0.0218, |
| "reward": 0.04296835511922836, |
| "reward_after_mean": 0.04296835511922836, |
| "reward_after_std": 0.5716155916452408, |
| "reward_before_mean": 0.5175326284952462, |
| "reward_before_std": 0.4212250765413046, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4745642766356468, |
| "reward_change_min": -0.6585596464574337, |
| "reward_change_std": 0.25283501856029034, |
| "reward_std": 0.571615606546402, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.12169927265495062, |
| "step": 128 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3137.4583892822266, |
| "epoch": 0.14742857142857144, |
| "grad_norm": 0.018007410690188408, |
| "kl": 0.00014001131057739258, |
| "lambda_div_used": 0.6392767652869225, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0746, |
| "reward": -0.024870820343494415, |
| "reward_after_mean": -0.024870820343494415, |
| "reward_after_std": 0.6713957078754902, |
| "reward_before_mean": 0.2818590197712183, |
| "reward_before_std": 0.6434567552059889, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30672984197735786, |
| "reward_change_min": -0.5197948329150677, |
| "reward_change_std": 0.20186646562069654, |
| "reward_std": 0.6713957078754902, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/cosine_scaled_reward": 0.031859016977250576, |
| "step": 129 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3087.104202270508, |
| "epoch": 0.14857142857142858, |
| "grad_norm": 0.020643344148993492, |
| "kl": 0.00017881393432617188, |
| "lambda_div_used": 0.6044124737381935, |
| "learning_rate": 4.034943304942796e-07, |
| "loss": -0.0175, |
| "reward": -0.250907301902771, |
| "reward_after_mean": -0.250907301902771, |
| "reward_after_std": 0.514793710783124, |
| "reward_before_mean": 0.01573239639401436, |
| "reward_before_std": 0.4839063249528408, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26663970574736595, |
| "reward_change_min": -0.5019430406391621, |
| "reward_change_std": 0.1815296784043312, |
| "reward_std": 0.5147937145084143, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.1301009338349104, |
| "step": 130 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2290.5000381469727, |
| "epoch": 0.14971428571428572, |
| "grad_norm": 0.025495275855064392, |
| "kl": 0.00014261901378631592, |
| "lambda_div_used": 0.585864968597889, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": -0.0345, |
| "reward": -0.033165013417601585, |
| "reward_after_mean": -0.033165013417601585, |
| "reward_after_std": 0.5260650478303432, |
| "reward_before_mean": 0.42901195771992207, |
| "reward_before_std": 0.3989385652821511, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4621769953519106, |
| "reward_change_min": -0.7094772532582283, |
| "reward_change_std": 0.26999812573194504, |
| "reward_std": 0.5260650608688593, |
| "rewards/accuracy_reward": 0.3541666679084301, |
| "rewards/cosine_scaled_reward": 0.07484527863562107, |
| "step": 131 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2628.979202270508, |
| "epoch": 0.15085714285714286, |
| "grad_norm": 0.021410632878541946, |
| "kl": 0.00011786073446273804, |
| "lambda_div_used": 0.6330231353640556, |
| "learning_rate": 3.8873442270461485e-07, |
| "loss": -0.0163, |
| "reward": 0.21169579401612282, |
| "reward_after_mean": 0.21169579401612282, |
| "reward_after_std": 0.6052236501127481, |
| "reward_before_mean": 0.6431907135993242, |
| "reward_before_std": 0.611154742538929, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43149489536881447, |
| "reward_change_min": -0.7013396099209785, |
| "reward_change_std": 0.279757896438241, |
| "reward_std": 0.605223661288619, |
| "rewards/accuracy_reward": 0.4375000186264515, |
| "rewards/cosine_scaled_reward": 0.20569069124758244, |
| "step": 132 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3043.916702270508, |
| "epoch": 0.152, |
| "grad_norm": 0.020860377699136734, |
| "kl": 0.00016830861568450928, |
| "lambda_div_used": 0.577904686331749, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": -0.0229, |
| "reward": -0.311251699924469, |
| "reward_after_mean": -0.311251699924469, |
| "reward_after_std": 0.40735830925405025, |
| "reward_before_mean": -0.022546445950865746, |
| "reward_before_std": 0.35403214395046234, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28870525024831295, |
| "reward_change_min": -0.4795113056898117, |
| "reward_change_std": 0.175603779964149, |
| "reward_std": 0.40735832042992115, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.16837978060357273, |
| "step": 133 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2296.2500228881836, |
| "epoch": 0.15314285714285714, |
| "grad_norm": 0.025448989123106003, |
| "kl": 0.00012351572513580322, |
| "lambda_div_used": 0.6253047212958336, |
| "learning_rate": 3.7417099217982686e-07, |
| "loss": 0.0232, |
| "reward": 0.1032534665428102, |
| "reward_after_mean": 0.1032534665428102, |
| "reward_after_std": 0.6201032679527998, |
| "reward_before_mean": 0.527151208370924, |
| "reward_before_std": 0.5803719013929367, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4238977525383234, |
| "reward_change_min": -0.7053604945540428, |
| "reward_change_std": 0.27356533519923687, |
| "reward_std": 0.6201032791286707, |
| "rewards/accuracy_reward": 0.3958333432674408, |
| "rewards/cosine_scaled_reward": 0.1313178651034832, |
| "step": 134 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1375.6667098999023, |
| "epoch": 0.15428571428571428, |
| "grad_norm": 0.042060088366270065, |
| "kl": 7.014349102973938e-05, |
| "lambda_div_used": 0.6157513931393623, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": -0.0865, |
| "reward": 0.32518790662288666, |
| "reward_after_mean": 0.32518790662288666, |
| "reward_after_std": 0.7028943486511707, |
| "reward_before_mean": 0.9168294770643115, |
| "reward_before_std": 0.5377248618751764, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5916415732353926, |
| "reward_change_min": -0.8611635379493237, |
| "reward_change_std": 0.3413122948259115, |
| "reward_std": 0.7028943561017513, |
| "rewards/accuracy_reward": 0.6041666734963655, |
| "rewards/cosine_scaled_reward": 0.31266278121620417, |
| "step": 135 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2304.375030517578, |
| "epoch": 0.15542857142857142, |
| "grad_norm": 0.0248978603631258, |
| "kl": 0.00010488927364349365, |
| "lambda_div_used": 0.6134930327534676, |
| "learning_rate": 3.5982178221668533e-07, |
| "loss": -0.0298, |
| "reward": 0.14305459149181843, |
| "reward_after_mean": 0.14305459149181843, |
| "reward_after_std": 0.6303485874086618, |
| "reward_before_mean": 0.6280363164842129, |
| "reward_before_std": 0.5314226988703012, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4849817119538784, |
| "reward_change_min": -0.7693819738924503, |
| "reward_change_std": 0.29691250063478947, |
| "reward_std": 0.6303485967218876, |
| "rewards/accuracy_reward": 0.4375000037252903, |
| "rewards/cosine_scaled_reward": 0.19053628714755177, |
| "step": 136 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2849.312530517578, |
| "epoch": 0.15657142857142858, |
| "grad_norm": 0.019165532663464546, |
| "kl": 0.00011494755744934082, |
| "lambda_div_used": 0.6259770095348358, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0001, |
| "reward": -0.2035258673131466, |
| "reward_after_mean": -0.2035258673131466, |
| "reward_after_std": 0.6123348288238049, |
| "reward_before_mean": 0.05552574759349227, |
| "reward_before_std": 0.5897263735532761, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25905160419642925, |
| "reward_change_min": -0.49531829729676247, |
| "reward_change_std": 0.184982025064528, |
| "reward_std": 0.6123348399996758, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.11114091548370197, |
| "step": 137 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2511.0625381469727, |
| "epoch": 0.15771428571428572, |
| "grad_norm": 0.02206576056778431, |
| "kl": 9.766221046447754e-05, |
| "lambda_div_used": 0.5779839232563972, |
| "learning_rate": 3.45704275117204e-07, |
| "loss": -0.0123, |
| "reward": -0.23054278269410133, |
| "reward_after_mean": -0.23054278269410133, |
| "reward_after_std": 0.47196367010474205, |
| "reward_before_mean": 0.11714623775333166, |
| "reward_before_std": 0.35360280703753233, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34768899716436863, |
| "reward_change_min": -0.5065655931830406, |
| "reward_change_std": 0.18736570980399847, |
| "reward_std": 0.4719636719673872, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.09118711110204458, |
| "step": 138 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2817.166702270508, |
| "epoch": 0.15885714285714286, |
| "grad_norm": 0.025601239874958992, |
| "kl": 0.00013655424118041992, |
| "lambda_div_used": 0.6219307482242584, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": -0.0354, |
| "reward": -0.07538405619561672, |
| "reward_after_mean": -0.07538405619561672, |
| "reward_after_std": 0.5752126723527908, |
| "reward_before_mean": 0.24527974613010883, |
| "reward_before_std": 0.5667949663475156, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3206638339906931, |
| "reward_change_min": -0.556602880358696, |
| "reward_change_std": 0.21935877669602633, |
| "reward_std": 0.5752126909792423, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": -0.02555358811514452, |
| "step": 139 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3020.0833740234375, |
| "epoch": 0.16, |
| "grad_norm": 0.020280320197343826, |
| "kl": 0.00016957521438598633, |
| "lambda_div_used": 0.623006746172905, |
| "learning_rate": 3.3183567088914833e-07, |
| "loss": 0.0263, |
| "reward": 0.048921750858426094, |
| "reward_after_mean": 0.048921750858426094, |
| "reward_after_std": 0.6539230048656464, |
| "reward_before_mean": 0.46376091009005904, |
| "reward_before_std": 0.5685575436800718, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4148391764611006, |
| "reward_change_min": -0.6308448016643524, |
| "reward_change_std": 0.2505591865628958, |
| "reward_std": 0.6539230197668076, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.08876088261604309, |
| "step": 140 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2796.916732788086, |
| "epoch": 0.16114285714285714, |
| "grad_norm": 0.019921308383345604, |
| "kl": 0.00011113286018371582, |
| "lambda_div_used": 0.6025099903345108, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0527, |
| "reward": -0.28436860628426075, |
| "reward_after_mean": -0.28436860628426075, |
| "reward_after_std": 0.5148810762912035, |
| "reward_before_mean": -0.02774716354906559, |
| "reward_before_std": 0.46978663094341755, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2566214445978403, |
| "reward_change_min": -0.41403992287814617, |
| "reward_change_std": 0.15458690002560616, |
| "reward_std": 0.5148810893297195, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.1527471598237753, |
| "step": 141 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2619.1458587646484, |
| "epoch": 0.16228571428571428, |
| "grad_norm": 0.019752761349081993, |
| "kl": 0.00013134628534317017, |
| "lambda_div_used": 0.595375120639801, |
| "learning_rate": 3.182328662904756e-07, |
| "loss": -0.0208, |
| "reward": -0.15498719364404678, |
| "reward_after_mean": -0.15498719364404678, |
| "reward_after_std": 0.5081784036010504, |
| "reward_before_mean": 0.1970929354429245, |
| "reward_before_std": 0.4446716960519552, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3520801328122616, |
| "reward_change_min": -0.6091671586036682, |
| "reward_change_std": 0.22200345993041992, |
| "reward_std": 0.508178411051631, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/cosine_scaled_reward": -0.032073733396828175, |
| "step": 142 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2416.2500762939453, |
| "epoch": 0.16342857142857142, |
| "grad_norm": 0.02633557841181755, |
| "kl": 0.00012940168380737305, |
| "lambda_div_used": 0.6070521473884583, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": 0.0473, |
| "reward": -0.1759424265474081, |
| "reward_after_mean": -0.1759424265474081, |
| "reward_after_std": 0.5199048612266779, |
| "reward_before_mean": 0.1274722833186388, |
| "reward_before_std": 0.4973313231021166, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3034147098660469, |
| "reward_change_min": -0.5472985841333866, |
| "reward_change_std": 0.20386416278779507, |
| "reward_std": 0.5199048724025488, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.03919439576566219, |
| "step": 143 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2497.0416946411133, |
| "epoch": 0.16457142857142856, |
| "grad_norm": 0.035163138061761856, |
| "kl": 0.00010834634304046631, |
| "lambda_div_used": 0.5917740762233734, |
| "learning_rate": 3.0491243424323783e-07, |
| "loss": 0.0908, |
| "reward": 0.24967988207936287, |
| "reward_after_mean": 0.24967988207936287, |
| "reward_after_std": 0.5445964094251394, |
| "reward_before_mean": 0.8210294228047132, |
| "reward_before_std": 0.4194592139683664, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5713495649397373, |
| "reward_change_min": -0.7989893518388271, |
| "reward_change_std": 0.3215014720335603, |
| "reward_std": 0.5445964206010103, |
| "rewards/accuracy_reward": 0.541666679084301, |
| "rewards/cosine_scaled_reward": 0.2793627381324768, |
| "step": 144 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1880.750015258789, |
| "epoch": 0.1657142857142857, |
| "grad_norm": 0.03040655143558979, |
| "kl": 9.316205978393555e-05, |
| "lambda_div_used": 0.6018117442727089, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": -0.0063, |
| "reward": 0.0018871724605560303, |
| "reward_after_mean": 0.0018871724605560303, |
| "reward_after_std": 0.6153606176376343, |
| "reward_before_mean": 0.45608025789260864, |
| "reward_before_std": 0.47093176282942295, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45419312454760075, |
| "reward_change_min": -0.7000111788511276, |
| "reward_change_std": 0.26173546724021435, |
| "reward_std": 0.6153606250882149, |
| "rewards/accuracy_reward": 0.35416666977107525, |
| "rewards/cosine_scaled_reward": 0.10191359603777528, |
| "step": 145 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1995.1041793823242, |
| "epoch": 0.16685714285714287, |
| "grad_norm": 0.02280835248529911, |
| "kl": 9.866058826446533e-05, |
| "lambda_div_used": 0.5822854116559029, |
| "learning_rate": 2.918906036420294e-07, |
| "loss": 0.0731, |
| "reward": -0.3418470360338688, |
| "reward_after_mean": -0.3418470360338688, |
| "reward_after_std": 0.4274127297103405, |
| "reward_before_mean": -0.07948943041265011, |
| "reward_before_std": 0.38051687460392714, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2623576056212187, |
| "reward_change_min": -0.453898411244154, |
| "reward_change_std": 0.1681989086791873, |
| "reward_std": 0.4274127408862114, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.18365611135959625, |
| "step": 146 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3414.4166870117188, |
| "epoch": 0.168, |
| "grad_norm": 0.017901504412293434, |
| "kl": 0.00017184019088745117, |
| "lambda_div_used": 0.6236077323555946, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.0225, |
| "reward": -0.0833415687084198, |
| "reward_after_mean": -0.0833415687084198, |
| "reward_after_std": 0.5858507957309484, |
| "reward_before_mean": 0.216837452724576, |
| "reward_before_std": 0.5764442849904299, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30017900839447975, |
| "reward_change_min": -0.5037183798849583, |
| "reward_change_std": 0.2055044947192073, |
| "reward_std": 0.5858508311212063, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.012329218676313758, |
| "step": 147 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2291.000030517578, |
| "epoch": 0.16914285714285715, |
| "grad_norm": 0.021171187981963158, |
| "kl": 0.00012093409895896912, |
| "lambda_div_used": 0.5844326093792915, |
| "learning_rate": 2.791832395815782e-07, |
| "loss": 0.0386, |
| "reward": -0.15509312599897385, |
| "reward_after_mean": -0.15509312599897385, |
| "reward_after_std": 0.48159872740507126, |
| "reward_before_mean": 0.22943633235991, |
| "reward_before_std": 0.39098774176090956, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3845294751226902, |
| "reward_change_min": -0.6122906021773815, |
| "reward_change_std": 0.23033427819609642, |
| "reward_std": 0.48159876093268394, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/cosine_scaled_reward": 0.00026967376470565796, |
| "step": 148 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2509.0416870117188, |
| "epoch": 0.1702857142857143, |
| "grad_norm": 0.020010385662317276, |
| "kl": 8.110702037811279e-05, |
| "lambda_div_used": 0.5822181403636932, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0208, |
| "reward": 0.08701876550912857, |
| "reward_after_mean": 0.08701876550912857, |
| "reward_after_std": 0.5091588757932186, |
| "reward_before_mean": 0.598341865465045, |
| "reward_before_std": 0.3779993327334523, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5113231185823679, |
| "reward_change_min": -0.740135669708252, |
| "reward_change_std": 0.2875883989036083, |
| "reward_std": 0.5091589000076056, |
| "rewards/accuracy_reward": 0.4166666716337204, |
| "rewards/cosine_scaled_reward": 0.18167520873248577, |
| "step": 149 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2679.208381652832, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.025659440085291862, |
| "kl": 0.00014823675155639648, |
| "lambda_div_used": 0.6290425136685371, |
| "learning_rate": 2.6680582402757324e-07, |
| "loss": -0.0233, |
| "reward": -0.049642632249742746, |
| "reward_after_mean": -0.049642632249742746, |
| "reward_after_std": 0.5986230112612247, |
| "reward_before_mean": 0.2686304301023483, |
| "reward_before_std": 0.6011289358139038, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31827306374907494, |
| "reward_change_min": -0.5840066187083721, |
| "reward_change_std": 0.2246640883386135, |
| "reward_std": 0.598623014986515, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/cosine_scaled_reward": -0.002202920615673065, |
| "step": 150 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2462.5625610351562, |
| "epoch": 0.17257142857142857, |
| "grad_norm": 0.03043326921761036, |
| "kl": 0.0001488029956817627, |
| "lambda_div_used": 0.6573176011443138, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0929, |
| "reward": 0.2124087940901518, |
| "reward_after_mean": 0.2124087940901518, |
| "reward_after_std": 0.762018321081996, |
| "reward_before_mean": 0.6105309925042093, |
| "reward_before_std": 0.7428378090262413, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39812218956649303, |
| "reward_change_min": -0.6643032841384411, |
| "reward_change_std": 0.271884405054152, |
| "reward_std": 0.7620183527469635, |
| "rewards/accuracy_reward": 0.43750000558793545, |
| "rewards/cosine_scaled_reward": 0.17303097806870937, |
| "step": 151 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2846.875072479248, |
| "epoch": 0.1737142857142857, |
| "grad_norm": 0.02812943048775196, |
| "kl": 0.00018167495727539062, |
| "lambda_div_used": 0.599166102707386, |
| "learning_rate": 2.547734369542718e-07, |
| "loss": -0.0069, |
| "reward": -0.32120730075985193, |
| "reward_after_mean": -0.32120730075985193, |
| "reward_after_std": 0.5156350377947092, |
| "reward_before_mean": -0.06628246325999498, |
| "reward_before_std": 0.4535220582038164, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25492484122514725, |
| "reward_change_min": -0.42022984474897385, |
| "reward_change_std": 0.14956693351268768, |
| "reward_std": 0.5156350489705801, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.1496157981455326, |
| "step": 152 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2625.770896911621, |
| "epoch": 0.17485714285714285, |
| "grad_norm": 0.02562631107866764, |
| "kl": 0.0001496821641921997, |
| "lambda_div_used": 0.5755600407719612, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0383, |
| "reward": -0.2726967688649893, |
| "reward_after_mean": -0.2726967688649893, |
| "reward_after_std": 0.43943885155022144, |
| "reward_before_mean": 0.054511758498847485, |
| "reward_before_std": 0.3466099677607417, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3272085413336754, |
| "reward_change_min": -0.4877549596130848, |
| "reward_change_std": 0.18566382955759764, |
| "reward_std": 0.4394388683140278, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.15382158383727074, |
| "step": 153 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2930.2083587646484, |
| "epoch": 0.176, |
| "grad_norm": 0.01869855262339115, |
| "kl": 0.00013381242752075195, |
| "lambda_div_used": 0.6487007588148117, |
| "learning_rate": 2.4310073797187573e-07, |
| "loss": 0.0074, |
| "reward": 0.17745468392968178, |
| "reward_after_mean": 0.17745468392968178, |
| "reward_after_std": 0.6683135256171227, |
| "reward_before_mean": 0.554833997040987, |
| "reward_before_std": 0.6959163639694452, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37737933173775673, |
| "reward_change_min": -0.6365409940481186, |
| "reward_change_std": 0.26569664292037487, |
| "reward_std": 0.6683135367929935, |
| "rewards/accuracy_reward": 0.3958333432674408, |
| "rewards/cosine_scaled_reward": 0.15900065936148167, |
| "step": 154 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2324.7083587646484, |
| "epoch": 0.17714285714285713, |
| "grad_norm": 0.03042282536625862, |
| "kl": 0.0001443326473236084, |
| "lambda_div_used": 0.6021355092525482, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": -0.0708, |
| "reward": 0.013361499644815922, |
| "reward_after_mean": 0.013361499644815922, |
| "reward_after_std": 0.608528571203351, |
| "reward_before_mean": 0.4701185021549463, |
| "reward_before_std": 0.4722642693668604, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4567570425570011, |
| "reward_change_min": -0.728898536413908, |
| "reward_change_std": 0.2705167792737484, |
| "reward_std": 0.6085285805165768, |
| "rewards/accuracy_reward": 0.3958333358168602, |
| "rewards/cosine_scaled_reward": 0.07428519520908594, |
| "step": 155 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2742.041702270508, |
| "epoch": 0.1782857142857143, |
| "grad_norm": 0.023997837677598, |
| "kl": 0.00013341009616851807, |
| "lambda_div_used": 0.6111876517534256, |
| "learning_rate": 2.3180194846605364e-07, |
| "loss": 0.0104, |
| "reward": -0.11058625392615795, |
| "reward_after_mean": -0.11058625392615795, |
| "reward_after_std": 0.5431522708386183, |
| "reward_before_mean": 0.20433677232358605, |
| "reward_before_std": 0.5068034324795008, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31492303870618343, |
| "reward_change_min": -0.4863894209265709, |
| "reward_change_std": 0.18972175009548664, |
| "reward_std": 0.5431522782891989, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/cosine_scaled_reward": -0.024829893372952938, |
| "step": 156 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2665.875045776367, |
| "epoch": 0.17942857142857144, |
| "grad_norm": 0.02073371410369873, |
| "kl": 0.00014007091522216797, |
| "lambda_div_used": 0.5806600153446198, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0406, |
| "reward": -0.2642064723186195, |
| "reward_after_mean": -0.2642064723186195, |
| "reward_after_std": 0.48669449612498283, |
| "reward_before_mean": 0.06258813291788101, |
| "reward_before_std": 0.3662749119102955, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32679460756480694, |
| "reward_change_min": -0.473216038197279, |
| "reward_change_std": 0.1716562630608678, |
| "reward_std": 0.48669449612498283, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.10407854370714631, |
| "step": 157 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2338.8333740234375, |
| "epoch": 0.18057142857142858, |
| "grad_norm": 0.02651275135576725, |
| "kl": 0.00011517666280269623, |
| "lambda_div_used": 0.6631434112787247, |
| "learning_rate": 2.2089083427137329e-07, |
| "loss": 0.0501, |
| "reward": 0.14301904384046793, |
| "reward_after_mean": 0.14301904384046793, |
| "reward_after_std": 0.8312356304377317, |
| "reward_before_mean": 0.5320943212136626, |
| "reward_before_std": 0.7637526150792837, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38907529041171074, |
| "reward_change_min": -0.694700576364994, |
| "reward_change_std": 0.26256909035146236, |
| "reward_std": 0.8312356378883123, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.15709431585855782, |
| "step": 158 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3239.312530517578, |
| "epoch": 0.18171428571428572, |
| "grad_norm": 0.016524603590369225, |
| "kl": 0.0001583099365234375, |
| "lambda_div_used": 0.623667947947979, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": -0.0341, |
| "reward": -0.1187703013420105, |
| "reward_after_mean": -0.1187703013420105, |
| "reward_after_std": 0.5951940100640059, |
| "reward_before_mean": 0.16929386125411838, |
| "reward_before_std": 0.5728897508233786, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28806419111788273, |
| "reward_change_min": -0.4665379598736763, |
| "reward_change_std": 0.1831390606239438, |
| "reward_std": 0.5951940137892962, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/cosine_scaled_reward": -0.05987279675900936, |
| "step": 159 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2803.312530517578, |
| "epoch": 0.18285714285714286, |
| "grad_norm": 0.026071852073073387, |
| "kl": 0.00017073750495910645, |
| "lambda_div_used": 0.6305168867111206, |
| "learning_rate": 2.1038068889975259e-07, |
| "loss": -0.0459, |
| "reward": 0.010341526940464973, |
| "reward_after_mean": 0.010341526940464973, |
| "reward_after_std": 0.6011195741593838, |
| "reward_before_mean": 0.3497283663600683, |
| "reward_before_std": 0.6088744457811117, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.33938686549663544, |
| "reward_change_min": -0.5917558334767818, |
| "reward_change_std": 0.2366197258234024, |
| "reward_std": 0.6011195983737707, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/cosine_scaled_reward": 0.05806170590221882, |
| "step": 160 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2208.9167098999023, |
| "epoch": 0.184, |
| "grad_norm": 0.0240344051271677, |
| "kl": 0.000129062682390213, |
| "lambda_div_used": 0.6582028865814209, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0279, |
| "reward": 0.07936648279428482, |
| "reward_after_mean": 0.07936648279428482, |
| "reward_after_std": 0.7546874471008778, |
| "reward_before_mean": 0.40869135939283296, |
| "reward_before_std": 0.7339576873928308, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32932490296661854, |
| "reward_change_min": -0.6056464668363333, |
| "reward_change_std": 0.22207134775817394, |
| "reward_std": 0.7546874955296516, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.05452469550073147, |
| "step": 161 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3135.0208892822266, |
| "epoch": 0.18514285714285714, |
| "grad_norm": 0.022219210863113403, |
| "kl": 0.0001678466796875, |
| "lambda_div_used": 0.5934342220425606, |
| "learning_rate": 2.0028431734436308e-07, |
| "loss": -0.0607, |
| "reward": -0.046884071081876755, |
| "reward_after_mean": -0.046884071081876755, |
| "reward_after_std": 0.5110116824507713, |
| "reward_before_mean": 0.3527396023273468, |
| "reward_before_std": 0.4275300269946456, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39962366595864296, |
| "reward_change_min": -0.5722533725202084, |
| "reward_change_std": 0.22872111946344376, |
| "reward_std": 0.5110117141157389, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.019406253471970558, |
| "step": 162 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2319.9791870117188, |
| "epoch": 0.18628571428571428, |
| "grad_norm": 0.025694716721773148, |
| "kl": 0.00014371052384376526, |
| "lambda_div_used": 0.5704625844955444, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.0916, |
| "reward": 0.07464592158794403, |
| "reward_after_mean": 0.07464592158794403, |
| "reward_after_std": 0.49546825513243675, |
| "reward_before_mean": 0.6325086355209351, |
| "reward_before_std": 0.32235115580260754, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.557862676680088, |
| "reward_change_min": -0.7940906882286072, |
| "reward_change_std": 0.3045828063040972, |
| "reward_std": 0.49546825885772705, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_scaled_reward": 0.19500861689448357, |
| "step": 163 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2167.750045776367, |
| "epoch": 0.18742857142857142, |
| "grad_norm": 0.0288226380944252, |
| "kl": 0.0001278519630432129, |
| "lambda_div_used": 0.6208588480949402, |
| "learning_rate": 1.9061402047871833e-07, |
| "loss": 0.0045, |
| "reward": 0.052167763307807036, |
| "reward_after_mean": 0.052167763307807036, |
| "reward_after_std": 0.627906009554863, |
| "reward_before_mean": 0.44521861523389816, |
| "reward_before_std": 0.5596300046890974, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39305083081126213, |
| "reward_change_min": -0.6213876642286777, |
| "reward_change_std": 0.24184285942465067, |
| "reward_std": 0.6279060393571854, |
| "rewards/accuracy_reward": 0.35416667349636555, |
| "rewards/cosine_scaled_reward": 0.09105192590504885, |
| "step": 164 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2823.8125228881836, |
| "epoch": 0.18857142857142858, |
| "grad_norm": 0.026144707575440407, |
| "kl": 0.00016139447689056396, |
| "lambda_div_used": 0.5820390656590462, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": -0.0708, |
| "reward": -0.32857649284414947, |
| "reward_after_mean": -0.32857649284414947, |
| "reward_after_std": 0.4344688355922699, |
| "reward_before_mean": -0.0503513365983963, |
| "reward_before_std": 0.3760820124298334, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2782251574099064, |
| "reward_change_min": -0.43433111906051636, |
| "reward_change_std": 0.16556962952017784, |
| "reward_std": 0.4344688393175602, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.13368466403335333, |
| "step": 165 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2491.0208587646484, |
| "epoch": 0.18971428571428572, |
| "grad_norm": 0.019254038110375404, |
| "kl": 0.0001147836446762085, |
| "lambda_div_used": 0.6493343263864517, |
| "learning_rate": 1.8138158006995363e-07, |
| "loss": 0.0025, |
| "reward": 0.10031834430992603, |
| "reward_after_mean": 0.10031834430992603, |
| "reward_after_std": 0.6912827659398317, |
| "reward_before_mean": 0.4514310024678707, |
| "reward_before_std": 0.6998845022171736, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.351112674921751, |
| "reward_change_min": -0.6428324580192566, |
| "reward_change_std": 0.2517909351736307, |
| "reward_std": 0.6912827901542187, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.11809766665101051, |
| "step": 166 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2146.645896911621, |
| "epoch": 0.19085714285714286, |
| "grad_norm": 0.028959238901734352, |
| "kl": 0.00012630224227905273, |
| "lambda_div_used": 0.602773554623127, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": -0.0687, |
| "reward": -0.08888023532927036, |
| "reward_after_mean": -0.08888023532927036, |
| "reward_after_std": 0.555552402511239, |
| "reward_before_mean": 0.284214471001178, |
| "reward_before_std": 0.4734340328723192, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37309471145272255, |
| "reward_change_min": -0.5994942858815193, |
| "reward_change_std": 0.2264004945755005, |
| "reward_std": 0.5555524323135614, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.013381102122366428, |
| "step": 167 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2591.562545776367, |
| "epoch": 0.192, |
| "grad_norm": 0.022079093381762505, |
| "kl": 0.00013020634651184082, |
| "lambda_div_used": 0.6182359680533409, |
| "learning_rate": 1.7259824442455923e-07, |
| "loss": 0.0571, |
| "reward": 0.07187426090240479, |
| "reward_after_mean": 0.07187426090240479, |
| "reward_after_std": 0.6350691560655832, |
| "reward_before_mean": 0.4906120039522648, |
| "reward_before_std": 0.5473381988704205, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4187377579510212, |
| "reward_change_min": -0.6163501553237438, |
| "reward_change_std": 0.24177053570747375, |
| "reward_std": 0.6350691728293896, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.1364453360438347, |
| "step": 168 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1961.333366394043, |
| "epoch": 0.19314285714285714, |
| "grad_norm": 0.024920228868722916, |
| "kl": 0.00011189281940460205, |
| "lambda_div_used": 0.5712982937693596, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": -0.037, |
| "reward": 0.20125045720487833, |
| "reward_after_mean": 0.20125045720487833, |
| "reward_after_std": 0.5916534103453159, |
| "reward_before_mean": 0.8435525028035045, |
| "reward_before_std": 0.32216374203562737, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.6423020549118519, |
| "reward_change_min": -0.836066972464323, |
| "reward_change_std": 0.32226173765957355, |
| "reward_std": 0.5916534326970577, |
| "rewards/accuracy_reward": 0.5416666679084301, |
| "rewards/cosine_scaled_reward": 0.3018858137074858, |
| "step": 169 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2154.5417404174805, |
| "epoch": 0.19428571428571428, |
| "grad_norm": 0.028055250644683838, |
| "kl": 9.50545072555542e-05, |
| "lambda_div_used": 0.5548974648118019, |
| "learning_rate": 1.6427471468404952e-07, |
| "loss": 0.0414, |
| "reward": -0.1357494406402111, |
| "reward_after_mean": -0.1357494406402111, |
| "reward_after_std": 0.41400698386132717, |
| "reward_before_mean": 0.32883553951978683, |
| "reward_before_std": 0.2495311009697616, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4645849745720625, |
| "reward_change_min": -0.6499762162566185, |
| "reward_change_std": 0.250681190751493, |
| "reward_std": 0.41400699876248837, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/cosine_scaled_reward": 0.01633552461862564, |
| "step": 170 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2344.3958587646484, |
| "epoch": 0.19542857142857142, |
| "grad_norm": 0.02545234002172947, |
| "kl": 0.00012035667896270752, |
| "lambda_div_used": 0.5686006918549538, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": -0.0076, |
| "reward": 0.025929288007318974, |
| "reward_after_mean": 0.025929288007318974, |
| "reward_after_std": 0.4821996595710516, |
| "reward_before_mean": 0.5422459719702601, |
| "reward_before_std": 0.3103277189657092, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5163167044520378, |
| "reward_change_min": -0.6874858625233173, |
| "reward_change_std": 0.26997776329517365, |
| "reward_std": 0.4821996670216322, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.1672459738329053, |
| "step": 171 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2580.3958740234375, |
| "epoch": 0.19657142857142856, |
| "grad_norm": 0.03461524099111557, |
| "kl": 0.00015616416931152344, |
| "lambda_div_used": 0.5815886929631233, |
| "learning_rate": 8.487667956935087e-07, |
| "loss": -0.0386, |
| "reward": -0.02652345411479473, |
| "reward_after_mean": -0.02652345411479473, |
| "reward_after_std": 0.5259725619107485, |
| "reward_before_mean": 0.4387537483125925, |
| "reward_before_std": 0.3750908151268959, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.46527721732854843, |
| "reward_change_min": -0.6695724055171013, |
| "reward_change_std": 0.2563530644401908, |
| "reward_std": 0.5259725674986839, |
| "rewards/accuracy_reward": 0.33333333395421505, |
| "rewards/cosine_scaled_reward": 0.10542040364816785, |
| "step": 172 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1932.0625305175781, |
| "epoch": 0.1977142857142857, |
| "grad_norm": 0.03550613671541214, |
| "kl": 9.726732969284058e-05, |
| "lambda_div_used": 0.5629367232322693, |
| "learning_rate": 8.464102570534061e-07, |
| "loss": -0.0198, |
| "reward": -0.33315238857176155, |
| "reward_after_mean": -0.33315238857176155, |
| "reward_after_std": 0.35940456483513117, |
| "reward_before_mean": -0.03457173053175211, |
| "reward_before_std": 0.2833498573163524, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29858064092695713, |
| "reward_change_min": -0.4200097434222698, |
| "reward_change_std": 0.16404641512781382, |
| "reward_std": 0.35940458066761494, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.15957174729555845, |
| "step": 173 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1809.7708587646484, |
| "epoch": 0.19885714285714284, |
| "grad_norm": 0.03420722112059593, |
| "kl": 0.00013248249888420105, |
| "lambda_div_used": 0.6139687895774841, |
| "learning_rate": 8.440392717955475e-07, |
| "loss": -0.0816, |
| "reward": -0.07395929284393787, |
| "reward_after_mean": -0.07395929284393787, |
| "reward_after_std": 0.5732789468020201, |
| "reward_before_mean": 0.25243946351110935, |
| "reward_before_std": 0.5262010591104627, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3263987563550472, |
| "reward_change_min": -0.5132386535406113, |
| "reward_change_std": 0.20316704735159874, |
| "reward_std": 0.5732789561152458, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": 0.02327278454322368, |
| "step": 174 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2270.7500495910645, |
| "epoch": 0.2, |
| "grad_norm": 0.023624489083886147, |
| "kl": 0.00011564046144485474, |
| "lambda_div_used": 0.570957601070404, |
| "learning_rate": 8.416539554784089e-07, |
| "loss": 0.0018, |
| "reward": -0.09137872606515884, |
| "reward_after_mean": -0.09137872606515884, |
| "reward_after_std": 0.4245176389813423, |
| "reward_before_mean": 0.3394407369196415, |
| "reward_before_std": 0.3214457123540342, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4308194350451231, |
| "reward_change_min": -0.6172507330775261, |
| "reward_change_std": 0.24237936083227396, |
| "reward_std": 0.4245176613330841, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.026940705254673958, |
| "step": 175 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2428.3750534057617, |
| "epoch": 0.20114285714285715, |
| "grad_norm": 0.03552456945180893, |
| "kl": 9.998679161071777e-05, |
| "lambda_div_used": 0.6270494386553764, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0567, |
| "reward": 0.23782964330166578, |
| "reward_after_mean": 0.23782964330166578, |
| "reward_after_std": 0.6844992376863956, |
| "reward_before_mean": 0.7466199137270451, |
| "reward_before_std": 0.594361359719187, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5087902657687664, |
| "reward_change_min": -0.8128968887031078, |
| "reward_change_std": 0.31682352535426617, |
| "reward_std": 0.6844992693513632, |
| "rewards/accuracy_reward": 0.5000000074505806, |
| "rewards/cosine_scaled_reward": 0.24661988578736782, |
| "step": 176 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2709.5833740234375, |
| "epoch": 0.2022857142857143, |
| "grad_norm": 0.022738253697752953, |
| "kl": 0.0001655668020248413, |
| "lambda_div_used": 0.6092279329895973, |
| "learning_rate": 8.368407953869103e-07, |
| "loss": 0.0278, |
| "reward": -0.16157673671841621, |
| "reward_after_mean": -0.16157673671841621, |
| "reward_after_std": 0.5301279928535223, |
| "reward_before_mean": 0.14143561571836472, |
| "reward_before_std": 0.5103836972266436, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.303012328222394, |
| "reward_change_min": -0.5294999107718468, |
| "reward_change_std": 0.2066562958061695, |
| "reward_std": 0.530128002166748, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.04606438986957073, |
| "step": 177 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2521.562545776367, |
| "epoch": 0.20342857142857143, |
| "grad_norm": 0.023333929479122162, |
| "kl": 0.00010882318019866943, |
| "lambda_div_used": 0.5983341336250305, |
| "learning_rate": 8.344131861991828e-07, |
| "loss": 0.0305, |
| "reward": -0.0399339459836483, |
| "reward_after_mean": -0.0399339459836483, |
| "reward_after_std": 0.544251000508666, |
| "reward_before_mean": 0.369808804243803, |
| "reward_before_std": 0.4518149495124817, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4097427297383547, |
| "reward_change_min": -0.6259582564234734, |
| "reward_change_std": 0.23724547680467367, |
| "reward_std": 0.5442510098218918, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": 0.07814211072400212, |
| "step": 178 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2703.7083740234375, |
| "epoch": 0.20457142857142857, |
| "grad_norm": 0.023093275725841522, |
| "kl": 0.00013177096843719482, |
| "lambda_div_used": 0.5762727931141853, |
| "learning_rate": 8.319717151140072e-07, |
| "loss": 0.0879, |
| "reward": -0.20519665256142616, |
| "reward_after_mean": -0.20519665256142616, |
| "reward_after_std": 0.39140512235462666, |
| "reward_before_mean": 0.13847777154296637, |
| "reward_before_std": 0.3475749148055911, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3436744213104248, |
| "reward_change_min": -0.5246328189969063, |
| "reward_change_std": 0.20370345003902912, |
| "reward_std": 0.39140513353049755, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.049022228457033634, |
| "step": 179 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2075.3750228881836, |
| "epoch": 0.2057142857142857, |
| "grad_norm": 0.03283306583762169, |
| "kl": 0.00013710558414459229, |
| "lambda_div_used": 0.5492931827902794, |
| "learning_rate": 8.295165011252396e-07, |
| "loss": -0.0592, |
| "reward": -0.09344155341386795, |
| "reward_after_mean": -0.09344155341386795, |
| "reward_after_std": 0.37022680789232254, |
| "reward_before_mean": 0.39771560952067375, |
| "reward_before_std": 0.21920094243250787, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4911571964621544, |
| "reward_change_min": -0.6641882658004761, |
| "reward_change_std": 0.25923535134643316, |
| "reward_std": 0.3702268172055483, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.0435489546507597, |
| "step": 180 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2976.875030517578, |
| "epoch": 0.20685714285714285, |
| "grad_norm": 0.02316705696284771, |
| "kl": 0.00017173588275909424, |
| "lambda_div_used": 0.594053827226162, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": -0.0029, |
| "reward": -0.17979225050657988, |
| "reward_after_mean": -0.17979225050657988, |
| "reward_after_std": 0.5341140031814575, |
| "reward_before_mean": 0.17803902877494693, |
| "reward_before_std": 0.4284058129414916, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3578312788158655, |
| "reward_change_min": -0.5442017950117588, |
| "reward_change_std": 0.2014410514384508, |
| "reward_std": 0.5341140106320381, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.03029430890455842, |
| "step": 181 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1833.0833930969238, |
| "epoch": 0.208, |
| "grad_norm": 0.02471437305212021, |
| "kl": 6.746500730514526e-05, |
| "lambda_div_used": 0.6185052543878555, |
| "learning_rate": 8.245653237555705e-07, |
| "loss": -0.0541, |
| "reward": 0.10658288560807705, |
| "reward_after_mean": 0.10658288560807705, |
| "reward_after_std": 0.6189861167222261, |
| "reward_before_mean": 0.5465792864561081, |
| "reward_before_std": 0.5544852227903903, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43999641202390194, |
| "reward_change_min": -0.7109990864992142, |
| "reward_change_std": 0.2778801778331399, |
| "reward_std": 0.618986152112484, |
| "rewards/accuracy_reward": 0.4375000037252903, |
| "rewards/cosine_scaled_reward": 0.10907927341759205, |
| "step": 182 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1834.4375686645508, |
| "epoch": 0.20914285714285713, |
| "grad_norm": 0.026828886941075325, |
| "kl": 8.349120616912842e-05, |
| "lambda_div_used": 0.6341942846775055, |
| "learning_rate": 8.220696016880687e-07, |
| "loss": 0.0145, |
| "reward": 0.10971941862953827, |
| "reward_after_mean": 0.10971941862953827, |
| "reward_after_std": 0.6589909251779318, |
| "reward_before_mean": 0.4826927953399718, |
| "reward_before_std": 0.6231282472144812, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37297336757183075, |
| "reward_change_min": -0.5635729804635048, |
| "reward_change_std": 0.2322026826441288, |
| "reward_std": 0.6589909512549639, |
| "rewards/accuracy_reward": 0.3125000111758709, |
| "rewards/cosine_scaled_reward": 0.1701928018592298, |
| "step": 183 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2777.020854949951, |
| "epoch": 0.2102857142857143, |
| "grad_norm": 0.027208158746361732, |
| "kl": 0.00016835331916809082, |
| "lambda_div_used": 0.5735552906990051, |
| "learning_rate": 8.195606193320136e-07, |
| "loss": 0.0331, |
| "reward": -0.2550749061629176, |
| "reward_after_mean": -0.2550749061629176, |
| "reward_after_std": 0.45949564687907696, |
| "reward_before_mean": 0.09985405765473843, |
| "reward_before_std": 0.33237360091879964, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3549289759248495, |
| "reward_change_min": -0.4971868433058262, |
| "reward_change_std": 0.18658464308828115, |
| "reward_std": 0.45949566550552845, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.06681260792538524, |
| "step": 184 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2450.4583473205566, |
| "epoch": 0.21142857142857144, |
| "grad_norm": 0.031349100172519684, |
| "kl": 0.00011152029037475586, |
| "lambda_div_used": 0.5730742663145065, |
| "learning_rate": 8.170384989716657e-07, |
| "loss": 0.055, |
| "reward": -0.23662783950567245, |
| "reward_after_mean": -0.23662783950567245, |
| "reward_after_std": 0.3798432908952236, |
| "reward_before_mean": 0.08804147504270077, |
| "reward_before_std": 0.3363419594243169, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32466931641101837, |
| "reward_change_min": -0.47964803501963615, |
| "reward_change_std": 0.19531975220888853, |
| "reward_std": 0.3798433095216751, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.09945851005613804, |
| "step": 185 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2827.9375228881836, |
| "epoch": 0.21257142857142858, |
| "grad_norm": 0.018743637949228287, |
| "kl": 0.00014129281044006348, |
| "lambda_div_used": 0.6022143214941025, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": -0.0243, |
| "reward": -0.19847029261291027, |
| "reward_after_mean": -0.19847029261291027, |
| "reward_after_std": 0.4997438360005617, |
| "reward_before_mean": 0.09954999759793282, |
| "reward_before_std": 0.4699443206191063, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.298020301386714, |
| "reward_change_min": -0.4791484698653221, |
| "reward_change_std": 0.18491498567163944, |
| "reward_std": 0.49974384531378746, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.08794999308884144, |
| "step": 186 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2602.1875228881836, |
| "epoch": 0.21371428571428572, |
| "grad_norm": 0.030055196955800056, |
| "kl": 0.0001709461212158203, |
| "lambda_div_used": 0.5778695195913315, |
| "learning_rate": 8.119553365707802e-07, |
| "loss": -0.0784, |
| "reward": -0.23846351448446512, |
| "reward_after_mean": -0.23846351448446512, |
| "reward_after_std": 0.39364523626863956, |
| "reward_before_mean": 0.09842715226113796, |
| "reward_before_std": 0.35397925041615963, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3368906620889902, |
| "reward_change_min": -0.5288374535739422, |
| "reward_change_std": 0.20237108506262302, |
| "reward_std": 0.3936452493071556, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.047406171914190054, |
| "step": 187 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3423.875030517578, |
| "epoch": 0.21485714285714286, |
| "grad_norm": 0.01838378608226776, |
| "kl": 0.00020372867584228516, |
| "lambda_div_used": 0.5887190625071526, |
| "learning_rate": 8.093945422764069e-07, |
| "loss": 0.0025, |
| "reward": -0.24391454830765724, |
| "reward_after_mean": -0.24391454830765724, |
| "reward_after_std": 0.43992058746516705, |
| "reward_before_mean": 0.055846452713012695, |
| "reward_before_std": 0.4102069940418005, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2997609917074442, |
| "reward_change_min": -0.501131433993578, |
| "reward_change_std": 0.19180170260369778, |
| "reward_std": 0.4399206154048443, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.0691535547375679, |
| "step": 188 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1879.500057220459, |
| "epoch": 0.216, |
| "grad_norm": 0.034202635288238525, |
| "kl": 0.00012063980102539062, |
| "lambda_div_used": 0.6004914790391922, |
| "learning_rate": 8.068211054579943e-07, |
| "loss": -0.0391, |
| "reward": -0.1741858683526516, |
| "reward_after_mean": -0.1741858683526516, |
| "reward_after_std": 0.4978427290916443, |
| "reward_before_mean": 0.13175462279468775, |
| "reward_before_std": 0.466338312253356, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30594046600162983, |
| "reward_change_min": -0.47848713025450706, |
| "reward_change_std": 0.1904344316571951, |
| "reward_std": 0.49784273840487003, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.05574538931250572, |
| "step": 189 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2554.9583740234375, |
| "epoch": 0.21714285714285714, |
| "grad_norm": 0.02105082757771015, |
| "kl": 0.0001103430986404419, |
| "lambda_div_used": 0.6010043099522591, |
| "learning_rate": 8.04235151541222e-07, |
| "loss": 0.0273, |
| "reward": -0.017362398095428944, |
| "reward_after_mean": -0.017362398095428944, |
| "reward_after_std": 0.5378628317266703, |
| "reward_before_mean": 0.3967582155019045, |
| "reward_before_std": 0.46276178024709225, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4141206480562687, |
| "reward_change_min": -0.6200221106410027, |
| "reward_change_std": 0.24346179515123367, |
| "reward_std": 0.5378628373146057, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.08425821363925934, |
| "step": 190 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2059.0208702087402, |
| "epoch": 0.21828571428571428, |
| "grad_norm": 0.027044324204325676, |
| "kl": 0.0001252889633178711, |
| "lambda_div_used": 0.6293297410011292, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": 0.0258, |
| "reward": 0.11435023881494999, |
| "reward_after_mean": 0.11435023881494999, |
| "reward_after_std": 0.6808852814137936, |
| "reward_before_mean": 0.5306741870008409, |
| "reward_before_std": 0.5962998084723949, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4163239523768425, |
| "reward_change_min": -0.6401379927992821, |
| "reward_change_std": 0.2505673002451658, |
| "reward_std": 0.680885311216116, |
| "rewards/accuracy_reward": 0.416666679084301, |
| "rewards/cosine_scaled_reward": 0.11400750931352377, |
| "step": 191 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3052.4583740234375, |
| "epoch": 0.21942857142857142, |
| "grad_norm": 0.01929704286158085, |
| "kl": 0.0001703500747680664, |
| "lambda_div_used": 0.5668376535177231, |
| "learning_rate": 7.990261971595048e-07, |
| "loss": -0.0488, |
| "reward": -0.18853843957185745, |
| "reward_after_mean": -0.18853843957185745, |
| "reward_after_std": 0.3872429598122835, |
| "reward_before_mean": 0.20527121797204018, |
| "reward_before_std": 0.30656870268285275, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39380968734622, |
| "reward_change_min": -0.5667809918522835, |
| "reward_change_std": 0.22394196968525648, |
| "reward_std": 0.38724296167492867, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.0030621085315942764, |
| "step": 192 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2665.0000610351562, |
| "epoch": 0.22057142857142858, |
| "grad_norm": 0.024683799594640732, |
| "kl": 0.0001424849033355713, |
| "lambda_div_used": 0.6040371954441071, |
| "learning_rate": 7.964034505716476e-07, |
| "loss": 0.1051, |
| "reward": -0.12321672588586807, |
| "reward_after_mean": -0.12321672588586807, |
| "reward_after_std": 0.49566064216196537, |
| "reward_before_mean": 0.19374842569231987, |
| "reward_before_std": 0.4829600788652897, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.316965164616704, |
| "reward_change_min": -0.5347904153168201, |
| "reward_change_std": 0.2084766924381256, |
| "reward_std": 0.49566065706312656, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/cosine_scaled_reward": -0.05625157803297043, |
| "step": 193 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2681.6666870117188, |
| "epoch": 0.22171428571428572, |
| "grad_norm": 0.01963093690574169, |
| "kl": 0.0001462697982788086, |
| "lambda_div_used": 0.6072653383016586, |
| "learning_rate": 7.93768694627233e-07, |
| "loss": 0.0685, |
| "reward": 0.2620888948440552, |
| "reward_after_mean": 0.2620888948440552, |
| "reward_after_std": 0.5582387447357178, |
| "reward_before_mean": 0.7925186604261398, |
| "reward_before_std": 0.4891574867069721, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5304297637194395, |
| "reward_change_min": -0.7805496528744698, |
| "reward_change_std": 0.3099254406988621, |
| "reward_std": 0.5582387670874596, |
| "rewards/accuracy_reward": 0.541666679084301, |
| "rewards/cosine_scaled_reward": 0.250851983204484, |
| "step": 194 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2509.2708892822266, |
| "epoch": 0.22285714285714286, |
| "grad_norm": 0.021495619788765907, |
| "kl": 0.00014010071754455566, |
| "lambda_div_used": 0.6250215768814087, |
| "learning_rate": 7.911220577405484e-07, |
| "loss": -0.0159, |
| "reward": -0.02809133753180504, |
| "reward_after_mean": -0.02809133753180504, |
| "reward_after_std": 0.5878969728946686, |
| "reward_before_mean": 0.3000528886914253, |
| "reward_before_std": 0.5790729988366365, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.32814422622323036, |
| "reward_change_min": -0.5872809514403343, |
| "reward_change_std": 0.22489875741302967, |
| "reward_std": 0.5878969803452492, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.029219531919807196, |
| "step": 195 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3391.9166870117188, |
| "epoch": 0.224, |
| "grad_norm": 0.01908985897898674, |
| "kl": 0.00020110607147216797, |
| "lambda_div_used": 0.5609800890088081, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": -0.0096, |
| "reward": -0.2512575164437294, |
| "reward_after_mean": -0.2512575164437294, |
| "reward_after_std": 0.3720796424895525, |
| "reward_before_mean": 0.114608995616436, |
| "reward_before_std": 0.27679250249639153, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3658665083348751, |
| "reward_change_min": -0.5327885784208775, |
| "reward_change_std": 0.20587429776787758, |
| "reward_std": 0.37207965552806854, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.09372434392571449, |
| "step": 196 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2261.50008392334, |
| "epoch": 0.22514285714285714, |
| "grad_norm": 0.0338062159717083, |
| "kl": 0.00014703720808029175, |
| "lambda_div_used": 0.6676772907376289, |
| "learning_rate": 7.857936576865356e-07, |
| "loss": 0.0118, |
| "reward": 0.3714135689660907, |
| "reward_after_mean": 0.3714135689660907, |
| "reward_after_std": 0.7947616018354893, |
| "reward_before_mean": 0.8371437452733517, |
| "reward_before_std": 0.7848006598651409, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4657301902770996, |
| "reward_change_min": -0.7909989431500435, |
| "reward_change_std": 0.31981481425464153, |
| "reward_std": 0.7947616167366505, |
| "rewards/accuracy_reward": 0.5625000149011612, |
| "rewards/cosine_scaled_reward": 0.2746437588939443, |
| "step": 197 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2588.0000381469727, |
| "epoch": 0.22628571428571428, |
| "grad_norm": 0.02565637417137623, |
| "kl": 0.00015842914581298828, |
| "lambda_div_used": 0.5861488357186317, |
| "learning_rate": 7.831121542179086e-07, |
| "loss": 0.0107, |
| "reward": -0.04248452000319958, |
| "reward_after_mean": -0.04248452000319958, |
| "reward_after_std": 0.5402837041765451, |
| "reward_before_mean": 0.3971143513917923, |
| "reward_before_std": 0.3975243829190731, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4395988676697016, |
| "reward_change_min": -0.6525615304708481, |
| "reward_change_std": 0.24251667596399784, |
| "reward_std": 0.5402837190777063, |
| "rewards/accuracy_reward": 0.33333333395421505, |
| "rewards/cosine_scaled_reward": 0.06378100253641605, |
| "step": 198 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3565.500030517578, |
| "epoch": 0.22742857142857142, |
| "grad_norm": 0.017493488267064095, |
| "kl": 0.00019216537475585938, |
| "lambda_div_used": 0.6027035862207413, |
| "learning_rate": 7.804192891917571e-07, |
| "loss": 0.0044, |
| "reward": -0.25367068126797676, |
| "reward_after_mean": -0.25367068126797676, |
| "reward_after_std": 0.5018568355590105, |
| "reward_before_mean": 0.015577135607600212, |
| "reward_before_std": 0.4789720713160932, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2692478112876415, |
| "reward_change_min": -0.49732755869627, |
| "reward_change_std": 0.18694379180669785, |
| "reward_std": 0.501856841146946, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.15108953975141048, |
| "step": 199 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2030.7708587646484, |
| "epoch": 0.22857142857142856, |
| "grad_norm": 0.023603590205311775, |
| "kl": 8.529424667358398e-05, |
| "lambda_div_used": 0.6037929654121399, |
| "learning_rate": 7.777151938545235e-07, |
| "loss": 0.0494, |
| "reward": 0.20463257655501366, |
| "reward_after_mean": 0.20463257655501366, |
| "reward_after_std": 0.607479989528656, |
| "reward_before_mean": 0.7358259493485093, |
| "reward_before_std": 0.48482801718637347, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.531193383038044, |
| "reward_change_min": -0.7928582988679409, |
| "reward_change_std": 0.3125428520143032, |
| "reward_std": 0.6074800062924623, |
| "rewards/accuracy_reward": 0.4583333395421505, |
| "rewards/cosine_scaled_reward": 0.2774925837293267, |
| "step": 200 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2393.2708740234375, |
| "epoch": 0.2297142857142857, |
| "grad_norm": 0.030843405053019524, |
| "kl": 0.00012966245412826538, |
| "lambda_div_used": 0.6355544030666351, |
| "learning_rate": 7.75e-07, |
| "loss": -0.0226, |
| "reward": 0.36713023856282234, |
| "reward_after_mean": 0.36713023856282234, |
| "reward_after_std": 0.7189916651695967, |
| "reward_before_mean": 0.8961770609021187, |
| "reward_before_std": 0.6416445402428508, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5290468074381351, |
| "reward_change_min": -0.790231991559267, |
| "reward_change_std": 0.33059168234467506, |
| "reward_std": 0.7189916893839836, |
| "rewards/accuracy_reward": 0.5833333432674408, |
| "rewards/cosine_scaled_reward": 0.3128436878323555, |
| "step": 201 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2136.437526702881, |
| "epoch": 0.23085714285714284, |
| "grad_norm": 0.02494831755757332, |
| "kl": 9.03918407857418e-05, |
| "lambda_div_used": 0.5813711285591125, |
| "learning_rate": 7.72273839962904e-07, |
| "loss": -0.0212, |
| "reward": 0.20651111379265785, |
| "reward_after_mean": 0.20651111379265785, |
| "reward_after_std": 0.5735384915024042, |
| "reward_before_mean": 0.8149078581482172, |
| "reward_before_std": 0.3815823132172227, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.6083967536687851, |
| "reward_change_min": -0.8435066714882851, |
| "reward_change_std": 0.3385826703161001, |
| "reward_std": 0.5735384933650494, |
| "rewards/accuracy_reward": 0.5208333358168602, |
| "rewards/cosine_scaled_reward": 0.2940745260566473, |
| "step": 202 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3241.125030517578, |
| "epoch": 0.232, |
| "grad_norm": 0.019406091421842575, |
| "kl": 0.00020945072174072266, |
| "lambda_div_used": 0.5796335637569427, |
| "learning_rate": 7.695368466124296e-07, |
| "loss": 0.0001, |
| "reward": -0.171187374740839, |
| "reward_after_mean": -0.171187374740839, |
| "reward_after_std": 0.4753460921347141, |
| "reward_before_mean": 0.21428256388753653, |
| "reward_before_std": 0.3680141428485513, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38546993769705296, |
| "reward_change_min": -0.565359104424715, |
| "reward_change_std": 0.21748895198106766, |
| "reward_std": 0.4753460939973593, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": 0.0059492262080311775, |
| "step": 203 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1740.6458892822266, |
| "epoch": 0.23314285714285715, |
| "grad_norm": 0.0317753441631794, |
| "kl": 0.00010880827903747559, |
| "lambda_div_used": 0.6080649197101593, |
| "learning_rate": 7.667891533457718e-07, |
| "loss": 0.1061, |
| "reward": 0.01815126556903124, |
| "reward_after_mean": 0.01815126556903124, |
| "reward_after_std": 0.5492583587765694, |
| "reward_before_mean": 0.41005287505686283, |
| "reward_before_std": 0.4979597805067897, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39190160669386387, |
| "reward_change_min": -0.5888865925371647, |
| "reward_change_std": 0.23326328117400408, |
| "reward_std": 0.5492583997547626, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.09755285223945975, |
| "step": 204 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2370.875068664551, |
| "epoch": 0.2342857142857143, |
| "grad_norm": 0.030042720958590508, |
| "kl": 0.00014892220497131348, |
| "lambda_div_used": 0.6162895858287811, |
| "learning_rate": 7.640308940816239e-07, |
| "loss": 0.1316, |
| "reward": 0.43424203619360924, |
| "reward_after_mean": 0.43424203619360924, |
| "reward_after_std": 0.6284053698182106, |
| "reward_before_mean": 1.0446599274873734, |
| "reward_before_std": 0.5331263300031424, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.610417865216732, |
| "reward_change_min": -0.8885884135961533, |
| "reward_change_std": 0.35387465916574, |
| "reward_std": 0.6284053847193718, |
| "rewards/accuracy_reward": 0.6250000149011612, |
| "rewards/cosine_scaled_reward": 0.41965989768505096, |
| "step": 205 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2687.0208587646484, |
| "epoch": 0.23542857142857143, |
| "grad_norm": 0.021548230201005936, |
| "kl": 0.00012874603271484375, |
| "lambda_div_used": 0.6292116791009903, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0002, |
| "reward": -0.19604980573058128, |
| "reward_after_mean": -0.19604980573058128, |
| "reward_after_std": 0.6235801223665476, |
| "reward_before_mean": 0.05089604668319225, |
| "reward_before_std": 0.606025786139071, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24694585241377354, |
| "reward_change_min": -0.5342049337923527, |
| "reward_change_std": 0.1862892871722579, |
| "reward_std": 0.623580127954483, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.1157706193625927, |
| "step": 206 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2933.145866394043, |
| "epoch": 0.23657142857142857, |
| "grad_norm": 0.02744130790233612, |
| "kl": 0.0001799464225769043, |
| "lambda_div_used": 0.5820295214653015, |
| "learning_rate": 7.584832158039378e-07, |
| "loss": -0.0561, |
| "reward": -0.27963498421013355, |
| "reward_after_mean": -0.27963498421013355, |
| "reward_after_std": 0.4231335464864969, |
| "reward_before_mean": 0.013983679935336113, |
| "reward_before_std": 0.377011489123106, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2936186585575342, |
| "reward_change_min": -0.4932614788413048, |
| "reward_change_std": 0.18361396715044975, |
| "reward_std": 0.423133572563529, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.11101631447672844, |
| "step": 207 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2700.4583587646484, |
| "epoch": 0.2377142857142857, |
| "grad_norm": 0.022379335016012192, |
| "kl": 0.00015173852443695068, |
| "lambda_div_used": 0.6108261719346046, |
| "learning_rate": 7.556940671764124e-07, |
| "loss": 0.0624, |
| "reward": -0.02188705001026392, |
| "reward_after_mean": -0.02188705001026392, |
| "reward_after_std": 0.5500882770866156, |
| "reward_before_mean": 0.3470336627215147, |
| "reward_before_std": 0.5068763047456741, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3689207211136818, |
| "reward_change_min": -0.5766280777752399, |
| "reward_change_std": 0.22263448685407639, |
| "reward_std": 0.550088282674551, |
| "rewards/accuracy_reward": 0.27083334513008595, |
| "rewards/cosine_scaled_reward": 0.07620031712576747, |
| "step": 208 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2061.041732788086, |
| "epoch": 0.23885714285714285, |
| "grad_norm": 0.029824599623680115, |
| "kl": 9.86829400062561e-05, |
| "lambda_div_used": 0.6090070083737373, |
| "learning_rate": 7.528948933102438e-07, |
| "loss": -0.0351, |
| "reward": -0.05778682604432106, |
| "reward_after_mean": -0.05778682604432106, |
| "reward_after_std": 0.5228247437626123, |
| "reward_before_mean": 0.2826935350894928, |
| "reward_before_std": 0.501367649412714, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3404803555458784, |
| "reward_change_min": -0.5473614186048508, |
| "reward_change_std": 0.21792252641171217, |
| "reward_std": 0.5228247474879026, |
| "rewards/accuracy_reward": 0.3125000111758709, |
| "rewards/cosine_scaled_reward": -0.029806464910507202, |
| "step": 209 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2671.9583740234375, |
| "epoch": 0.24, |
| "grad_norm": 0.019723398610949516, |
| "kl": 0.0001126602292060852, |
| "lambda_div_used": 0.6178258955478668, |
| "learning_rate": 7.500858306332172e-07, |
| "loss": -0.046, |
| "reward": -0.019846799783408642, |
| "reward_after_mean": -0.019846799783408642, |
| "reward_after_std": 0.6330938600003719, |
| "reward_before_mean": 0.3628573752939701, |
| "reward_before_std": 0.5453151864930987, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38270418159663677, |
| "reward_change_min": -0.5824633538722992, |
| "reward_change_std": 0.22786249686032534, |
| "reward_std": 0.6330938655883074, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.05035736900754273, |
| "step": 210 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2191.2500076293945, |
| "epoch": 0.24114285714285713, |
| "grad_norm": 0.026132306084036827, |
| "kl": 0.00012756884098052979, |
| "lambda_div_used": 0.5773908644914627, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0368, |
| "reward": -0.09260139870457351, |
| "reward_after_mean": -0.09260139870457351, |
| "reward_after_std": 0.5260053481906652, |
| "reward_before_mean": 0.3510722735663876, |
| "reward_before_std": 0.35058190673589706, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4436736721545458, |
| "reward_change_min": -0.5985335633158684, |
| "reward_change_std": 0.22750128898769617, |
| "reward_std": 0.5260053630918264, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.05940559017471969, |
| "step": 211 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1899.8750457763672, |
| "epoch": 0.2422857142857143, |
| "grad_norm": 0.03510681912302971, |
| "kl": 0.000111408531665802, |
| "lambda_div_used": 0.6011399254202843, |
| "learning_rate": 7.444385869608921e-07, |
| "loss": 0.0147, |
| "reward": 0.018216299824416637, |
| "reward_after_mean": 0.018216299824416637, |
| "reward_after_std": 0.5794783290475607, |
| "reward_before_mean": 0.4559548683464527, |
| "reward_before_std": 0.47238516760990024, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43773859925568104, |
| "reward_change_min": -0.6772446036338806, |
| "reward_change_std": 0.26400116458535194, |
| "reward_std": 0.5794783346354961, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/cosine_scaled_reward": 0.018454871140420437, |
| "step": 212 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2187.2708587646484, |
| "epoch": 0.24342857142857144, |
| "grad_norm": 0.02723986841738224, |
| "kl": 0.00015923380851745605, |
| "lambda_div_used": 0.6033707112073898, |
| "learning_rate": 7.416006812042827e-07, |
| "loss": 0.0702, |
| "reward": 0.11780917271971703, |
| "reward_after_mean": 0.11780917271971703, |
| "reward_after_std": 0.6044025905430317, |
| "reward_before_mean": 0.6168809719383717, |
| "reward_before_std": 0.4788802685216069, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4990718085318804, |
| "reward_change_min": -0.774650864303112, |
| "reward_change_std": 0.2952164225280285, |
| "reward_std": 0.6044025998562574, |
| "rewards/accuracy_reward": 0.4375000037252903, |
| "rewards/cosine_scaled_reward": 0.17938095517456532, |
| "step": 213 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2442.375072479248, |
| "epoch": 0.24457142857142858, |
| "grad_norm": 0.024862557649612427, |
| "kl": 0.00014431774616241455, |
| "lambda_div_used": 0.5444722771644592, |
| "learning_rate": 7.387534371007797e-07, |
| "loss": -0.0237, |
| "reward": -0.20888542756438255, |
| "reward_after_mean": -0.20888542756438255, |
| "reward_after_std": 0.3397774752229452, |
| "reward_before_mean": 0.22809876408427954, |
| "reward_before_std": 0.19605009350925684, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.436984209343791, |
| "reward_change_min": -0.5989037677645683, |
| "reward_change_std": 0.22710295487195253, |
| "reward_std": 0.339777497574687, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/cosine_scaled_reward": -0.021901232190430164, |
| "step": 214 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1974.0000457763672, |
| "epoch": 0.24571428571428572, |
| "grad_norm": 0.03193683549761772, |
| "kl": 0.00011608004570007324, |
| "lambda_div_used": 0.6048371568322182, |
| "learning_rate": 7.358969934210438e-07, |
| "loss": -0.0834, |
| "reward": -0.23377530556172132, |
| "reward_after_mean": -0.23377530556172132, |
| "reward_after_std": 0.5205546151846647, |
| "reward_before_mean": 0.02766125090420246, |
| "reward_before_std": 0.48529171757400036, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2614365555346012, |
| "reward_change_min": -0.44492336362600327, |
| "reward_change_std": 0.16870077326893806, |
| "reward_std": 0.5205546207726002, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.11817209050059319, |
| "step": 215 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2073.7917137145996, |
| "epoch": 0.24685714285714286, |
| "grad_norm": 0.03090011700987816, |
| "kl": 0.00015020370483398438, |
| "lambda_div_used": 0.5987100675702095, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": -0.037, |
| "reward": -0.0010385997593402863, |
| "reward_after_mean": -0.0010385997593402863, |
| "reward_after_std": 0.5259138215333223, |
| "reward_before_mean": 0.42963695898652077, |
| "reward_before_std": 0.4547121487557888, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43067559227347374, |
| "reward_change_min": -0.6873270347714424, |
| "reward_change_std": 0.2589325439184904, |
| "reward_std": 0.5259138215333223, |
| "rewards/accuracy_reward": 0.3541666679084301, |
| "rewards/cosine_scaled_reward": 0.07547031342983246, |
| "step": 216 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2313.416679382324, |
| "epoch": 0.248, |
| "grad_norm": 0.02645043283700943, |
| "kl": 0.00013341009616851807, |
| "lambda_div_used": 0.6059171706438065, |
| "learning_rate": 7.301570646506027e-07, |
| "loss": 0.005, |
| "reward": 0.06280752643942833, |
| "reward_after_mean": 0.06280752643942833, |
| "reward_after_std": 0.5530473850667477, |
| "reward_before_mean": 0.4911847524344921, |
| "reward_before_std": 0.4922928689047694, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42837722785770893, |
| "reward_change_min": -0.6544801071286201, |
| "reward_change_std": 0.2630965141579509, |
| "reward_std": 0.5530474036931992, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.11618476174771786, |
| "step": 217 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2668.9375381469727, |
| "epoch": 0.24914285714285714, |
| "grad_norm": 0.023471172899007797, |
| "kl": 0.00011789798736572266, |
| "lambda_div_used": 0.6332497969269753, |
| "learning_rate": 7.27273859315928e-07, |
| "loss": -0.0061, |
| "reward": 0.06328068673610687, |
| "reward_after_mean": 0.06328068673610687, |
| "reward_after_std": 0.631486464291811, |
| "reward_before_mean": 0.4275508373975754, |
| "reward_before_std": 0.613295029848814, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3642701506614685, |
| "reward_change_min": -0.593178354203701, |
| "reward_change_std": 0.23565197084099054, |
| "reward_std": 0.6314864810556173, |
| "rewards/accuracy_reward": 0.33333334513008595, |
| "rewards/cosine_scaled_reward": 0.09421749995090067, |
| "step": 218 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2036.7292098999023, |
| "epoch": 0.2502857142857143, |
| "grad_norm": 0.031107638031244278, |
| "kl": 0.00014078617095947266, |
| "lambda_div_used": 0.6064230278134346, |
| "learning_rate": 7.243820139034464e-07, |
| "loss": 0.0536, |
| "reward": 0.17783035337924957, |
| "reward_after_mean": 0.17783035337924957, |
| "reward_after_std": 0.6028466131538153, |
| "reward_before_mean": 0.6854837816208601, |
| "reward_before_std": 0.4974929317831993, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5076534673571587, |
| "reward_change_min": -0.7541679181158543, |
| "reward_change_std": 0.3024881314486265, |
| "reward_std": 0.602846622467041, |
| "rewards/accuracy_reward": 0.4583333358168602, |
| "rewards/cosine_scaled_reward": 0.22715043649077415, |
| "step": 219 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2585.562515258789, |
| "epoch": 0.25142857142857145, |
| "grad_norm": 0.033569689840078354, |
| "kl": 0.00014656782150268555, |
| "lambda_div_used": 0.5626775473356247, |
| "learning_rate": 7.214816693576234e-07, |
| "loss": 0.0096, |
| "reward": -0.44609223771840334, |
| "reward_after_mean": -0.44609223771840334, |
| "reward_after_std": 0.34080066718161106, |
| "reward_before_mean": -0.19077827036380768, |
| "reward_before_std": 0.2862963704392314, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25531397201120853, |
| "reward_change_min": -0.4116707444190979, |
| "reward_change_std": 0.15291727520525455, |
| "reward_std": 0.340800691395998, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.23244493766105734, |
| "step": 220 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1835.1667137145996, |
| "epoch": 0.25257142857142856, |
| "grad_norm": 0.026043305173516273, |
| "kl": 9.073130786418915e-05, |
| "lambda_div_used": 0.589106909930706, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": -0.0167, |
| "reward": 0.1618174184113741, |
| "reward_after_mean": 0.1618174184113741, |
| "reward_after_std": 0.5352318156510592, |
| "reward_before_mean": 0.7099000085145235, |
| "reward_before_std": 0.40553835732862353, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5480826254934072, |
| "reward_change_min": -0.7475878298282623, |
| "reward_change_std": 0.3015699228271842, |
| "reward_std": 0.5352318380028009, |
| "rewards/accuracy_reward": 0.47916667722165585, |
| "rewards/cosine_scaled_reward": 0.23073333408683538, |
| "step": 221 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2023.6875305175781, |
| "epoch": 0.2537142857142857, |
| "grad_norm": 0.024540327489376068, |
| "kl": 0.0001620650291442871, |
| "lambda_div_used": 0.5971302166581154, |
| "learning_rate": 7.156560487081051e-07, |
| "loss": 0.0082, |
| "reward": 0.04625087231397629, |
| "reward_after_mean": 0.04625087231397629, |
| "reward_after_std": 0.5205375533550978, |
| "reward_before_mean": 0.4992250055074692, |
| "reward_before_std": 0.4442645478993654, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45297410897910595, |
| "reward_change_min": -0.7014825120568275, |
| "reward_change_std": 0.27082843892276287, |
| "reward_std": 0.5205375626683235, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.145058311522007, |
| "step": 222 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2102.104202270508, |
| "epoch": 0.25485714285714284, |
| "grad_norm": 0.02421215921640396, |
| "kl": 0.00010086596012115479, |
| "lambda_div_used": 0.6135998442769051, |
| "learning_rate": 7.127310565369415e-07, |
| "loss": 0.0077, |
| "reward": -0.07060145400464535, |
| "reward_after_mean": -0.07060145400464535, |
| "reward_after_std": 0.5450965594500303, |
| "reward_before_mean": 0.2666409400990233, |
| "reward_before_std": 0.5180736510083079, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.33724240958690643, |
| "reward_change_min": -0.5495268329977989, |
| "reward_change_std": 0.2100124368444085, |
| "reward_std": 0.5450965687632561, |
| "rewards/accuracy_reward": 0.27083334513008595, |
| "rewards/cosine_scaled_reward": -0.004192400723695755, |
| "step": 223 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2678.8958892822266, |
| "epoch": 0.256, |
| "grad_norm": 0.0209470484405756, |
| "kl": 0.00014913082122802734, |
| "lambda_div_used": 0.6316534802317619, |
| "learning_rate": 7.097981330836616e-07, |
| "loss": 0.041, |
| "reward": 0.002024895278736949, |
| "reward_after_mean": 0.002024895278736949, |
| "reward_after_std": 0.6682235784828663, |
| "reward_before_mean": 0.36611822061240673, |
| "reward_before_std": 0.6153735313564539, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36409333534538746, |
| "reward_change_min": -0.6180168017745018, |
| "reward_change_std": 0.23850849829614162, |
| "reward_std": 0.6682236194610596, |
| "rewards/accuracy_reward": 0.33333333767950535, |
| "rewards/cosine_scaled_reward": 0.03278488974319771, |
| "step": 224 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2463.6875610351562, |
| "epoch": 0.2571428571428571, |
| "grad_norm": 0.026948727667331696, |
| "kl": 0.0001347959041595459, |
| "lambda_div_used": 0.6428607329726219, |
| "learning_rate": 7.068574212948169e-07, |
| "loss": 0.0403, |
| "reward": 0.05994867905974388, |
| "reward_after_mean": 0.05994867905974388, |
| "reward_after_std": 0.6384792737662792, |
| "reward_before_mean": 0.3915696498006582, |
| "reward_before_std": 0.6680623888969421, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3316209614276886, |
| "reward_change_min": -0.606514610350132, |
| "reward_change_std": 0.24293010961264372, |
| "reward_std": 0.638479296118021, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.03740297071635723, |
| "step": 225 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2275.479232788086, |
| "epoch": 0.2582857142857143, |
| "grad_norm": 0.0262776929885149, |
| "kl": 0.00012694299221038818, |
| "lambda_div_used": 0.6046253740787506, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0006, |
| "reward": 0.08375599328428507, |
| "reward_after_mean": 0.08375599328428507, |
| "reward_after_std": 0.5798605680465698, |
| "reward_before_mean": 0.5400239741429687, |
| "reward_before_std": 0.47723895218223333, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45626799017190933, |
| "reward_change_min": -0.6333699934184551, |
| "reward_change_std": 0.2536289654672146, |
| "reward_std": 0.579860582947731, |
| "rewards/accuracy_reward": 0.3750000111758709, |
| "rewards/cosine_scaled_reward": 0.16502396669238806, |
| "step": 226 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1853.1041793823242, |
| "epoch": 0.25942857142857145, |
| "grad_norm": 0.03546634316444397, |
| "kl": 0.00011576712131500244, |
| "lambda_div_used": 0.6238459944725037, |
| "learning_rate": 7.009532063876148e-07, |
| "loss": -0.0356, |
| "reward": 0.035207513719797134, |
| "reward_after_mean": 0.035207513719797134, |
| "reward_after_std": 0.5671821534633636, |
| "reward_before_mean": 0.38889277167618275, |
| "reward_before_std": 0.5772030726075172, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35368524491786957, |
| "reward_change_min": -0.587718054652214, |
| "reward_change_std": 0.2398481909185648, |
| "reward_std": 0.5671821553260088, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.07639275304973125, |
| "step": 227 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2130.875026702881, |
| "epoch": 0.26057142857142856, |
| "grad_norm": 0.03324354812502861, |
| "kl": 0.00012753158807754517, |
| "lambda_div_used": 0.566599652171135, |
| "learning_rate": 6.979899910323624e-07, |
| "loss": -0.0669, |
| "reward": 0.011986830271780491, |
| "reward_after_mean": 0.011986830271780491, |
| "reward_after_std": 0.4835386872291565, |
| "reward_before_mean": 0.5381738739088178, |
| "reward_before_std": 0.30367479752749205, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5261870250105858, |
| "reward_change_min": -0.6992091946303844, |
| "reward_change_std": 0.27626297529786825, |
| "reward_std": 0.48353871516883373, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.1631738357245922, |
| "step": 228 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3159.562530517578, |
| "epoch": 0.26171428571428573, |
| "grad_norm": 0.026536332443356514, |
| "kl": 0.00018978118896484375, |
| "lambda_div_used": 0.5688631013035774, |
| "learning_rate": 6.950195628537299e-07, |
| "loss": -0.0295, |
| "reward": -0.11544675379991531, |
| "reward_after_mean": -0.11544675379991531, |
| "reward_after_std": 0.42418220825493336, |
| "reward_before_mean": 0.31868776679039, |
| "reward_before_std": 0.3126910990104079, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4341345224529505, |
| "reward_change_min": -0.63496870175004, |
| "reward_change_std": 0.24463962391018867, |
| "reward_std": 0.42418221198022366, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.047854430973529816, |
| "step": 229 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2753.3333740234375, |
| "epoch": 0.26285714285714284, |
| "grad_norm": 0.020300425589084625, |
| "kl": 0.00014778971672058105, |
| "lambda_div_used": 0.6197129040956497, |
| "learning_rate": 6.920420666261961e-07, |
| "loss": 0.0738, |
| "reward": -0.06457636877894402, |
| "reward_after_mean": -0.06457636877894402, |
| "reward_after_std": 0.5870498064905405, |
| "reward_before_mean": 0.2789040170609951, |
| "reward_before_std": 0.5545760486274958, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34348038397729397, |
| "reward_change_min": -0.6167686618864536, |
| "reward_change_std": 0.22966008260846138, |
| "reward_std": 0.5870498213917017, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.008070665411651134, |
| "step": 230 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2388.3750610351562, |
| "epoch": 0.264, |
| "grad_norm": 0.025614017620682716, |
| "kl": 0.00012599676847457886, |
| "lambda_div_used": 0.6009133085608482, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": -0.0117, |
| "reward": -0.07801849395036697, |
| "reward_after_mean": -0.07801849395036697, |
| "reward_after_std": 0.5619381573051214, |
| "reward_before_mean": 0.2970875895989593, |
| "reward_before_std": 0.4653975451365113, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37510609440505505, |
| "reward_change_min": -0.5894374549388885, |
| "reward_change_std": 0.2189607135951519, |
| "reward_std": 0.5619381796568632, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.02625426184386015, |
| "step": 231 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2800.4792098999023, |
| "epoch": 0.2651428571428571, |
| "grad_norm": 0.020755581557750702, |
| "kl": 0.00017752498388290405, |
| "lambda_div_used": 0.6359871402382851, |
| "learning_rate": 6.860664508377001e-07, |
| "loss": 0.0213, |
| "reward": 0.09202059358358383, |
| "reward_after_mean": 0.09202059358358383, |
| "reward_after_std": 0.6515896432101727, |
| "reward_before_mean": 0.49167851358652115, |
| "reward_before_std": 0.6335036922246218, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3996579386293888, |
| "reward_change_min": -0.6971911080181599, |
| "reward_change_std": 0.2742554973810911, |
| "reward_std": 0.6515896506607533, |
| "rewards/accuracy_reward": 0.3958333358168602, |
| "rewards/cosine_scaled_reward": 0.09584518847987056, |
| "step": 232 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1828.020866394043, |
| "epoch": 0.2662857142857143, |
| "grad_norm": 0.028299635276198387, |
| "kl": 0.00010451674461364746, |
| "lambda_div_used": 0.6347432807087898, |
| "learning_rate": 6.83068622519821e-07, |
| "loss": -0.0488, |
| "reward": -0.11232293955981731, |
| "reward_after_mean": -0.11232293955981731, |
| "reward_after_std": 0.6607348509132862, |
| "reward_before_mean": 0.16373980697244406, |
| "reward_before_std": 0.6230235639959574, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2760627530515194, |
| "reward_change_min": -0.5052222050726414, |
| "reward_change_std": 0.18505325820297003, |
| "reward_std": 0.660734860226512, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.06542686396278441, |
| "step": 233 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2573.562526702881, |
| "epoch": 0.2674285714285714, |
| "grad_norm": 0.027019290253520012, |
| "kl": 0.0001166127622127533, |
| "lambda_div_used": 0.5475329235196114, |
| "learning_rate": 6.800643086250121e-07, |
| "loss": 0.0132, |
| "reward": -0.23357034847140312, |
| "reward_after_mean": -0.23357034847140312, |
| "reward_after_std": 0.3406812082976103, |
| "reward_before_mean": 0.17690101824700832, |
| "reward_before_std": 0.21493587270379066, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4104713797569275, |
| "reward_change_min": -0.5621155239641666, |
| "reward_change_std": 0.22167872916907072, |
| "reward_std": 0.34068121016025543, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.03143232688307762, |
| "step": 234 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2243.645851135254, |
| "epoch": 0.26857142857142857, |
| "grad_norm": 0.031080337241292, |
| "kl": 0.00015339255332946777, |
| "lambda_div_used": 0.6331789866089821, |
| "learning_rate": 6.770536555792944e-07, |
| "loss": -0.0167, |
| "reward": 0.05447516264393926, |
| "reward_after_mean": 0.05447516264393926, |
| "reward_after_std": 0.7066546399146318, |
| "reward_before_mean": 0.44694859720766544, |
| "reward_before_std": 0.6171103774104267, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39247346110641956, |
| "reward_change_min": -0.6285405829548836, |
| "reward_change_std": 0.23825406469404697, |
| "reward_std": 0.706654641777277, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.09278193739010021, |
| "step": 235 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2630.875030517578, |
| "epoch": 0.26971428571428574, |
| "grad_norm": 0.022363808006048203, |
| "kl": 0.00015428662300109863, |
| "lambda_div_used": 0.6662941351532936, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.026, |
| "reward": 0.1967709083110094, |
| "reward_after_mean": 0.1967709083110094, |
| "reward_after_std": 0.7583610694855452, |
| "reward_before_mean": 0.5491750640794635, |
| "reward_before_std": 0.7768743745982647, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35240414179861546, |
| "reward_change_min": -0.625398077070713, |
| "reward_change_std": 0.25161405000835657, |
| "reward_std": 0.758361091837287, |
| "rewards/accuracy_reward": 0.39583334513008595, |
| "rewards/cosine_scaled_reward": 0.15334171522408724, |
| "step": 236 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2439.7292289733887, |
| "epoch": 0.27085714285714285, |
| "grad_norm": 0.02510235831141472, |
| "kl": 0.00016179680824279785, |
| "lambda_div_used": 0.6136586889624596, |
| "learning_rate": 6.710139192768694e-07, |
| "loss": -0.0166, |
| "reward": 0.04647237854078412, |
| "reward_after_mean": 0.04647237854078412, |
| "reward_after_std": 0.6743428651243448, |
| "reward_before_mean": 0.47922211419790983, |
| "reward_before_std": 0.5223582116886973, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43274970538914204, |
| "reward_change_min": -0.6191227361559868, |
| "reward_change_std": 0.23846820835024118, |
| "reward_std": 0.6743428837507963, |
| "rewards/accuracy_reward": 0.37500000186264515, |
| "rewards/cosine_scaled_reward": 0.10422207851661369, |
| "step": 237 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2402.5000762939453, |
| "epoch": 0.272, |
| "grad_norm": 0.021976694464683533, |
| "kl": 0.0001609325408935547, |
| "lambda_div_used": 0.6034338474273682, |
| "learning_rate": 6.679851303883891e-07, |
| "loss": 0.0784, |
| "reward": 0.18511426215991378, |
| "reward_after_mean": 0.18511426215991378, |
| "reward_after_std": 0.6621626690030098, |
| "reward_before_mean": 0.7374962608737405, |
| "reward_before_std": 0.4762335177510977, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5523819867521524, |
| "reward_change_min": -0.7993261553347111, |
| "reward_change_std": 0.3063361942768097, |
| "reward_std": 0.6621626764535904, |
| "rewards/accuracy_reward": 0.5000000018626451, |
| "rewards/cosine_scaled_reward": 0.23749624891206622, |
| "step": 238 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1732.2292098999023, |
| "epoch": 0.27314285714285713, |
| "grad_norm": 0.03150353208184242, |
| "kl": 7.70464539527893e-05, |
| "lambda_div_used": 0.6118984445929527, |
| "learning_rate": 6.649505910711058e-07, |
| "loss": 0.0245, |
| "reward": 0.22464729472994804, |
| "reward_after_mean": 0.22464729472994804, |
| "reward_after_std": 0.5859156623482704, |
| "reward_before_mean": 0.7294908128678799, |
| "reward_before_std": 0.5119684813544154, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5048435050994158, |
| "reward_change_min": -0.7602570950984955, |
| "reward_change_std": 0.30051624588668346, |
| "reward_std": 0.5859156772494316, |
| "rewards/accuracy_reward": 0.4583333469927311, |
| "rewards/cosine_scaled_reward": 0.27115743793547153, |
| "step": 239 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2970.708366394043, |
| "epoch": 0.2742857142857143, |
| "grad_norm": 0.02445312589406967, |
| "kl": 0.00020241737365722656, |
| "lambda_div_used": 0.5594572946429253, |
| "learning_rate": 6.619104492241847e-07, |
| "loss": 0.0141, |
| "reward": -0.36844223737716675, |
| "reward_after_mean": -0.36844223737716675, |
| "reward_after_std": 0.32861490175127983, |
| "reward_before_mean": -0.06151419784873724, |
| "reward_before_std": 0.2699447488412261, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30692804232239723, |
| "reward_change_min": -0.46535007655620575, |
| "reward_change_std": 0.1784888058900833, |
| "reward_std": 0.3286149147897959, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.12401420716196299, |
| "step": 240 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2815.041702270508, |
| "epoch": 0.2754285714285714, |
| "grad_norm": 0.02061399444937706, |
| "kl": 0.0001932680606842041, |
| "lambda_div_used": 0.5598616823554039, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0225, |
| "reward": -0.4072608258575201, |
| "reward_after_mean": -0.4072608258575201, |
| "reward_after_std": 0.34731264412403107, |
| "reward_before_mean": -0.12633821368217468, |
| "reward_before_std": 0.2754332982003689, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2809226084500551, |
| "reward_change_min": -0.4441990442574024, |
| "reward_change_std": 0.1635214313864708, |
| "reward_std": 0.34731266647577286, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.16800487786531448, |
| "step": 241 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2020.520851135254, |
| "epoch": 0.2765714285714286, |
| "grad_norm": 0.03268786519765854, |
| "kl": 0.0002362281084060669, |
| "lambda_div_used": 0.5827708318829536, |
| "learning_rate": 6.558139508961654e-07, |
| "loss": 0.0641, |
| "reward": -0.14442800264805555, |
| "reward_after_mean": -0.14442800264805555, |
| "reward_after_std": 0.488038569688797, |
| "reward_before_mean": 0.23860891722142696, |
| "reward_before_std": 0.3739425097592175, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38303691893815994, |
| "reward_change_min": -0.5426856316626072, |
| "reward_change_std": 0.20754980947822332, |
| "reward_std": 0.4880385845899582, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.032224420458078384, |
| "step": 242 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2664.4375610351562, |
| "epoch": 0.2777142857142857, |
| "grad_norm": 0.02101411111652851, |
| "kl": 0.00011355429887771606, |
| "lambda_div_used": 0.6568357795476913, |
| "learning_rate": 6.527578915497951e-07, |
| "loss": 0.0052, |
| "reward": 0.12949350103735924, |
| "reward_after_mean": 0.12949350103735924, |
| "reward_after_std": 0.7312840819358826, |
| "reward_before_mean": 0.46976747084409, |
| "reward_before_std": 0.7229422759264708, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3402740005403757, |
| "reward_change_min": -0.5887857899069786, |
| "reward_change_std": 0.2261042231693864, |
| "reward_std": 0.7312840968370438, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/cosine_scaled_reward": 0.11560080386698246, |
| "step": 243 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2822.6875381469727, |
| "epoch": 0.27885714285714286, |
| "grad_norm": 0.021215323358774185, |
| "kl": 0.00015109777450561523, |
| "lambda_div_used": 0.6380957439541817, |
| "learning_rate": 6.496968239287603e-07, |
| "loss": 0.023, |
| "reward": 0.23113884031772614, |
| "reward_after_mean": 0.23113884031772614, |
| "reward_after_std": 0.6932291053235531, |
| "reward_before_mean": 0.678146418184042, |
| "reward_before_std": 0.6383852679282427, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4470075909048319, |
| "reward_change_min": -0.6521747056394815, |
| "reward_change_std": 0.2655975092202425, |
| "reward_std": 0.6932291202247143, |
| "rewards/accuracy_reward": 0.4583333469927311, |
| "rewards/cosine_scaled_reward": 0.21981305815279484, |
| "step": 244 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2480.2500915527344, |
| "epoch": 0.28, |
| "grad_norm": 0.026185423135757446, |
| "kl": 0.00016620755195617676, |
| "lambda_div_used": 0.6584924161434174, |
| "learning_rate": 6.466308972251785e-07, |
| "loss": 0.058, |
| "reward": 0.19090854283422232, |
| "reward_after_mean": 0.19090854283422232, |
| "reward_after_std": 0.726154362782836, |
| "reward_before_mean": 0.5613497914746404, |
| "reward_before_std": 0.7352566458284855, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37044124491512775, |
| "reward_change_min": -0.6325159706175327, |
| "reward_change_std": 0.25586483906954527, |
| "reward_std": 0.7261543925851583, |
| "rewards/accuracy_reward": 0.41666668094694614, |
| "rewards/cosine_scaled_reward": 0.14468309609219432, |
| "step": 245 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2683.291717529297, |
| "epoch": 0.28114285714285714, |
| "grad_norm": 0.020767759531736374, |
| "kl": 0.00016689300537109375, |
| "lambda_div_used": 0.613718219101429, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": 0.0867, |
| "reward": -0.015127861872315407, |
| "reward_after_mean": -0.015127861872315407, |
| "reward_after_std": 0.5861052125692368, |
| "reward_before_mean": 0.36624928191304207, |
| "reward_before_std": 0.5300967525690794, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38137709721922874, |
| "reward_change_min": -0.6563880071043968, |
| "reward_change_std": 0.2452305220067501, |
| "reward_std": 0.5861052181571722, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.05374925094656646, |
| "step": 246 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3034.937545776367, |
| "epoch": 0.2822857142857143, |
| "grad_norm": 0.018628831952810287, |
| "kl": 0.00017150957137346268, |
| "lambda_div_used": 0.6288246288895607, |
| "learning_rate": 6.404850645156841e-07, |
| "loss": 0.0338, |
| "reward": -0.14502286911010742, |
| "reward_after_mean": -0.14502286911010742, |
| "reward_after_std": 0.6141320299357176, |
| "reward_before_mean": 0.1263586189597845, |
| "reward_before_std": 0.6014503743499517, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2713814973831177, |
| "reward_change_min": -0.5071048811078072, |
| "reward_change_std": 0.19316286500543356, |
| "reward_std": 0.61413205973804, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.06114138173870742, |
| "step": 247 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2029.1250228881836, |
| "epoch": 0.2834285714285714, |
| "grad_norm": 0.034633222967386246, |
| "kl": 0.00014004111289978027, |
| "lambda_div_used": 0.6094017848372459, |
| "learning_rate": 6.374054580489873e-07, |
| "loss": -0.0144, |
| "reward": 0.2783904932439327, |
| "reward_after_mean": 0.2783904932439327, |
| "reward_after_std": 0.6359313689172268, |
| "reward_before_mean": 0.8294162545353174, |
| "reward_before_std": 0.5048373020254076, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5510257538408041, |
| "reward_change_min": -0.7822528444230556, |
| "reward_change_std": 0.3142691068351269, |
| "reward_std": 0.6359313875436783, |
| "rewards/accuracy_reward": 0.5416666772216558, |
| "rewards/cosine_scaled_reward": 0.28774956427514553, |
| "step": 248 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1789.4791870117188, |
| "epoch": 0.2845714285714286, |
| "grad_norm": 0.028334610164165497, |
| "kl": 7.846951484680176e-05, |
| "lambda_div_used": 0.5886820033192635, |
| "learning_rate": 6.343215915635761e-07, |
| "loss": 0.0095, |
| "reward": -0.009319216012954712, |
| "reward_after_mean": -0.009319216012954712, |
| "reward_after_std": 0.5895384289324284, |
| "reward_before_mean": 0.46247682347893715, |
| "reward_before_std": 0.4071828918531537, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.47179603204131126, |
| "reward_change_min": -0.6552967764437199, |
| "reward_change_std": 0.2524276301264763, |
| "reward_std": 0.589538436383009, |
| "rewards/accuracy_reward": 0.3333333358168602, |
| "rewards/cosine_scaled_reward": 0.1291434899903834, |
| "step": 249 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2342.8750610351562, |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.02926229126751423, |
| "kl": 0.00020366907119750977, |
| "lambda_div_used": 0.6367609649896622, |
| "learning_rate": 6.31233615362752e-07, |
| "loss": 0.0537, |
| "reward": 0.04997219145298004, |
| "reward_after_mean": 0.04997219145298004, |
| "reward_after_std": 0.6538249664008617, |
| "reward_before_mean": 0.4029387356713414, |
| "reward_before_std": 0.6333111096173525, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35296651534736156, |
| "reward_change_min": -0.5975028611719608, |
| "reward_change_std": 0.23207756876945496, |
| "reward_std": 0.6538249738514423, |
| "rewards/accuracy_reward": 0.31250001303851604, |
| "rewards/cosine_scaled_reward": 0.09043872263282537, |
| "step": 250 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1925.1667289733887, |
| "epoch": 0.28685714285714287, |
| "grad_norm": 0.03169158101081848, |
| "kl": 0.0001310408115386963, |
| "lambda_div_used": 0.6620426177978516, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": -0.0552, |
| "reward": 0.16181311733089387, |
| "reward_after_mean": 0.16181311733089387, |
| "reward_after_std": 0.7404435630887747, |
| "reward_before_mean": 0.516552684828639, |
| "reward_before_std": 0.7539111012592912, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35473958775401115, |
| "reward_change_min": -0.6220344565808773, |
| "reward_change_std": 0.24936181399971247, |
| "reward_std": 0.7404435705393553, |
| "rewards/accuracy_reward": 0.39583334885537624, |
| "rewards/cosine_scaled_reward": 0.12071935646235943, |
| "step": 251 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2385.4166831970215, |
| "epoch": 0.288, |
| "grad_norm": 0.027474144473671913, |
| "kl": 0.00018829107284545898, |
| "lambda_div_used": 0.5771610513329506, |
| "learning_rate": 6.25045936022246e-07, |
| "loss": 0.0396, |
| "reward": -0.18753607827238739, |
| "reward_after_mean": -0.18753607827238739, |
| "reward_after_std": 0.4631412886083126, |
| "reward_before_mean": 0.1804720275104046, |
| "reward_before_std": 0.3519942844286561, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3680081032216549, |
| "reward_change_min": -0.5239151008427143, |
| "reward_change_std": 0.20504287257790565, |
| "reward_std": 0.46314129047095776, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.04869466880336404, |
| "step": 252 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2590.875045776367, |
| "epoch": 0.28914285714285715, |
| "grad_norm": 0.027266530320048332, |
| "kl": 0.0001958310604095459, |
| "lambda_div_used": 0.6256092488765717, |
| "learning_rate": 6.219465344613258e-07, |
| "loss": -0.0262, |
| "reward": 0.09082555398344994, |
| "reward_after_mean": 0.09082555398344994, |
| "reward_after_std": 0.6417571641504765, |
| "reward_before_mean": 0.5176093801856041, |
| "reward_before_std": 0.5764628401957452, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4267838429659605, |
| "reward_change_min": -0.7001185156404972, |
| "reward_change_std": 0.26451323740184307, |
| "reward_std": 0.6417571865022182, |
| "rewards/accuracy_reward": 0.39583334140479565, |
| "rewards/cosine_scaled_reward": 0.12177603470627218, |
| "step": 253 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2205.3333587646484, |
| "epoch": 0.29028571428571426, |
| "grad_norm": 0.03558209538459778, |
| "kl": 0.0001392364501953125, |
| "lambda_div_used": 0.6507852524518967, |
| "learning_rate": 6.188436263278172e-07, |
| "loss": -0.0987, |
| "reward": 0.18358214199543, |
| "reward_after_mean": 0.18358214199543, |
| "reward_after_std": 0.7428858652710915, |
| "reward_before_mean": 0.5856727678328753, |
| "reward_before_std": 0.7034552115947008, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40209066309034824, |
| "reward_change_min": -0.6650605984032154, |
| "reward_change_std": 0.2646036548539996, |
| "reward_std": 0.7428858801722527, |
| "rewards/accuracy_reward": 0.43750000931322575, |
| "rewards/cosine_scaled_reward": 0.14817279600538313, |
| "step": 254 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3061.5625610351562, |
| "epoch": 0.2914285714285714, |
| "grad_norm": 0.026399368420243263, |
| "kl": 0.00017392635345458984, |
| "lambda_div_used": 0.6185515820980072, |
| "learning_rate": 6.157373628530852e-07, |
| "loss": 0.023, |
| "reward": -0.10978002939373255, |
| "reward_after_mean": -0.10978002939373255, |
| "reward_after_std": 0.5691167917102575, |
| "reward_before_mean": 0.19686487689614296, |
| "reward_before_std": 0.5497966632246971, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3066448848694563, |
| "reward_change_min": -0.5226548612117767, |
| "reward_change_std": 0.20487169921398163, |
| "reward_std": 0.569116810336709, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": -0.032301797065883875, |
| "step": 255 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2540.5833435058594, |
| "epoch": 0.2925714285714286, |
| "grad_norm": 0.02308651991188526, |
| "kl": 0.00019800662994384766, |
| "lambda_div_used": 0.6459387838840485, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": -0.0509, |
| "reward": 0.1737559838220477, |
| "reward_after_mean": 0.1737559838220477, |
| "reward_after_std": 0.6614836137741804, |
| "reward_before_mean": 0.5523870065808296, |
| "reward_before_std": 0.6789623461663723, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37863098084926605, |
| "reward_change_min": -0.6470838598906994, |
| "reward_change_std": 0.26314268447458744, |
| "reward_std": 0.661483621224761, |
| "rewards/accuracy_reward": 0.3958333469927311, |
| "rewards/cosine_scaled_reward": 0.15655364841222763, |
| "step": 256 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2811.0209045410156, |
| "epoch": 0.2937142857142857, |
| "grad_norm": 0.021752549335360527, |
| "kl": 0.00017321109771728516, |
| "lambda_div_used": 0.6104201078414917, |
| "learning_rate": 6.095153756157051e-07, |
| "loss": 0.077, |
| "reward": 0.32746705412864685, |
| "reward_after_mean": 0.32746705412864685, |
| "reward_after_std": 0.627650348469615, |
| "reward_before_mean": 0.9031309094280005, |
| "reward_before_std": 0.5052130986005068, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5756638199090958, |
| "reward_change_min": -0.8629779443144798, |
| "reward_change_std": 0.33332069404423237, |
| "reward_std": 0.6276503596454859, |
| "rewards/accuracy_reward": 0.5625000111758709, |
| "rewards/cosine_scaled_reward": 0.3406308852136135, |
| "step": 257 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3115.166732788086, |
| "epoch": 0.2948571428571429, |
| "grad_norm": 0.019492125138640404, |
| "kl": 0.00021690130233764648, |
| "lambda_div_used": 0.6207298263907433, |
| "learning_rate": 6.06399955103937e-07, |
| "loss": 0.0613, |
| "reward": 0.0009925179183483124, |
| "reward_after_mean": 0.0009925179183483124, |
| "reward_after_std": 0.5666598528623581, |
| "reward_before_mean": 0.35771505534648895, |
| "reward_before_std": 0.5585699509829283, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3567225467413664, |
| "reward_change_min": -0.6022392623126507, |
| "reward_change_std": 0.23685699328780174, |
| "reward_std": 0.5666598528623581, |
| "rewards/accuracy_reward": 0.3125000111758709, |
| "rewards/cosine_scaled_reward": 0.04521506559103727, |
| "step": 258 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2637.2500534057617, |
| "epoch": 0.296, |
| "grad_norm": 0.029653117060661316, |
| "kl": 0.00020319223403930664, |
| "lambda_div_used": 0.6043171733617783, |
| "learning_rate": 6.032817857379256e-07, |
| "loss": -0.0019, |
| "reward": -0.037313513457775116, |
| "reward_after_mean": -0.037313513457775116, |
| "reward_after_std": 0.5161938592791557, |
| "reward_before_mean": 0.3289037337526679, |
| "reward_before_std": 0.47458031587302685, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3662172295153141, |
| "reward_change_min": -0.5700537078082561, |
| "reward_change_std": 0.2212026845663786, |
| "reward_std": 0.5161938853561878, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": -0.004429628141224384, |
| "step": 259 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1935.9166679382324, |
| "epoch": 0.29714285714285715, |
| "grad_norm": 0.028990233317017555, |
| "kl": 0.00012747198343276978, |
| "lambda_div_used": 0.56998710334301, |
| "learning_rate": 6.001610194928464e-07, |
| "loss": -0.0104, |
| "reward": 0.24466626904904842, |
| "reward_after_mean": 0.24466626904904842, |
| "reward_after_std": 0.5734536852687597, |
| "reward_before_mean": 0.9249862097203732, |
| "reward_before_std": 0.32228614180348814, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.6803199425339699, |
| "reward_change_min": -0.9398231357336044, |
| "reward_change_std": 0.3666897714138031, |
| "reward_std": 0.5734537076205015, |
| "rewards/accuracy_reward": 0.6041666679084301, |
| "rewards/cosine_scaled_reward": 0.3208195334300399, |
| "step": 260 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2764.8958892822266, |
| "epoch": 0.29828571428571427, |
| "grad_norm": 0.021772203966975212, |
| "kl": 0.00015366077423095703, |
| "lambda_div_used": 0.6116980388760567, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0391, |
| "reward": -0.14533425867557526, |
| "reward_after_mean": -0.14533425867557526, |
| "reward_after_std": 0.5442243628203869, |
| "reward_before_mean": 0.15450917836278677, |
| "reward_before_std": 0.5254541491158307, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29984344728291035, |
| "reward_change_min": -0.5264418311417103, |
| "reward_change_std": 0.2055408162996173, |
| "reward_std": 0.5442243684083223, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/cosine_scaled_reward": -0.03299082722514868, |
| "step": 261 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2946.541732788086, |
| "epoch": 0.29942857142857143, |
| "grad_norm": 0.02444116398692131, |
| "kl": 0.00019305944442749023, |
| "lambda_div_used": 0.5770210847258568, |
| "learning_rate": 5.939123048916173e-07, |
| "loss": -0.0078, |
| "reward": -0.30842714570462704, |
| "reward_after_mean": -0.30842714570462704, |
| "reward_after_std": 0.4061661623418331, |
| "reward_before_mean": -0.014746684581041336, |
| "reward_before_std": 0.35232585947960615, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29368047416210175, |
| "reward_change_min": -0.43233491107821465, |
| "reward_change_std": 0.167787273414433, |
| "reward_std": 0.40616616792976856, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.11891335435211658, |
| "step": 262 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2761.7083740234375, |
| "epoch": 0.30057142857142854, |
| "grad_norm": 0.026775242760777473, |
| "kl": 0.00015038251876831055, |
| "lambda_div_used": 0.5933946445584297, |
| "learning_rate": 5.907846610890011e-07, |
| "loss": -0.0212, |
| "reward": -0.28860565181821585, |
| "reward_after_mean": -0.28860565181821585, |
| "reward_after_std": 0.4898714739829302, |
| "reward_before_mean": -0.010391712188720703, |
| "reward_before_std": 0.4291188698261976, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27821394614875317, |
| "reward_change_min": -0.43029162287712097, |
| "reward_change_std": 0.16667384281754494, |
| "reward_std": 0.48987148329615593, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.11455838289111853, |
| "step": 263 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2586.3750534057617, |
| "epoch": 0.3017142857142857, |
| "grad_norm": 0.01994405686855316, |
| "kl": 0.0001627206802368164, |
| "lambda_div_used": 0.6219401434063911, |
| "learning_rate": 5.87655029499542e-07, |
| "loss": 0.043, |
| "reward": -0.11865252908319235, |
| "reward_after_mean": -0.11865252908319235, |
| "reward_after_std": 0.59744056686759, |
| "reward_before_mean": 0.1666876282542944, |
| "reward_before_std": 0.5631906799972057, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2853401657193899, |
| "reward_change_min": -0.4587775580585003, |
| "reward_change_std": 0.18016593530774117, |
| "reward_std": 0.597440579906106, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.04164570523425937, |
| "step": 264 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1858.2083587646484, |
| "epoch": 0.3028571428571429, |
| "grad_norm": 0.02953009493649006, |
| "kl": 0.0001354515552520752, |
| "lambda_div_used": 0.5754420235753059, |
| "learning_rate": 5.845235626570683e-07, |
| "loss": 0.1211, |
| "reward": 0.017749376595020294, |
| "reward_after_mean": 0.017749376595020294, |
| "reward_after_std": 0.478146318346262, |
| "reward_before_mean": 0.4997409600764513, |
| "reward_before_std": 0.3383461497724056, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.48199158161878586, |
| "reward_change_min": -0.6475610621273518, |
| "reward_change_std": 0.2544400207698345, |
| "reward_std": 0.4781463425606489, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.12474094179924577, |
| "step": 265 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3198.625, |
| "epoch": 0.304, |
| "grad_norm": 0.019427413120865822, |
| "kl": 0.00019751489162445068, |
| "lambda_div_used": 0.5534809529781342, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": -0.0091, |
| "reward": -0.40436042100191116, |
| "reward_after_mean": -0.40436042100191116, |
| "reward_after_std": 0.2967198472470045, |
| "reward_before_mean": -0.12292576022446156, |
| "reward_before_std": 0.23972244351170957, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2814346421509981, |
| "reward_change_min": -0.43088357895612717, |
| "reward_change_std": 0.16106584202498198, |
| "reward_std": 0.29671985376626253, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/cosine_scaled_reward": -0.22709244303405285, |
| "step": 266 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2879.354202270508, |
| "epoch": 0.30514285714285716, |
| "grad_norm": 0.02146352268755436, |
| "kl": 0.00019466876983642578, |
| "lambda_div_used": 0.6196897253394127, |
| "learning_rate": 5.78255733788191e-07, |
| "loss": -0.0062, |
| "reward": -0.12080054543912411, |
| "reward_after_mean": -0.12080054543912411, |
| "reward_after_std": 0.5834086053073406, |
| "reward_before_mean": 0.1734671276062727, |
| "reward_before_std": 0.5533927101641893, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.29426765628159046, |
| "reward_change_min": -0.4760690741240978, |
| "reward_change_std": 0.18818860314786434, |
| "reward_std": 0.5834086183458567, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.03486621752381325, |
| "step": 267 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2556.5208740234375, |
| "epoch": 0.3062857142857143, |
| "grad_norm": 0.03222980722784996, |
| "kl": 0.00019940733909606934, |
| "lambda_div_used": 0.6217963546514511, |
| "learning_rate": 5.751196772469237e-07, |
| "loss": 0.11, |
| "reward": -0.0901529286056757, |
| "reward_after_mean": -0.0901529286056757, |
| "reward_after_std": 0.5890399143099785, |
| "reward_before_mean": 0.22261973470449448, |
| "reward_before_std": 0.5651240181177855, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3127726651728153, |
| "reward_change_min": -0.5852204114198685, |
| "reward_change_std": 0.21594477724283934, |
| "reward_std": 0.5890399310737848, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": -0.04821361042559147, |
| "step": 268 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2763.0000534057617, |
| "epoch": 0.30742857142857144, |
| "grad_norm": 0.023989371955394745, |
| "kl": 0.00017099082469940186, |
| "lambda_div_used": 0.5594866573810577, |
| "learning_rate": 5.71982396408026e-07, |
| "loss": 0.04, |
| "reward": 0.10778852179646492, |
| "reward_after_mean": 0.10778852179646492, |
| "reward_after_std": 0.45204984955489635, |
| "reward_before_mean": 0.7018298227339983, |
| "reward_before_std": 0.26720918249338865, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5940412897616625, |
| "reward_change_min": -0.8074228167533875, |
| "reward_change_std": 0.31475239619612694, |
| "reward_std": 0.4520498663187027, |
| "rewards/accuracy_reward": 0.4791666716337204, |
| "rewards/cosine_scaled_reward": 0.22266313433647156, |
| "step": 269 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2451.7917404174805, |
| "epoch": 0.30857142857142855, |
| "grad_norm": 0.02231750823557377, |
| "kl": 0.00018018484115600586, |
| "lambda_div_used": 0.6626150384545326, |
| "learning_rate": 5.688440441781398e-07, |
| "loss": 0.0293, |
| "reward": 0.23073547426611185, |
| "reward_after_mean": 0.23073547426611185, |
| "reward_after_std": 0.8142610676586628, |
| "reward_before_mean": 0.6419318169355392, |
| "reward_before_std": 0.7594864275306463, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4111963789910078, |
| "reward_change_min": -0.6618837527930737, |
| "reward_change_std": 0.26606686785817146, |
| "reward_std": 0.8142610862851143, |
| "rewards/accuracy_reward": 0.4583333432674408, |
| "rewards/cosine_scaled_reward": 0.18359847948886454, |
| "step": 270 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1821.8333740234375, |
| "epoch": 0.3097142857142857, |
| "grad_norm": 0.029715267941355705, |
| "kl": 0.00010024569928646088, |
| "lambda_div_used": 0.6296036839485168, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": -0.127, |
| "reward": 0.15136122331023216, |
| "reward_after_mean": 0.15136122331023216, |
| "reward_after_std": 0.6207431796938181, |
| "reward_before_mean": 0.5634245574474335, |
| "reward_before_std": 0.6018376401625574, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4120633378624916, |
| "reward_change_min": -0.6677181459963322, |
| "reward_change_std": 0.26569564640522003, |
| "reward_std": 0.6207432132214308, |
| "rewards/accuracy_reward": 0.4375000111758709, |
| "rewards/cosine_scaled_reward": 0.1259245565161109, |
| "step": 271 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2659.0834045410156, |
| "epoch": 0.31085714285714283, |
| "grad_norm": 0.025181055068969727, |
| "kl": 0.0002090930938720703, |
| "lambda_div_used": 0.6270301192998886, |
| "learning_rate": 5.625647374256061e-07, |
| "loss": 0.0218, |
| "reward": 0.11005037371069193, |
| "reward_after_mean": 0.11005037371069193, |
| "reward_after_std": 0.6581083033233881, |
| "reward_before_mean": 0.5188289349898696, |
| "reward_before_std": 0.59244554489851, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40877855755388737, |
| "reward_change_min": -0.6175772212445736, |
| "reward_change_std": 0.2535307565703988, |
| "reward_std": 0.6581083126366138, |
| "rewards/accuracy_reward": 0.39583334140479565, |
| "rewards/cosine_scaled_reward": 0.12299557868391275, |
| "step": 272 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2516.875045776367, |
| "epoch": 0.312, |
| "grad_norm": 0.022818049415946007, |
| "kl": 0.00016960501670837402, |
| "lambda_div_used": 0.5807301178574562, |
| "learning_rate": 5.594240889475106e-07, |
| "loss": -0.0175, |
| "reward": 0.09495561942458153, |
| "reward_after_mean": 0.09495561942458153, |
| "reward_after_std": 0.5033265259116888, |
| "reward_before_mean": 0.6270047463476658, |
| "reward_before_std": 0.3717161314561963, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5320491325110197, |
| "reward_change_min": -0.7614771388471127, |
| "reward_change_std": 0.30045478232204914, |
| "reward_std": 0.5033265501260757, |
| "rewards/accuracy_reward": 0.4166666716337204, |
| "rewards/cosine_scaled_reward": 0.2103380784392357, |
| "step": 273 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1573.3750228881836, |
| "epoch": 0.31314285714285717, |
| "grad_norm": 0.03588717430830002, |
| "kl": 0.00010608136653900146, |
| "lambda_div_used": 0.6433197036385536, |
| "learning_rate": 5.562829811526154e-07, |
| "loss": -0.0021, |
| "reward": 0.24507278576493263, |
| "reward_after_mean": 0.24507278576493263, |
| "reward_after_std": 0.7453816495835781, |
| "reward_before_mean": 0.6975303117651492, |
| "reward_before_std": 0.6636776090599597, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4524575434625149, |
| "reward_change_min": -0.703257791697979, |
| "reward_change_std": 0.2789953136816621, |
| "reward_std": 0.7453816495835781, |
| "rewards/accuracy_reward": 0.479166679084301, |
| "rewards/cosine_scaled_reward": 0.21836365200579166, |
| "step": 274 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2231.375015258789, |
| "epoch": 0.3142857142857143, |
| "grad_norm": 0.02452162466943264, |
| "kl": 0.00013327598571777344, |
| "lambda_div_used": 0.5854036509990692, |
| "learning_rate": 5.531415671340826e-07, |
| "loss": 0.0351, |
| "reward": 0.1521737277507782, |
| "reward_after_mean": 0.1521737277507782, |
| "reward_after_std": 0.5359849948436022, |
| "reward_before_mean": 0.6887877276167274, |
| "reward_before_std": 0.3889514375478029, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5366139896214008, |
| "reward_change_min": -0.7292314879596233, |
| "reward_change_std": 0.28961729165166616, |
| "reward_std": 0.5359850041568279, |
| "rewards/accuracy_reward": 0.47916667722165585, |
| "rewards/cosine_scaled_reward": 0.20962106250226498, |
| "step": 275 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2479.375015258789, |
| "epoch": 0.31542857142857145, |
| "grad_norm": 0.022260351106524467, |
| "kl": 0.0001841336488723755, |
| "lambda_div_used": 0.6080946400761604, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0233, |
| "reward": 0.15685568936169147, |
| "reward_after_mean": 0.15685568936169147, |
| "reward_after_std": 0.6163474209606647, |
| "reward_before_mean": 0.6554346140474081, |
| "reward_before_std": 0.5032088747248054, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.49857890233397484, |
| "reward_change_min": -0.7599121108651161, |
| "reward_change_std": 0.2981911161914468, |
| "reward_std": 0.6163474582135677, |
| "rewards/accuracy_reward": 0.4583333395421505, |
| "rewards/cosine_scaled_reward": 0.19710125587880611, |
| "step": 276 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2441.625045776367, |
| "epoch": 0.31657142857142856, |
| "grad_norm": 0.0253590177744627, |
| "kl": 0.0002009868621826172, |
| "lambda_div_used": 0.662374809384346, |
| "learning_rate": 5.468584328659172e-07, |
| "loss": 0.0379, |
| "reward": 0.13848626799881458, |
| "reward_after_mean": 0.13848626799881458, |
| "reward_after_std": 0.8288281839340925, |
| "reward_before_mean": 0.5136084349360317, |
| "reward_before_std": 0.7587530063465238, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37512217462062836, |
| "reward_change_min": -0.6195148192346096, |
| "reward_change_std": 0.24048541858792305, |
| "reward_std": 0.8288282137364149, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.13860841654241085, |
| "step": 277 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2101.729202270508, |
| "epoch": 0.3177142857142857, |
| "grad_norm": 0.03273219242691994, |
| "kl": 0.0001831650733947754, |
| "lambda_div_used": 0.5535080209374428, |
| "learning_rate": 5.437170188473847e-07, |
| "loss": 0.0176, |
| "reward": 0.026296844705939293, |
| "reward_after_mean": 0.026296844705939293, |
| "reward_after_std": 0.47010152228176594, |
| "reward_before_mean": 0.6100956231821328, |
| "reward_before_std": 0.24391112057492137, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5837987624108791, |
| "reward_change_min": -0.8103098906576633, |
| "reward_change_std": 0.3077183160930872, |
| "reward_std": 0.4701015278697014, |
| "rewards/accuracy_reward": 0.4166666679084301, |
| "rewards/cosine_scaled_reward": 0.19342893542489037, |
| "step": 278 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3132.5416717529297, |
| "epoch": 0.31885714285714284, |
| "grad_norm": 0.021023932844400406, |
| "kl": 0.000225067138671875, |
| "lambda_div_used": 0.6101865917444229, |
| "learning_rate": 5.405759110524894e-07, |
| "loss": 0.0252, |
| "reward": -0.24995685555040836, |
| "reward_after_mean": -0.24995685555040836, |
| "reward_after_std": 0.5465981848537922, |
| "reward_before_mean": 0.00569869764149189, |
| "reward_before_std": 0.5138198006898165, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25565553829073906, |
| "reward_change_min": -0.42565521970391273, |
| "reward_change_std": 0.1681571202352643, |
| "reward_std": 0.5465981848537922, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.14013465493917465, |
| "step": 279 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2124.208381652832, |
| "epoch": 0.32, |
| "grad_norm": 0.03544396162033081, |
| "kl": 0.000164031982421875, |
| "lambda_div_used": 0.6441325098276138, |
| "learning_rate": 5.37435262574394e-07, |
| "loss": 0.0491, |
| "reward": 0.1208603996783495, |
| "reward_after_mean": 0.1208603996783495, |
| "reward_after_std": 0.6665525771677494, |
| "reward_before_mean": 0.48556846380233765, |
| "reward_before_std": 0.6708142012357712, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.364708062261343, |
| "reward_change_min": -0.6562114134430885, |
| "reward_change_std": 0.25303495209664106, |
| "reward_std": 0.66655258461833, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/cosine_scaled_reward": 0.13140178471803665, |
| "step": 280 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3433.6459045410156, |
| "epoch": 0.3211428571428571, |
| "grad_norm": 0.0178577471524477, |
| "kl": 0.00023686885833740234, |
| "lambda_div_used": 0.5564139187335968, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0092, |
| "reward": -0.4361804537475109, |
| "reward_after_mean": -0.4361804537475109, |
| "reward_after_std": 0.3096983712166548, |
| "reward_before_mean": -0.1626793835312128, |
| "reward_before_std": 0.2517517115920782, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2735010664910078, |
| "reward_change_min": -0.44846677780151367, |
| "reward_change_std": 0.1593692358583212, |
| "reward_std": 0.3096983730792999, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.22517940029501915, |
| "step": 281 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2314.645866394043, |
| "epoch": 0.3222857142857143, |
| "grad_norm": 0.023510051891207695, |
| "kl": 0.00020015239715576172, |
| "lambda_div_used": 0.5891857892274857, |
| "learning_rate": 5.311559558218603e-07, |
| "loss": -0.0533, |
| "reward": 0.015196382999420166, |
| "reward_after_mean": 0.015196382999420166, |
| "reward_after_std": 0.506258824840188, |
| "reward_before_mean": 0.4649945506826043, |
| "reward_before_std": 0.40477199107408524, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.449798122048378, |
| "reward_change_min": -0.6414072066545486, |
| "reward_change_std": 0.2532971305772662, |
| "reward_std": 0.5062588378787041, |
| "rewards/accuracy_reward": 0.35416667722165585, |
| "rewards/cosine_scaled_reward": 0.11082786321640015, |
| "step": 282 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2393.854232788086, |
| "epoch": 0.32342857142857145, |
| "grad_norm": 0.02048996463418007, |
| "kl": 0.00015240907669067383, |
| "lambda_div_used": 0.587490864098072, |
| "learning_rate": 5.28017603591974e-07, |
| "loss": 0.0139, |
| "reward": 0.30065850354731083, |
| "reward_after_mean": 0.30065850354731083, |
| "reward_after_std": 0.5727098472416401, |
| "reward_before_mean": 0.9280985994264483, |
| "reward_before_std": 0.3965794490650296, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.6274400968104601, |
| "reward_change_min": -0.8531196974217892, |
| "reward_change_std": 0.3378349719569087, |
| "reward_std": 0.5727098621428013, |
| "rewards/accuracy_reward": 0.6041666772216558, |
| "rewards/cosine_scaled_reward": 0.3239319231361151, |
| "step": 283 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2126.020851135254, |
| "epoch": 0.32457142857142857, |
| "grad_norm": 0.028623223304748535, |
| "kl": 0.00013819336891174316, |
| "lambda_div_used": 0.6661800816655159, |
| "learning_rate": 5.248803227530763e-07, |
| "loss": 0.0092, |
| "reward": 0.09131545946002007, |
| "reward_after_mean": 0.09131545946002007, |
| "reward_after_std": 0.7503824215382338, |
| "reward_before_mean": 0.4026043973863125, |
| "reward_before_std": 0.788893286138773, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3112889491021633, |
| "reward_change_min": -0.6233196444809437, |
| "reward_change_std": 0.24689115211367607, |
| "reward_std": 0.7503824215382338, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.09010439366102219, |
| "step": 284 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1871.5000228881836, |
| "epoch": 0.32571428571428573, |
| "grad_norm": 0.02662411518394947, |
| "kl": 0.00016036629676818848, |
| "lambda_div_used": 0.576582707464695, |
| "learning_rate": 5.21744266211809e-07, |
| "loss": 0.0319, |
| "reward": -0.3101608529686928, |
| "reward_after_mean": -0.3101608529686928, |
| "reward_after_std": 0.41523571871221066, |
| "reward_before_mean": -0.01919533498585224, |
| "reward_before_std": 0.3510099109262228, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2909655049443245, |
| "reward_change_min": -0.4455920048058033, |
| "reward_change_std": 0.16767465602606535, |
| "reward_std": 0.4152357243001461, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.12336201290600002, |
| "step": 285 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2349.000030517578, |
| "epoch": 0.32685714285714285, |
| "grad_norm": 0.02408268116414547, |
| "kl": 0.00019100308418273926, |
| "lambda_div_used": 0.6201390102505684, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0307, |
| "reward": 0.21592768095433712, |
| "reward_after_mean": 0.21592768095433712, |
| "reward_after_std": 0.66009739972651, |
| "reward_before_mean": 0.7218069694936275, |
| "reward_before_std": 0.554044695571065, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5058793053030968, |
| "reward_change_min": -0.7697652019560337, |
| "reward_change_std": 0.2988813826814294, |
| "reward_std": 0.6600974258035421, |
| "rewards/accuracy_reward": 0.5000000093132257, |
| "rewards/cosine_scaled_reward": 0.22180695831775665, |
| "step": 286 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1718.604175567627, |
| "epoch": 0.328, |
| "grad_norm": 0.03919753059744835, |
| "kl": 0.00013002753257751465, |
| "lambda_div_used": 0.5975622236728668, |
| "learning_rate": 5.154764373429315e-07, |
| "loss": -0.1056, |
| "reward": 0.03168256084609311, |
| "reward_after_mean": 0.03168256084609311, |
| "reward_after_std": 0.5345403701066971, |
| "reward_before_mean": 0.4581381119787693, |
| "reward_before_std": 0.44751388020813465, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42645558528602123, |
| "reward_change_min": -0.617030244320631, |
| "reward_change_std": 0.24450463335961103, |
| "reward_std": 0.5345403775572777, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.12480480223894119, |
| "step": 287 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2906.3541717529297, |
| "epoch": 0.3291428571428571, |
| "grad_norm": 0.019498826935887337, |
| "kl": 0.00022274255752563477, |
| "lambda_div_used": 0.5806883201003075, |
| "learning_rate": 5.123449705004581e-07, |
| "loss": 0.0055, |
| "reward": -0.18337237276136875, |
| "reward_after_mean": -0.18337237276136875, |
| "reward_after_std": 0.4604283105581999, |
| "reward_before_mean": 0.1833638995885849, |
| "reward_before_std": 0.3699948964640498, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36673627234995365, |
| "reward_change_min": -0.5305228792130947, |
| "reward_change_std": 0.20658569782972336, |
| "reward_std": 0.4604283291846514, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.04580276645720005, |
| "step": 288 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2202.666702270508, |
| "epoch": 0.3302857142857143, |
| "grad_norm": 0.033871494233608246, |
| "kl": 0.00016301870346069336, |
| "lambda_div_used": 0.5840093046426773, |
| "learning_rate": 5.09215338910999e-07, |
| "loss": -0.0478, |
| "reward": 0.007814206182956696, |
| "reward_after_mean": 0.007814206182956696, |
| "reward_after_std": 0.4950754214078188, |
| "reward_before_mean": 0.46548917703330517, |
| "reward_before_std": 0.381549178622663, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45767495781183243, |
| "reward_change_min": -0.6326170898973942, |
| "reward_change_std": 0.24920715391635895, |
| "reward_std": 0.49507543072104454, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.09048915281891823, |
| "step": 289 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1533.6042251586914, |
| "epoch": 0.3314285714285714, |
| "grad_norm": 0.032007429748773575, |
| "kl": 0.00014695525169372559, |
| "lambda_div_used": 0.6211200878024101, |
| "learning_rate": 5.060876951083828e-07, |
| "loss": 0.0764, |
| "reward": 0.04068056936375797, |
| "reward_after_mean": 0.04068056936375797, |
| "reward_after_std": 0.6179232522845268, |
| "reward_before_mean": 0.4251148612238467, |
| "reward_before_std": 0.556100070476532, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3844342865049839, |
| "reward_change_min": -0.5800922103226185, |
| "reward_change_std": 0.22699587792158127, |
| "reward_std": 0.617923267185688, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.09178150352090597, |
| "step": 290 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1996.187515258789, |
| "epoch": 0.3325714285714286, |
| "grad_norm": 0.024394486099481583, |
| "kl": 0.00016900897026062012, |
| "lambda_div_used": 0.67839565128088, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0077, |
| "reward": 0.09233464859426022, |
| "reward_after_mean": 0.09233464859426022, |
| "reward_after_std": 0.8132282309234142, |
| "reward_before_mean": 0.3814069051295519, |
| "reward_before_std": 0.8372865226119757, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2890722490847111, |
| "reward_change_min": -0.5995339304208755, |
| "reward_change_std": 0.22939695976674557, |
| "reward_std": 0.8132282607257366, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.06890689395368099, |
| "step": 291 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2611.541702270508, |
| "epoch": 0.33371428571428574, |
| "grad_norm": 0.022669553756713867, |
| "kl": 0.00019761919975280762, |
| "lambda_div_used": 0.6018925532698631, |
| "learning_rate": 4.998389805071536e-07, |
| "loss": 0.0303, |
| "reward": -0.11102894321084023, |
| "reward_after_mean": -0.11102894321084023, |
| "reward_after_std": 0.5624180883169174, |
| "reward_before_mean": 0.24086102936416864, |
| "reward_before_std": 0.4718956621363759, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35188993997871876, |
| "reward_change_min": -0.5550865493714809, |
| "reward_change_std": 0.21156923100352287, |
| "reward_std": 0.5624181143939495, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": -0.029972338117659092, |
| "step": 292 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2105.812545776367, |
| "epoch": 0.33485714285714285, |
| "grad_norm": 0.022291820496320724, |
| "kl": 0.00020776689052581787, |
| "lambda_div_used": 0.5899104326963425, |
| "learning_rate": 4.967182142620745e-07, |
| "loss": -0.0174, |
| "reward": -0.15591239370405674, |
| "reward_after_mean": -0.15591239370405674, |
| "reward_after_std": 0.4546964541077614, |
| "reward_before_mean": 0.18034806847572327, |
| "reward_before_std": 0.4065048359334469, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.33626046776771545, |
| "reward_change_min": -0.5091775916516781, |
| "reward_change_std": 0.19646561425179243, |
| "reward_std": 0.4546964690089226, |
| "rewards/accuracy_reward": 0.22916667722165585, |
| "rewards/cosine_scaled_reward": -0.048818591982126236, |
| "step": 293 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3101.562545776367, |
| "epoch": 0.336, |
| "grad_norm": 0.020145803689956665, |
| "kl": 0.0002519190311431885, |
| "lambda_div_used": 0.5795014202594757, |
| "learning_rate": 4.93600044896063e-07, |
| "loss": -0.0181, |
| "reward": -0.29703800566494465, |
| "reward_after_mean": -0.29703800566494465, |
| "reward_after_std": 0.4137891363352537, |
| "reward_before_mean": -0.004143683239817619, |
| "reward_before_std": 0.3665550462901592, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2928943391889334, |
| "reward_change_min": -0.4820845164358616, |
| "reward_change_std": 0.18245024606585503, |
| "reward_std": 0.4137891549617052, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.1291436767205596, |
| "step": 294 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2967.1250076293945, |
| "epoch": 0.33714285714285713, |
| "grad_norm": 0.03346420079469681, |
| "kl": 0.00022870302200317383, |
| "lambda_div_used": 0.5998342111706734, |
| "learning_rate": 4.904846243842949e-07, |
| "loss": 0.0317, |
| "reward": -0.21130692400038242, |
| "reward_after_mean": -0.21130692400038242, |
| "reward_after_std": 0.4944173116236925, |
| "reward_before_mean": 0.07879196340218186, |
| "reward_before_std": 0.4600360617041588, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2900988757610321, |
| "reward_change_min": -0.47371478378772736, |
| "reward_change_std": 0.18294072337448597, |
| "reward_std": 0.49441731721162796, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.08787472359836102, |
| "step": 295 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2934.3541870117188, |
| "epoch": 0.3382857142857143, |
| "grad_norm": 0.024312211200594902, |
| "kl": 0.00023806095123291016, |
| "lambda_div_used": 0.5608572289347649, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0627, |
| "reward": -0.18713558092713356, |
| "reward_after_mean": -0.18713558092713356, |
| "reward_after_std": 0.37113036401569843, |
| "reward_before_mean": 0.2144376989454031, |
| "reward_before_std": 0.2741664042696357, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40157328359782696, |
| "reward_change_min": -0.5742517001926899, |
| "reward_change_std": 0.22423129715025425, |
| "reward_std": 0.3711303863674402, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.01472897082567215, |
| "step": 296 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3350.8958435058594, |
| "epoch": 0.3394285714285714, |
| "grad_norm": 0.016911419108510017, |
| "kl": 0.0002732276916503906, |
| "lambda_div_used": 0.6085675731301308, |
| "learning_rate": 4.842626371469149e-07, |
| "loss": -0.0001, |
| "reward": -0.13465989474207163, |
| "reward_after_mean": -0.13465989474207163, |
| "reward_after_std": 0.5554658677428961, |
| "reward_before_mean": 0.1848100395873189, |
| "reward_before_std": 0.494691526517272, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31946992687880993, |
| "reward_change_min": -0.5050319209694862, |
| "reward_change_std": 0.18696323037147522, |
| "reward_std": 0.5554658826440573, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/cosine_scaled_reward": -0.04435660713352263, |
| "step": 297 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2694.791748046875, |
| "epoch": 0.3405714285714286, |
| "grad_norm": 0.022230952978134155, |
| "kl": 0.0001736283302307129, |
| "lambda_div_used": 0.585621178150177, |
| "learning_rate": 4.811563736721829e-07, |
| "loss": -0.0641, |
| "reward": -0.09131219866685569, |
| "reward_after_mean": -0.09131219866685569, |
| "reward_after_std": 0.521870668977499, |
| "reward_before_mean": 0.32360453344881535, |
| "reward_before_std": 0.3913488043472171, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41491674818098545, |
| "reward_change_min": -0.6092600487172604, |
| "reward_change_std": 0.22935225442051888, |
| "reward_std": 0.5218706801533699, |
| "rewards/accuracy_reward": 0.31250000186264515, |
| "rewards/cosine_scaled_reward": 0.011104530887678266, |
| "step": 298 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3055.562530517578, |
| "epoch": 0.3417142857142857, |
| "grad_norm": 0.016956690698862076, |
| "kl": 0.00024694204330444336, |
| "lambda_div_used": 0.6225644424557686, |
| "learning_rate": 4.780534655386743e-07, |
| "loss": -0.0092, |
| "reward": 0.035534653812646866, |
| "reward_after_mean": 0.035534653812646866, |
| "reward_after_std": 0.5773687828332186, |
| "reward_before_mean": 0.4040503818541765, |
| "reward_before_std": 0.5686829779297113, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3685157597064972, |
| "reward_change_min": -0.6160638965666294, |
| "reward_change_std": 0.24328476376831532, |
| "reward_std": 0.5773687846958637, |
| "rewards/accuracy_reward": 0.3125000111758709, |
| "rewards/cosine_scaled_reward": 0.09155038185417652, |
| "step": 299 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3382.041717529297, |
| "epoch": 0.34285714285714286, |
| "grad_norm": 0.018982913345098495, |
| "kl": 0.00028121471405029297, |
| "lambda_div_used": 0.5635487735271454, |
| "learning_rate": 4.749540639777539e-07, |
| "loss": 0.0416, |
| "reward": -0.35363302007317543, |
| "reward_after_mean": -0.35363302007317543, |
| "reward_after_std": 0.3348991144448519, |
| "reward_before_mean": -0.05836603417992592, |
| "reward_before_std": 0.2893520062789321, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2952669896185398, |
| "reward_change_min": -0.45902693271636963, |
| "reward_change_std": 0.17492949962615967, |
| "reward_std": 0.33489912562072277, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.14169937558472157, |
| "step": 300 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2189.666732788086, |
| "epoch": 0.344, |
| "grad_norm": 0.02743070013821125, |
| "kl": 0.00020772218704223633, |
| "lambda_div_used": 0.6162105649709702, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.077, |
| "reward": -0.2270398661494255, |
| "reward_after_mean": -0.2270398661494255, |
| "reward_after_std": 0.5807012170553207, |
| "reward_before_mean": 0.03607312589883804, |
| "reward_before_std": 0.5330064725130796, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.263112997636199, |
| "reward_change_min": -0.46500347927212715, |
| "reward_change_std": 0.16810790356248617, |
| "reward_std": 0.5807012394070625, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.10976020619273186, |
| "step": 301 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2271.9167098999023, |
| "epoch": 0.34514285714285714, |
| "grad_norm": 0.03009362705051899, |
| "kl": 0.00019848346710205078, |
| "lambda_div_used": 0.6102809309959412, |
| "learning_rate": 4.68766384637248e-07, |
| "loss": -0.0061, |
| "reward": 0.1712653641588986, |
| "reward_after_mean": 0.1712653641588986, |
| "reward_after_std": 0.6374101359397173, |
| "reward_before_mean": 0.6755810640752316, |
| "reward_before_std": 0.5059886500239372, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5043157208710909, |
| "reward_change_min": -0.7193296477198601, |
| "reward_change_std": 0.2849651984870434, |
| "reward_std": 0.637410145252943, |
| "rewards/accuracy_reward": 0.4583333395421505, |
| "rewards/cosine_scaled_reward": 0.21724772220477462, |
| "step": 302 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2379.8125381469727, |
| "epoch": 0.3462857142857143, |
| "grad_norm": 0.028283070772886276, |
| "kl": 0.00022455304861068726, |
| "lambda_div_used": 0.5937864035367966, |
| "learning_rate": 4.656784084364238e-07, |
| "loss": -0.0542, |
| "reward": -0.11667206266429275, |
| "reward_after_mean": -0.11667206266429275, |
| "reward_after_std": 0.46380676329135895, |
| "reward_before_mean": 0.24170983396470547, |
| "reward_before_std": 0.4273997135460377, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3583819102495909, |
| "reward_change_min": -0.5776236318051815, |
| "reward_change_std": 0.22200697474181652, |
| "reward_std": 0.4638067800551653, |
| "rewards/accuracy_reward": 0.2500000111758709, |
| "rewards/cosine_scaled_reward": -0.008290180005133152, |
| "step": 303 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2483.270881652832, |
| "epoch": 0.3474285714285714, |
| "grad_norm": 0.02691042050719261, |
| "kl": 0.0002352595329284668, |
| "lambda_div_used": 0.6489763781428337, |
| "learning_rate": 4.6259454195101267e-07, |
| "loss": -0.0177, |
| "reward": -0.052505167201161385, |
| "reward_after_mean": -0.052505167201161385, |
| "reward_after_std": 0.7108322139829397, |
| "reward_before_mean": 0.2283354545943439, |
| "reward_before_std": 0.6949998550117016, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28084064833819866, |
| "reward_change_min": -0.5053656212985516, |
| "reward_change_std": 0.1956273503601551, |
| "reward_std": 0.7108322307467461, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.00083120446652174, |
| "step": 304 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2814.229232788086, |
| "epoch": 0.3485714285714286, |
| "grad_norm": 0.023387128487229347, |
| "kl": 0.0002447366714477539, |
| "lambda_div_used": 0.5986855253577232, |
| "learning_rate": 4.59514935484316e-07, |
| "loss": -0.0237, |
| "reward": -0.17212717607617378, |
| "reward_after_mean": -0.17212717607617378, |
| "reward_after_std": 0.5009447801858187, |
| "reward_before_mean": 0.13333414122462273, |
| "reward_before_std": 0.45383385568857193, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30546131171286106, |
| "reward_change_min": -0.460513886064291, |
| "reward_change_std": 0.1838802546262741, |
| "reward_std": 0.5009447950869799, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.07499920274131, |
| "step": 305 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2095.0000190734863, |
| "epoch": 0.3497142857142857, |
| "grad_norm": 0.03199386969208717, |
| "kl": 0.00018633902072906494, |
| "lambda_div_used": 0.595753937959671, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": 0.0245, |
| "reward": 0.03899537643883377, |
| "reward_after_mean": 0.03899537643883377, |
| "reward_after_std": 0.528584310784936, |
| "reward_before_mean": 0.49805059214122593, |
| "reward_before_std": 0.4409319751430303, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45905524492263794, |
| "reward_change_min": -0.6990172192454338, |
| "reward_change_std": 0.2754313191398978, |
| "reward_std": 0.528584323823452, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.14388393727131188, |
| "step": 306 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2299.4166984558105, |
| "epoch": 0.35085714285714287, |
| "grad_norm": 0.030135195702314377, |
| "kl": 0.00022998452186584473, |
| "lambda_div_used": 0.6275613307952881, |
| "learning_rate": 4.5336910277482155e-07, |
| "loss": 0.024, |
| "reward": -0.1547635430470109, |
| "reward_after_mean": -0.1547635430470109, |
| "reward_after_std": 0.6599444597959518, |
| "reward_before_mean": 0.11526766640599817, |
| "reward_before_std": 0.586240291595459, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.270031226798892, |
| "reward_change_min": -0.40235158428549767, |
| "reward_change_std": 0.15426483657211065, |
| "reward_std": 0.6599444709718227, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.05139899626374245, |
| "step": 307 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3160.000030517578, |
| "epoch": 0.352, |
| "grad_norm": 0.018670011311769485, |
| "kl": 0.00022402405738830566, |
| "lambda_div_used": 0.6338188126683235, |
| "learning_rate": 4.503031760712397e-07, |
| "loss": -0.0036, |
| "reward": -0.02171214483678341, |
| "reward_after_mean": -0.02171214483678341, |
| "reward_after_std": 0.6237165722995996, |
| "reward_before_mean": 0.29556242609396577, |
| "reward_before_std": 0.6284392019733787, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3172745667397976, |
| "reward_change_min": -0.5880400538444519, |
| "reward_change_std": 0.227503115311265, |
| "reward_std": 0.6237165946513414, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.024729080265387893, |
| "step": 308 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2601.9583740234375, |
| "epoch": 0.35314285714285715, |
| "grad_norm": 0.02388334833085537, |
| "kl": 0.00020259618759155273, |
| "lambda_div_used": 0.645737886428833, |
| "learning_rate": 4.4724210845020494e-07, |
| "loss": 0.0479, |
| "reward": 0.09015273489058018, |
| "reward_after_mean": 0.09015273489058018, |
| "reward_after_std": 0.6656354945152998, |
| "reward_before_mean": 0.4416396114975214, |
| "reward_before_std": 0.6783724967390299, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3514868915081024, |
| "reward_change_min": -0.5935809202492237, |
| "reward_change_std": 0.24138008058071136, |
| "reward_std": 0.6656355243176222, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.10830628499388695, |
| "step": 309 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2018.3542022705078, |
| "epoch": 0.35428571428571426, |
| "grad_norm": 0.030827393755316734, |
| "kl": 0.00020694732666015625, |
| "lambda_div_used": 0.6246318891644478, |
| "learning_rate": 4.441860491038345e-07, |
| "loss": 0.0364, |
| "reward": -0.13591936416924, |
| "reward_after_mean": -0.13591936416924, |
| "reward_after_std": 0.590507235378027, |
| "reward_before_mean": 0.14517710404470563, |
| "reward_before_std": 0.5790550196543336, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28109647892415524, |
| "reward_change_min": -0.5089371241629124, |
| "reward_change_std": 0.1950351819396019, |
| "reward_std": 0.5905072540044785, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/cosine_scaled_reward": -0.06315623363479972, |
| "step": 310 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2212.083335876465, |
| "epoch": 0.3554285714285714, |
| "grad_norm": 0.025455351918935776, |
| "kl": 0.00022774934768676758, |
| "lambda_div_used": 0.5723904073238373, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": -0.0105, |
| "reward": 0.04124009236693382, |
| "reward_after_mean": 0.04124009236693382, |
| "reward_after_std": 0.47586701065301895, |
| "reward_before_mean": 0.5763039644807577, |
| "reward_before_std": 0.3296704487875104, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5350638851523399, |
| "reward_change_min": -0.7587380260229111, |
| "reward_change_std": 0.295849135145545, |
| "reward_std": 0.47586701437830925, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.18047063890844584, |
| "step": 311 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2184.833351135254, |
| "epoch": 0.3565714285714286, |
| "grad_norm": 0.030652416869997978, |
| "kl": 0.0002588629722595215, |
| "lambda_div_used": 0.5729342699050903, |
| "learning_rate": 4.3808955077581546e-07, |
| "loss": -0.0417, |
| "reward": 0.07517872378230095, |
| "reward_after_mean": 0.07517872378230095, |
| "reward_after_std": 0.4695996157824993, |
| "reward_before_mean": 0.596075527369976, |
| "reward_before_std": 0.333228693343699, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5208968166261911, |
| "reward_change_min": -0.7326644062995911, |
| "reward_change_std": 0.28812805004417896, |
| "reward_std": 0.46959962509572506, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/cosine_scaled_reward": 0.15857553109526634, |
| "step": 312 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2586.1875534057617, |
| "epoch": 0.3577142857142857, |
| "grad_norm": 0.022464681416749954, |
| "kl": 0.00023896992206573486, |
| "lambda_div_used": 0.5898676738142967, |
| "learning_rate": 4.350494089288943e-07, |
| "loss": 0.0132, |
| "reward": 0.004749574698507786, |
| "reward_after_mean": 0.004749574698507786, |
| "reward_after_std": 0.5146205350756645, |
| "reward_before_mean": 0.4418492801487446, |
| "reward_before_std": 0.41057482920587063, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4370996989309788, |
| "reward_change_min": -0.6041509285569191, |
| "reward_change_std": 0.24443841353058815, |
| "reward_std": 0.5146205425262451, |
| "rewards/accuracy_reward": 0.35416667722165585, |
| "rewards/cosine_scaled_reward": 0.08768259733915329, |
| "step": 313 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2277.68754196167, |
| "epoch": 0.3588571428571429, |
| "grad_norm": 0.029802966862916946, |
| "kl": 0.00015804357826709747, |
| "lambda_div_used": 0.57961256057024, |
| "learning_rate": 4.3201486961161093e-07, |
| "loss": -0.0058, |
| "reward": 0.04087065905332565, |
| "reward_after_mean": 0.04087065905332565, |
| "reward_after_std": 0.491618013009429, |
| "reward_before_mean": 0.5418765433132648, |
| "reward_before_std": 0.3644657013937831, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5010058786720037, |
| "reward_change_min": -0.7266513183712959, |
| "reward_change_std": 0.2839649748057127, |
| "reward_std": 0.4916180297732353, |
| "rewards/accuracy_reward": 0.3958333358168602, |
| "rewards/cosine_scaled_reward": 0.14604321867227554, |
| "step": 314 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2722.479179382324, |
| "epoch": 0.36, |
| "grad_norm": 0.02163972705602646, |
| "kl": 0.00025177001953125, |
| "lambda_div_used": 0.5554845109581947, |
| "learning_rate": 4.2898608072313045e-07, |
| "loss": 0.0302, |
| "reward": -0.1392173320055008, |
| "reward_after_mean": -0.1392173320055008, |
| "reward_after_std": 0.421754639595747, |
| "reward_before_mean": 0.3322529550641775, |
| "reward_before_std": 0.2525685231667012, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4714703354984522, |
| "reward_change_min": -0.6779767945408821, |
| "reward_change_std": 0.2537938868626952, |
| "reward_std": 0.4217546433210373, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.04058632627129555, |
| "step": 315 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3430.4166870117188, |
| "epoch": 0.36114285714285715, |
| "grad_norm": 0.021473117172718048, |
| "kl": 0.00034546852111816406, |
| "lambda_div_used": 0.5831886008381844, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": 0.0331, |
| "reward": -0.20306292921304703, |
| "reward_after_mean": -0.20306292921304703, |
| "reward_after_std": 0.4045538082718849, |
| "reward_before_mean": 0.12524887174367905, |
| "reward_before_std": 0.37981976941227913, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3283117860555649, |
| "reward_change_min": -0.5172865837812424, |
| "reward_change_std": 0.20252829603850842, |
| "reward_std": 0.40455381385982037, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.062251146882772446, |
| "step": 316 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2792.6250076293945, |
| "epoch": 0.36228571428571427, |
| "grad_norm": 0.023409582674503326, |
| "kl": 0.00030331313610076904, |
| "lambda_div_used": 0.5606855005025864, |
| "learning_rate": 4.2294634442070553e-07, |
| "loss": 0.0162, |
| "reward": -0.25401476211845875, |
| "reward_after_mean": -0.25401476211845875, |
| "reward_after_std": 0.39204780384898186, |
| "reward_before_mean": 0.13540820218622684, |
| "reward_before_std": 0.2758419858291745, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38942296989262104, |
| "reward_change_min": -0.5835930369794369, |
| "reward_change_std": 0.21769424341619015, |
| "reward_std": 0.39204781129956245, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.03125846944749355, |
| "step": 317 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1197.1875228881836, |
| "epoch": 0.36342857142857143, |
| "grad_norm": 0.044679004698991776, |
| "kl": 0.00012731552124023438, |
| "lambda_div_used": 0.6002074480056763, |
| "learning_rate": 4.1993569137498776e-07, |
| "loss": -0.0198, |
| "reward": 0.03343228530138731, |
| "reward_after_mean": 0.03343228530138731, |
| "reward_after_std": 0.5209389794617891, |
| "reward_before_mean": 0.4655236080288887, |
| "reward_before_std": 0.45914868731051683, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4320913068950176, |
| "reward_change_min": -0.6742168106138706, |
| "reward_change_std": 0.2611297369003296, |
| "reward_std": 0.5209389794617891, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.1113569182343781, |
| "step": 318 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2743.4167098999023, |
| "epoch": 0.36457142857142855, |
| "grad_norm": 0.03044748492538929, |
| "kl": 0.0002751946449279785, |
| "lambda_div_used": 0.5535945892333984, |
| "learning_rate": 4.1693137748017915e-07, |
| "loss": 0.0111, |
| "reward": -0.4908355651423335, |
| "reward_after_mean": -0.4908355651423335, |
| "reward_after_std": 0.3172128964215517, |
| "reward_before_mean": -0.24739529378712177, |
| "reward_before_std": 0.23890432622283697, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2434402648359537, |
| "reward_change_min": -0.35953374207019806, |
| "reward_change_std": 0.13119421433657408, |
| "reward_std": 0.31721290200948715, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.2682286258786917, |
| "step": 319 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1857.3333587646484, |
| "epoch": 0.3657142857142857, |
| "grad_norm": 0.03547307848930359, |
| "kl": 0.00026220083236694336, |
| "lambda_div_used": 0.6022170931100845, |
| "learning_rate": 4.1393354916230005e-07, |
| "loss": 0.0462, |
| "reward": -0.125480268150568, |
| "reward_after_mean": -0.125480268150568, |
| "reward_after_std": 0.505291972309351, |
| "reward_before_mean": 0.19450905406847596, |
| "reward_before_std": 0.4768552405294031, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31998929381370544, |
| "reward_change_min": -0.5316651687026024, |
| "reward_change_std": 0.20880178455263376, |
| "reward_std": 0.505291985347867, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/cosine_scaled_reward": -0.034657632233574986, |
| "step": 320 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1452.770881652832, |
| "epoch": 0.3668571428571429, |
| "grad_norm": 0.03124224953353405, |
| "kl": 0.00018140673637390137, |
| "lambda_div_used": 0.6199431717395782, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": -0.0599, |
| "reward": 0.282286923378706, |
| "reward_after_mean": 0.282286923378706, |
| "reward_after_std": 0.6488686576485634, |
| "reward_before_mean": 0.8242554701864719, |
| "reward_before_std": 0.5566414860077202, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5419685430824757, |
| "reward_change_min": -0.8236861452460289, |
| "reward_change_std": 0.33083922043442726, |
| "reward_std": 0.648868665099144, |
| "rewards/accuracy_reward": 0.5625000074505806, |
| "rewards/cosine_scaled_reward": 0.26175545156002045, |
| "step": 321 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2662.50004196167, |
| "epoch": 0.368, |
| "grad_norm": 0.036272305995225906, |
| "kl": 0.0003364086151123047, |
| "lambda_div_used": 0.6552734896540642, |
| "learning_rate": 4.079579333738039e-07, |
| "loss": 0.0123, |
| "reward": -0.039356768131256104, |
| "reward_after_mean": -0.039356768131256104, |
| "reward_after_std": 0.7184183727949858, |
| "reward_before_mean": 0.23175650835037231, |
| "reward_before_std": 0.7305669207125902, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2711132802069187, |
| "reward_change_min": -0.5112780928611755, |
| "reward_change_std": 0.20828023366630077, |
| "reward_std": 0.7184183821082115, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": -0.03907683305442333, |
| "step": 322 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2875.937530517578, |
| "epoch": 0.36914285714285716, |
| "grad_norm": 0.023642728105187416, |
| "kl": 0.0003116130828857422, |
| "lambda_div_used": 0.5762533918023109, |
| "learning_rate": 4.0498043714627006e-07, |
| "loss": -0.0062, |
| "reward": -0.18317513819783926, |
| "reward_after_mean": -0.18317513819783926, |
| "reward_after_std": 0.45206453651189804, |
| "reward_before_mean": 0.1913843434303999, |
| "reward_before_std": 0.34759796876460314, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3745594993233681, |
| "reward_change_min": -0.5339466538280249, |
| "reward_change_std": 0.2068865867331624, |
| "reward_std": 0.4520645458251238, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.037782331462949514, |
| "step": 323 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2517.541732788086, |
| "epoch": 0.3702857142857143, |
| "grad_norm": 0.02370315045118332, |
| "kl": 0.0002675652503967285, |
| "lambda_div_used": 0.582614079117775, |
| "learning_rate": 4.020100089676376e-07, |
| "loss": 0.0264, |
| "reward": -0.09797720052301884, |
| "reward_after_mean": -0.09797720052301884, |
| "reward_after_std": 0.46359810046851635, |
| "reward_before_mean": 0.30637714080512524, |
| "reward_before_std": 0.37786578573286533, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4043543320149183, |
| "reward_change_min": -0.6221220158040524, |
| "reward_change_std": 0.2347763581201434, |
| "reward_std": 0.46359810046851635, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.03554379381239414, |
| "step": 324 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2340.104232788086, |
| "epoch": 0.37142857142857144, |
| "grad_norm": 0.02759523130953312, |
| "kl": 0.0002740621566772461, |
| "lambda_div_used": 0.6287032291293144, |
| "learning_rate": 3.9904679361238526e-07, |
| "loss": 0.0386, |
| "reward": 0.3422952927649021, |
| "reward_after_mean": 0.3422952927649021, |
| "reward_after_std": 0.6593811456114054, |
| "reward_before_mean": 0.8776058983057737, |
| "reward_before_std": 0.5943253133445978, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5353106111288071, |
| "reward_change_min": -0.8339854516088963, |
| "reward_change_std": 0.32581204548478127, |
| "reward_std": 0.6593811772763729, |
| "rewards/accuracy_reward": 0.5416666753590107, |
| "rewards/cosine_scaled_reward": 0.335939209908247, |
| "step": 325 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2069.0208740234375, |
| "epoch": 0.37257142857142855, |
| "grad_norm": 0.026782048866152763, |
| "kl": 0.00021858513355255127, |
| "lambda_div_used": 0.5514643862843513, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.0214, |
| "reward": -0.2377523072063923, |
| "reward_after_mean": -0.2377523072063923, |
| "reward_after_std": 0.3420621510595083, |
| "reward_before_mean": 0.17398178996518254, |
| "reward_before_std": 0.2309601987944916, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41173411533236504, |
| "reward_change_min": -0.586280532181263, |
| "reward_change_std": 0.22657191008329391, |
| "reward_std": 0.3420621529221535, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.034351545851677656, |
| "step": 326 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2595.583339691162, |
| "epoch": 0.3737142857142857, |
| "grad_norm": 0.023073619231581688, |
| "kl": 0.0003046169877052307, |
| "lambda_div_used": 0.5742950737476349, |
| "learning_rate": 3.931425787051832e-07, |
| "loss": 0.016, |
| "reward": -0.06984845921397209, |
| "reward_after_mean": -0.06984845921397209, |
| "reward_after_std": 0.4958471246063709, |
| "reward_before_mean": 0.39507998805493116, |
| "reward_before_std": 0.33821228239685297, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.46492844074964523, |
| "reward_change_min": -0.6655924804508686, |
| "reward_change_std": 0.25432714726775885, |
| "reward_std": 0.49584713764488697, |
| "rewards/accuracy_reward": 0.33333333395421505, |
| "rewards/cosine_scaled_reward": 0.06174664665013552, |
| "step": 327 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3254.375030517578, |
| "epoch": 0.37485714285714283, |
| "grad_norm": 0.02187371626496315, |
| "kl": 0.0003033876419067383, |
| "lambda_div_used": 0.6039082854986191, |
| "learning_rate": 3.902018669163384e-07, |
| "loss": 0.0465, |
| "reward": -0.22737010568380356, |
| "reward_after_mean": -0.22737010568380356, |
| "reward_after_std": 0.5139794517308474, |
| "reward_before_mean": 0.057241520611569285, |
| "reward_before_std": 0.47974141500890255, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28461163491010666, |
| "reward_change_min": -0.4783761650323868, |
| "reward_change_std": 0.18032598588615656, |
| "reward_std": 0.5139794517308474, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.10942514054477215, |
| "step": 328 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1712.7500305175781, |
| "epoch": 0.376, |
| "grad_norm": 0.030909525230526924, |
| "kl": 0.00020560622215270996, |
| "lambda_div_used": 0.5851015225052834, |
| "learning_rate": 3.872689434630585e-07, |
| "loss": 0.0147, |
| "reward": -0.0780089907348156, |
| "reward_after_mean": -0.0780089907348156, |
| "reward_after_std": 0.4830572586506605, |
| "reward_before_mean": 0.3372959513217211, |
| "reward_before_std": 0.39714881777763367, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.415304945781827, |
| "reward_change_min": -0.6223411783576012, |
| "reward_change_std": 0.24614232685416937, |
| "reward_std": 0.48305728659033775, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.06646260246634483, |
| "step": 329 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2283.437515258789, |
| "epoch": 0.37714285714285717, |
| "grad_norm": 0.03479500487446785, |
| "kl": 0.00036913156509399414, |
| "lambda_div_used": 0.629969134926796, |
| "learning_rate": 3.843439512918949e-07, |
| "loss": -0.0771, |
| "reward": -0.08872065320611, |
| "reward_after_mean": -0.08872065320611, |
| "reward_after_std": 0.6100571732968092, |
| "reward_before_mean": 0.20380639098584652, |
| "reward_before_std": 0.606320459395647, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2925270590931177, |
| "reward_change_min": -0.5179039165377617, |
| "reward_change_std": 0.20588424988090992, |
| "reward_std": 0.6100572124123573, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.02536027878522873, |
| "step": 330 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2365.354200363159, |
| "epoch": 0.3782857142857143, |
| "grad_norm": 0.05943391099572182, |
| "kl": 0.0003091096878051758, |
| "lambda_div_used": 0.6346529722213745, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": -0.0337, |
| "reward": -0.24408827535808086, |
| "reward_after_mean": -0.24408827535808086, |
| "reward_after_std": 0.6825795099139214, |
| "reward_before_mean": -0.026182920671999454, |
| "reward_before_std": 0.6199203189462423, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.21790535561740398, |
| "reward_change_min": -0.3899071477353573, |
| "reward_change_std": 0.13575981836766005, |
| "reward_std": 0.6825795285403728, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.1511829246301204, |
| "step": 331 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2488.937530517578, |
| "epoch": 0.37942857142857145, |
| "grad_norm": 0.02497093193233013, |
| "kl": 0.00024446845054626465, |
| "lambda_div_used": 0.628095343708992, |
| "learning_rate": 3.785183306423767e-07, |
| "loss": 0.002, |
| "reward": -0.0888600671896711, |
| "reward_after_mean": -0.0888600671896711, |
| "reward_after_std": 0.634421993046999, |
| "reward_before_mean": 0.22087145410478115, |
| "reward_before_std": 0.5863482365384698, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30973155051469803, |
| "reward_change_min": -0.5146533660590649, |
| "reward_change_std": 0.19153987523168325, |
| "reward_std": 0.6344220079481602, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/cosine_scaled_reward": -0.00829521007835865, |
| "step": 332 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1976.333381652832, |
| "epoch": 0.38057142857142856, |
| "grad_norm": 0.029570044949650764, |
| "kl": 0.00023859739303588867, |
| "lambda_div_used": 0.5713351741433144, |
| "learning_rate": 3.7561798609655373e-07, |
| "loss": 0.026, |
| "reward": 0.11462849378585815, |
| "reward_after_mean": 0.11462849378585815, |
| "reward_after_std": 0.5290882792323828, |
| "reward_before_mean": 0.6867042146623135, |
| "reward_before_std": 0.3248422802425921, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5720757059752941, |
| "reward_change_min": -0.7701217532157898, |
| "reward_change_std": 0.30192676838487387, |
| "reward_std": 0.5290882866829634, |
| "rewards/accuracy_reward": 0.47916666977107525, |
| "rewards/cosine_scaled_reward": 0.20753752067685127, |
| "step": 333 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2970.2708892822266, |
| "epoch": 0.38171428571428573, |
| "grad_norm": 0.022143961861729622, |
| "kl": 0.0002802610397338867, |
| "lambda_div_used": 0.5814503356814384, |
| "learning_rate": 3.72726140684072e-07, |
| "loss": -0.0083, |
| "reward": -0.36110448837280273, |
| "reward_after_mean": -0.36110448837280273, |
| "reward_after_std": 0.42284002527594566, |
| "reward_before_mean": -0.09825330413877964, |
| "reward_before_std": 0.3718461263924837, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26285118237137794, |
| "reward_change_min": -0.4302907735109329, |
| "reward_change_std": 0.16061531472951174, |
| "reward_std": 0.42284005135297775, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.1815866343677044, |
| "step": 334 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2137.7916717529297, |
| "epoch": 0.38285714285714284, |
| "grad_norm": 0.023810893297195435, |
| "kl": 0.00019755959510803223, |
| "lambda_div_used": 0.5730342343449593, |
| "learning_rate": 3.6984293534939737e-07, |
| "loss": -0.0589, |
| "reward": -0.003980423090979457, |
| "reward_after_mean": -0.003980423090979457, |
| "reward_after_std": 0.4973279498517513, |
| "reward_before_mean": 0.48682230338454247, |
| "reward_before_std": 0.3303193561732769, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4908027183264494, |
| "reward_change_min": -0.6631990969181061, |
| "reward_change_std": 0.25930201914161444, |
| "reward_std": 0.4973279610276222, |
| "rewards/accuracy_reward": 0.35416666977107525, |
| "rewards/cosine_scaled_reward": 0.132655612193048, |
| "step": 335 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2604.5625762939453, |
| "epoch": 0.384, |
| "grad_norm": 0.019800275564193726, |
| "kl": 0.00027495622634887695, |
| "lambda_div_used": 0.6422952190041542, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": -0.0153, |
| "reward": 0.1156077766790986, |
| "reward_after_mean": 0.1156077766790986, |
| "reward_after_std": 0.7227907460182905, |
| "reward_before_mean": 0.5011630854569376, |
| "reward_before_std": 0.6614610198885202, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3855553139001131, |
| "reward_change_min": -0.6121690329164267, |
| "reward_change_std": 0.23865524679422379, |
| "reward_std": 0.7227907720953226, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.1261630654335022, |
| "step": 336 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2581.6667404174805, |
| "epoch": 0.3851428571428571, |
| "grad_norm": 0.024305738508701324, |
| "kl": 0.000293731689453125, |
| "lambda_div_used": 0.5970565602183342, |
| "learning_rate": 3.641030065789562e-07, |
| "loss": 0.0618, |
| "reward": -0.012716710567474365, |
| "reward_after_mean": -0.012716710567474365, |
| "reward_after_std": 0.5347360204905272, |
| "reward_before_mean": 0.4010282773524523, |
| "reward_before_std": 0.4501037606969476, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41374498419463634, |
| "reward_change_min": -0.597484715282917, |
| "reward_change_std": 0.24177721049636602, |
| "reward_std": 0.5347360335290432, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.08852825942449272, |
| "step": 337 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1854.9375381469727, |
| "epoch": 0.3862857142857143, |
| "grad_norm": 0.035376228392124176, |
| "kl": 0.00024819374084472656, |
| "lambda_div_used": 0.6688028946518898, |
| "learning_rate": 3.612465628992203e-07, |
| "loss": 0.1016, |
| "reward": 0.3372328467667103, |
| "reward_after_mean": 0.3372328467667103, |
| "reward_after_std": 0.8120726235210896, |
| "reward_before_mean": 0.783452745527029, |
| "reward_before_std": 0.787550600245595, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.44621986895799637, |
| "reward_change_min": -0.745157428085804, |
| "reward_change_std": 0.29823943972587585, |
| "reward_std": 0.812072642147541, |
| "rewards/accuracy_reward": 0.500000013038516, |
| "rewards/cosine_scaled_reward": 0.2834526968654245, |
| "step": 338 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2807.541717529297, |
| "epoch": 0.38742857142857146, |
| "grad_norm": 0.024581631645560265, |
| "kl": 0.0003020763397216797, |
| "lambda_div_used": 0.559957392513752, |
| "learning_rate": 3.5839931879571725e-07, |
| "loss": -0.0589, |
| "reward": -0.2989091109484434, |
| "reward_after_mean": -0.2989091109484434, |
| "reward_after_std": 0.3937798347324133, |
| "reward_before_mean": 0.06776450201869011, |
| "reward_before_std": 0.269554709084332, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3666736055165529, |
| "reward_change_min": -0.5128730908036232, |
| "reward_change_std": 0.19430112652480602, |
| "reward_std": 0.3937798459082842, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.07806883845478296, |
| "step": 339 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2440.437530517578, |
| "epoch": 0.38857142857142857, |
| "grad_norm": 0.027588481083512306, |
| "kl": 0.0002573728561401367, |
| "lambda_div_used": 0.6044978573918343, |
| "learning_rate": 3.555614130391079e-07, |
| "loss": -0.021, |
| "reward": -0.09739532321691513, |
| "reward_after_mean": -0.09739532321691513, |
| "reward_after_std": 0.5066191554069519, |
| "reward_before_mean": 0.23490323033183813, |
| "reward_before_std": 0.48513105837628245, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.33229855448007584, |
| "reward_change_min": -0.5539730787277222, |
| "reward_change_std": 0.21668985951691866, |
| "reward_std": 0.5066191554069519, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": 0.005736543796956539, |
| "step": 340 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2306.4583740234375, |
| "epoch": 0.38971428571428574, |
| "grad_norm": 0.025286095216870308, |
| "kl": 0.00023421645164489746, |
| "lambda_div_used": 0.5724563226103783, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0526, |
| "reward": 0.04836506303399801, |
| "reward_after_mean": 0.04836506303399801, |
| "reward_after_std": 0.4924194272607565, |
| "reward_before_mean": 0.5868019293993711, |
| "reward_before_std": 0.33085333183407784, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5384368915110826, |
| "reward_change_min": -0.7688934281468391, |
| "reward_change_std": 0.29607443511486053, |
| "reward_std": 0.492419445887208, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.19096860231366009, |
| "step": 341 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2629.291690826416, |
| "epoch": 0.39085714285714285, |
| "grad_norm": 0.02702566236257553, |
| "kl": 0.00029778480529785156, |
| "lambda_div_used": 0.6460641473531723, |
| "learning_rate": 3.4991416936678276e-07, |
| "loss": -0.0077, |
| "reward": 0.11682657990604639, |
| "reward_after_mean": 0.11682657990604639, |
| "reward_after_std": 0.6720432955771685, |
| "reward_before_mean": 0.4811732564121485, |
| "reward_before_std": 0.6802498865872622, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3643466793000698, |
| "reward_change_min": -0.6579400822520256, |
| "reward_change_std": 0.255507318302989, |
| "reward_std": 0.6720433253794909, |
| "rewards/accuracy_reward": 0.3750000149011612, |
| "rewards/cosine_scaled_reward": 0.1061732517555356, |
| "step": 342 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2944.5833435058594, |
| "epoch": 0.392, |
| "grad_norm": 0.01883111707866192, |
| "kl": 0.0002968311309814453, |
| "lambda_div_used": 0.6280755251646042, |
| "learning_rate": 3.471051066897562e-07, |
| "loss": 0.0174, |
| "reward": 0.13441785983741283, |
| "reward_after_mean": 0.13441785983741283, |
| "reward_after_std": 0.62815947458148, |
| "reward_before_mean": 0.5553851053118706, |
| "reward_before_std": 0.5964916851371527, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42096727155148983, |
| "reward_change_min": -0.6972850449383259, |
| "reward_change_std": 0.27345132920891047, |
| "reward_std": 0.6281594894826412, |
| "rewards/accuracy_reward": 0.4166666753590107, |
| "rewards/cosine_scaled_reward": 0.13871843740344048, |
| "step": 343 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2095.5833854675293, |
| "epoch": 0.3931428571428571, |
| "grad_norm": 0.023183098062872887, |
| "kl": 0.00022208690643310547, |
| "lambda_div_used": 0.5810001268982887, |
| "learning_rate": 3.4430593282358777e-07, |
| "loss": -0.024, |
| "reward": 0.11522329319268465, |
| "reward_after_mean": 0.11522329319268465, |
| "reward_after_std": 0.561205493286252, |
| "reward_before_mean": 0.6688053011894226, |
| "reward_before_std": 0.3688823012635112, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5535820256918669, |
| "reward_change_min": -0.7913318648934364, |
| "reward_change_std": 0.2952663041651249, |
| "reward_std": 0.5612055025994778, |
| "rewards/accuracy_reward": 0.47916666977107525, |
| "rewards/cosine_scaled_reward": 0.18963862350210547, |
| "step": 344 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2982.229202270508, |
| "epoch": 0.3942857142857143, |
| "grad_norm": 0.022071614861488342, |
| "kl": 0.00032585859298706055, |
| "lambda_div_used": 0.577229768037796, |
| "learning_rate": 3.4151678419606233e-07, |
| "loss": -0.061, |
| "reward": -0.20981058850884438, |
| "reward_after_mean": -0.20981058850884438, |
| "reward_after_std": 0.40093352645635605, |
| "reward_before_mean": 0.13120032008737326, |
| "reward_before_std": 0.353534915484488, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3410108871757984, |
| "reward_change_min": -0.5285827927291393, |
| "reward_change_std": 0.20349892415106297, |
| "reward_std": 0.40093354508280754, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.03546636272221804, |
| "step": 345 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3075.916702270508, |
| "epoch": 0.3954285714285714, |
| "grad_norm": 0.018548911437392235, |
| "kl": 0.0003154873847961426, |
| "lambda_div_used": 0.5923640578985214, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0031, |
| "reward": -0.25945382937788963, |
| "reward_after_mean": -0.25945382937788963, |
| "reward_after_std": 0.47871536388993263, |
| "reward_before_mean": 0.022648759186267853, |
| "reward_before_std": 0.41908061131834984, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28210258670151234, |
| "reward_change_min": -0.4310699477791786, |
| "reward_change_std": 0.16193275339901447, |
| "reward_std": 0.4787153732031584, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.12318458454683423, |
| "step": 346 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3052.8333740234375, |
| "epoch": 0.3965714285714286, |
| "grad_norm": 0.01688998006284237, |
| "kl": 0.0003132820129394531, |
| "lambda_div_used": 0.5991570502519608, |
| "learning_rate": 3.359691059183761e-07, |
| "loss": 0.0425, |
| "reward": -0.21558007411658764, |
| "reward_after_mean": -0.21558007411658764, |
| "reward_after_std": 0.504334045574069, |
| "reward_before_mean": 0.08187778666615486, |
| "reward_before_std": 0.4535634834319353, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2974578682333231, |
| "reward_change_min": -0.48911403492093086, |
| "reward_change_std": 0.1813967889174819, |
| "reward_std": 0.5043340623378754, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.08478887472301722, |
| "step": 347 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2511.916679382324, |
| "epoch": 0.3977142857142857, |
| "grad_norm": 0.031576935201883316, |
| "kl": 0.00030159950256347656, |
| "lambda_div_used": 0.6005090326070786, |
| "learning_rate": 3.3321084665422803e-07, |
| "loss": 0.048, |
| "reward": -0.08813801780343056, |
| "reward_after_mean": -0.08813801780343056, |
| "reward_after_std": 0.4803820662200451, |
| "reward_before_mean": 0.25806165859103203, |
| "reward_before_std": 0.4620923697948456, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3461996652185917, |
| "reward_change_min": -0.547101479023695, |
| "reward_change_std": 0.22022681962698698, |
| "reward_std": 0.48038206808269024, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/cosine_scaled_reward": 0.008061652071774006, |
| "step": 348 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3160.916717529297, |
| "epoch": 0.39885714285714285, |
| "grad_norm": 0.020290188491344452, |
| "kl": 0.00036334991455078125, |
| "lambda_div_used": 0.5632076561450958, |
| "learning_rate": 3.3046315338757026e-07, |
| "loss": 0.0316, |
| "reward": -0.26980413869023323, |
| "reward_after_mean": -0.26980413869023323, |
| "reward_after_std": 0.3867268729954958, |
| "reward_before_mean": 0.09292120113968849, |
| "reward_before_std": 0.2870235051959753, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3627253398299217, |
| "reward_change_min": -0.5702350400388241, |
| "reward_change_std": 0.20531136635690928, |
| "reward_std": 0.386726887896657, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.0737454742193222, |
| "step": 349 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2213.604202270508, |
| "epoch": 0.4, |
| "grad_norm": 0.030296506360173225, |
| "kl": 0.0002709701657295227, |
| "lambda_div_used": 0.6561494767665863, |
| "learning_rate": 3.2772616003709616e-07, |
| "loss": 0.0154, |
| "reward": 0.1817268170416355, |
| "reward_after_mean": 0.1817268170416355, |
| "reward_after_std": 0.6988476235419512, |
| "reward_before_mean": 0.5450442042201757, |
| "reward_before_std": 0.7353325374424458, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3633174039423466, |
| "reward_change_min": -0.6499125882983208, |
| "reward_change_std": 0.26448090467602015, |
| "reward_std": 0.6988476365804672, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.17004419304430485, |
| "step": 350 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2509.5208740234375, |
| "epoch": 0.40114285714285713, |
| "grad_norm": 0.02334379218518734, |
| "kl": 0.0002808868885040283, |
| "lambda_div_used": 0.602841705083847, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0785, |
| "reward": -0.02256767451763153, |
| "reward_after_mean": -0.02256767451763153, |
| "reward_after_std": 0.545713946223259, |
| "reward_before_mean": 0.3783569000661373, |
| "reward_before_std": 0.47443881165236235, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40092457458376884, |
| "reward_change_min": -0.5974738858640194, |
| "reward_change_std": 0.24015377275645733, |
| "reward_std": 0.5457139611244202, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.06585689261555672, |
| "step": 351 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2237.7917251586914, |
| "epoch": 0.4022857142857143, |
| "grad_norm": 0.02218765579164028, |
| "kl": 0.00025266408920288086, |
| "lambda_div_used": 0.6070134416222572, |
| "learning_rate": 3.222848061454764e-07, |
| "loss": 0.0034, |
| "reward": -0.07995379093335941, |
| "reward_after_mean": -0.07995379093335941, |
| "reward_after_std": 0.5729443337768316, |
| "reward_before_mean": 0.291741443797946, |
| "reward_before_std": 0.4979767380282283, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3716952446848154, |
| "reward_change_min": -0.6056977100670338, |
| "reward_change_std": 0.23109738621860743, |
| "reward_std": 0.5729443468153477, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 7.475726306438446e-05, |
| "step": 352 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2214.4583702087402, |
| "epoch": 0.4034285714285714, |
| "grad_norm": 0.02660507895052433, |
| "kl": 0.0002359449863433838, |
| "lambda_div_used": 0.670557290315628, |
| "learning_rate": 3.195807108082429e-07, |
| "loss": -0.0096, |
| "reward": 0.11138913966715336, |
| "reward_after_mean": 0.11138913966715336, |
| "reward_after_std": 0.79796995036304, |
| "reward_before_mean": 0.420612467918545, |
| "reward_before_std": 0.7911950433626771, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30922332406044006, |
| "reward_change_min": -0.5478326119482517, |
| "reward_change_std": 0.21525408141314983, |
| "reward_std": 0.7979699578136206, |
| "rewards/accuracy_reward": 0.35416667722165585, |
| "rewards/cosine_scaled_reward": 0.06644579023122787, |
| "step": 353 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2018.0625228881836, |
| "epoch": 0.4045714285714286, |
| "grad_norm": 0.028822243213653564, |
| "kl": 0.0002269148826599121, |
| "lambda_div_used": 0.6141867712140083, |
| "learning_rate": 3.168878457820915e-07, |
| "loss": 0.0254, |
| "reward": 0.07253095135092735, |
| "reward_after_mean": 0.07253095135092735, |
| "reward_after_std": 0.6088422238826752, |
| "reward_before_mean": 0.4882249776273966, |
| "reward_before_std": 0.5237803608179092, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4156940244138241, |
| "reward_change_min": -0.5956555530428886, |
| "reward_change_std": 0.23860780615359545, |
| "reward_std": 0.6088422238826752, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.13405831216368824, |
| "step": 354 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2018.0208740234375, |
| "epoch": 0.4057142857142857, |
| "grad_norm": 0.029382316395640373, |
| "kl": 0.00026294589042663574, |
| "lambda_div_used": 0.5812733992934227, |
| "learning_rate": 3.142063423134644e-07, |
| "loss": 0.0145, |
| "reward": 0.23357930406928062, |
| "reward_after_mean": 0.23357930406928062, |
| "reward_after_std": 0.5394825823605061, |
| "reward_before_mean": 0.8446227628737688, |
| "reward_before_std": 0.37303208094090223, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.6110434681177139, |
| "reward_change_min": -0.8454090058803558, |
| "reward_change_std": 0.3367054909467697, |
| "reward_std": 0.5394826009869576, |
| "rewards/accuracy_reward": 0.5416666716337204, |
| "rewards/cosine_scaled_reward": 0.3029560726135969, |
| "step": 355 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2752.3333740234375, |
| "epoch": 0.40685714285714286, |
| "grad_norm": 0.01905817724764347, |
| "kl": 0.00031578540802001953, |
| "lambda_div_used": 0.6255660429596901, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": 0.0216, |
| "reward": 0.12779017974389717, |
| "reward_after_mean": 0.12779017974389717, |
| "reward_after_std": 0.6442528441548347, |
| "reward_before_mean": 0.5468167327344418, |
| "reward_before_std": 0.5852497918531299, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41902654618024826, |
| "reward_change_min": -0.6820830404758453, |
| "reward_change_std": 0.2657659938558936, |
| "reward_std": 0.6442528460174799, |
| "rewards/accuracy_reward": 0.3958333395421505, |
| "rewards/cosine_scaled_reward": 0.15098336525261402, |
| "step": 356 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2952.520854949951, |
| "epoch": 0.408, |
| "grad_norm": 0.024500912055373192, |
| "kl": 0.0002726316452026367, |
| "lambda_div_used": 0.573906421661377, |
| "learning_rate": 3.0887794225945143e-07, |
| "loss": -0.0555, |
| "reward": -0.22403784468770027, |
| "reward_after_mean": -0.22403784468770027, |
| "reward_after_std": 0.4447880759835243, |
| "reward_before_mean": 0.14636994618922472, |
| "reward_before_std": 0.3346911370754242, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3704077899456024, |
| "reward_change_min": -0.5609126053750515, |
| "reward_change_std": 0.20881207659840584, |
| "reward_std": 0.4447880797088146, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.04113006801344454, |
| "step": 357 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2411.8541946411133, |
| "epoch": 0.40914285714285714, |
| "grad_norm": 0.024526813998818398, |
| "kl": 0.00026684999465942383, |
| "lambda_div_used": 0.6140530630946159, |
| "learning_rate": 3.062313053727671e-07, |
| "loss": -0.0357, |
| "reward": 0.30990589410066605, |
| "reward_after_mean": 0.30990589410066605, |
| "reward_after_std": 0.5656850170344114, |
| "reward_before_mean": 0.8357270993292332, |
| "reward_before_std": 0.5201095007359982, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5258211866021156, |
| "reward_change_min": -0.7649649046361446, |
| "reward_change_std": 0.3101219357922673, |
| "reward_std": 0.5656850375235081, |
| "rewards/accuracy_reward": 0.5416666865348816, |
| "rewards/cosine_scaled_reward": 0.2940603978931904, |
| "step": 358 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2100.604217529297, |
| "epoch": 0.4102857142857143, |
| "grad_norm": 0.029383460059762, |
| "kl": 0.00023573637008666992, |
| "lambda_div_used": 0.6149207651615143, |
| "learning_rate": 3.0359654942835247e-07, |
| "loss": -0.0725, |
| "reward": -0.014719150494784117, |
| "reward_after_mean": -0.014719150494784117, |
| "reward_after_std": 0.6281173154711723, |
| "reward_before_mean": 0.3637474989518523, |
| "reward_before_std": 0.5245081130415201, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37846665270626545, |
| "reward_change_min": -0.5382856801152229, |
| "reward_change_std": 0.20675079058855772, |
| "reward_std": 0.6281173229217529, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": 0.07208081643329933, |
| "step": 359 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2650.0625610351562, |
| "epoch": 0.4114285714285714, |
| "grad_norm": 0.028681648895144463, |
| "kl": 0.00029480457305908203, |
| "lambda_div_used": 0.5845082253217697, |
| "learning_rate": 3.0097380284049523e-07, |
| "loss": 0.0359, |
| "reward": 0.034104809165000916, |
| "reward_after_mean": 0.034104809165000916, |
| "reward_after_std": 0.5086164381355047, |
| "reward_before_mean": 0.5177552103996277, |
| "reward_before_std": 0.3886600947007537, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4836503826081753, |
| "reward_change_min": -0.7048606462776661, |
| "reward_change_std": 0.27614138927310705, |
| "reward_std": 0.5086164511740208, |
| "rewards/accuracy_reward": 0.3958333358168602, |
| "rewards/cosine_scaled_reward": 0.12192187085747719, |
| "step": 360 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2715.7084197998047, |
| "epoch": 0.4125714285714286, |
| "grad_norm": 0.02216268703341484, |
| "kl": 0.00035099685192108154, |
| "lambda_div_used": 0.6144327148795128, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0341, |
| "reward": 0.11957069113850594, |
| "reward_after_mean": 0.11957069113850594, |
| "reward_after_std": 0.5912090875208378, |
| "reward_before_mean": 0.5671045240014791, |
| "reward_before_std": 0.523097550496459, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4475338254123926, |
| "reward_change_min": -0.6687514297664165, |
| "reward_change_std": 0.2680971557274461, |
| "reward_std": 0.5912090986967087, |
| "rewards/accuracy_reward": 0.41666667722165585, |
| "rewards/cosine_scaled_reward": 0.15043783793225884, |
| "step": 361 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1608.958396911621, |
| "epoch": 0.4137142857142857, |
| "grad_norm": 0.03410353511571884, |
| "kl": 0.0002919435501098633, |
| "lambda_div_used": 0.5573309659957886, |
| "learning_rate": 2.9576484845877793e-07, |
| "loss": -0.1085, |
| "reward": 0.017518717795610428, |
| "reward_after_mean": 0.017518717795610428, |
| "reward_after_std": 0.46568065509200096, |
| "reward_before_mean": 0.5754165817052126, |
| "reward_before_std": 0.2668048879131675, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.557897862046957, |
| "reward_change_min": -0.7572538442909718, |
| "reward_change_std": 0.2997015379369259, |
| "reward_std": 0.46568066254258156, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_scaled_reward": 0.13791657239198685, |
| "step": 362 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1770.7500305175781, |
| "epoch": 0.41485714285714287, |
| "grad_norm": 0.036160316318273544, |
| "kl": 0.0002955198287963867, |
| "lambda_div_used": 0.5742382705211639, |
| "learning_rate": 2.931788945420058e-07, |
| "loss": 0.0235, |
| "reward": 0.08720480650663376, |
| "reward_after_mean": 0.08720480650663376, |
| "reward_after_std": 0.4698806144297123, |
| "reward_before_mean": 0.624412227421999, |
| "reward_before_std": 0.33839546935632825, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5372074488550425, |
| "reward_change_min": -0.7525182664394379, |
| "reward_change_std": 0.2967495834454894, |
| "reward_std": 0.4698806367814541, |
| "rewards/accuracy_reward": 0.4166666716337204, |
| "rewards/cosine_scaled_reward": 0.20774555951356888, |
| "step": 363 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2871.395866394043, |
| "epoch": 0.416, |
| "grad_norm": 0.021756965667009354, |
| "kl": 0.0003199577331542969, |
| "lambda_div_used": 0.6380201950669289, |
| "learning_rate": 2.9060545772359305e-07, |
| "loss": 0.0094, |
| "reward": -0.06151282729115337, |
| "reward_after_mean": -0.06151282729115337, |
| "reward_after_std": 0.6731106694787741, |
| "reward_before_mean": 0.22438967041671276, |
| "reward_before_std": 0.6390752401202917, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2859025076031685, |
| "reward_change_min": -0.4525425359606743, |
| "reward_change_std": 0.18198375776410103, |
| "reward_std": 0.6731106787919998, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": -0.004777010064572096, |
| "step": 364 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2915.7708892822266, |
| "epoch": 0.41714285714285715, |
| "grad_norm": 0.026546625420451164, |
| "kl": 0.00028133392333984375, |
| "lambda_div_used": 0.6235380545258522, |
| "learning_rate": 2.8804466342921987e-07, |
| "loss": 0.0013, |
| "reward": -0.19956049136817455, |
| "reward_after_mean": -0.19956049136817455, |
| "reward_after_std": 0.5996164344251156, |
| "reward_before_mean": 0.056321932934224606, |
| "reward_before_std": 0.5726153058931231, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2558824270963669, |
| "reward_change_min": -0.486958272755146, |
| "reward_change_std": 0.17727997712790966, |
| "reward_std": 0.5996164586395025, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.11034472845494747, |
| "step": 365 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1711.791690826416, |
| "epoch": 0.41828571428571426, |
| "grad_norm": 0.030281659215688705, |
| "kl": 0.00020581483840942383, |
| "lambda_div_used": 0.5937831178307533, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.0313, |
| "reward": 0.19904952123761177, |
| "reward_after_mean": 0.19904952123761177, |
| "reward_after_std": 0.5501149389892817, |
| "reward_before_mean": 0.7462510112673044, |
| "reward_before_std": 0.428714738227427, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5472014844417572, |
| "reward_change_min": -0.7698434814810753, |
| "reward_change_std": 0.31167063396424055, |
| "reward_std": 0.550114942714572, |
| "rewards/accuracy_reward": 0.5208333432674408, |
| "rewards/cosine_scaled_reward": 0.2254176577553153, |
| "step": 366 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2489.0209045410156, |
| "epoch": 0.41942857142857143, |
| "grad_norm": 0.02355087362229824, |
| "kl": 0.00023761391639709473, |
| "lambda_div_used": 0.6294166967272758, |
| "learning_rate": 2.829615010283344e-07, |
| "loss": -0.0082, |
| "reward": 0.1264641396701336, |
| "reward_after_mean": 0.1264641396701336, |
| "reward_after_std": 0.6499300934374332, |
| "reward_before_mean": 0.5483083166182041, |
| "reward_before_std": 0.6043886244297028, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4218441918492317, |
| "reward_change_min": -0.6975029557943344, |
| "reward_change_std": 0.2718219608068466, |
| "reward_std": 0.6499301269650459, |
| "rewards/accuracy_reward": 0.3958333432674408, |
| "rewards/cosine_scaled_reward": 0.15247498638927937, |
| "step": 367 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3003.458335876465, |
| "epoch": 0.4205714285714286, |
| "grad_norm": 0.030648062005639076, |
| "kl": 0.00033867359161376953, |
| "lambda_div_used": 0.5932426005601883, |
| "learning_rate": 2.8043938066798645e-07, |
| "loss": 0.0339, |
| "reward": -0.11047623306512833, |
| "reward_after_mean": -0.11047623306512833, |
| "reward_after_std": 0.46016608364880085, |
| "reward_before_mean": 0.24663935555145144, |
| "reward_before_std": 0.42351202201098204, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3571155872195959, |
| "reward_change_min": -0.523664090782404, |
| "reward_change_std": 0.21024074219167233, |
| "reward_std": 0.460166085511446, |
| "rewards/accuracy_reward": 0.22916667722165585, |
| "rewards/cosine_scaled_reward": 0.01747269369661808, |
| "step": 368 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2393.520854949951, |
| "epoch": 0.4217142857142857, |
| "grad_norm": 0.03554327413439751, |
| "kl": 0.00029546022415161133, |
| "lambda_div_used": 0.6290072500705719, |
| "learning_rate": 2.7793039831193133e-07, |
| "loss": -0.1119, |
| "reward": 0.045321037992835045, |
| "reward_after_mean": 0.045321037992835045, |
| "reward_after_std": 0.6632880251854658, |
| "reward_before_mean": 0.42809890396893024, |
| "reward_before_std": 0.6034117415547371, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3827778585255146, |
| "reward_change_min": -0.6426752880215645, |
| "reward_change_std": 0.24680283293128014, |
| "reward_std": 0.6632880419492722, |
| "rewards/accuracy_reward": 0.31250000186264515, |
| "rewards/cosine_scaled_reward": 0.11559889325872064, |
| "step": 369 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3084.6458587646484, |
| "epoch": 0.4228571428571429, |
| "grad_norm": 0.021294524893164635, |
| "kl": 0.00037282705307006836, |
| "lambda_div_used": 0.6392548009753227, |
| "learning_rate": 2.7543467624442956e-07, |
| "loss": 0.0248, |
| "reward": 0.1099370252341032, |
| "reward_after_mean": 0.1099370252341032, |
| "reward_after_std": 0.7434613928198814, |
| "reward_before_mean": 0.5280295421835035, |
| "reward_before_std": 0.6512415455654263, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4180925004184246, |
| "reward_change_min": -0.6554959528148174, |
| "reward_change_std": 0.2566772401332855, |
| "reward_std": 0.7434614114463329, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.15302953217178583, |
| "step": 370 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1583.729190826416, |
| "epoch": 0.424, |
| "grad_norm": 0.03987959772348404, |
| "kl": 0.000273287296295166, |
| "lambda_div_used": 0.5595665127038956, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.015, |
| "reward": -0.05838925391435623, |
| "reward_after_mean": -0.05838925391435623, |
| "reward_after_std": 0.44396297819912434, |
| "reward_before_mean": 0.4467340558767319, |
| "reward_before_std": 0.2712427484802902, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5051233097910881, |
| "reward_change_min": -0.7045671716332436, |
| "reward_change_std": 0.27023117896169424, |
| "reward_std": 0.44396298564970493, |
| "rewards/accuracy_reward": 0.4166666679084301, |
| "rewards/cosine_scaled_reward": 0.030067380517721176, |
| "step": 371 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2887.87504196167, |
| "epoch": 0.42514285714285716, |
| "grad_norm": 0.024510102346539497, |
| "kl": 0.0002981424331665039, |
| "lambda_div_used": 0.6707161664962769, |
| "learning_rate": 2.7048349887476037e-07, |
| "loss": -0.0009, |
| "reward": 0.25641736947000027, |
| "reward_after_mean": 0.25641736947000027, |
| "reward_after_std": 0.7552758120000362, |
| "reward_before_mean": 0.6394204869866371, |
| "reward_before_std": 0.800614426843822, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3830030895769596, |
| "reward_change_min": -0.689335536211729, |
| "reward_change_std": 0.28328478895127773, |
| "reward_std": 0.7552758287638426, |
| "rewards/accuracy_reward": 0.4583333469927311, |
| "rewards/cosine_scaled_reward": 0.18108712136745453, |
| "step": 372 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1918.5833930969238, |
| "epoch": 0.42628571428571427, |
| "grad_norm": 0.03153732046484947, |
| "kl": 0.00024700164794921875, |
| "lambda_div_used": 0.6141555905342102, |
| "learning_rate": 2.6802828488599294e-07, |
| "loss": 0.0114, |
| "reward": 0.040444918908178806, |
| "reward_after_mean": 0.040444918908178806, |
| "reward_after_std": 0.6271691359579563, |
| "reward_before_mean": 0.4637982491403818, |
| "reward_before_std": 0.5247453823685646, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42335335724055767, |
| "reward_change_min": -0.6717953830957413, |
| "reward_change_std": 0.2525172745808959, |
| "reward_std": 0.6271691434085369, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.10963157773949206, |
| "step": 373 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1947.4792213439941, |
| "epoch": 0.42742857142857144, |
| "grad_norm": 0.036099888384342194, |
| "kl": 0.0002910494804382324, |
| "lambda_div_used": 0.6011270731687546, |
| "learning_rate": 2.655868138008171e-07, |
| "loss": -0.1104, |
| "reward": 0.05605058930814266, |
| "reward_after_mean": 0.05605058930814266, |
| "reward_after_std": 0.614537576213479, |
| "reward_before_mean": 0.5277584344148636, |
| "reward_before_std": 0.46880532428622246, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4717078376561403, |
| "reward_change_min": -0.7001210488379002, |
| "reward_change_std": 0.270130792632699, |
| "reward_std": 0.6145375911146402, |
| "rewards/accuracy_reward": 0.37500000186264515, |
| "rewards/cosine_scaled_reward": 0.1527584195137024, |
| "step": 374 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2643.3333892822266, |
| "epoch": 0.42857142857142855, |
| "grad_norm": 0.025029828771948814, |
| "kl": 0.0003476142883300781, |
| "lambda_div_used": 0.558178536593914, |
| "learning_rate": 2.631592046130896e-07, |
| "loss": -0.0046, |
| "reward": 0.06168156489729881, |
| "reward_after_mean": 0.06168156489729881, |
| "reward_after_std": 0.46577400900423527, |
| "reward_before_mean": 0.6490027587860823, |
| "reward_before_std": 0.2644077790901065, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5873212069272995, |
| "reward_change_min": -0.8054014258086681, |
| "reward_change_std": 0.3127285521477461, |
| "reward_std": 0.46577401272952557, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/cosine_scaled_reward": 0.2115027718245983, |
| "step": 375 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2176.875045776367, |
| "epoch": 0.4297142857142857, |
| "grad_norm": 0.025618579238653183, |
| "kl": 0.00024950504302978516, |
| "lambda_div_used": 0.5522258281707764, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0245, |
| "reward": -0.24329839646816254, |
| "reward_after_mean": -0.24329839646816254, |
| "reward_after_std": 0.3648342005908489, |
| "reward_before_mean": 0.15543348528444767, |
| "reward_before_std": 0.2328620203770697, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3987318556755781, |
| "reward_change_min": -0.5627252347767353, |
| "reward_change_std": 0.21180008072406054, |
| "reward_std": 0.3648342117667198, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.052899875794537365, |
| "step": 376 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3302.1875610351562, |
| "epoch": 0.4308571428571429, |
| "grad_norm": 0.019104059785604477, |
| "kl": 0.0004137754440307617, |
| "lambda_div_used": 0.6249697953462601, |
| "learning_rate": 2.583460445215911e-07, |
| "loss": 0.0065, |
| "reward": -0.03637286019511521, |
| "reward_after_mean": -0.03637286019511521, |
| "reward_after_std": 0.6691129393875599, |
| "reward_before_mean": 0.30246374011039734, |
| "reward_before_std": 0.578639387153089, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3388365972787142, |
| "reward_change_min": -0.5369448103010654, |
| "reward_change_std": 0.20115663390606642, |
| "reward_std": 0.6691129766404629, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": 0.010797052644193172, |
| "step": 377 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1876.0208568572998, |
| "epoch": 0.432, |
| "grad_norm": 0.039962347596883774, |
| "kl": 0.0002534538507461548, |
| "lambda_div_used": 0.6230520308017731, |
| "learning_rate": 2.5596072820445254e-07, |
| "loss": 0.0138, |
| "reward": 0.20900094881653786, |
| "reward_after_mean": 0.20900094881653786, |
| "reward_after_std": 0.685304744169116, |
| "reward_before_mean": 0.7048993427306414, |
| "reward_before_std": 0.577358863549307, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4958983939141035, |
| "reward_change_min": -0.7410164549946785, |
| "reward_change_std": 0.3000446343794465, |
| "reward_std": 0.6853047590702772, |
| "rewards/accuracy_reward": 0.5000000055879354, |
| "rewards/cosine_scaled_reward": 0.204899336444214, |
| "step": 378 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2977.7917289733887, |
| "epoch": 0.43314285714285716, |
| "grad_norm": 0.023082684725522995, |
| "kl": 0.00034427642822265625, |
| "lambda_div_used": 0.564103439450264, |
| "learning_rate": 2.5358974294659373e-07, |
| "loss": 0.0472, |
| "reward": -0.2481522224843502, |
| "reward_after_mean": -0.2481522224843502, |
| "reward_after_std": 0.3903357107192278, |
| "reward_before_mean": 0.1231729257851839, |
| "reward_before_std": 0.2936667911708355, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37132514640688896, |
| "reward_change_min": -0.5592780411243439, |
| "reward_change_std": 0.21076095290482044, |
| "reward_std": 0.39033573493361473, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.06432707794010639, |
| "step": 379 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2197.2708740234375, |
| "epoch": 0.4342857142857143, |
| "grad_norm": 0.0468011274933815, |
| "kl": 0.0003237128257751465, |
| "lambda_div_used": 0.6098255217075348, |
| "learning_rate": 2.512332043064913e-07, |
| "loss": -0.1385, |
| "reward": -0.01926261931657791, |
| "reward_after_mean": -0.01926261931657791, |
| "reward_after_std": 0.582806745544076, |
| "reward_before_mean": 0.3642494883388281, |
| "reward_before_std": 0.5125604961067438, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38351211696863174, |
| "reward_change_min": -0.6093635484576225, |
| "reward_change_std": 0.24249585159122944, |
| "reward_std": 0.5828067641705275, |
| "rewards/accuracy_reward": 0.3333333358168602, |
| "rewards/cosine_scaled_reward": 0.030916159972548485, |
| "step": 380 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2804.291702270508, |
| "epoch": 0.43542857142857144, |
| "grad_norm": 0.03009038046002388, |
| "kl": 0.00038611888885498047, |
| "lambda_div_used": 0.6074136793613434, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0773, |
| "reward": -0.2030959241092205, |
| "reward_after_mean": -0.2030959241092205, |
| "reward_after_std": 0.5294999033212662, |
| "reward_before_mean": 0.0848972403910011, |
| "reward_before_std": 0.4909538859501481, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.287993174046278, |
| "reward_change_min": -0.4920281432569027, |
| "reward_change_std": 0.1843261569738388, |
| "reward_std": 0.5294999219477177, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.08176943706348538, |
| "step": 381 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1741.7292022705078, |
| "epoch": 0.43657142857142855, |
| "grad_norm": 0.02836902253329754, |
| "kl": 0.000269085168838501, |
| "lambda_div_used": 0.593373216688633, |
| "learning_rate": 2.465639255873246e-07, |
| "loss": 0.0017, |
| "reward": -0.2921748459339142, |
| "reward_after_mean": -0.2921748459339142, |
| "reward_after_std": 0.4851537048816681, |
| "reward_before_mean": -0.02789500029757619, |
| "reward_before_std": 0.4276847830042243, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2642798572778702, |
| "reward_change_min": -0.4220714569091797, |
| "reward_change_std": 0.156461289152503, |
| "reward_std": 0.485153716057539, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.152894988656044, |
| "step": 382 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2540.208366394043, |
| "epoch": 0.4377142857142857, |
| "grad_norm": 0.027282925322651863, |
| "kl": 0.00039958953857421875, |
| "lambda_div_used": 0.6196694001555443, |
| "learning_rate": 2.4425141308231765e-07, |
| "loss": -0.0249, |
| "reward": -0.009514571633189917, |
| "reward_after_mean": -0.009514571633189917, |
| "reward_after_std": 0.6358621753752232, |
| "reward_before_mean": 0.3640937558375299, |
| "reward_before_std": 0.5521808844059706, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3736083246767521, |
| "reward_change_min": -0.5938910432159901, |
| "reward_change_std": 0.22305074147880077, |
| "reward_std": 0.6358622014522552, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": 0.07242710120044649, |
| "step": 383 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2133.9166984558105, |
| "epoch": 0.43885714285714283, |
| "grad_norm": 0.031124358996748924, |
| "kl": 0.00030043721199035645, |
| "lambda_div_used": 0.6193316578865051, |
| "learning_rate": 2.4195380233209006e-07, |
| "loss": -0.0227, |
| "reward": 0.32511539570987225, |
| "reward_after_mean": 0.32511539570987225, |
| "reward_after_std": 0.7301186248660088, |
| "reward_before_mean": 0.913657930213958, |
| "reward_before_std": 0.5516102942638099, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.588542552664876, |
| "reward_change_min": -0.8305801004171371, |
| "reward_change_std": 0.32795302756130695, |
| "reward_std": 0.7301186472177505, |
| "rewards/accuracy_reward": 0.5833333395421505, |
| "rewards/cosine_scaled_reward": 0.3303246097639203, |
| "step": 384 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2633.770851135254, |
| "epoch": 0.44, |
| "grad_norm": 0.018747175112366676, |
| "kl": 0.00027829408645629883, |
| "lambda_div_used": 0.59091367572546, |
| "learning_rate": 2.3967120531894857e-07, |
| "loss": 0.0052, |
| "reward": -0.16111253947019577, |
| "reward_after_mean": -0.16111253947019577, |
| "reward_after_std": 0.4700228702276945, |
| "reward_before_mean": 0.17826138995587826, |
| "reward_before_std": 0.41598498076200485, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3393739238381386, |
| "reward_change_min": -0.5320228524506092, |
| "reward_change_std": 0.20085815154016018, |
| "reward_std": 0.4700228702276945, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.05090527608990669, |
| "step": 385 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2641.6250762939453, |
| "epoch": 0.44114285714285717, |
| "grad_norm": 0.022516794502735138, |
| "kl": 0.00033086538314819336, |
| "lambda_div_used": 0.5835114791989326, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": -0.0898, |
| "reward": -0.16643539629876614, |
| "reward_after_mean": -0.16643539629876614, |
| "reward_after_std": 0.47341430373489857, |
| "reward_before_mean": 0.20273982174694538, |
| "reward_before_std": 0.38260515965521336, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36917522735893726, |
| "reward_change_min": -0.5404521636664867, |
| "reward_change_std": 0.20916004106402397, |
| "reward_std": 0.4734143167734146, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.02642684616148472, |
| "step": 386 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2953.6875610351562, |
| "epoch": 0.4422857142857143, |
| "grad_norm": 0.023601215332746506, |
| "kl": 0.0003770887851715088, |
| "lambda_div_used": 0.6255086436867714, |
| "learning_rate": 2.3515149676898552e-07, |
| "loss": 0.013, |
| "reward": -0.055427778512239456, |
| "reward_after_mean": -0.055427778512239456, |
| "reward_after_std": 0.6565756388008595, |
| "reward_before_mean": 0.2887960313819349, |
| "reward_before_std": 0.582161720842123, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3442238178104162, |
| "reward_change_min": -0.555218169465661, |
| "reward_change_std": 0.2120614117011428, |
| "reward_std": 0.6565756406635046, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": -0.0028706385055556893, |
| "step": 387 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2360.6667251586914, |
| "epoch": 0.44342857142857145, |
| "grad_norm": 0.026174332946538925, |
| "kl": 0.00029283761978149414, |
| "lambda_div_used": 0.5821729674935341, |
| "learning_rate": 2.3291460551638237e-07, |
| "loss": -0.0288, |
| "reward": -0.10938419215381145, |
| "reward_after_mean": -0.10938419215381145, |
| "reward_after_std": 0.4824158512055874, |
| "reward_before_mean": 0.2964679952710867, |
| "reward_before_std": 0.37308686412870884, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.405852185562253, |
| "reward_change_min": -0.5691216923296452, |
| "reward_change_std": 0.2231999458745122, |
| "reward_std": 0.4824158661067486, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.025634657591581345, |
| "step": 388 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2439.229232788086, |
| "epoch": 0.44457142857142856, |
| "grad_norm": 0.023821894079446793, |
| "kl": 0.00028192996978759766, |
| "lambda_div_used": 0.6302470341324806, |
| "learning_rate": 2.306931685585657e-07, |
| "loss": 0.0324, |
| "reward": 0.04070591554045677, |
| "reward_after_mean": 0.04070591554045677, |
| "reward_after_std": 0.6089895591139793, |
| "reward_before_mean": 0.3929547220468521, |
| "reward_before_std": 0.6121768653392792, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35224880650639534, |
| "reward_change_min": -0.6027859784662724, |
| "reward_change_std": 0.2432803064584732, |
| "reward_std": 0.6089895665645599, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": 0.10128805413842201, |
| "step": 389 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2221.3750610351562, |
| "epoch": 0.44571428571428573, |
| "grad_norm": 0.026545513421297073, |
| "kl": 0.0002053976058959961, |
| "lambda_div_used": 0.5998165532946587, |
| "learning_rate": 2.2848729416523859e-07, |
| "loss": 0.0542, |
| "reward": 0.11247721314430237, |
| "reward_after_mean": 0.11247721314430237, |
| "reward_after_std": 0.5243697017431259, |
| "reward_before_mean": 0.5878691142424941, |
| "reward_before_std": 0.4597471170127392, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4753919020295143, |
| "reward_change_min": -0.7015566639602184, |
| "reward_change_std": 0.28336913883686066, |
| "reward_std": 0.5243697185069323, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.2128690993413329, |
| "step": 390 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2397.791736602783, |
| "epoch": 0.44685714285714284, |
| "grad_norm": 0.030633823946118355, |
| "kl": 0.0002848505973815918, |
| "lambda_div_used": 0.6453134343028069, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0079, |
| "reward": 0.32186132250353694, |
| "reward_after_mean": 0.32186132250353694, |
| "reward_after_std": 0.7361500542610884, |
| "reward_before_mean": 0.8155574453994632, |
| "reward_before_std": 0.6687694359570742, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4936961196362972, |
| "reward_change_min": -0.7406940795481205, |
| "reward_change_std": 0.2951981630176306, |
| "reward_std": 0.7361500766128302, |
| "rewards/accuracy_reward": 0.5208333507180214, |
| "rewards/cosine_scaled_reward": 0.2947241172660142, |
| "step": 391 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1701.3541946411133, |
| "epoch": 0.448, |
| "grad_norm": 0.030896564945578575, |
| "kl": 0.00023484230041503906, |
| "lambda_div_used": 0.5580313578248024, |
| "learning_rate": 2.2412266235313973e-07, |
| "loss": 0.0276, |
| "reward": -0.1474175527691841, |
| "reward_after_mean": -0.1474175527691841, |
| "reward_after_std": 0.4200621973723173, |
| "reward_before_mean": 0.3056653430685401, |
| "reward_before_std": 0.2651460962370038, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45308290608227253, |
| "reward_change_min": -0.6591883115470409, |
| "reward_change_std": 0.24698374886065722, |
| "reward_std": 0.42006222531199455, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.013998678419739008, |
| "step": 392 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2144.7708892822266, |
| "epoch": 0.4491428571428571, |
| "grad_norm": 0.025813451036810875, |
| "kl": 0.00031191110610961914, |
| "lambda_div_used": 0.6718219220638275, |
| "learning_rate": 2.2196411766036487e-07, |
| "loss": -0.0357, |
| "reward": 0.048092352226376534, |
| "reward_after_mean": 0.048092352226376534, |
| "reward_after_std": 0.7942012958228588, |
| "reward_before_mean": 0.32687312876805663, |
| "reward_before_std": 0.8060374613851309, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27878078632056713, |
| "reward_change_min": -0.5697837248444557, |
| "reward_change_std": 0.22098596021533012, |
| "reward_std": 0.7942013349384069, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.014373119222000241, |
| "step": 393 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3021.5833740234375, |
| "epoch": 0.4502857142857143, |
| "grad_norm": 0.02360186167061329, |
| "kl": 0.00038444995880126953, |
| "lambda_div_used": 0.6319213733077049, |
| "learning_rate": 2.1982156097370557e-07, |
| "loss": 0.052, |
| "reward": -0.20860249735414982, |
| "reward_after_mean": -0.20860249735414982, |
| "reward_after_std": 0.6297264527529478, |
| "reward_before_mean": 0.03818079084157944, |
| "reward_before_std": 0.6159613355994225, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2467832900583744, |
| "reward_change_min": -0.5391863323748112, |
| "reward_change_std": 0.1899872226640582, |
| "reward_std": 0.6297264751046896, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.12848588544875383, |
| "step": 394 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2135.8333473205566, |
| "epoch": 0.4514285714285714, |
| "grad_norm": 0.04946435987949371, |
| "kl": 0.00024586915969848633, |
| "lambda_div_used": 0.576830618083477, |
| "learning_rate": 2.1769509671835223e-07, |
| "loss": -0.0301, |
| "reward": -0.24670689832419157, |
| "reward_after_mean": -0.24670689832419157, |
| "reward_after_std": 0.4664156064391136, |
| "reward_before_mean": 0.09556343220174313, |
| "reward_before_std": 0.35311094112694263, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3422703631222248, |
| "reward_change_min": -0.5164104513823986, |
| "reward_change_std": 0.19275081250816584, |
| "reward_std": 0.466415636241436, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.09193656174466014, |
| "step": 395 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2614.750068664551, |
| "epoch": 0.45257142857142857, |
| "grad_norm": 0.021299051120877266, |
| "kl": 0.00028631091117858887, |
| "lambda_div_used": 0.6214602738618851, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.0454, |
| "reward": 0.09535084664821625, |
| "reward_after_mean": 0.09535084664821625, |
| "reward_after_std": 0.6240330375730991, |
| "reward_before_mean": 0.5079526733607054, |
| "reward_before_std": 0.5631251083686948, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41260186582803726, |
| "reward_change_min": -0.6420417241752148, |
| "reward_change_std": 0.2541531687602401, |
| "reward_std": 0.6240330524742603, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.13295269757509232, |
| "step": 396 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2262.5625610351562, |
| "epoch": 0.45371428571428574, |
| "grad_norm": 0.023174487054347992, |
| "kl": 0.00028808414936065674, |
| "lambda_div_used": 0.620752289891243, |
| "learning_rate": 2.134908592756607e-07, |
| "loss": -0.0334, |
| "reward": 0.0681952117010951, |
| "reward_after_mean": 0.0681952117010951, |
| "reward_after_std": 0.6165110263973475, |
| "reward_before_mean": 0.48235524632036686, |
| "reward_before_std": 0.5585681181401014, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41416002810001373, |
| "reward_change_min": -0.6492009982466698, |
| "reward_change_std": 0.2585160303860903, |
| "reward_std": 0.6165110506117344, |
| "rewards/accuracy_reward": 0.35416667349636555, |
| "rewards/cosine_scaled_reward": 0.1281885566713754, |
| "step": 397 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2219.1875534057617, |
| "epoch": 0.45485714285714285, |
| "grad_norm": 0.02591089904308319, |
| "kl": 0.0003045201301574707, |
| "lambda_div_used": 0.6068568229675293, |
| "learning_rate": 2.1141329099692406e-07, |
| "loss": 0.0302, |
| "reward": -0.10764243453741074, |
| "reward_after_mean": -0.10764243453741074, |
| "reward_after_std": 0.5858584549278021, |
| "reward_before_mean": 0.2538683768361807, |
| "reward_before_std": 0.4971063416451216, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36151083186268806, |
| "reward_change_min": -0.6307843886315823, |
| "reward_change_std": 0.2273276075720787, |
| "reward_std": 0.5858584903180599, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": 0.003868376836180687, |
| "step": 398 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2028.7292175292969, |
| "epoch": 0.456, |
| "grad_norm": 0.023573419079184532, |
| "kl": 0.00022789835929870605, |
| "lambda_div_used": 0.6212376356124878, |
| "learning_rate": 2.0935222495670968e-07, |
| "loss": 0.0692, |
| "reward": 0.19389131292700768, |
| "reward_after_mean": 0.19389131292700768, |
| "reward_after_std": 0.6119374781847, |
| "reward_before_mean": 0.656364331021905, |
| "reward_before_std": 0.5594985205680132, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4624730013310909, |
| "reward_change_min": -0.6958450116217136, |
| "reward_change_std": 0.28642346803098917, |
| "reward_std": 0.6119375005364418, |
| "rewards/accuracy_reward": 0.479166679084301, |
| "rewards/cosine_scaled_reward": 0.1771976239979267, |
| "step": 399 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1497.520881652832, |
| "epoch": 0.45714285714285713, |
| "grad_norm": 0.034730274230241776, |
| "kl": 0.00023819506168365479, |
| "lambda_div_used": 0.6499348282814026, |
| "learning_rate": 2.0730776160846853e-07, |
| "loss": -0.0384, |
| "reward": 0.35817267652601004, |
| "reward_after_mean": 0.35817267652601004, |
| "reward_after_std": 0.7002598587423563, |
| "reward_before_mean": 0.8351266942918301, |
| "reward_before_std": 0.6965709868818521, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.47695402428507805, |
| "reward_change_min": -0.7681624032557011, |
| "reward_change_std": 0.3101581484079361, |
| "reward_std": 0.7002598755061626, |
| "rewards/accuracy_reward": 0.5208333469927311, |
| "rewards/cosine_scaled_reward": 0.31429335149005055, |
| "step": 400 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2817.020851135254, |
| "epoch": 0.4582857142857143, |
| "grad_norm": 0.024576053023338318, |
| "kl": 0.00034928321838378906, |
| "lambda_div_used": 0.5570264235138893, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0255, |
| "reward": -0.17014812678098679, |
| "reward_after_mean": -0.17014812678098679, |
| "reward_after_std": 0.3594451379030943, |
| "reward_before_mean": 0.25060533825308084, |
| "reward_before_std": 0.25669852178543806, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4207534771412611, |
| "reward_change_min": -0.5965141579508781, |
| "reward_change_std": 0.2299406472593546, |
| "reward_std": 0.35944515466690063, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": 0.02143866289407015, |
| "step": 401 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2323.062530517578, |
| "epoch": 0.4594285714285714, |
| "grad_norm": 0.02968554012477398, |
| "kl": 0.00033861398696899414, |
| "lambda_div_used": 0.579738162457943, |
| "learning_rate": 2.032690407508949e-07, |
| "loss": -0.0104, |
| "reward": -0.20102215744554996, |
| "reward_after_mean": -0.20102215744554996, |
| "reward_after_std": 0.4696238599717617, |
| "reward_before_mean": 0.16348140873014927, |
| "reward_before_std": 0.36563692055642605, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36450355127453804, |
| "reward_change_min": -0.5928855016827583, |
| "reward_change_std": 0.21332142874598503, |
| "reward_std": 0.46962387673556805, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.044851938262581825, |
| "step": 402 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1727.2708587646484, |
| "epoch": 0.4605714285714286, |
| "grad_norm": 0.03782231733202934, |
| "kl": 0.0002620220184326172, |
| "lambda_div_used": 0.5557271614670753, |
| "learning_rate": 2.0127498008311922e-07, |
| "loss": 0.0459, |
| "reward": -0.10220484808087349, |
| "reward_after_mean": -0.10220484808087349, |
| "reward_after_std": 0.4127990063279867, |
| "reward_before_mean": 0.37698337249457836, |
| "reward_before_std": 0.25216651428490877, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4791882447898388, |
| "reward_change_min": -0.6764676049351692, |
| "reward_change_std": 0.25756980665028095, |
| "reward_std": 0.4127990175038576, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/cosine_scaled_reward": 0.06448337621986866, |
| "step": 403 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2469.5000076293945, |
| "epoch": 0.4617142857142857, |
| "grad_norm": 0.0334598608314991, |
| "kl": 0.00029639899730682373, |
| "lambda_div_used": 0.5738808363676071, |
| "learning_rate": 1.9929791578083655e-07, |
| "loss": 0.0173, |
| "reward": 0.004905553534626961, |
| "reward_after_mean": 0.004905553534626961, |
| "reward_after_std": 0.48677791468799114, |
| "reward_before_mean": 0.506166247650981, |
| "reward_before_std": 0.33276718482375145, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5012606829404831, |
| "reward_change_min": -0.6952128820121288, |
| "reward_change_std": 0.26712857093662024, |
| "reward_std": 0.4867779165506363, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.1311662346124649, |
| "step": 404 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1970.9167022705078, |
| "epoch": 0.46285714285714286, |
| "grad_norm": 0.03668729215860367, |
| "kl": 0.00032258033752441406, |
| "lambda_div_used": 0.6146402955055237, |
| "learning_rate": 1.9733794420337213e-07, |
| "loss": 0.0422, |
| "reward": 0.1430281363427639, |
| "reward_after_mean": 0.1430281363427639, |
| "reward_after_std": 0.5927682984620333, |
| "reward_before_mean": 0.610472509637475, |
| "reward_before_std": 0.530167305842042, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4674443490803242, |
| "reward_change_min": -0.726629700511694, |
| "reward_change_std": 0.28909510001540184, |
| "reward_std": 0.592768307775259, |
| "rewards/accuracy_reward": 0.4375000111758709, |
| "rewards/cosine_scaled_reward": 0.17297248914837837, |
| "step": 405 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2001.6875228881836, |
| "epoch": 0.464, |
| "grad_norm": 0.022773489356040955, |
| "kl": 0.0002652406692504883, |
| "lambda_div_used": 0.6149442344903946, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": -0.01, |
| "reward": 0.13723283261060715, |
| "reward_after_mean": 0.13723283261060715, |
| "reward_after_std": 0.6057328097522259, |
| "reward_before_mean": 0.5846194308251143, |
| "reward_before_std": 0.5296543845906854, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.44738658517599106, |
| "reward_change_min": -0.6640791147947311, |
| "reward_change_std": 0.267610440030694, |
| "reward_std": 0.6057328246533871, |
| "rewards/accuracy_reward": 0.41666667722165585, |
| "rewards/cosine_scaled_reward": 0.16795274708420038, |
| "step": 406 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2183.3333892822266, |
| "epoch": 0.46514285714285714, |
| "grad_norm": 0.029050234705209732, |
| "kl": 0.00023674964904785156, |
| "lambda_div_used": 0.5692102611064911, |
| "learning_rate": 1.934696604901642e-07, |
| "loss": -0.002, |
| "reward": 0.08383433520793915, |
| "reward_after_mean": 0.08383433520793915, |
| "reward_after_std": 0.5238823061808944, |
| "reward_before_mean": 0.6431381715228781, |
| "reward_before_std": 0.31836726085748523, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5593038275837898, |
| "reward_change_min": -0.7351665589958429, |
| "reward_change_std": 0.29199546575546265, |
| "reward_std": 0.5238823387771845, |
| "rewards/accuracy_reward": 0.47916666977107525, |
| "rewards/cosine_scaled_reward": 0.16397148557007313, |
| "step": 407 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2340.208396911621, |
| "epoch": 0.4662857142857143, |
| "grad_norm": 0.0308608990162611, |
| "kl": 0.0002751350402832031, |
| "lambda_div_used": 0.595428429543972, |
| "learning_rate": 1.915615368891117e-07, |
| "loss": -0.0448, |
| "reward": -0.14165206719189882, |
| "reward_after_mean": -0.14165206719189882, |
| "reward_after_std": 0.5383005198091269, |
| "reward_before_mean": 0.22720495285466313, |
| "reward_before_std": 0.4390671527944505, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3688570037484169, |
| "reward_change_min": -0.560546163469553, |
| "reward_change_std": 0.21708335354924202, |
| "reward_std": 0.5383005253970623, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": -0.043628389947116375, |
| "step": 408 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3338.3333740234375, |
| "epoch": 0.4674285714285714, |
| "grad_norm": 0.017489202320575714, |
| "kl": 0.0003743171691894531, |
| "lambda_div_used": 0.641681618988514, |
| "learning_rate": 1.8967088307307e-07, |
| "loss": 0.0221, |
| "reward": 0.0034925403306260705, |
| "reward_after_mean": 0.0034925403306260705, |
| "reward_after_std": 0.7239628247916698, |
| "reward_before_mean": 0.3585042329505086, |
| "reward_before_std": 0.6582456473261118, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3550117015838623, |
| "reward_change_min": -0.627179455012083, |
| "reward_change_std": 0.23101032618433237, |
| "reward_std": 0.7239628490060568, |
| "rewards/accuracy_reward": 0.3125000037252903, |
| "rewards/cosine_scaled_reward": 0.04600422829389572, |
| "step": 409 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2387.291690826416, |
| "epoch": 0.4685714285714286, |
| "grad_norm": 0.028829436749219894, |
| "kl": 0.000278472900390625, |
| "lambda_div_used": 0.6026698350906372, |
| "learning_rate": 1.8779779118983867e-07, |
| "loss": -0.0184, |
| "reward": -0.08084386587142944, |
| "reward_after_mean": -0.08084386587142944, |
| "reward_after_std": 0.5610201843082905, |
| "reward_before_mean": 0.2896402692422271, |
| "reward_before_std": 0.4759355755522847, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37048413045704365, |
| "reward_change_min": -0.5802675113081932, |
| "reward_change_std": 0.2220335192978382, |
| "reward_std": 0.561020215973258, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.018806922249495983, |
| "step": 410 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2483.9791717529297, |
| "epoch": 0.4697142857142857, |
| "grad_norm": 0.02735401690006256, |
| "kl": 0.00030410289764404297, |
| "lambda_div_used": 0.5891791060566902, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0464, |
| "reward": -0.09278726205229759, |
| "reward_after_mean": -0.09278726205229759, |
| "reward_after_std": 0.48578726314008236, |
| "reward_before_mean": 0.29704072792083025, |
| "reward_before_std": 0.41402094066143036, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3898279666900635, |
| "reward_change_min": -0.6011387817561626, |
| "reward_change_std": 0.234495647251606, |
| "reward_std": 0.48578727059066296, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.026207380928099155, |
| "step": 411 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2443.000057220459, |
| "epoch": 0.47085714285714286, |
| "grad_norm": 0.027080198749899864, |
| "kl": 0.0003039836883544922, |
| "lambda_div_used": 0.626535639166832, |
| "learning_rate": 1.8410465752883758e-07, |
| "loss": 0.0382, |
| "reward": 0.18809181079268456, |
| "reward_after_mean": 0.18809181079268456, |
| "reward_after_std": 0.6282138898968697, |
| "reward_before_mean": 0.6274868324398994, |
| "reward_before_std": 0.5877205710858107, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4393950141966343, |
| "reward_change_min": -0.6751919612288475, |
| "reward_change_std": 0.2793376138433814, |
| "reward_std": 0.6282139029353857, |
| "rewards/accuracy_reward": 0.4583333432674408, |
| "rewards/cosine_scaled_reward": 0.16915349289774895, |
| "step": 412 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2277.0417251586914, |
| "epoch": 0.472, |
| "grad_norm": 0.02998846024274826, |
| "kl": 0.00023829936981201172, |
| "lambda_div_used": 0.6537542790174484, |
| "learning_rate": 1.822847957491922e-07, |
| "loss": 0.023, |
| "reward": 0.0677551869302988, |
| "reward_after_mean": 0.0677551869302988, |
| "reward_after_std": 0.6939267106354237, |
| "reward_before_mean": 0.3922595623880625, |
| "reward_before_std": 0.7187845781445503, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3245043680071831, |
| "reward_change_min": -0.6250169165432453, |
| "reward_change_std": 0.24318813905119896, |
| "reward_std": 0.6939267329871655, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.05892622594546992, |
| "step": 413 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3080.5625, |
| "epoch": 0.47314285714285714, |
| "grad_norm": 0.016316330060362816, |
| "kl": 0.00028955936431884766, |
| "lambda_div_used": 0.6257347464561462, |
| "learning_rate": 1.804828558898332e-07, |
| "loss": 0.0235, |
| "reward": -0.13909682049416006, |
| "reward_after_mean": -0.13909682049416006, |
| "reward_after_std": 0.6051931101828814, |
| "reward_before_mean": 0.1403335351496935, |
| "reward_before_std": 0.583410625346005, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2794303596019745, |
| "reward_change_min": -0.45995376631617546, |
| "reward_change_std": 0.1819247854873538, |
| "reward_std": 0.6051931362599134, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/cosine_scaled_reward": -0.04716646298766136, |
| "step": 414 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3129.0208740234375, |
| "epoch": 0.4742857142857143, |
| "grad_norm": 0.020433053374290466, |
| "kl": 0.0003305673599243164, |
| "lambda_div_used": 0.6250654757022858, |
| "learning_rate": 1.7869892577476722e-07, |
| "loss": 0.0036, |
| "reward": -0.22648247238248587, |
| "reward_after_mean": -0.22648247238248587, |
| "reward_after_std": 0.6230853609740734, |
| "reward_before_mean": 0.01668240688741207, |
| "reward_before_std": 0.5801385007798672, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24316489323973656, |
| "reward_change_min": -0.43964531272649765, |
| "reward_change_std": 0.1610111938789487, |
| "reward_std": 0.6230853945016861, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.12915092520415783, |
| "step": 415 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1350.31254196167, |
| "epoch": 0.4754285714285714, |
| "grad_norm": 0.029406633228063583, |
| "kl": 0.00015437602996826172, |
| "lambda_div_used": 0.6057035326957703, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": -0.006, |
| "reward": -0.15187997743487358, |
| "reward_after_mean": -0.15187997743487358, |
| "reward_after_std": 0.500312227755785, |
| "reward_before_mean": 0.1518111266195774, |
| "reward_before_std": 0.4886645954102278, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3036911189556122, |
| "reward_change_min": -0.5342599004507065, |
| "reward_change_std": 0.20331810228526592, |
| "reward_std": 0.5003122296184301, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.05652220547199249, |
| "step": 416 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3118.062530517578, |
| "epoch": 0.4765714285714286, |
| "grad_norm": 0.022452035918831825, |
| "kl": 0.00038176774978637695, |
| "lambda_div_used": 0.5929878354072571, |
| "learning_rate": 1.7518544168045524e-07, |
| "loss": -0.0269, |
| "reward": -0.31791230058297515, |
| "reward_after_mean": -0.31791230058297515, |
| "reward_after_std": 0.5058948453515768, |
| "reward_before_mean": -0.05459975823760033, |
| "reward_before_std": 0.4272688911296427, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2633125390857458, |
| "reward_change_min": -0.4145249240100384, |
| "reward_change_std": 0.15364530310034752, |
| "reward_std": 0.5058948528021574, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.13793309262837283, |
| "step": 417 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2075.0417289733887, |
| "epoch": 0.4777142857142857, |
| "grad_norm": 0.03444957733154297, |
| "kl": 0.00030410289764404297, |
| "lambda_div_used": 0.5850719586014748, |
| "learning_rate": 1.7345605894346726e-07, |
| "loss": -0.0603, |
| "reward": 0.03764221305027604, |
| "reward_after_mean": 0.03764221305027604, |
| "reward_after_std": 0.5699926447123289, |
| "reward_before_mean": 0.5322978757321835, |
| "reward_before_std": 0.3904368221992627, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4946556333452463, |
| "reward_change_min": -0.6658468469977379, |
| "reward_change_std": 0.2617466766387224, |
| "reward_std": 0.5699926633387804, |
| "rewards/accuracy_reward": 0.3750000037252903, |
| "rewards/cosine_scaled_reward": 0.15729784907307476, |
| "step": 418 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2422.062515258789, |
| "epoch": 0.47885714285714287, |
| "grad_norm": 0.023973578587174416, |
| "kl": 0.0002415478229522705, |
| "lambda_div_used": 0.5876604542136192, |
| "learning_rate": 1.7174502842694212e-07, |
| "loss": 0.0464, |
| "reward": -0.01938623934984207, |
| "reward_after_mean": -0.01938623934984207, |
| "reward_after_std": 0.531193170696497, |
| "reward_before_mean": 0.4354735445231199, |
| "reward_before_std": 0.4033324606716633, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.45485977828502655, |
| "reward_change_min": -0.6743562705814838, |
| "reward_change_std": 0.2555869175121188, |
| "reward_std": 0.5311931855976582, |
| "rewards/accuracy_reward": 0.33333333395421505, |
| "rewards/cosine_scaled_reward": 0.10214020684361458, |
| "step": 419 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1696.083381652832, |
| "epoch": 0.48, |
| "grad_norm": 0.04010794684290886, |
| "kl": 0.0002671480178833008, |
| "lambda_div_used": 0.5893955454230309, |
| "learning_rate": 1.7005243352409333e-07, |
| "loss": -0.0499, |
| "reward": -0.18572357669472694, |
| "reward_after_mean": -0.18572357669472694, |
| "reward_after_std": 0.4607646930962801, |
| "reward_before_mean": 0.12988583371043205, |
| "reward_before_std": 0.416760787833482, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31560939736664295, |
| "reward_change_min": -0.5215952098369598, |
| "reward_change_std": 0.1989196827635169, |
| "reward_std": 0.4607647117227316, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": -0.14094750862568617, |
| "step": 420 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2925.3333892822266, |
| "epoch": 0.48114285714285715, |
| "grad_norm": 0.025533905252814293, |
| "kl": 0.0003809928894042969, |
| "lambda_div_used": 0.6258808895945549, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0252, |
| "reward": -0.1876915767788887, |
| "reward_after_mean": -0.1876915767788887, |
| "reward_after_std": 0.6121686920523643, |
| "reward_before_mean": 0.07189313881099224, |
| "reward_before_std": 0.5847431821748614, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25958471931517124, |
| "reward_change_min": -0.5139855779707432, |
| "reward_change_std": 0.1843523010611534, |
| "reward_std": 0.6121687144041061, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.09477353328838944, |
| "step": 421 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2811.4166946411133, |
| "epoch": 0.48228571428571426, |
| "grad_norm": 0.020569510757923126, |
| "kl": 0.00035816431045532227, |
| "lambda_div_used": 0.5612113624811172, |
| "learning_rate": 1.6672287963562852e-07, |
| "loss": 0.0257, |
| "reward": -0.22088398411870003, |
| "reward_after_mean": -0.22088398411870003, |
| "reward_after_std": 0.37868294678628445, |
| "reward_before_mean": 0.17238148115575314, |
| "reward_before_std": 0.27797973807901144, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3932654559612274, |
| "reward_change_min": -0.5825164802372456, |
| "reward_change_std": 0.22011223249137402, |
| "reward_std": 0.3786829560995102, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.015118520706892014, |
| "step": 422 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2948.2916870117188, |
| "epoch": 0.48342857142857143, |
| "grad_norm": 0.021468866616487503, |
| "kl": 0.0003040432929992676, |
| "lambda_div_used": 0.6255258545279503, |
| "learning_rate": 1.6508608292777203e-07, |
| "loss": -0.0097, |
| "reward": -0.10086626000702381, |
| "reward_after_mean": -0.10086626000702381, |
| "reward_after_std": 0.5887319762259722, |
| "reward_before_mean": 0.19233786687254906, |
| "reward_before_std": 0.5806169025599957, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2932041045278311, |
| "reward_change_min": -0.5194867514073849, |
| "reward_change_std": 0.19756229128688574, |
| "reward_std": 0.5887319948524237, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": -0.03682881221175194, |
| "step": 423 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2725.6041946411133, |
| "epoch": 0.4845714285714286, |
| "grad_norm": 0.025675497949123383, |
| "kl": 0.00032907724380493164, |
| "lambda_div_used": 0.6430495753884315, |
| "learning_rate": 1.6346804638120098e-07, |
| "loss": -0.0044, |
| "reward": -0.06141174025833607, |
| "reward_after_mean": -0.06141174025833607, |
| "reward_after_std": 0.6873566564172506, |
| "reward_before_mean": 0.2285282697994262, |
| "reward_before_std": 0.664992194622755, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.28994001634418964, |
| "reward_change_min": -0.5423923581838608, |
| "reward_change_std": 0.20123817585408688, |
| "reward_std": 0.6873566769063473, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/cosine_scaled_reward": -0.021471746265888214, |
| "step": 424 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1639.2292175292969, |
| "epoch": 0.4857142857142857, |
| "grad_norm": 0.025542836636304855, |
| "kl": 0.00018703937530517578, |
| "lambda_div_used": 0.6579956188797951, |
| "learning_rate": 1.6186884885673413e-07, |
| "loss": 0.0441, |
| "reward": 0.5941350422799587, |
| "reward_after_mean": 0.5941350422799587, |
| "reward_after_std": 0.8043180033564568, |
| "reward_before_mean": 1.2064684219658375, |
| "reward_before_std": 0.7356861205771565, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.6123334169387817, |
| "reward_change_min": -0.9342719316482544, |
| "reward_change_std": 0.38359352573752403, |
| "reward_std": 0.804318018257618, |
| "rewards/accuracy_reward": 0.7500000186264515, |
| "rewards/cosine_scaled_reward": 0.456468403339386, |
| "step": 425 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2131.9375381469727, |
| "epoch": 0.4868571428571429, |
| "grad_norm": 0.029882676899433136, |
| "kl": 0.00031810998916625977, |
| "lambda_div_used": 0.6198792308568954, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": -0.0153, |
| "reward": 0.03275429271161556, |
| "reward_after_mean": 0.03275429271161556, |
| "reward_after_std": 0.6299177911132574, |
| "reward_before_mean": 0.4296752456575632, |
| "reward_before_std": 0.5544933034107089, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3969209287315607, |
| "reward_change_min": -0.6235288828611374, |
| "reward_change_std": 0.24328004382550716, |
| "reward_std": 0.6299178209155798, |
| "rewards/accuracy_reward": 0.37500000931322575, |
| "rewards/cosine_scaled_reward": 0.05467522703111172, |
| "step": 426 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3065.3958587646484, |
| "epoch": 0.488, |
| "grad_norm": 0.024560794234275818, |
| "kl": 0.0003638267517089844, |
| "lambda_div_used": 0.5728301778435707, |
| "learning_rate": 1.5872728172265146e-07, |
| "loss": 0.0713, |
| "reward": -0.181079788133502, |
| "reward_after_mean": -0.181079788133502, |
| "reward_after_std": 0.4291039705276489, |
| "reward_before_mean": 0.19762573204934597, |
| "reward_before_std": 0.3286724528297782, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3787055220454931, |
| "reward_change_min": -0.5433184914290905, |
| "reward_change_std": 0.20709905866533518, |
| "reward_std": 0.4291039779782295, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": -0.05237427353858948, |
| "step": 427 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2331.416702270508, |
| "epoch": 0.48914285714285716, |
| "grad_norm": 0.027837947010993958, |
| "kl": 0.00027292966842651367, |
| "lambda_div_used": 0.6576317623257637, |
| "learning_rate": 1.5718506522858572e-07, |
| "loss": 0.0627, |
| "reward": 0.037938148714601994, |
| "reward_after_mean": 0.037938148714601994, |
| "reward_after_std": 0.7411033473908901, |
| "reward_before_mean": 0.3408977910876274, |
| "reward_before_std": 0.7301055882126093, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3029596321284771, |
| "reward_change_min": -0.5550829358398914, |
| "reward_change_std": 0.21431603003293276, |
| "reward_std": 0.7411033622920513, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/cosine_scaled_reward": 0.028397773392498493, |
| "step": 428 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2035.8750381469727, |
| "epoch": 0.49028571428571427, |
| "grad_norm": 0.0320022888481617, |
| "kl": 0.00036203861236572266, |
| "lambda_div_used": 0.6039041504263878, |
| "learning_rate": 1.5566199398026147e-07, |
| "loss": -0.0493, |
| "reward": -0.09264844097197056, |
| "reward_after_mean": -0.09264844097197056, |
| "reward_after_std": 0.5593565441668034, |
| "reward_before_mean": 0.28112196549773216, |
| "reward_before_std": 0.4800149817019701, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3737703934311867, |
| "reward_change_min": -0.602749090641737, |
| "reward_change_std": 0.22795243095606565, |
| "reward_std": 0.5593565553426743, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.010288612451404333, |
| "step": 429 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2334.7083778381348, |
| "epoch": 0.49142857142857144, |
| "grad_norm": 0.02567731775343418, |
| "kl": 0.00031498074531555176, |
| "lambda_div_used": 0.609040379524231, |
| "learning_rate": 1.5415814221002265e-07, |
| "loss": 0.0017, |
| "reward": -0.11068469006568193, |
| "reward_after_mean": -0.11068469006568193, |
| "reward_after_std": 0.5843690279871225, |
| "reward_before_mean": 0.24647100269794464, |
| "reward_before_std": 0.5133073255419731, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35715569369494915, |
| "reward_change_min": -0.6265733800828457, |
| "reward_change_std": 0.23320611286908388, |
| "reward_std": 0.5843690391629934, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": -0.02436233009211719, |
| "step": 430 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2203.125015258789, |
| "epoch": 0.49257142857142855, |
| "grad_norm": 0.030070627108216286, |
| "kl": 0.0003618001937866211, |
| "lambda_div_used": 0.5612699165940285, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0147, |
| "reward": -0.21507295966148376, |
| "reward_after_mean": -0.21507295966148376, |
| "reward_after_std": 0.37470250017941, |
| "reward_before_mean": 0.18050049245357513, |
| "reward_before_std": 0.2745527196675539, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39557345397770405, |
| "reward_change_min": -0.5842532999813557, |
| "reward_change_std": 0.21895906049758196, |
| "reward_std": 0.37470250204205513, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.027832843363285065, |
| "step": 431 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2875.2917098999023, |
| "epoch": 0.4937142857142857, |
| "grad_norm": 0.021773481741547585, |
| "kl": 0.00034308433532714844, |
| "lambda_div_used": 0.6463272646069527, |
| "learning_rate": 1.5120838934595337e-07, |
| "loss": -0.0175, |
| "reward": 0.0671270489692688, |
| "reward_after_mean": 0.0671270489692688, |
| "reward_after_std": 0.6703518275171518, |
| "reward_before_mean": 0.3971955068409443, |
| "reward_before_std": 0.6849911892786622, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3300684615969658, |
| "reward_change_min": -0.6204027272760868, |
| "reward_change_std": 0.24090207554399967, |
| "reward_std": 0.6703518535941839, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.06386216171085835, |
| "step": 432 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2982.0000610351562, |
| "epoch": 0.4948571428571429, |
| "grad_norm": 0.021818527951836586, |
| "kl": 0.0003089308738708496, |
| "lambda_div_used": 0.5629478171467781, |
| "learning_rate": 1.4976263201891613e-07, |
| "loss": -0.0006, |
| "reward": -0.06292321160435677, |
| "reward_after_mean": -0.06292321160435677, |
| "reward_after_std": 0.43814039044082165, |
| "reward_before_mean": 0.4110525958240032, |
| "reward_before_std": 0.28899803664535284, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4739758223295212, |
| "reward_change_min": -0.672174334526062, |
| "reward_change_std": 0.25748884305357933, |
| "reward_std": 0.4381403960287571, |
| "rewards/accuracy_reward": 0.3333333358168602, |
| "rewards/cosine_scaled_reward": 0.07771925255656242, |
| "step": 433 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2822.6458587646484, |
| "epoch": 0.496, |
| "grad_norm": 0.024085119366645813, |
| "kl": 0.0003399848937988281, |
| "lambda_div_used": 0.5637771561741829, |
| "learning_rate": 1.483363816965435e-07, |
| "loss": 0.0353, |
| "reward": -0.39865921065211296, |
| "reward_after_mean": -0.39865921065211296, |
| "reward_after_std": 0.34102493710815907, |
| "reward_before_mean": -0.12386159785091877, |
| "reward_before_std": 0.2913210419937968, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2747976202517748, |
| "reward_change_min": -0.4381771683692932, |
| "reward_change_std": 0.16477954387664795, |
| "reward_std": 0.34102495945990086, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.18636160157620907, |
| "step": 434 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2098.333366394043, |
| "epoch": 0.49714285714285716, |
| "grad_norm": 0.03476332873106003, |
| "kl": 0.0004093945026397705, |
| "lambda_div_used": 0.561927042901516, |
| "learning_rate": 1.469297078922642e-07, |
| "loss": 0.012, |
| "reward": -0.23742017894983292, |
| "reward_after_mean": -0.23742017894983292, |
| "reward_after_std": 0.3864587936550379, |
| "reward_before_mean": 0.1438802983611822, |
| "reward_before_std": 0.28451414965093136, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3813004810363054, |
| "reward_change_min": -0.5687282234430313, |
| "reward_change_std": 0.21544194873422384, |
| "reward_std": 0.3864588178694248, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.043619705364108086, |
| "step": 435 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1956.6666984558105, |
| "epoch": 0.4982857142857143, |
| "grad_norm": 0.03724474087357521, |
| "kl": 0.00024643540382385254, |
| "lambda_div_used": 0.5972427576780319, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": -0.0215, |
| "reward": 0.24212833493947983, |
| "reward_after_mean": 0.24212833493947983, |
| "reward_after_std": 0.5586434360593557, |
| "reward_before_mean": 0.7999392561614513, |
| "reward_before_std": 0.44878256041556597, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5578109100461006, |
| "reward_change_min": -0.789992418140173, |
| "reward_change_std": 0.32050481624901295, |
| "reward_std": 0.5586434435099363, |
| "rewards/accuracy_reward": 0.5208333432674408, |
| "rewards/cosine_scaled_reward": 0.27910589799284935, |
| "step": 436 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2701.1458587646484, |
| "epoch": 0.49942857142857144, |
| "grad_norm": 0.02587887831032276, |
| "kl": 0.0003287792205810547, |
| "lambda_div_used": 0.5530718564987183, |
| "learning_rate": 1.4417536311769885e-07, |
| "loss": -0.0406, |
| "reward": -0.2597166027408093, |
| "reward_after_mean": -0.2597166027408093, |
| "reward_after_std": 0.3689497411251068, |
| "reward_before_mean": 0.13261681143194437, |
| "reward_before_std": 0.23820086661726236, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.39233342185616493, |
| "reward_change_min": -0.5372026227414608, |
| "reward_change_std": 0.20350486412644386, |
| "reward_std": 0.3689497448503971, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.013216521823778749, |
| "step": 437 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2737.6667404174805, |
| "epoch": 0.5005714285714286, |
| "grad_norm": 0.02056184597313404, |
| "kl": 0.0002714395523071289, |
| "lambda_div_used": 0.6291297823190689, |
| "learning_rate": 1.4282782639029128e-07, |
| "loss": -0.0391, |
| "reward": 0.011486291885375977, |
| "reward_after_mean": 0.011486291885375977, |
| "reward_after_std": 0.6032967660576105, |
| "reward_before_mean": 0.34824367985129356, |
| "reward_before_std": 0.6035201866179705, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3367573842406273, |
| "reward_change_min": -0.588931929320097, |
| "reward_change_std": 0.23199212551116943, |
| "reward_std": 0.6032967790961266, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/cosine_scaled_reward": 0.05657700449228287, |
| "step": 438 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2417.875057220459, |
| "epoch": 0.5017142857142857, |
| "grad_norm": 0.02567973919212818, |
| "kl": 0.00029343366622924805, |
| "lambda_div_used": 0.6277910619974136, |
| "learning_rate": 1.4150013466019114e-07, |
| "loss": 0.013, |
| "reward": -0.039602138102054596, |
| "reward_after_mean": -0.039602138102054596, |
| "reward_after_std": 0.6028888281434774, |
| "reward_before_mean": 0.2732508610934019, |
| "reward_before_std": 0.5980509808287024, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3128530103713274, |
| "reward_change_min": -0.5787924043834209, |
| "reward_change_std": 0.21938505861908197, |
| "reward_std": 0.6028888486325741, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/cosine_scaled_reward": 0.023250849917531013, |
| "step": 439 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2942.770835876465, |
| "epoch": 0.5028571428571429, |
| "grad_norm": 0.028702648356556892, |
| "kl": 0.00038042664527893066, |
| "lambda_div_used": 0.5716730058193207, |
| "learning_rate": 1.4019235263722034e-07, |
| "loss": -0.0684, |
| "reward": -0.40989339258521795, |
| "reward_after_mean": -0.40989339258521795, |
| "reward_after_std": 0.42086669616401196, |
| "reward_before_mean": -0.15851380862295628, |
| "reward_before_std": 0.3258522395044565, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2513795755803585, |
| "reward_change_min": -0.3500328026711941, |
| "reward_change_std": 0.13096946012228727, |
| "reward_std": 0.42086671106517315, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.2001804756000638, |
| "step": 440 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2898.250015258789, |
| "epoch": 0.504, |
| "grad_norm": 0.025851793587207794, |
| "kl": 0.0004258155822753906, |
| "lambda_div_used": 0.5947419032454491, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0188, |
| "reward": -0.09268893860280514, |
| "reward_after_mean": -0.09268893860280514, |
| "reward_after_std": 0.527678394690156, |
| "reward_before_mean": 0.28031823271885514, |
| "reward_before_std": 0.43342068372294307, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.373007170855999, |
| "reward_change_min": -0.5688190311193466, |
| "reward_change_std": 0.2149599390104413, |
| "reward_std": 0.5276784114539623, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": -0.03218177333474159, |
| "step": 441 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2738.312530517578, |
| "epoch": 0.5051428571428571, |
| "grad_norm": 0.024453362450003624, |
| "kl": 0.0003438591957092285, |
| "lambda_div_used": 0.6373191103339195, |
| "learning_rate": 1.3763677169699217e-07, |
| "loss": -0.0098, |
| "reward": -0.11879788711667061, |
| "reward_after_mean": -0.11879788711667061, |
| "reward_after_std": 0.6699451506137848, |
| "reward_before_mean": 0.15645072294864804, |
| "reward_before_std": 0.6331657916307449, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27524859458208084, |
| "reward_change_min": -0.4614573121070862, |
| "reward_change_std": 0.1758969947695732, |
| "reward_std": 0.669945165514946, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.051882621832191944, |
| "step": 442 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3111.3541870117188, |
| "epoch": 0.5062857142857143, |
| "grad_norm": 0.018424130976200104, |
| "kl": 0.0003415346145629883, |
| "lambda_div_used": 0.594095878303051, |
| "learning_rate": 1.3638909733514452e-07, |
| "loss": -0.0021, |
| "reward": -0.13722801115363836, |
| "reward_after_mean": -0.13722801115363836, |
| "reward_after_std": 0.5484136454761028, |
| "reward_before_mean": 0.23274649307131767, |
| "reward_before_std": 0.4324050806462765, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36997453309595585, |
| "reward_change_min": -0.530858725309372, |
| "reward_change_std": 0.2040829285979271, |
| "reward_std": 0.5484136454761028, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": -0.01725347526371479, |
| "step": 443 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2850.6666984558105, |
| "epoch": 0.5074285714285715, |
| "grad_norm": 0.023973438888788223, |
| "kl": 0.0003078579902648926, |
| "lambda_div_used": 0.5948461815714836, |
| "learning_rate": 1.351615817851748e-07, |
| "loss": -0.0055, |
| "reward": -0.1747817099094391, |
| "reward_after_mean": -0.1747817099094391, |
| "reward_after_std": 0.4692641645669937, |
| "reward_before_mean": 0.1552269384264946, |
| "reward_before_std": 0.4344164803624153, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3300086557865143, |
| "reward_change_min": -0.4955419562757015, |
| "reward_change_std": 0.19987357687205076, |
| "reward_std": 0.4692641757428646, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.05310639180243015, |
| "step": 444 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2896.1458892822266, |
| "epoch": 0.5085714285714286, |
| "grad_norm": 0.021344272419810295, |
| "kl": 0.000371396541595459, |
| "lambda_div_used": 0.5911799594759941, |
| "learning_rate": 1.3395428487445914e-07, |
| "loss": 0.0204, |
| "reward": -0.09663986414670944, |
| "reward_after_mean": -0.09663986414670944, |
| "reward_after_std": 0.5167377535253763, |
| "reward_before_mean": 0.29358627926558256, |
| "reward_before_std": 0.4120226204395294, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3902261406183243, |
| "reward_change_min": -0.5846884902566671, |
| "reward_change_std": 0.21900581941008568, |
| "reward_std": 0.5167377851903439, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": -0.018913742154836655, |
| "step": 445 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2849.7083892822266, |
| "epoch": 0.5097142857142857, |
| "grad_norm": 0.020637815818190575, |
| "kl": 0.0003114938735961914, |
| "lambda_div_used": 0.5921742841601372, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": 0.0358, |
| "reward": -0.19162439927458763, |
| "reward_after_mean": -0.19162439927458763, |
| "reward_after_std": 0.4613885171711445, |
| "reward_before_mean": 0.12132475152611732, |
| "reward_before_std": 0.42135200183838606, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.31294916570186615, |
| "reward_change_min": -0.5426378659904003, |
| "reward_change_std": 0.196873115375638, |
| "reward_std": 0.4613885283470154, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.06617523916065693, |
| "step": 446 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1833.833351135254, |
| "epoch": 0.5108571428571429, |
| "grad_norm": 0.03296150267124176, |
| "kl": 0.00031435489654541016, |
| "lambda_div_used": 0.5761597007513046, |
| "learning_rate": 1.316005813502869e-07, |
| "loss": -0.0092, |
| "reward": -0.1308344192802906, |
| "reward_after_mean": -0.1308344192802906, |
| "reward_after_std": 0.44760639779269695, |
| "reward_before_mean": 0.27170680463314056, |
| "reward_before_std": 0.3468586690723896, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.40254124999046326, |
| "reward_change_min": -0.6140144616365433, |
| "reward_change_std": 0.23130866140127182, |
| "reward_std": 0.44760641269385815, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/cosine_scaled_reward": 0.0008734846487641335, |
| "step": 447 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2031.1875228881836, |
| "epoch": 0.512, |
| "grad_norm": 0.03561374545097351, |
| "kl": 0.0002592802047729492, |
| "lambda_div_used": 0.5793976187705994, |
| "learning_rate": 1.3045428945301953e-07, |
| "loss": 0.0625, |
| "reward": -0.15141154546290636, |
| "reward_after_mean": -0.15141154546290636, |
| "reward_after_std": 0.47303674556314945, |
| "reward_before_mean": 0.2355510238558054, |
| "reward_before_std": 0.36865185387432575, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38696256279945374, |
| "reward_change_min": -0.6033525615930557, |
| "reward_change_std": 0.22703420650213957, |
| "reward_std": 0.47303677164018154, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/cosine_scaled_reward": 0.00638435548171401, |
| "step": 448 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2527.416679382324, |
| "epoch": 0.5131428571428571, |
| "grad_norm": 0.028000032529234886, |
| "kl": 0.0003337860107421875, |
| "lambda_div_used": 0.5869953334331512, |
| "learning_rate": 1.2932844562179352e-07, |
| "loss": -0.0384, |
| "reward": -0.2571147223934531, |
| "reward_after_mean": -0.2571147223934531, |
| "reward_after_std": 0.44822895526885986, |
| "reward_before_mean": 0.04588266555219889, |
| "reward_before_std": 0.39473184011876583, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30299739353358746, |
| "reward_change_min": -0.4681765213608742, |
| "reward_change_std": 0.17783036269247532, |
| "reward_std": 0.4482289757579565, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.1624506814405322, |
| "step": 449 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2360.3125228881836, |
| "epoch": 0.5142857142857142, |
| "grad_norm": 0.028984738513827324, |
| "kl": 0.00030869245529174805, |
| "lambda_div_used": 0.5782932788133621, |
| "learning_rate": 1.2822310472864885e-07, |
| "loss": -0.0103, |
| "reward": -0.1581341177225113, |
| "reward_after_mean": -0.1581341177225113, |
| "reward_after_std": 0.45438094437122345, |
| "reward_before_mean": 0.22925141779705882, |
| "reward_before_std": 0.3587577445432544, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3873855248093605, |
| "reward_change_min": -0.5598408095538616, |
| "reward_change_std": 0.21874273754656315, |
| "reward_std": 0.4543809536844492, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": 8.474662899971008e-05, |
| "step": 450 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2468.1667098999023, |
| "epoch": 0.5154285714285715, |
| "grad_norm": 0.035742077976465225, |
| "kl": 0.0004057884216308594, |
| "lambda_div_used": 0.5319794341921806, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": -0.0206, |
| "reward": -0.24930068850517273, |
| "reward_after_mean": -0.24930068850517273, |
| "reward_after_std": 0.3233966138213873, |
| "reward_before_mean": 0.2043198449537158, |
| "reward_before_std": 0.1404099608771503, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4536205381155014, |
| "reward_change_min": -0.6192456483840942, |
| "reward_change_std": 0.23171372152864933, |
| "reward_std": 0.3233966249972582, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/cosine_scaled_reward": -0.04568016994744539, |
| "step": 451 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2740.4166831970215, |
| "epoch": 0.5165714285714286, |
| "grad_norm": 0.02392762340605259, |
| "kl": 0.0003084242343902588, |
| "lambda_div_used": 0.6302760690450668, |
| "learning_rate": 1.260741462457165e-07, |
| "loss": -0.0038, |
| "reward": 0.021876126527786255, |
| "reward_after_mean": 0.021876126527786255, |
| "reward_after_std": 0.5877971854060888, |
| "reward_before_mean": 0.37123518623411655, |
| "reward_before_std": 0.6054041795432568, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34935908019542694, |
| "reward_change_min": -0.593828123062849, |
| "reward_change_std": 0.2436074260622263, |
| "reward_std": 0.5877972133457661, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.03790186531841755, |
| "step": 452 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2600.1667518615723, |
| "epoch": 0.5177142857142857, |
| "grad_norm": 0.03355313092470169, |
| "kl": 0.0003637075424194336, |
| "lambda_div_used": 0.6487660184502602, |
| "learning_rate": 1.2503063339313356e-07, |
| "loss": 0.0766, |
| "reward": 0.08280018530786037, |
| "reward_after_mean": 0.08280018530786037, |
| "reward_after_std": 0.721244465559721, |
| "reward_before_mean": 0.43781263194978237, |
| "reward_before_std": 0.6883194223046303, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35501245222985744, |
| "reward_change_min": -0.6027462910860777, |
| "reward_change_std": 0.2280629277229309, |
| "reward_std": 0.7212444879114628, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.08364596217870712, |
| "step": 453 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2229.520866394043, |
| "epoch": 0.5188571428571429, |
| "grad_norm": 0.026976440101861954, |
| "kl": 0.0003204345703125, |
| "lambda_div_used": 0.586439348757267, |
| "learning_rate": 1.2400783294793668e-07, |
| "loss": 0.0213, |
| "reward": -0.06522449851036072, |
| "reward_after_mean": -0.06522449851036072, |
| "reward_after_std": 0.4758566189557314, |
| "reward_before_mean": 0.34830280393362045, |
| "reward_before_std": 0.4015544820576906, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4135272856801748, |
| "reward_change_min": -0.6215804703533649, |
| "reward_change_std": 0.24525572545826435, |
| "reward_std": 0.47585663571953773, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": 0.05663611739873886, |
| "step": 454 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2910.729217529297, |
| "epoch": 0.52, |
| "grad_norm": 0.022959912195801735, |
| "kl": 0.0003757178783416748, |
| "lambda_div_used": 0.575455017387867, |
| "learning_rate": 1.2300579475997657e-07, |
| "loss": 0.0464, |
| "reward": -0.396820537163876, |
| "reward_after_mean": -0.396820537163876, |
| "reward_after_std": 0.42159392312169075, |
| "reward_before_mean": -0.14198972191661596, |
| "reward_before_std": 0.3413227070122957, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2548308204859495, |
| "reward_change_min": -0.36302991211414337, |
| "reward_change_std": 0.13639382366091013, |
| "reward_std": 0.4215939249843359, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.18365638982504606, |
| "step": 455 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2996.0417098999023, |
| "epoch": 0.5211428571428571, |
| "grad_norm": 0.022153589874505997, |
| "kl": 0.0003063082695007324, |
| "lambda_div_used": 0.6117950826883316, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.03, |
| "reward": -0.22662064619362354, |
| "reward_after_mean": -0.22662064619362354, |
| "reward_after_std": 0.5622703209519386, |
| "reward_before_mean": 0.03414517780765891, |
| "reward_before_std": 0.5151250278577209, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26076582819223404, |
| "reward_change_min": -0.43476971983909607, |
| "reward_change_std": 0.165956006385386, |
| "reward_std": 0.5622703321278095, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.13252148625906557, |
| "step": 456 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2978.187530517578, |
| "epoch": 0.5222857142857142, |
| "grad_norm": 0.0224411953240633, |
| "kl": 0.0004011392593383789, |
| "lambda_div_used": 0.5359829142689705, |
| "learning_rate": 1.2106419949317388e-07, |
| "loss": -0.0311, |
| "reward": -0.2614034563302994, |
| "reward_after_mean": -0.2614034563302994, |
| "reward_after_std": 0.33165648579597473, |
| "reward_before_mean": 0.16703256964683533, |
| "reward_before_std": 0.157981239259243, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42843602411448956, |
| "reward_change_min": -0.5874424390494823, |
| "reward_change_std": 0.21858325507491827, |
| "reward_std": 0.3316564913839102, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/cosine_scaled_reward": -0.08296743780374527, |
| "step": 457 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2119.250026702881, |
| "epoch": 0.5234285714285715, |
| "grad_norm": 0.03472839295864105, |
| "kl": 0.00028389692306518555, |
| "lambda_div_used": 0.6622679010033607, |
| "learning_rate": 1.2012473704494537e-07, |
| "loss": 0.0396, |
| "reward": 0.09622732177376747, |
| "reward_after_mean": 0.09622732177376747, |
| "reward_after_std": 0.7549249790608883, |
| "reward_before_mean": 0.4134064484387636, |
| "reward_before_std": 0.7546715997159481, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3171791285276413, |
| "reward_change_min": -0.5532067231833935, |
| "reward_change_std": 0.2206783127039671, |
| "reward_std": 0.7549249865114689, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.05923975070982124, |
| "step": 458 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1308.0000534057617, |
| "epoch": 0.5245714285714286, |
| "grad_norm": 0.034576416015625, |
| "kl": 0.00019240379333496094, |
| "lambda_div_used": 0.6211641430854797, |
| "learning_rate": 1.1920622611056974e-07, |
| "loss": 0.0136, |
| "reward": 0.10968395322561264, |
| "reward_after_mean": 0.10968395322561264, |
| "reward_after_std": 0.642698410898447, |
| "reward_before_mean": 0.5380105744116008, |
| "reward_before_std": 0.5641936575993896, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.42832658998668194, |
| "reward_change_min": -0.647167906165123, |
| "reward_change_std": 0.26214463263750076, |
| "reward_std": 0.6426984257996082, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.16301053576171398, |
| "step": 459 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2947.2916870117188, |
| "epoch": 0.5257142857142857, |
| "grad_norm": 0.022817425429821014, |
| "kl": 0.00036215782165527344, |
| "lambda_div_used": 0.6313923373818398, |
| "learning_rate": 1.1830871145697412e-07, |
| "loss": 0.0374, |
| "reward": -0.143511475995183, |
| "reward_after_mean": -0.143511475995183, |
| "reward_after_std": 0.6310819126665592, |
| "reward_before_mean": 0.12019729614257812, |
| "reward_before_std": 0.6178826270624995, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.26370877772569656, |
| "reward_change_min": -0.5500058270990849, |
| "reward_change_std": 0.197435456328094, |
| "reward_std": 0.6310819499194622, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/cosine_scaled_reward": -0.06730269826948643, |
| "step": 460 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3020.604232788086, |
| "epoch": 0.5268571428571428, |
| "grad_norm": 0.019895615056157112, |
| "kl": 0.0003611445426940918, |
| "lambda_div_used": 0.6468427553772926, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0226, |
| "reward": 0.13486449420452118, |
| "reward_after_mean": 0.13486449420452118, |
| "reward_after_std": 0.6662128213793039, |
| "reward_before_mean": 0.5074618738144636, |
| "reward_before_std": 0.6824358962476254, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3725973889231682, |
| "reward_change_min": -0.659518338739872, |
| "reward_change_std": 0.26068645529448986, |
| "reward_std": 0.6662128381431103, |
| "rewards/accuracy_reward": 0.3750000111758709, |
| "rewards/cosine_scaled_reward": 0.1324618849903345, |
| "step": 461 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2896.958354949951, |
| "epoch": 0.528, |
| "grad_norm": 0.027659112587571144, |
| "kl": 0.0003952980041503906, |
| "lambda_div_used": 0.5935230925679207, |
| "learning_rate": 1.1657684494105386e-07, |
| "loss": -0.0084, |
| "reward": -0.32939455355517566, |
| "reward_after_mean": -0.32939455355517566, |
| "reward_after_std": 0.4861418064683676, |
| "reward_before_mean": -0.06984398560598493, |
| "reward_before_std": 0.4248745897784829, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.25955056957900524, |
| "reward_change_min": -0.4244098737835884, |
| "reward_change_std": 0.1560937762260437, |
| "reward_std": 0.4861418195068836, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.15317732468247414, |
| "step": 462 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2485.375030517578, |
| "epoch": 0.5291428571428571, |
| "grad_norm": 0.02032829262316227, |
| "kl": 0.00022867321968078613, |
| "lambda_div_used": 0.6013497039675713, |
| "learning_rate": 1.1574257748745986e-07, |
| "loss": 0.1063, |
| "reward": 0.004297456704080105, |
| "reward_after_mean": 0.004297456704080105, |
| "reward_after_std": 0.5376875698566437, |
| "reward_before_mean": 0.42904988676309586, |
| "reward_before_std": 0.46889279037714005, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4247523993253708, |
| "reward_change_min": -0.6676977872848511, |
| "reward_change_std": 0.26014791429042816, |
| "reward_std": 0.5376875959336758, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.09571653697639704, |
| "step": 463 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1664.770881652832, |
| "epoch": 0.5302857142857142, |
| "grad_norm": 0.03104904294013977, |
| "kl": 0.0002219080924987793, |
| "lambda_div_used": 0.6030265837907791, |
| "learning_rate": 1.1492947512799328e-07, |
| "loss": -0.0781, |
| "reward": 0.027926755137741566, |
| "reward_after_mean": 0.027926755137741566, |
| "reward_after_std": 0.5750233307480812, |
| "reward_before_mean": 0.4541918604518287, |
| "reward_before_std": 0.4690140914171934, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4262651167809963, |
| "reward_change_min": -0.597686804831028, |
| "reward_change_std": 0.22887016367167234, |
| "reward_std": 0.5750233307480812, |
| "rewards/accuracy_reward": 0.4166666679084301, |
| "rewards/cosine_scaled_reward": 0.03752519562840462, |
| "step": 464 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2696.979248046875, |
| "epoch": 0.5314285714285715, |
| "grad_norm": 0.02595655620098114, |
| "kl": 0.0003947019577026367, |
| "lambda_div_used": 0.6469813883304596, |
| "learning_rate": 1.1413757749211602e-07, |
| "loss": 0.0015, |
| "reward": -0.005829358473420143, |
| "reward_after_mean": -0.005829358473420143, |
| "reward_after_std": 0.7018453720957041, |
| "reward_before_mean": 0.2963540703058243, |
| "reward_before_std": 0.687056201742962, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.30218344181776047, |
| "reward_change_min": -0.5677898563444614, |
| "reward_change_std": 0.2154219476506114, |
| "reward_std": 0.7018453869968653, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": 0.04635407403111458, |
| "step": 465 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3051.750030517578, |
| "epoch": 0.5325714285714286, |
| "grad_norm": 0.022232649847865105, |
| "kl": 0.00035768747329711914, |
| "lambda_div_used": 0.6076068878173828, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": 0.0096, |
| "reward": -0.09472141414880753, |
| "reward_after_mean": -0.09472141414880753, |
| "reward_after_std": 0.5900444928556681, |
| "reward_before_mean": 0.2662593559361994, |
| "reward_before_std": 0.5045403479598463, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36098079197108746, |
| "reward_change_min": -0.5804308690130711, |
| "reward_change_std": 0.22072006110101938, |
| "reward_std": 0.5900445096194744, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/cosine_scaled_reward": 0.016259355936199427, |
| "step": 466 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2831.6458702087402, |
| "epoch": 0.5337142857142857, |
| "grad_norm": 0.027867048978805542, |
| "kl": 0.0003955364227294922, |
| "lambda_div_used": 0.5972240790724754, |
| "learning_rate": 1.1261754973965422e-07, |
| "loss": 0.0064, |
| "reward": -0.16292368434369564, |
| "reward_after_mean": -0.16292368434369564, |
| "reward_after_std": 0.5499103963375092, |
| "reward_before_mean": 0.19015199813293293, |
| "reward_before_std": 0.4444003812968731, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3530757036060095, |
| "reward_change_min": -0.5379382502287626, |
| "reward_change_std": 0.1972823329269886, |
| "reward_std": 0.5499104224145412, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.018181337043642998, |
| "step": 467 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2942.583354949951, |
| "epoch": 0.5348571428571428, |
| "grad_norm": 0.021755212917923927, |
| "kl": 0.0003478527069091797, |
| "lambda_div_used": 0.5782695487141609, |
| "learning_rate": 1.1188949370707787e-07, |
| "loss": -0.0004, |
| "reward": -0.2234923504292965, |
| "reward_after_mean": -0.2234923504292965, |
| "reward_after_std": 0.4573483895510435, |
| "reward_before_mean": 0.13846815121360123, |
| "reward_before_std": 0.36004857218358666, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.36196050979197025, |
| "reward_change_min": -0.5418836548924446, |
| "reward_change_std": 0.2078724503517151, |
| "reward_std": 0.45734839886426926, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.06986518204212189, |
| "step": 468 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2679.208396911621, |
| "epoch": 0.536, |
| "grad_norm": 0.02693931572139263, |
| "kl": 0.00039780139923095703, |
| "lambda_div_used": 0.6250492706894875, |
| "learning_rate": 1.1118279056249653e-07, |
| "loss": 0.0181, |
| "reward": 0.01080943364650011, |
| "reward_after_mean": 0.01080943364650011, |
| "reward_after_std": 0.6565821636468172, |
| "reward_before_mean": 0.38899326138198376, |
| "reward_before_std": 0.5838505062274635, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.37818380631506443, |
| "reward_change_min": -0.6336386613547802, |
| "reward_change_std": 0.23969437181949615, |
| "reward_std": 0.6565821878612041, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.07649324322119355, |
| "step": 469 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2651.8333740234375, |
| "epoch": 0.5371428571428571, |
| "grad_norm": 0.022814009338617325, |
| "kl": 0.0003286600112915039, |
| "lambda_div_used": 0.5820565819740295, |
| "learning_rate": 1.1049747474962444e-07, |
| "loss": 0.062, |
| "reward": -0.1548374481499195, |
| "reward_after_mean": -0.1548374481499195, |
| "reward_after_std": 0.46373489685356617, |
| "reward_before_mean": 0.21819785539992154, |
| "reward_before_std": 0.37470409646630287, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3730353116989136, |
| "reward_change_min": -0.5532267577946186, |
| "reward_change_std": 0.2110730605199933, |
| "reward_std": 0.46373490430414677, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.010968813672661781, |
| "step": 470 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2898.7916946411133, |
| "epoch": 0.5382857142857143, |
| "grad_norm": 0.01961471140384674, |
| "kl": 0.00035419315099716187, |
| "lambda_div_used": 0.580303005874157, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": -0.0592, |
| "reward": -0.22997316345572472, |
| "reward_after_mean": -0.22997316345572472, |
| "reward_after_std": 0.4005854483693838, |
| "reward_before_mean": 0.09628471545875072, |
| "reward_before_std": 0.36470284312963486, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3262578770518303, |
| "reward_change_min": -0.5184299051761627, |
| "reward_change_std": 0.1985421497374773, |
| "reward_std": 0.40058545023202896, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.07038197666406631, |
| "step": 471 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2656.125030517578, |
| "epoch": 0.5394285714285715, |
| "grad_norm": 0.020427875220775604, |
| "kl": 0.00032085180282592773, |
| "lambda_div_used": 0.6381691917777061, |
| "learning_rate": 1.0919113768029517e-07, |
| "loss": -0.0459, |
| "reward": -0.11792396203964017, |
| "reward_after_mean": -0.11792396203964017, |
| "reward_after_std": 0.6783455964177847, |
| "reward_before_mean": 0.15370581997558475, |
| "reward_before_std": 0.6358637362718582, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2716297823935747, |
| "reward_change_min": -0.4567250721156597, |
| "reward_change_std": 0.1737262774258852, |
| "reward_std": 0.678345600143075, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.033794180024415255, |
| "step": 472 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2983.750015258789, |
| "epoch": 0.5405714285714286, |
| "grad_norm": 0.02453148551285267, |
| "kl": 0.0003961324691772461, |
| "lambda_div_used": 0.6016801968216896, |
| "learning_rate": 1.0857018009286381e-07, |
| "loss": 0.0472, |
| "reward": -0.006093651056289673, |
| "reward_after_mean": -0.006093651056289673, |
| "reward_after_std": 0.5333354268223047, |
| "reward_before_mean": 0.4089024979621172, |
| "reward_before_std": 0.4693258060142398, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4149961844086647, |
| "reward_change_min": -0.6490769572556019, |
| "reward_change_std": 0.25427413638681173, |
| "reward_std": 0.5333354473114014, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.07556918449699879, |
| "step": 473 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2252.104202270508, |
| "epoch": 0.5417142857142857, |
| "grad_norm": 0.02836003340780735, |
| "kl": 0.00038546323776245117, |
| "lambda_div_used": 0.5747748166322708, |
| "learning_rate": 1.0797073717209013e-07, |
| "loss": 0.0473, |
| "reward": 0.11940666288137436, |
| "reward_after_mean": 0.11940666288137436, |
| "reward_after_std": 0.5397788472473621, |
| "reward_before_mean": 0.7007539421319962, |
| "reward_before_std": 0.3393053291365504, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5813473239541054, |
| "reward_change_min": -0.7977702841162682, |
| "reward_change_std": 0.3083681631833315, |
| "reward_std": 0.5397788546979427, |
| "rewards/accuracy_reward": 0.47916666977107525, |
| "rewards/cosine_scaled_reward": 0.22158729657530785, |
| "step": 474 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2133.562568664551, |
| "epoch": 0.5428571428571428, |
| "grad_norm": 0.02848704345524311, |
| "kl": 0.0002709627151489258, |
| "lambda_div_used": 0.6917188391089439, |
| "learning_rate": 1.0739283813397639e-07, |
| "loss": 0.0352, |
| "reward": 0.2483121044933796, |
| "reward_after_mean": 0.2483121044933796, |
| "reward_after_std": 0.851221090182662, |
| "reward_before_mean": 0.5981418825685978, |
| "reward_before_std": 0.9012213246896863, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.34982976876199245, |
| "reward_change_min": -0.7087801471352577, |
| "reward_change_std": 0.28033955581486225, |
| "reward_std": 0.8512210976332426, |
| "rewards/accuracy_reward": 0.4375000111758709, |
| "rewards/cosine_scaled_reward": 0.16064186580479145, |
| "step": 475 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2319.3750381469727, |
| "epoch": 0.544, |
| "grad_norm": 0.02834029123187065, |
| "kl": 0.00039076805114746094, |
| "lambda_div_used": 0.6543590575456619, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": 0.0905, |
| "reward": 0.3722646813839674, |
| "reward_after_mean": 0.3722646813839674, |
| "reward_after_std": 0.76073794439435, |
| "reward_before_mean": 0.8793929517269135, |
| "reward_before_std": 0.7163430340588093, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5071282722055912, |
| "reward_change_min": -0.8575163669884205, |
| "reward_change_std": 0.32718705013394356, |
| "reward_std": 0.7607379760593176, |
| "rewards/accuracy_reward": 0.5625000074505806, |
| "rewards/cosine_scaled_reward": 0.31689293240197003, |
| "step": 476 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1388.479190826416, |
| "epoch": 0.5451428571428572, |
| "grad_norm": 0.03197532892227173, |
| "kl": 0.00019982457160949707, |
| "lambda_div_used": 0.6069512218236923, |
| "learning_rate": 1.063017833182728e-07, |
| "loss": 0.0104, |
| "reward": 0.2797765755094588, |
| "reward_after_mean": 0.2797765755094588, |
| "reward_after_std": 0.6170946378260851, |
| "reward_before_mean": 0.8468085322529078, |
| "reward_before_std": 0.48898326186463237, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5670319274067879, |
| "reward_change_min": -0.8104716204106808, |
| "reward_change_std": 0.32000066339969635, |
| "reward_std": 0.6170946676284075, |
| "rewards/accuracy_reward": 0.520833345130086, |
| "rewards/cosine_scaled_reward": 0.325975195504725, |
| "step": 477 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2758.9375228881836, |
| "epoch": 0.5462857142857143, |
| "grad_norm": 0.023568512871861458, |
| "kl": 0.0003166794776916504, |
| "lambda_div_used": 0.6045826748013496, |
| "learning_rate": 1.0578868071715544e-07, |
| "loss": 0.0424, |
| "reward": 0.06877373531460762, |
| "reward_after_mean": 0.06877373531460762, |
| "reward_after_std": 0.5457048490643501, |
| "reward_before_mean": 0.5106075219810009, |
| "reward_before_std": 0.4815037827938795, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4418338183313608, |
| "reward_change_min": -0.6827114932239056, |
| "reward_change_std": 0.2656334117054939, |
| "reward_std": 0.5457048676908016, |
| "rewards/accuracy_reward": 0.3750000111758709, |
| "rewards/cosine_scaled_reward": 0.1356075219810009, |
| "step": 478 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2932.895866394043, |
| "epoch": 0.5474285714285714, |
| "grad_norm": 0.025046760216355324, |
| "kl": 0.00037872791290283203, |
| "lambda_div_used": 0.5930827036499977, |
| "learning_rate": 1.0529722834905125e-07, |
| "loss": 0.0013, |
| "reward": -0.29585114773362875, |
| "reward_after_mean": -0.29585114773362875, |
| "reward_after_std": 0.4981949180364609, |
| "reward_before_mean": -0.02313473215326667, |
| "reward_before_std": 0.4286517295986414, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.27271641232073307, |
| "reward_change_min": -0.4339797645807266, |
| "reward_change_std": 0.1596519472077489, |
| "reward_std": 0.4981949217617512, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.12730140378698707, |
| "step": 479 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2238.062515258789, |
| "epoch": 0.5485714285714286, |
| "grad_norm": 0.03429793193936348, |
| "kl": 0.0003292560577392578, |
| "lambda_div_used": 0.6356522366404533, |
| "learning_rate": 1.0482745016665526e-07, |
| "loss": 0.0622, |
| "reward": -0.19213765393942595, |
| "reward_after_mean": -0.19213765393942595, |
| "reward_after_std": 0.6865857243537903, |
| "reward_before_mean": 0.048641178291291, |
| "reward_before_std": 0.6263551618903875, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.24077884666621685, |
| "reward_change_min": -0.43974681198596954, |
| "reward_change_std": 0.15421691350638866, |
| "reward_std": 0.6865857467055321, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.09719215868972242, |
| "step": 480 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2533.666732788086, |
| "epoch": 0.5497142857142857, |
| "grad_norm": 0.01749316044151783, |
| "kl": 0.00031572580337524414, |
| "lambda_div_used": 0.6021129563450813, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0031, |
| "reward": -0.24226298835128546, |
| "reward_after_mean": -0.24226298835128546, |
| "reward_after_std": 0.5046281572431326, |
| "reward_before_mean": 0.02957908995449543, |
| "reward_before_std": 0.47462415788322687, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2718420699238777, |
| "reward_change_min": -0.45962032303214073, |
| "reward_change_std": 0.1774886343628168, |
| "reward_std": 0.5046281665563583, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.11625425447709858, |
| "step": 481 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2584.3958702087402, |
| "epoch": 0.5508571428571428, |
| "grad_norm": 0.027141164988279343, |
| "kl": 0.0004488229751586914, |
| "lambda_div_used": 0.6000719964504242, |
| "learning_rate": 1.0395300688680625e-07, |
| "loss": -0.0155, |
| "reward": 0.2671008687466383, |
| "reward_after_mean": 0.2671008687466383, |
| "reward_after_std": 0.6417696066200733, |
| "reward_before_mean": 0.8570855539292097, |
| "reward_before_std": 0.46050422452390194, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5899846386164427, |
| "reward_change_min": -0.8298100866377354, |
| "reward_change_std": 0.3227744400501251, |
| "reward_std": 0.6417696103453636, |
| "rewards/accuracy_reward": 0.5625000055879354, |
| "rewards/cosine_scaled_reward": 0.29458553344011307, |
| "step": 482 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2577.833366394043, |
| "epoch": 0.552, |
| "grad_norm": 0.022445959970355034, |
| "kl": 0.00040972232818603516, |
| "lambda_div_used": 0.5583987012505531, |
| "learning_rate": 1.0354838440848501e-07, |
| "loss": -0.0221, |
| "reward": -0.2919683400541544, |
| "reward_after_mean": -0.2919683400541544, |
| "reward_after_std": 0.37157695554196835, |
| "reward_before_mean": 0.06092929560691118, |
| "reward_before_std": 0.26688239723443985, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35289763286709785, |
| "reward_change_min": -0.5401931628584862, |
| "reward_change_std": 0.19714731443673372, |
| "reward_std": 0.37157696671783924, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.10573737230151892, |
| "step": 483 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2279.06254196167, |
| "epoch": 0.5531428571428572, |
| "grad_norm": 0.027125045657157898, |
| "kl": 0.0002903938293457031, |
| "lambda_div_used": 0.6015297621488571, |
| "learning_rate": 1.0316552135205837e-07, |
| "loss": -0.0284, |
| "reward": 0.08303672191686928, |
| "reward_after_mean": 0.08303672191686928, |
| "reward_after_std": 0.5998833030462265, |
| "reward_before_mean": 0.5530420504510403, |
| "reward_before_std": 0.4744609510526061, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.47000533528625965, |
| "reward_change_min": -0.6774297691881657, |
| "reward_change_std": 0.2718982184305787, |
| "reward_std": 0.5998833123594522, |
| "rewards/accuracy_reward": 0.41666666977107525, |
| "rewards/cosine_scaled_reward": 0.13637536205351353, |
| "step": 484 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1672.458351135254, |
| "epoch": 0.5542857142857143, |
| "grad_norm": 0.03484974429011345, |
| "kl": 0.00024706125259399414, |
| "lambda_div_used": 0.5606790855526924, |
| "learning_rate": 1.0280443637773163e-07, |
| "loss": -0.0009, |
| "reward": -0.24691498838365078, |
| "reward_after_mean": -0.24691498838365078, |
| "reward_after_std": 0.4085298776626587, |
| "reward_before_mean": 0.13868734147399664, |
| "reward_before_std": 0.2728640455752611, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.38560234755277634, |
| "reward_change_min": -0.5570618100464344, |
| "reward_change_std": 0.20243172626942396, |
| "reward_std": 0.408529881387949, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.04881266225129366, |
| "step": 485 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1802.2083473205566, |
| "epoch": 0.5554285714285714, |
| "grad_norm": 0.03193674981594086, |
| "kl": 0.000291287899017334, |
| "lambda_div_used": 0.6145108714699745, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": 0.0763, |
| "reward": -0.15824460261501372, |
| "reward_after_mean": -0.15824460261501372, |
| "reward_after_std": 0.5667215548455715, |
| "reward_before_mean": 0.12157449871301651, |
| "reward_before_std": 0.5284877885133028, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2798191010951996, |
| "reward_change_min": -0.43175532296299934, |
| "reward_change_std": 0.17297773249447346, |
| "reward_std": 0.566721560433507, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.0867588329128921, |
| "step": 486 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1764.979190826416, |
| "epoch": 0.5565714285714286, |
| "grad_norm": 0.03538261726498604, |
| "kl": 0.0003072023391723633, |
| "lambda_div_used": 0.6353137269616127, |
| "learning_rate": 1.0214767000817596e-07, |
| "loss": 0.0034, |
| "reward": 0.4021994969807565, |
| "reward_after_mean": 0.4021994969807565, |
| "reward_after_std": 0.7458214424550533, |
| "reward_before_mean": 0.9773663654923439, |
| "reward_before_std": 0.6311542720068246, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.5751669164747, |
| "reward_change_min": -0.8273528963327408, |
| "reward_change_std": 0.3425491387024522, |
| "reward_std": 0.745821475982666, |
| "rewards/accuracy_reward": 0.6041666772216558, |
| "rewards/cosine_scaled_reward": 0.37319972552359104, |
| "step": 487 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2063.020866394043, |
| "epoch": 0.5577142857142857, |
| "grad_norm": 0.027028201147913933, |
| "kl": 0.0002803802490234375, |
| "lambda_div_used": 0.5719931498169899, |
| "learning_rate": 1.0185202062281336e-07, |
| "loss": 0.0247, |
| "reward": -0.2594454251229763, |
| "reward_after_mean": -0.2594454251229763, |
| "reward_after_std": 0.458538630977273, |
| "reward_before_mean": 0.09588468819856644, |
| "reward_before_std": 0.3299330030567944, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.35533010959625244, |
| "reward_change_min": -0.5170786269009113, |
| "reward_change_std": 0.19317798037081957, |
| "reward_std": 0.45853864029049873, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.09161530435085297, |
| "step": 488 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3084.8958892822266, |
| "epoch": 0.5588571428571428, |
| "grad_norm": 0.02722005918622017, |
| "kl": 0.00041961669921875, |
| "lambda_div_used": 0.5843943357467651, |
| "learning_rate": 1.0157821333772304e-07, |
| "loss": -0.012, |
| "reward": -0.31660015136003494, |
| "reward_after_mean": -0.31660015136003494, |
| "reward_after_std": 0.43002712167799473, |
| "reward_before_mean": -0.0431265402585268, |
| "reward_before_std": 0.38921575900167227, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.2734736017882824, |
| "reward_change_min": -0.4792550317943096, |
| "reward_change_std": 0.17569494806230068, |
| "reward_std": 0.4300271272659302, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.14729320164769888, |
| "step": 489 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2155.020851135254, |
| "epoch": 0.56, |
| "grad_norm": 0.024445833638310432, |
| "kl": 0.00034758448600769043, |
| "lambda_div_used": 0.5682637020945549, |
| "learning_rate": 1.013262614978859e-07, |
| "loss": -0.0323, |
| "reward": -0.13516036188229918, |
| "reward_after_mean": -0.13516036188229918, |
| "reward_after_std": 0.41841856203973293, |
| "reward_before_mean": 0.28103313967585564, |
| "reward_before_std": 0.307465685531497, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41619347035884857, |
| "reward_change_min": -0.5914728902280331, |
| "reward_change_std": 0.22987399622797966, |
| "reward_std": 0.4184185788035393, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": -0.01063353568315506, |
| "step": 490 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2476.8958740234375, |
| "epoch": 0.5611428571428572, |
| "grad_norm": 0.026141280308365822, |
| "kl": 0.0003236532211303711, |
| "lambda_div_used": 0.61311075091362, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": -0.0399, |
| "reward": 0.14587094401940703, |
| "reward_after_mean": 0.14587094401940703, |
| "reward_after_std": 0.6566581912338734, |
| "reward_before_mean": 0.6461835531517863, |
| "reward_before_std": 0.5190226640552282, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.500312577933073, |
| "reward_change_min": -0.7899926863610744, |
| "reward_change_std": 0.29301502648741007, |
| "reward_std": 0.6566582024097443, |
| "rewards/accuracy_reward": 0.45833334140479565, |
| "rewards/cosine_scaled_reward": 0.1878501633182168, |
| "step": 491 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2644.770835876465, |
| "epoch": 0.5622857142857143, |
| "grad_norm": 0.04309506341814995, |
| "kl": 0.000333636999130249, |
| "lambda_div_used": 0.5899444594979286, |
| "learning_rate": 1.0088797220727779e-07, |
| "loss": 0.012, |
| "reward": -0.1085394024848938, |
| "reward_after_mean": -0.1085394024848938, |
| "reward_after_std": 0.4626395758241415, |
| "reward_before_mean": 0.25940042175352573, |
| "reward_before_std": 0.4149730410426855, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3679398000240326, |
| "reward_change_min": -0.5678628534078598, |
| "reward_change_std": 0.223101656883955, |
| "reward_std": 0.46263957768678665, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": -0.011432923376560211, |
| "step": 492 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1723.0000457763672, |
| "epoch": 0.5634285714285714, |
| "grad_norm": 0.029525646939873695, |
| "kl": 0.00028970837593078613, |
| "lambda_div_used": 0.6680872738361359, |
| "learning_rate": 1.0070165611810855e-07, |
| "loss": 0.0137, |
| "reward": 0.13114306051284075, |
| "reward_after_mean": 0.13114306051284075, |
| "reward_after_std": 0.8511836100369692, |
| "reward_before_mean": 0.4935412285849452, |
| "reward_before_std": 0.7856986094266176, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3623981699347496, |
| "reward_change_min": -0.587810892611742, |
| "reward_change_std": 0.22921003215014935, |
| "reward_std": 0.8511836417019367, |
| "rewards/accuracy_reward": 0.37500000558793545, |
| "rewards/cosine_scaled_reward": 0.11854122020304203, |
| "step": 493 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1625.3125915527344, |
| "epoch": 0.5645714285714286, |
| "grad_norm": 0.031883496791124344, |
| "kl": 0.00029462575912475586, |
| "lambda_div_used": 0.6554152071475983, |
| "learning_rate": 1.005372381963547e-07, |
| "loss": -0.0192, |
| "reward": 0.2886330671608448, |
| "reward_after_mean": 0.2886330671608448, |
| "reward_after_std": 0.7760931197553873, |
| "reward_before_mean": 0.7313874992541969, |
| "reward_before_std": 0.7224944466724992, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4427544269710779, |
| "reward_change_min": -0.7037210427224636, |
| "reward_change_std": 0.28225341718643904, |
| "reward_std": 0.776093129068613, |
| "rewards/accuracy_reward": 0.4583333432674408, |
| "rewards/cosine_scaled_reward": 0.2730541592463851, |
| "step": 494 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2404.104217529297, |
| "epoch": 0.5657142857142857, |
| "grad_norm": 0.025816943496465683, |
| "kl": 0.00029496103525161743, |
| "lambda_div_used": 0.6370417103171349, |
| "learning_rate": 1.0039472645551372e-07, |
| "loss": 0.0444, |
| "reward": -0.002785082906484604, |
| "reward_after_mean": -0.002785082906484604, |
| "reward_after_std": 0.7017532214522362, |
| "reward_before_mean": 0.35137681500054896, |
| "reward_before_std": 0.6334654297679663, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3541618827730417, |
| "reward_change_min": -0.61022624745965, |
| "reward_change_std": 0.2296114508062601, |
| "reward_std": 0.7017532512545586, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.018043467309325933, |
| "step": 495 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1731.9791717529297, |
| "epoch": 0.5668571428571428, |
| "grad_norm": 0.03966463729739189, |
| "kl": 0.00029587745666503906, |
| "lambda_div_used": 0.6256460249423981, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": 0.0666, |
| "reward": 0.17603341676294804, |
| "reward_after_mean": 0.17603341676294804, |
| "reward_after_std": 0.636838972568512, |
| "reward_before_mean": 0.6388222957029939, |
| "reward_before_std": 0.586422567255795, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4627888761460781, |
| "reward_change_min": -0.7264026664197445, |
| "reward_change_std": 0.29389980621635914, |
| "reward_std": 0.6368389893323183, |
| "rewards/accuracy_reward": 0.4166666753590107, |
| "rewards/cosine_scaled_reward": 0.22215561103075743, |
| "step": 496 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2216.1250534057617, |
| "epoch": 0.568, |
| "grad_norm": 0.028931519016623497, |
| "kl": 0.0002690255641937256, |
| "lambda_div_used": 0.6521744430065155, |
| "learning_rate": 1.0017544823184055e-07, |
| "loss": 0.0104, |
| "reward": 0.271162249147892, |
| "reward_after_mean": 0.271162249147892, |
| "reward_after_std": 0.7392647787928581, |
| "reward_before_mean": 0.7059557363390923, |
| "reward_before_std": 0.7143486840650439, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.43479350954294205, |
| "reward_change_min": -0.7290924154222012, |
| "reward_change_std": 0.2943303110077977, |
| "reward_std": 0.7392647992819548, |
| "rewards/accuracy_reward": 0.4791666753590107, |
| "rewards/cosine_scaled_reward": 0.2267890479415655, |
| "step": 497 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2327.6250534057617, |
| "epoch": 0.5691428571428572, |
| "grad_norm": 0.02442491240799427, |
| "kl": 0.0003362894058227539, |
| "lambda_div_used": 0.659889928996563, |
| "learning_rate": 1.0009869243631952e-07, |
| "loss": 0.0113, |
| "reward": 0.31018914096057415, |
| "reward_after_mean": 0.31018914096057415, |
| "reward_after_std": 0.7232479602098465, |
| "reward_before_mean": 0.7269191518425941, |
| "reward_before_std": 0.741204846650362, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.41672998666763306, |
| "reward_change_min": -0.7166927941143513, |
| "reward_change_std": 0.2858916409313679, |
| "reward_std": 0.7232479825615883, |
| "rewards/accuracy_reward": 0.5000000111758709, |
| "rewards/cosine_scaled_reward": 0.22691912204027176, |
| "step": 498 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2263.500026702881, |
| "epoch": 0.5702857142857143, |
| "grad_norm": 0.024736450985074043, |
| "kl": 0.0002918243408203125, |
| "lambda_div_used": 0.60347481071949, |
| "learning_rate": 1.000438641958131e-07, |
| "loss": -0.0248, |
| "reward": 0.009699596092104912, |
| "reward_after_mean": 0.009699596092104912, |
| "reward_after_std": 0.5504223238676786, |
| "reward_before_mean": 0.4187678713351488, |
| "reward_before_std": 0.483045837841928, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.4090682379901409, |
| "reward_change_min": -0.6463241390883923, |
| "reward_change_std": 0.2534347465261817, |
| "reward_std": 0.5504223313182592, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.08543450571596622, |
| "step": 499 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2641.000030517578, |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.02830589935183525, |
| "kl": 0.00043022632598876953, |
| "lambda_div_used": 0.6449039503931999, |
| "learning_rate": 1.0001096618257236e-07, |
| "loss": 0.0892, |
| "reward": -0.016204694285988808, |
| "reward_after_mean": -0.016204694285988808, |
| "reward_after_std": 0.6894690785557032, |
| "reward_before_mean": 0.2851157810073346, |
| "reward_before_std": 0.6750934664160013, |
| "reward_change_max": 0.0, |
| "reward_change_mean": -0.3013204652816057, |
| "reward_change_min": -0.5549999382346869, |
| "reward_change_std": 0.20787928439676762, |
| "reward_std": 0.6894691102206707, |
| "rewards/accuracy_reward": 0.29166667722165585, |
| "rewards/cosine_scaled_reward": -0.006550896912813187, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "step": 500, |
| "total_flos": 0.0, |
| "train_loss": 0.0046342451156378955, |
| "train_runtime": 100887.1622, |
| "train_samples_per_second": 0.238, |
| "train_steps_per_second": 0.005 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|