| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.11428571428571428, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2523.270866394043, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.06767117232084274, | |
| "kl": 0.0, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 0.0, | |
| "loss": -0.0168, | |
| "reward": -0.06946920603513718, | |
| "reward_after_mean": -0.06946920603513718, | |
| "reward_after_std": 0.5871539004147053, | |
| "reward_before_mean": 0.17862090840935707, | |
| "reward_before_std": 0.5394803490489721, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2480901088565588, | |
| "reward_change_min": -0.44155892729759216, | |
| "reward_change_std": 0.16163883404806256, | |
| "reward_std": 0.5871539227664471, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": -0.0713790925219655, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2684.583366394043, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.07669047266244888, | |
| "kl": 0.0, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0043, | |
| "reward": 0.034156665205955505, | |
| "reward_after_mean": 0.034156665205955505, | |
| "reward_after_std": 0.46472928673028946, | |
| "reward_before_mean": 0.33918463438749313, | |
| "reward_before_std": 0.41114553064107895, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30502799339592457, | |
| "reward_change_min": -0.46970658004283905, | |
| "reward_change_std": 0.17754351254552603, | |
| "reward_std": 0.46472930535674095, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/cosine_scaled_reward": 0.047517990693449974, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2952.6458587646484, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.08670737594366074, | |
| "kl": 5.030632019042969e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2e-07, | |
| "loss": -0.0823, | |
| "reward": -0.286656855372712, | |
| "reward_after_mean": -0.286656855372712, | |
| "reward_after_std": 0.3848829958587885, | |
| "reward_before_mean": -0.09138839645311236, | |
| "reward_before_std": 0.32367083616554737, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1952684484422207, | |
| "reward_change_min": -0.2791023626923561, | |
| "reward_change_std": 0.1039092754945159, | |
| "reward_std": 0.38488300889730453, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.13305507553741336, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1583.8958587646484, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.1990806609392166, | |
| "kl": 3.854930400848389e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3e-07, | |
| "loss": -0.1819, | |
| "reward": -0.11896039673592895, | |
| "reward_after_mean": -0.11896039673592895, | |
| "reward_after_std": 0.5856046769768, | |
| "reward_before_mean": 0.11259861849248409, | |
| "reward_before_std": 0.5683185886591673, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23155900090932846, | |
| "reward_change_min": -0.4133305959403515, | |
| "reward_change_std": 0.15685046929866076, | |
| "reward_std": 0.5856046807020903, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/cosine_scaled_reward": -0.05406807316467166, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3137.166717529297, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.07480327039957047, | |
| "kl": 5.798041820526123e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0034, | |
| "reward": -0.2642638385295868, | |
| "reward_after_mean": -0.2642638385295868, | |
| "reward_after_std": 0.47306079883128405, | |
| "reward_before_mean": -0.06776071339845657, | |
| "reward_before_std": 0.4494105405174196, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19650312513113022, | |
| "reward_change_min": -0.39078205823898315, | |
| "reward_change_std": 0.13734673336148262, | |
| "reward_std": 0.47306080628186464, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.1719273824710399, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2746.5416717529297, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.08584456145763397, | |
| "kl": 5.7503581047058105e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5e-07, | |
| "loss": 0.012, | |
| "reward": -0.2835393473505974, | |
| "reward_after_mean": -0.2835393473505974, | |
| "reward_after_std": 0.2985193133354187, | |
| "reward_before_mean": -0.06978187244385481, | |
| "reward_before_std": 0.27547394298017025, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21375747956335545, | |
| "reward_change_min": -0.34073719196021557, | |
| "reward_change_std": 0.13080975599586964, | |
| "reward_std": 0.29851931519806385, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/cosine_scaled_reward": -0.13228187058120966, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2684.8334045410156, | |
| "epoch": 0.008, | |
| "grad_norm": 0.06891732662916183, | |
| "kl": 3.771483898162842e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6e-07, | |
| "loss": -0.0261, | |
| "reward": -0.040946679189801216, | |
| "reward_after_mean": -0.040946679189801216, | |
| "reward_after_std": 0.4143037796020508, | |
| "reward_before_mean": 0.24522151239216328, | |
| "reward_before_std": 0.36546519538387656, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2861681915819645, | |
| "reward_change_min": -0.47350434213876724, | |
| "reward_change_std": 0.17670375108718872, | |
| "reward_std": 0.4143037870526314, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/cosine_scaled_reward": 0.016054846346378326, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2381.770881652832, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.0757519081234932, | |
| "kl": 3.248453140258789e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7e-07, | |
| "loss": -0.0307, | |
| "reward": 0.11849129945039749, | |
| "reward_after_mean": 0.11849129945039749, | |
| "reward_after_std": 0.5733677167445421, | |
| "reward_before_mean": 0.44195539876818657, | |
| "reward_before_std": 0.5410135546699166, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3234641030430794, | |
| "reward_change_min": -0.5369394198060036, | |
| "reward_change_std": 0.20503532141447067, | |
| "reward_std": 0.5733677390962839, | |
| "rewards/accuracy_reward": 0.35416667349636555, | |
| "rewards/cosine_scaled_reward": 0.08778871223330498, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2806.8125534057617, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.10130124539136887, | |
| "kl": 5.823373794555664e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8e-07, | |
| "loss": -0.073, | |
| "reward": -0.3014819361269474, | |
| "reward_after_mean": -0.3014819361269474, | |
| "reward_after_std": 0.4007901232689619, | |
| "reward_before_mean": -0.11112237721681595, | |
| "reward_before_std": 0.36138677038252354, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1903595570474863, | |
| "reward_change_min": -0.3259355407208204, | |
| "reward_change_std": 0.11471330747008324, | |
| "reward_std": 0.4007901381701231, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.17362238001078367, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2706.750045776367, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.07514520734548569, | |
| "kl": 4.935264587402344e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0161, | |
| "reward": -0.11493759602308273, | |
| "reward_after_mean": -0.11493759602308273, | |
| "reward_after_std": 0.5900738928467035, | |
| "reward_before_mean": 0.12419397698249668, | |
| "reward_before_std": 0.599910007789731, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23913157917559147, | |
| "reward_change_min": -0.4677496124058962, | |
| "reward_change_std": 0.18292754143476486, | |
| "reward_std": 0.5900739189237356, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/cosine_scaled_reward": -0.08413936011493206, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3402.229217529297, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.05007028952240944, | |
| "kl": 3.884732723236084e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0793, | |
| "reward": -0.29482845464372076, | |
| "reward_after_mean": -0.29482845464372076, | |
| "reward_after_std": 0.474648579955101, | |
| "reward_before_mean": -0.11345059424638748, | |
| "reward_before_std": 0.4228599341586232, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18137785978615284, | |
| "reward_change_min": -0.3095668628811836, | |
| "reward_change_std": 0.109623190946877, | |
| "reward_std": 0.47464858181774616, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.19678393006324768, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2166.041702270508, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.08323562890291214, | |
| "kl": 5.860626697540283e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.02, | |
| "reward": -0.11367994034662843, | |
| "reward_after_mean": -0.11367994034662843, | |
| "reward_after_std": 0.5573214925825596, | |
| "reward_before_mean": 0.1285952888429165, | |
| "reward_before_std": 0.5449656378477812, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24227523803710938, | |
| "reward_change_min": -0.45779313147068024, | |
| "reward_change_std": 0.1707105627283454, | |
| "reward_std": 0.5573215000331402, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/cosine_scaled_reward": -0.058904721518047154, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2882.1041717529297, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.06770065426826477, | |
| "kl": 4.939734935760498e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0159, | |
| "reward": -0.04806898767128587, | |
| "reward_after_mean": -0.04806898767128587, | |
| "reward_after_std": 0.41037579998373985, | |
| "reward_before_mean": 0.23718727007508278, | |
| "reward_before_std": 0.3681218596175313, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2852562852203846, | |
| "reward_change_min": -0.4630883075296879, | |
| "reward_change_std": 0.17784415930509567, | |
| "reward_std": 0.4103758055716753, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": -0.012812718749046326, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2489.9583587646484, | |
| "epoch": 0.016, | |
| "grad_norm": 0.07574188709259033, | |
| "kl": 4.4599175453186035e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": -0.0047, | |
| "reward": -0.27982987742871046, | |
| "reward_after_mean": -0.27982987742871046, | |
| "reward_after_std": 0.39828755520284176, | |
| "reward_before_mean": -0.07930825836956501, | |
| "reward_before_std": 0.36426794342696667, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20052163302898407, | |
| "reward_change_min": -0.3274822384119034, | |
| "reward_change_std": 0.12421673815697432, | |
| "reward_std": 0.3982875719666481, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/cosine_scaled_reward": -0.16264159604907036, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2756.062557220459, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.07491511106491089, | |
| "kl": 4.3898820877075195e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": -0.0053, | |
| "reward": -0.02700677514076233, | |
| "reward_after_mean": -0.02700677514076233, | |
| "reward_after_std": 0.4231729060411453, | |
| "reward_before_mean": 0.26372290030121803, | |
| "reward_before_std": 0.37921188212931156, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.290729658678174, | |
| "reward_change_min": -0.4885778911411762, | |
| "reward_change_std": 0.1829922767356038, | |
| "reward_std": 0.42317293770611286, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": 0.013722889125347137, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3446.500030517578, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.05100657790899277, | |
| "kl": 5.772709846496582e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0162, | |
| "reward": -0.2724580317735672, | |
| "reward_after_mean": -0.2724580317735672, | |
| "reward_after_std": 0.391301766037941, | |
| "reward_before_mean": -0.0656847816426307, | |
| "reward_before_std": 0.3727356269955635, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20677325874567032, | |
| "reward_change_min": -0.33162772841751575, | |
| "reward_change_std": 0.13028155453503132, | |
| "reward_std": 0.391301779076457, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.16985145024955273, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2186.333351135254, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.1111261397600174, | |
| "kl": 4.7326087951660156e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": -0.0433, | |
| "reward": -0.09547922573983669, | |
| "reward_after_mean": -0.09547922573983669, | |
| "reward_after_std": 0.49696200527250767, | |
| "reward_before_mean": 0.16047327406704426, | |
| "reward_before_std": 0.4803733183071017, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25595249235630035, | |
| "reward_change_min": -0.4563688188791275, | |
| "reward_change_std": 0.17162334639579058, | |
| "reward_std": 0.4969620108604431, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/cosine_scaled_reward": -0.047860062681138515, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3001.8959045410156, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.04722139984369278, | |
| "kl": 3.723800182342529e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0773, | |
| "reward": -0.0474961269646883, | |
| "reward_after_mean": -0.0474961269646883, | |
| "reward_after_std": 0.6040066350251436, | |
| "reward_before_mean": 0.20818753214552999, | |
| "reward_before_std": 0.550414253026247, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25568367168307304, | |
| "reward_change_min": -0.4698070529848337, | |
| "reward_change_std": 0.169005892239511, | |
| "reward_std": 0.6040066443383694, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": -0.04181248042732477, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2931.2917098999023, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.06986711174249649, | |
| "kl": 3.400444984436035e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0292, | |
| "reward": 0.17090284021105617, | |
| "reward_after_mean": 0.17090284021105617, | |
| "reward_after_std": 0.5971331372857094, | |
| "reward_before_mean": 0.5100600440055132, | |
| "reward_before_std": 0.5602419087663293, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.33915719762444496, | |
| "reward_change_min": -0.5498323440551758, | |
| "reward_change_std": 0.21470873430371284, | |
| "reward_std": 0.5971331428736448, | |
| "rewards/accuracy_reward": 0.35416667349636555, | |
| "rewards/cosine_scaled_reward": 0.15589335677213967, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1947.3333854675293, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.09310275316238403, | |
| "kl": 2.9101967811584473e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.047, | |
| "reward": -0.030604678206145763, | |
| "reward_after_mean": -0.030604678206145763, | |
| "reward_after_std": 0.5643759882077575, | |
| "reward_before_mean": 0.23468416556715965, | |
| "reward_before_std": 0.5034359507262707, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.265288844704628, | |
| "reward_change_min": -0.4424171634018421, | |
| "reward_change_std": 0.16660070698708296, | |
| "reward_std": 0.5643760003149509, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/cosine_scaled_reward": -0.036149172694422305, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2655.0625343322754, | |
| "epoch": 0.024, | |
| "grad_norm": 0.10517873615026474, | |
| "kl": 4.278123378753662e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": -0.0291, | |
| "reward": -0.10848977044224739, | |
| "reward_after_mean": -0.10848977044224739, | |
| "reward_after_std": 0.5077532883733511, | |
| "reward_before_mean": 0.13500482868403196, | |
| "reward_before_std": 0.4411742137745023, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24349460750818253, | |
| "reward_change_min": -0.3907940573990345, | |
| "reward_change_std": 0.14410312101244926, | |
| "reward_std": 0.5077533088624477, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.07332850294187665, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1586.75004196167, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.1125827431678772, | |
| "kl": 3.287568688392639e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": -0.0943, | |
| "reward": -0.019220076501369476, | |
| "reward_after_mean": -0.019220076501369476, | |
| "reward_after_std": 0.38232380524277687, | |
| "reward_before_mean": 0.2721855500712991, | |
| "reward_before_std": 0.2772734249010682, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2914056619629264, | |
| "reward_change_min": -0.43606279231607914, | |
| "reward_change_std": 0.16009643021970987, | |
| "reward_std": 0.3823238220065832, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/cosine_scaled_reward": -0.019481111317873, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2185.0208702087402, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.08662360906600952, | |
| "kl": 3.97413969039917e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0316, | |
| "reward": 0.0812848350033164, | |
| "reward_after_mean": 0.0812848350033164, | |
| "reward_after_std": 0.7008634004741907, | |
| "reward_before_mean": 0.3680048354435712, | |
| "reward_before_std": 0.6547262277454138, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28671999275684357, | |
| "reward_change_min": -0.5204410944133997, | |
| "reward_change_std": 0.19234656170010567, | |
| "reward_std": 0.7008634451776743, | |
| "rewards/accuracy_reward": 0.3125000037252903, | |
| "rewards/cosine_scaled_reward": 0.05550480890087783, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2429.7500381469727, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.09173740446567535, | |
| "kl": 3.2924115657806396e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0745, | |
| "reward": 0.1347651109099388, | |
| "reward_after_mean": 0.1347651109099388, | |
| "reward_after_std": 0.6127859707921743, | |
| "reward_before_mean": 0.4589935354888439, | |
| "reward_before_std": 0.583356948569417, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3242284394800663, | |
| "reward_change_min": -0.5463455878198147, | |
| "reward_change_std": 0.21345821302384138, | |
| "reward_std": 0.6127859856933355, | |
| "rewards/accuracy_reward": 0.35416667349636555, | |
| "rewards/cosine_scaled_reward": 0.10482687130570412, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2295.8125228881836, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.08578181266784668, | |
| "kl": 5.142390727996826e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": -0.0378, | |
| "reward": -0.24743555346503854, | |
| "reward_after_mean": -0.24743555346503854, | |
| "reward_after_std": 0.47581059113144875, | |
| "reward_before_mean": -0.044429176254197955, | |
| "reward_before_std": 0.4596579996868968, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20300637558102608, | |
| "reward_change_min": -0.3940556962043047, | |
| "reward_change_std": 0.13973176665604115, | |
| "reward_std": 0.47581060975790024, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.14859584486111999, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2757.541748046875, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.062217455357313156, | |
| "kl": 3.5900622606277466e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": -0.0243, | |
| "reward": -0.22815486788749695, | |
| "reward_after_mean": -0.22815486788749695, | |
| "reward_after_std": 0.23574577271938324, | |
| "reward_before_mean": 0.010172484442591667, | |
| "reward_before_std": 0.1596730425953865, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23832733370363712, | |
| "reward_change_min": -0.3241576459258795, | |
| "reward_change_std": 0.12215171847492456, | |
| "reward_std": 0.23574578203260899, | |
| "rewards/accuracy_reward": 0.125, | |
| "rewards/cosine_scaled_reward": -0.11482751555740833, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2858.062545776367, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.07970089465379715, | |
| "kl": 6.517767906188965e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0278, | |
| "reward": -0.22820639377459884, | |
| "reward_after_mean": -0.22820639377459884, | |
| "reward_after_std": 0.3691936992108822, | |
| "reward_before_mean": -0.003585641272366047, | |
| "reward_before_std": 0.3326329058036208, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22462073341012, | |
| "reward_change_min": -0.3414224237203598, | |
| "reward_change_std": 0.1343115232884884, | |
| "reward_std": 0.36919371597468853, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.10775232722517103, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2576.958366394043, | |
| "epoch": 0.032, | |
| "grad_norm": 0.08655045926570892, | |
| "kl": 5.263090133666992e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0032, | |
| "reward": -0.1358381314203143, | |
| "reward_after_mean": -0.1358381314203143, | |
| "reward_after_std": 0.5688246618956327, | |
| "reward_before_mean": 0.0954693965613842, | |
| "reward_before_std": 0.5578853543847799, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23130753822624683, | |
| "reward_change_min": -0.4282049834728241, | |
| "reward_change_std": 0.16099752951413393, | |
| "reward_std": 0.5688246786594391, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/cosine_scaled_reward": -0.11286392994225025, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2898.750030517578, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.06516330689191818, | |
| "kl": 4.172325134277344e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": -0.0722, | |
| "reward": -0.3742859149351716, | |
| "reward_after_mean": -0.3742859149351716, | |
| "reward_after_std": 0.32394181191921234, | |
| "reward_before_mean": -0.19833002239465714, | |
| "reward_before_std": 0.295702856965363, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1759559065103531, | |
| "reward_change_min": -0.31070076301693916, | |
| "reward_change_std": 0.1110200947150588, | |
| "reward_std": 0.32394181936979294, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.23999668890610337, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2727.916763305664, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.08452288061380386, | |
| "kl": 4.813075065612793e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0229, | |
| "reward": 0.0020619072020053864, | |
| "reward_after_mean": 0.0020619072020053864, | |
| "reward_after_std": 0.6899437569081783, | |
| "reward_before_mean": 0.27052484080195427, | |
| "reward_before_std": 0.7042382340878248, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2684629485011101, | |
| "reward_change_min": -0.5466452892869711, | |
| "reward_change_std": 0.20367024186998606, | |
| "reward_std": 0.6899437606334686, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/cosine_scaled_reward": 0.02052484266459942, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3069.0208435058594, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.06624361127614975, | |
| "kl": 5.201995372772217e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0393, | |
| "reward": -0.18817763216793537, | |
| "reward_after_mean": -0.18817763216793537, | |
| "reward_after_std": 0.4799848888069391, | |
| "reward_before_mean": 0.03944748570211232, | |
| "reward_before_std": 0.4816620806232095, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22762510925531387, | |
| "reward_change_min": -0.39297138154506683, | |
| "reward_change_std": 0.16033693589270115, | |
| "reward_std": 0.4799849148839712, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/cosine_scaled_reward": -0.12721917685121298, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2598.854248046875, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.06277475506067276, | |
| "kl": 4.087388515472412e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": -0.013, | |
| "reward": -0.026365877129137516, | |
| "reward_after_mean": -0.026365877129137516, | |
| "reward_after_std": 0.5388314640149474, | |
| "reward_before_mean": 0.2528522349894047, | |
| "reward_before_std": 0.5446693664416671, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.27921812795102596, | |
| "reward_change_min": -0.5132879670709372, | |
| "reward_change_std": 0.1965640289708972, | |
| "reward_std": 0.5388314817100763, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/cosine_scaled_reward": -0.017981095821596682, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3199.604202270508, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.05635827034711838, | |
| "kl": 4.775822162628174e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0363, | |
| "reward": -0.11553127923980355, | |
| "reward_after_mean": -0.11553127923980355, | |
| "reward_after_std": 0.6909586228430271, | |
| "reward_before_mean": 0.10691910376772285, | |
| "reward_before_std": 0.6950575169175863, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22245037741959095, | |
| "reward_change_min": -0.4544982947409153, | |
| "reward_change_std": 0.1732303500175476, | |
| "reward_std": 0.690958658233285, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/cosine_scaled_reward": -0.10141423298045993, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2291.1875076293945, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.07628920674324036, | |
| "kl": 5.2928924560546875e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": -0.0126, | |
| "reward": 0.11718782410025597, | |
| "reward_after_mean": 0.11718782410025597, | |
| "reward_after_std": 0.6173261571675539, | |
| "reward_before_mean": 0.4387818221002817, | |
| "reward_before_std": 0.6273570032790303, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3215939924120903, | |
| "reward_change_min": -0.6037725955247879, | |
| "reward_change_std": 0.226388999260962, | |
| "reward_std": 0.6173261664807796, | |
| "rewards/accuracy_reward": 0.3333333469927311, | |
| "rewards/cosine_scaled_reward": 0.10544847697019577, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2999.6666870117188, | |
| "epoch": 0.04, | |
| "grad_norm": 0.09039846807718277, | |
| "kl": 6.504356861114502e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": -0.0028, | |
| "reward": -0.16610824968665838, | |
| "reward_after_mean": -0.16610824968665838, | |
| "reward_after_std": 0.58190375007689, | |
| "reward_before_mean": 0.05111471749842167, | |
| "reward_before_std": 0.5643127737566829, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21722296811640263, | |
| "reward_change_min": -0.40635400637984276, | |
| "reward_change_std": 0.15069269575178623, | |
| "reward_std": 0.5819037612527609, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.09471861086785793, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3241.6041870117188, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.05941932648420334, | |
| "kl": 4.7147274017333984e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0446, | |
| "reward": -0.3385976613499224, | |
| "reward_after_mean": -0.3385976613499224, | |
| "reward_after_std": 0.4202655293047428, | |
| "reward_before_mean": -0.16420376114547253, | |
| "reward_before_std": 0.3881509117782116, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17439390905201435, | |
| "reward_change_min": -0.31771647930145264, | |
| "reward_change_std": 0.11316250916570425, | |
| "reward_std": 0.42026553489267826, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.22670375928282738, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3407.250030517578, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.05422111600637436, | |
| "kl": 5.02467155456543e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0107, | |
| "reward": -0.3340600021183491, | |
| "reward_after_mean": -0.3340600021183491, | |
| "reward_after_std": 0.27687052451074123, | |
| "reward_before_mean": -0.1344250589609146, | |
| "reward_before_std": 0.24589443486183882, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19963493384420872, | |
| "reward_change_min": -0.31763908080756664, | |
| "reward_change_std": 0.11714739445596933, | |
| "reward_std": 0.2768705263733864, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.2177584059536457, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3299.625, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.058690257370471954, | |
| "kl": 4.3392181396484375e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": -0.0289, | |
| "reward": -0.1949592368910089, | |
| "reward_after_mean": -0.1949592368910089, | |
| "reward_after_std": 0.3663552775979042, | |
| "reward_before_mean": 0.040207072626799345, | |
| "reward_before_std": 0.32383421063423157, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2351663038134575, | |
| "reward_change_min": -0.3440061006695032, | |
| "reward_change_std": 0.13658129330724478, | |
| "reward_std": 0.36635528318583965, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.06395959993824363, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2834.208354949951, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.06519138813018799, | |
| "kl": 5.877390503883362e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0434, | |
| "reward": 0.0075050946325063705, | |
| "reward_after_mean": 0.0075050946325063705, | |
| "reward_after_std": 0.3204457741230726, | |
| "reward_before_mean": 0.31809414783492684, | |
| "reward_before_std": 0.20561318658292294, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3105890806764364, | |
| "reward_change_min": -0.4078735690563917, | |
| "reward_change_std": 0.157848265953362, | |
| "reward_std": 0.3204457778483629, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/cosine_scaled_reward": 0.047260792925953865, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2497.2708892822266, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.06621085107326508, | |
| "kl": 3.94284725189209e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0453, | |
| "reward": -0.09613560698926449, | |
| "reward_after_mean": -0.09613560698926449, | |
| "reward_after_std": 0.47081254981458187, | |
| "reward_before_mean": 0.15380273573100567, | |
| "reward_before_std": 0.37723068334162235, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.249938340857625, | |
| "reward_change_min": -0.34136574156582355, | |
| "reward_change_std": 0.13006254099309444, | |
| "reward_std": 0.4708125591278076, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/cosine_scaled_reward": -0.0336972763761878, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2959.541702270508, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.05511856451630592, | |
| "kl": 4.7087669372558594e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0083, | |
| "reward": -0.18304359819740057, | |
| "reward_after_mean": -0.18304359819740057, | |
| "reward_after_std": 0.5882194386795163, | |
| "reward_before_mean": 0.027974394150078297, | |
| "reward_before_std": 0.5736249866895378, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21101799607276917, | |
| "reward_change_min": -0.40425144881010056, | |
| "reward_change_std": 0.1514134258031845, | |
| "reward_std": 0.588219441473484, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.11785894399508834, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2838.7916774749756, | |
| "epoch": 0.048, | |
| "grad_norm": 0.10495254397392273, | |
| "kl": 6.80088996887207e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0644, | |
| "reward": -0.4437730088829994, | |
| "reward_after_mean": -0.4437730088829994, | |
| "reward_after_std": 0.1842181822285056, | |
| "reward_before_mean": -0.2758090700954199, | |
| "reward_before_std": 0.14354530815035105, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.16796394996345043, | |
| "reward_change_min": -0.23861237615346909, | |
| "reward_change_std": 0.0892399987205863, | |
| "reward_std": 0.1842181896790862, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/cosine_scaled_reward": -0.2758090700954199, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3059.2916870117188, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.06792694330215454, | |
| "kl": 4.3764710426330566e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": -0.0857, | |
| "reward": -0.3225740883499384, | |
| "reward_after_mean": -0.3225740883499384, | |
| "reward_after_std": 0.37870580051094294, | |
| "reward_before_mean": -0.1348972450941801, | |
| "reward_before_std": 0.3481356706470251, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18767685443162918, | |
| "reward_change_min": -0.3287505563348532, | |
| "reward_change_std": 0.11681838473305106, | |
| "reward_std": 0.37870581168681383, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.19739723904058337, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2378.020851135254, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.0813366174697876, | |
| "kl": 3.6869198083877563e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0455, | |
| "reward": -0.05170046165585518, | |
| "reward_after_mean": -0.05170046165585518, | |
| "reward_after_std": 0.4305565822869539, | |
| "reward_before_mean": 0.22961101355031133, | |
| "reward_before_std": 0.41200026869773865, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28131146915256977, | |
| "reward_change_min": -0.4182246755808592, | |
| "reward_change_std": 0.16517735086381435, | |
| "reward_std": 0.43055659532546997, | |
| "rewards/accuracy_reward": 0.22916667722165585, | |
| "rewards/cosine_scaled_reward": 0.0004443284124135971, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3216.812530517578, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.05646404251456261, | |
| "kl": 4.851818084716797e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": -0.0235, | |
| "reward": -0.2375551089644432, | |
| "reward_after_mean": -0.2375551089644432, | |
| "reward_after_std": 0.42938368022441864, | |
| "reward_before_mean": -0.027880379930138588, | |
| "reward_before_std": 0.39119122084230185, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20967471785843372, | |
| "reward_change_min": -0.30014154128730297, | |
| "reward_change_std": 0.11865215376019478, | |
| "reward_std": 0.42938369885087013, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/cosine_scaled_reward": -0.1737137222662568, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3094.2500610351562, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.06780198961496353, | |
| "kl": 5.8978796005249023e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0149, | |
| "reward": -0.3860484268516302, | |
| "reward_after_mean": -0.3860484268516302, | |
| "reward_after_std": 0.2960197441279888, | |
| "reward_before_mean": -0.21315561048686504, | |
| "reward_before_std": 0.24969620257616043, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17289282009005547, | |
| "reward_change_min": -0.2534465938806534, | |
| "reward_change_std": 0.09384610410779715, | |
| "reward_std": 0.29601974971592426, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.2339889481663704, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2600.062515258789, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.09613389521837234, | |
| "kl": 4.597008228302002e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": -0.0031, | |
| "reward": -0.15789308166131377, | |
| "reward_after_mean": -0.15789308166131377, | |
| "reward_after_std": 0.4657074324786663, | |
| "reward_before_mean": 0.07849103212356567, | |
| "reward_before_std": 0.44048627745360136, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2363841012120247, | |
| "reward_change_min": -0.4060584995895624, | |
| "reward_change_std": 0.15053978469222784, | |
| "reward_std": 0.46570744924247265, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.0881756441667676, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2541.0208892822266, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.08367791026830673, | |
| "kl": 4.8547983169555664e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": -0.0132, | |
| "reward": -0.04096842650324106, | |
| "reward_after_mean": -0.04096842650324106, | |
| "reward_after_std": 0.6091550681740046, | |
| "reward_before_mean": 0.22568438947200775, | |
| "reward_before_std": 0.6412058565765619, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.26665280386805534, | |
| "reward_change_min": -0.540655393153429, | |
| "reward_change_std": 0.2099922765046358, | |
| "reward_std": 0.6091550793498755, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": -0.024315630551427603, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1905.8333625793457, | |
| "epoch": 0.056, | |
| "grad_norm": 0.10268070548772812, | |
| "kl": 3.053247928619385e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": -0.0009, | |
| "reward": -0.15249623358249664, | |
| "reward_after_mean": -0.15249623358249664, | |
| "reward_after_std": 0.326967298053205, | |
| "reward_before_mean": 0.1072658714838326, | |
| "reward_before_std": 0.3044881981331855, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2597620990127325, | |
| "reward_change_min": -0.3807704448699951, | |
| "reward_change_std": 0.15653882548213005, | |
| "reward_std": 0.32696729991585016, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/cosine_scaled_reward": -0.08023415133357048, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3011.4166984558105, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.07368293404579163, | |
| "kl": 3.7729740142822266e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0181, | |
| "reward": -0.030200980603694916, | |
| "reward_after_mean": -0.030200980603694916, | |
| "reward_after_std": 0.4144356604665518, | |
| "reward_before_mean": 0.26172044314444065, | |
| "reward_before_std": 0.3721779463812709, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.291921429336071, | |
| "reward_change_min": -0.4539058431982994, | |
| "reward_change_std": 0.18305509351193905, | |
| "reward_std": 0.4144356790930033, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/cosine_scaled_reward": -0.00911291316151619, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2326.833351135254, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.09606306254863739, | |
| "kl": 4.869699478149414e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": -0.0003, | |
| "reward": -0.1915714619681239, | |
| "reward_after_mean": -0.1915714619681239, | |
| "reward_after_std": 0.4830855615437031, | |
| "reward_before_mean": 0.03159985225647688, | |
| "reward_before_std": 0.4657090399414301, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22317130863666534, | |
| "reward_change_min": -0.38562823459506035, | |
| "reward_change_std": 0.1446484811604023, | |
| "reward_std": 0.4830855708569288, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/cosine_scaled_reward": -0.1142334844917059, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2670.354217529297, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.09154725819826126, | |
| "kl": 3.1536445021629333e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0598, | |
| "reward": 0.19081778265535831, | |
| "reward_after_mean": 0.19081778265535831, | |
| "reward_after_std": 0.6043459214270115, | |
| "reward_before_mean": 0.5438746921718121, | |
| "reward_before_std": 0.6257130298763514, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3530569151043892, | |
| "reward_change_min": -0.5976719129830599, | |
| "reward_change_std": 0.24177804868668318, | |
| "reward_std": 0.6043459549546242, | |
| "rewards/accuracy_reward": 0.4166666828095913, | |
| "rewards/cosine_scaled_reward": 0.12720800563693047, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2716.104202270508, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.08996250480413437, | |
| "kl": 4.64320182800293e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0028, | |
| "reward": -0.003628397360444069, | |
| "reward_after_mean": -0.003628397360444069, | |
| "reward_after_std": 0.6251984257251024, | |
| "reward_before_mean": 0.2679465925320983, | |
| "reward_before_std": 0.6042734682559967, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.271575003862381, | |
| "reward_change_min": -0.42408623918890953, | |
| "reward_change_std": 0.1748236557468772, | |
| "reward_std": 0.625198433175683, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/cosine_scaled_reward": 0.038779920781962574, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2338.2500534057617, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.09756185859441757, | |
| "kl": 4.079937934875488e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.081, | |
| "reward": 0.20069499779492617, | |
| "reward_after_mean": 0.20069499779492617, | |
| "reward_after_std": 0.7350467648357153, | |
| "reward_before_mean": 0.5314708258956671, | |
| "reward_before_std": 0.7071247743442655, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3307758532464504, | |
| "reward_change_min": -0.5639360137283802, | |
| "reward_change_std": 0.22586506605148315, | |
| "reward_std": 0.7350467666983604, | |
| "rewards/accuracy_reward": 0.3750000037252903, | |
| "rewards/cosine_scaled_reward": 0.15647082310169935, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2827.208381652832, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.06813458353281021, | |
| "kl": 3.8139522075653076e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0192, | |
| "reward": 0.01979338377714157, | |
| "reward_after_mean": 0.01979338377714157, | |
| "reward_after_std": 0.47581884637475014, | |
| "reward_before_mean": 0.31956043280661106, | |
| "reward_before_std": 0.433451721444726, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2997670602053404, | |
| "reward_change_min": -0.4422443322837353, | |
| "reward_change_std": 0.17958854231983423, | |
| "reward_std": 0.4758188594132662, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/cosine_scaled_reward": 0.007060416042804718, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3146.187545776367, | |
| "epoch": 0.064, | |
| "grad_norm": 0.07230962067842484, | |
| "kl": 4.698336124420166e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0041, | |
| "reward": -0.3684567688032985, | |
| "reward_after_mean": -0.3684567688032985, | |
| "reward_after_std": 0.3458494506776333, | |
| "reward_before_mean": -0.19647526927292347, | |
| "reward_before_std": 0.2858909470960498, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17198150791227818, | |
| "reward_change_min": -0.25032067485153675, | |
| "reward_change_std": 0.09047863632440567, | |
| "reward_std": 0.3458494581282139, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.23814193485304713, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3092.4583892822266, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.054168950766325, | |
| "kl": 3.5762786865234375e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": -0.0577, | |
| "reward": -0.22174427472054958, | |
| "reward_after_mean": -0.22174427472054958, | |
| "reward_after_std": 0.4740599449723959, | |
| "reward_before_mean": -0.0071051958948373795, | |
| "reward_before_std": 0.46773200016468763, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.21463906578719616, | |
| "reward_change_min": -0.43531039729714394, | |
| "reward_change_std": 0.1588157471269369, | |
| "reward_std": 0.47405995428562164, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.1529385419562459, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2167.9792098999023, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.08869833499193192, | |
| "kl": 3.4928321838378906e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0885, | |
| "reward": -0.060710884630680084, | |
| "reward_after_mean": -0.060710884630680084, | |
| "reward_after_std": 0.35876573994755745, | |
| "reward_before_mean": 0.21950972266495228, | |
| "reward_before_std": 0.28317089146003127, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28022063709795475, | |
| "reward_change_min": -0.3964573312550783, | |
| "reward_change_std": 0.14705425314605236, | |
| "reward_std": 0.3587657529860735, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/cosine_scaled_reward": -0.051323600113391876, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2832.645851135254, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.06335015594959259, | |
| "kl": 3.361701965332031e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": -0.0151, | |
| "reward": -0.3008502349257469, | |
| "reward_after_mean": -0.3008502349257469, | |
| "reward_after_std": 0.46144136413931847, | |
| "reward_before_mean": -0.11781154479831457, | |
| "reward_before_std": 0.4245893997140229, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18303867429494858, | |
| "reward_change_min": -0.32882457226514816, | |
| "reward_change_std": 0.11585518904030323, | |
| "reward_std": 0.46144137158989906, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.20114488480612636, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2861.604217529297, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.06764096766710281, | |
| "kl": 4.756450653076172e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0064, | |
| "reward": -0.3513468327000737, | |
| "reward_after_mean": -0.3513468327000737, | |
| "reward_after_std": 0.34203874319791794, | |
| "reward_before_mean": -0.17035986855626106, | |
| "reward_before_std": 0.3069671168923378, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.18098697252571583, | |
| "reward_change_min": -0.3226324897259474, | |
| "reward_change_std": 0.11291467119008303, | |
| "reward_std": 0.34203874692320824, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.23285987111739814, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3003.4166870117188, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.05480307340621948, | |
| "kl": 3.312528133392334e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0558, | |
| "reward": 0.07884835358709097, | |
| "reward_after_mean": 0.07884835358709097, | |
| "reward_after_std": 0.5404287055134773, | |
| "reward_before_mean": 0.39688345044851303, | |
| "reward_before_std": 0.5404559625312686, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.31803508289158344, | |
| "reward_change_min": -0.5349768288433552, | |
| "reward_change_std": 0.21121840458363295, | |
| "reward_std": 0.5404287241399288, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/cosine_scaled_reward": 0.0843834443949163, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2324.333381652832, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.08491574972867966, | |
| "kl": 3.9268285036087036e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": -0.0186, | |
| "reward": 0.05790156498551369, | |
| "reward_after_mean": 0.05790156498551369, | |
| "reward_after_std": 0.5291469320654869, | |
| "reward_before_mean": 0.36868745367974043, | |
| "reward_before_std": 0.5278607029467821, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.3107858970761299, | |
| "reward_change_min": -0.4988170899450779, | |
| "reward_change_std": 0.19942885264754295, | |
| "reward_std": 0.5291469544172287, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/cosine_scaled_reward": 0.05618744622915983, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1919.5208892822266, | |
| "epoch": 0.072, | |
| "grad_norm": 0.08071651309728622, | |
| "kl": 4.035234451293945e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0627, | |
| "reward": 0.03887452092021704, | |
| "reward_after_mean": 0.03887452092021704, | |
| "reward_after_std": 0.6215568911284208, | |
| "reward_before_mean": 0.3234526915475726, | |
| "reward_before_std": 0.5673101404681802, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.28457815758883953, | |
| "reward_change_min": -0.46947887167334557, | |
| "reward_change_std": 0.18151281960308552, | |
| "reward_std": 0.6215569227933884, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/cosine_scaled_reward": 0.03178601246327162, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2727.958396911621, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.07106328755617142, | |
| "kl": 4.9501657485961914e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": -0.0233, | |
| "reward": -0.08374386164359748, | |
| "reward_after_mean": -0.08374386164359748, | |
| "reward_after_std": 0.5860113948583603, | |
| "reward_before_mean": 0.16729693859815598, | |
| "reward_before_std": 0.5889831483364105, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2510407902300358, | |
| "reward_change_min": -0.4641798008233309, | |
| "reward_change_std": 0.18448879569768906, | |
| "reward_std": 0.5860113985836506, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/cosine_scaled_reward": -0.04103639395907521, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2691.208354949951, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.0748182013630867, | |
| "kl": 4.149973392486572e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0242, | |
| "reward": -0.19793469086289406, | |
| "reward_after_mean": -0.19793469086289406, | |
| "reward_after_std": 0.34430792182683945, | |
| "reward_before_mean": 0.03801066428422928, | |
| "reward_before_std": 0.27941721118986607, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2359453458338976, | |
| "reward_change_min": -0.37367955408990383, | |
| "reward_change_std": 0.13625783286988735, | |
| "reward_std": 0.34430794045329094, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.12865600176155567, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2040.8541736602783, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.12233862280845642, | |
| "kl": 3.054831176996231e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": -0.036, | |
| "reward": -0.05932062119245529, | |
| "reward_after_mean": -0.05932062119245529, | |
| "reward_after_std": 0.37665878515690565, | |
| "reward_before_mean": 0.22568307630717754, | |
| "reward_before_std": 0.3319067317061126, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2850037161260843, | |
| "reward_change_min": -0.4456963036209345, | |
| "reward_change_std": 0.17067184578627348, | |
| "reward_std": 0.37665879912674427, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/cosine_scaled_reward": -0.04515025019645691, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3306.7083435058594, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.053935784846544266, | |
| "kl": 4.680454730987549e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": -0.0177, | |
| "reward": -0.47077513113617897, | |
| "reward_after_mean": -0.47077513113617897, | |
| "reward_after_std": 0.199712373316288, | |
| "reward_before_mean": -0.31407430768013, | |
| "reward_before_std": 0.15584436804056168, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.15670081414282322, | |
| "reward_change_min": -0.2310307528823614, | |
| "reward_change_std": 0.08393252175301313, | |
| "reward_std": 0.19971238262951374, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/cosine_scaled_reward": -0.31407430768013, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1717.6666870117188, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.1108105406165123, | |
| "kl": 2.9583927243947983e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": -0.0421, | |
| "reward": -0.23304753471165895, | |
| "reward_after_mean": -0.23304753471165895, | |
| "reward_after_std": 0.5989381801337004, | |
| "reward_before_mean": -0.045263697393238544, | |
| "reward_before_std": 0.5719784637913108, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1877838410437107, | |
| "reward_change_min": -0.38191304728388786, | |
| "reward_change_std": 0.13277372065931559, | |
| "reward_std": 0.5989382117986679, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.17026370204985142, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 1968.9583740234375, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.0938061773777008, | |
| "kl": 4.889070987701416e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.072, | |
| "reward": -0.29863712191581726, | |
| "reward_after_mean": -0.29863712191581726, | |
| "reward_after_std": 0.3859822079539299, | |
| "reward_before_mean": -0.10575509630143642, | |
| "reward_before_std": 0.3468486526980996, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1928820125758648, | |
| "reward_change_min": -0.3261357322335243, | |
| "reward_change_std": 0.11793852038681507, | |
| "reward_std": 0.38598222099244595, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.16825510375201702, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2932.958366394043, | |
| "epoch": 0.08, | |
| "grad_norm": 0.06500542908906937, | |
| "kl": 4.222989082336426e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.009, | |
| "reward": -0.2667752802371979, | |
| "reward_after_mean": -0.2667752802371979, | |
| "reward_after_std": 0.30794927291572094, | |
| "reward_before_mean": -0.04546757601201534, | |
| "reward_before_std": 0.29563159868121147, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22130770608782768, | |
| "reward_change_min": -0.3524601850658655, | |
| "reward_change_std": 0.13822024781256914, | |
| "reward_std": 0.30794927291572094, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.12880091182887554, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2713.7708587646484, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.07305732369422913, | |
| "kl": 4.410743713378906e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": -0.0409, | |
| "reward": -0.16716782632283866, | |
| "reward_after_mean": -0.16716782632283866, | |
| "reward_after_std": 0.4499338325113058, | |
| "reward_before_mean": 0.0670731533318758, | |
| "reward_before_std": 0.40882042655721307, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2342409696429968, | |
| "reward_change_min": -0.3438849691301584, | |
| "reward_change_std": 0.1364891054108739, | |
| "reward_std": 0.4499338483437896, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.05792684169136919, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2673.6875762939453, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.11232837289571762, | |
| "kl": 5.570054054260254e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0121, | |
| "reward": -0.293288990855217, | |
| "reward_after_mean": -0.293288990855217, | |
| "reward_after_std": 0.3026854023337364, | |
| "reward_before_mean": -0.08422034978866577, | |
| "reward_before_std": 0.2725341413170099, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2090686447918415, | |
| "reward_change_min": -0.3304400313645601, | |
| "reward_change_std": 0.12835939321666956, | |
| "reward_std": 0.3026854023337364, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/cosine_scaled_reward": -0.14672034978866577, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3406.3333740234375, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.05645836889743805, | |
| "kl": 4.6640634536743164e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0034, | |
| "reward": -0.21504229493439198, | |
| "reward_after_mean": -0.21504229493439198, | |
| "reward_after_std": 0.2924734726548195, | |
| "reward_before_mean": 0.02397450990974903, | |
| "reward_before_std": 0.2590841380879283, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2390167973935604, | |
| "reward_change_min": -0.36735046096146107, | |
| "reward_change_std": 0.13649490475654602, | |
| "reward_std": 0.29247347451746464, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/cosine_scaled_reward": -0.12185884267091751, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2586.5416870117188, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.08274800330400467, | |
| "kl": 5.094707012176514e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.1411, | |
| "reward": -0.00769616337493062, | |
| "reward_after_mean": -0.00769616337493062, | |
| "reward_after_std": 0.5274815298616886, | |
| "reward_before_mean": 0.27686647698283195, | |
| "reward_before_std": 0.4944295873865485, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.284562636166811, | |
| "reward_change_min": -0.5354499518871307, | |
| "reward_change_std": 0.19903385266661644, | |
| "reward_std": 0.5274815503507853, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/cosine_scaled_reward": 0.006033138604834676, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2864.875015258789, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.06136927381157875, | |
| "kl": 4.2051076889038086e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": -0.025, | |
| "reward": 0.0016691870987415314, | |
| "reward_after_mean": 0.0016691870987415314, | |
| "reward_after_std": 0.41493271477520466, | |
| "reward_before_mean": 0.30496746860444546, | |
| "reward_before_std": 0.36841379571706057, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.30329825915396214, | |
| "reward_change_min": -0.4862241819500923, | |
| "reward_change_std": 0.1891703074797988, | |
| "reward_std": 0.4149327166378498, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": 0.05496744904667139, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2622.187526702881, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.07091425359249115, | |
| "kl": 3.7416815757751465e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": -0.0643, | |
| "reward": -0.321389552205801, | |
| "reward_after_mean": -0.321389552205801, | |
| "reward_after_std": 0.3121167942881584, | |
| "reward_before_mean": -0.12414262071251869, | |
| "reward_before_std": 0.2767808083444834, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19724693521857262, | |
| "reward_change_min": -0.3346843905746937, | |
| "reward_change_std": 0.12084136344492435, | |
| "reward_std": 0.31211681105196476, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.1658092886209488, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2890.7083587646484, | |
| "epoch": 0.088, | |
| "grad_norm": 0.05628199875354767, | |
| "kl": 4.3079257011413574e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": -0.032, | |
| "reward": -0.28795215487480164, | |
| "reward_after_mean": -0.28795215487480164, | |
| "reward_after_std": 0.30752563290297985, | |
| "reward_before_mean": -0.07627008855342865, | |
| "reward_before_std": 0.29114991053938866, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2116820625960827, | |
| "reward_change_min": -0.34908806160092354, | |
| "reward_change_std": 0.13322968687862158, | |
| "reward_std": 0.30752563662827015, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/cosine_scaled_reward": -0.13877009227871895, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3316.0833587646484, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.04708974435925484, | |
| "kl": 3.929436206817627e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0528, | |
| "reward": -0.1775032225996256, | |
| "reward_after_mean": -0.1775032225996256, | |
| "reward_after_std": 0.37231264263391495, | |
| "reward_before_mean": 0.06680050306022167, | |
| "reward_before_std": 0.3495578747242689, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2443037237972021, | |
| "reward_change_min": -0.40812752209603786, | |
| "reward_change_std": 0.15259934403002262, | |
| "reward_std": 0.3723126482218504, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/cosine_scaled_reward": -0.07903283275663853, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2158.958381652832, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.09586747735738754, | |
| "kl": 3.816187381744385e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": -0.0522, | |
| "reward": -0.09330683806911111, | |
| "reward_after_mean": -0.09330683806911111, | |
| "reward_after_std": 0.5184203516691923, | |
| "reward_before_mean": 0.1577418657252565, | |
| "reward_before_std": 0.46990944538265467, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25104869343340397, | |
| "reward_change_min": -0.45565746538341045, | |
| "reward_change_std": 0.16463573463261127, | |
| "reward_std": 0.5184203591197729, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/cosine_scaled_reward": -0.0714248213917017, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3231.041702270508, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.057165395468473434, | |
| "kl": 5.085766315460205e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0002, | |
| "reward": -0.31745147332549095, | |
| "reward_after_mean": -0.31745147332549095, | |
| "reward_after_std": 0.3213096186518669, | |
| "reward_before_mean": -0.11810518335551023, | |
| "reward_before_std": 0.2888203295879066, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1993462722748518, | |
| "reward_change_min": -0.3132110647857189, | |
| "reward_change_std": 0.12029875814914703, | |
| "reward_std": 0.3213096335530281, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.22227186150848866, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3167.083335876465, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.06822940707206726, | |
| "kl": 5.334615707397461e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": -0.0023, | |
| "reward": -0.28903417102992535, | |
| "reward_after_mean": -0.28903417102992535, | |
| "reward_after_std": 0.36186219193041325, | |
| "reward_before_mean": -0.08435890730470419, | |
| "reward_before_std": 0.34423625422641635, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20467526651918888, | |
| "reward_change_min": -0.3274262994527817, | |
| "reward_change_std": 0.1262927083298564, | |
| "reward_std": 0.3618621937930584, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.18852557707577944, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2622.916702270508, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.0722210481762886, | |
| "kl": 4.655122756958008e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0292, | |
| "reward": -0.1543267703964375, | |
| "reward_after_mean": -0.1543267703964375, | |
| "reward_after_std": 0.5856532733887434, | |
| "reward_before_mean": 0.06535659916698933, | |
| "reward_before_std": 0.5598178403452039, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.219683350995183, | |
| "reward_change_min": -0.40967136807739735, | |
| "reward_change_std": 0.14763531927019358, | |
| "reward_std": 0.5856533050537109, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.08047674596309662, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2538.187515258789, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.09561274945735931, | |
| "kl": 5.626678466796875e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": -0.037, | |
| "reward": -0.12532864138484, | |
| "reward_after_mean": -0.12532864138484, | |
| "reward_after_std": 0.4662170447409153, | |
| "reward_before_mean": 0.12323063798248768, | |
| "reward_before_std": 0.44423565454781055, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24855928868055344, | |
| "reward_change_min": -0.4013477824628353, | |
| "reward_change_std": 0.15694511495530605, | |
| "reward_std": 0.4662170559167862, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/cosine_scaled_reward": -0.06426936015486717, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2805.375045776367, | |
| "epoch": 0.096, | |
| "grad_norm": 0.06256293505430222, | |
| "kl": 4.0903687477111816e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0204, | |
| "reward": 0.026594052091240883, | |
| "reward_after_mean": 0.026594052091240883, | |
| "reward_after_std": 0.6662927530705929, | |
| "reward_before_mean": 0.30712341889739037, | |
| "reward_before_std": 0.6726711131632328, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2805293593555689, | |
| "reward_change_min": -0.5315785314887762, | |
| "reward_change_std": 0.202547793276608, | |
| "reward_std": 0.6662927772849798, | |
| "rewards/accuracy_reward": 0.29166667722165585, | |
| "rewards/cosine_scaled_reward": 0.01545674353837967, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3143.3333740234375, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.04833953082561493, | |
| "kl": 3.106147050857544e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": -0.0189, | |
| "reward": -0.2577883277554065, | |
| "reward_after_mean": -0.2577883277554065, | |
| "reward_after_std": 0.48671364039182663, | |
| "reward_before_mean": -0.06025014817714691, | |
| "reward_before_std": 0.4735073782503605, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19753817282617092, | |
| "reward_change_min": -0.3981729503720999, | |
| "reward_change_std": 0.1415349431335926, | |
| "reward_std": 0.48671367578208447, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/cosine_scaled_reward": -0.16441681701689959, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2766.2708740234375, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.07490651309490204, | |
| "kl": 5.46574592590332e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": -0.027, | |
| "reward": -0.2117796391248703, | |
| "reward_after_mean": -0.2117796391248703, | |
| "reward_after_std": 0.4173107650130987, | |
| "reward_before_mean": 0.01547178067266941, | |
| "reward_before_std": 0.416951060295105, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.22725142538547516, | |
| "reward_change_min": -0.4079761225730181, | |
| "reward_change_std": 0.15469573251903057, | |
| "reward_std": 0.41731079295277596, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.10952822491526604, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2761.729217529297, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.07899302989244461, | |
| "kl": 4.908442497253418e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": -0.0014, | |
| "reward": -0.11103942012414336, | |
| "reward_after_mean": -0.11103942012414336, | |
| "reward_after_std": 0.4518774300813675, | |
| "reward_before_mean": 0.14697221852838993, | |
| "reward_before_std": 0.44158919900655746, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2580116242170334, | |
| "reward_change_min": -0.4169537350535393, | |
| "reward_change_std": 0.16394579596817493, | |
| "reward_std": 0.45187743939459324, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/cosine_scaled_reward": -0.04052778799086809, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2124.3125228881836, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.09118274599313736, | |
| "kl": 5.093216896057129e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.063, | |
| "reward": -0.03512320853769779, | |
| "reward_after_mean": -0.03512320853769779, | |
| "reward_after_std": 0.5551198851317167, | |
| "reward_before_mean": 0.22805100050754845, | |
| "reward_before_std": 0.4874242516234517, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2631742022931576, | |
| "reward_change_min": -0.46103948913514614, | |
| "reward_change_std": 0.16047734580934048, | |
| "reward_std": 0.5551198869943619, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/cosine_scaled_reward": -0.021949008107185364, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2926.0625610351562, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.05478106066584587, | |
| "kl": 3.62396240234375e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0017, | |
| "reward": -0.16949444822967052, | |
| "reward_after_mean": -0.16949444822967052, | |
| "reward_after_std": 0.5147665832191706, | |
| "reward_before_mean": 0.05882885679602623, | |
| "reward_before_std": 0.513655299320817, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2283233217895031, | |
| "reward_change_min": -0.4650176241993904, | |
| "reward_change_std": 0.16912943683564663, | |
| "reward_std": 0.5147665962576866, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/cosine_scaled_reward": -0.08700447157025337, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2371.3958435058594, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.10766109824180603, | |
| "kl": 6.204843521118164e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0561, | |
| "reward": -0.3421127675101161, | |
| "reward_after_mean": -0.3421127675101161, | |
| "reward_after_std": 0.4089508708566427, | |
| "reward_before_mean": -0.17222417518496513, | |
| "reward_before_std": 0.3507535606622696, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.1698886025696993, | |
| "reward_change_min": -0.24556688778102398, | |
| "reward_change_std": 0.09127143956720829, | |
| "reward_std": 0.4089508727192879, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/cosine_scaled_reward": -0.21389084495604038, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2908.291679382324, | |
| "epoch": 0.104, | |
| "grad_norm": 0.0649840459227562, | |
| "kl": 5.263090133666992e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0049, | |
| "reward": -0.14402012154459953, | |
| "reward_after_mean": -0.14402012154459953, | |
| "reward_after_std": 0.3665870167315006, | |
| "reward_before_mean": 0.11315873265266418, | |
| "reward_before_std": 0.34818731527775526, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25717885605990887, | |
| "reward_change_min": -0.40348367765545845, | |
| "reward_change_std": 0.15953873004764318, | |
| "reward_std": 0.3665870279073715, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/cosine_scaled_reward": -0.0743412896990776, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2475.2916946411133, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.07881049811840057, | |
| "kl": 3.402680158615112e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": -0.0348, | |
| "reward": -0.26179221691563725, | |
| "reward_after_mean": -0.26179221691563725, | |
| "reward_after_std": 0.4455073904246092, | |
| "reward_before_mean": -0.06640319153666496, | |
| "reward_before_std": 0.38197263330221176, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.19538902305066586, | |
| "reward_change_min": -0.28332172334194183, | |
| "reward_change_std": 0.10343744326382875, | |
| "reward_std": 0.4455074183642864, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/cosine_scaled_reward": -0.1289031896740198, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3584.0, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.05205822363495827, | |
| "kl": 5.187094211578369e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": -0.0, | |
| "reward": -0.377890408039093, | |
| "reward_after_mean": -0.377890408039093, | |
| "reward_after_std": 0.305487222969532, | |
| "reward_before_mean": -0.20220234524458647, | |
| "reward_before_std": 0.26121661625802517, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.17568806931376457, | |
| "reward_change_min": -0.26276420429348946, | |
| "reward_change_std": 0.09641448874026537, | |
| "reward_std": 0.30548722483217716, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/cosine_scaled_reward": -0.22303568944334984, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2259.1250534057617, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.07459476590156555, | |
| "kl": 5.307793617248535e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0457, | |
| "reward": -0.1999267702922225, | |
| "reward_after_mean": -0.1999267702922225, | |
| "reward_after_std": 0.3248994555324316, | |
| "reward_before_mean": 0.034845118410885334, | |
| "reward_before_std": 0.2530666273087263, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23477189242839813, | |
| "reward_change_min": -0.33492102287709713, | |
| "reward_change_std": 0.12350414134562016, | |
| "reward_std": 0.32489946112036705, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/cosine_scaled_reward": -0.11098822485655546, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3346.750030517578, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.05002405866980553, | |
| "kl": 4.0590763092041016e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": -0.0505, | |
| "reward": -0.27604287723079324, | |
| "reward_after_mean": -0.27604287723079324, | |
| "reward_after_std": 0.4140418618917465, | |
| "reward_before_mean": -0.07312451303005219, | |
| "reward_before_std": 0.4058398883789778, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20291837118566036, | |
| "reward_change_min": -0.4037560001015663, | |
| "reward_change_std": 0.1442381963133812, | |
| "reward_std": 0.4140418656170368, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/cosine_scaled_reward": -0.15645784453954548, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2331.1041946411133, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.07478164881467819, | |
| "kl": 3.7454068660736084e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0203, | |
| "reward": -0.16277848929166794, | |
| "reward_after_mean": -0.16277848929166794, | |
| "reward_after_std": 0.4757649786770344, | |
| "reward_before_mean": 0.07331054843962193, | |
| "reward_before_std": 0.4654188547283411, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.23608904145658016, | |
| "reward_change_min": -0.38738977164030075, | |
| "reward_change_std": 0.15735210105776787, | |
| "reward_std": 0.4757649824023247, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/cosine_scaled_reward": -0.11418944969773293, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 3057.729217529297, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.05840550735592842, | |
| "kl": 4.947185516357422e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0165, | |
| "reward": -0.1562044993042946, | |
| "reward_after_mean": -0.1562044993042946, | |
| "reward_after_std": 0.3517165407538414, | |
| "reward_before_mean": 0.09949065186083317, | |
| "reward_before_std": 0.33916947059333324, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.25569513998925686, | |
| "reward_change_min": -0.3923469968140125, | |
| "reward_change_std": 0.15778757631778717, | |
| "reward_std": 0.35171656124293804, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/cosine_scaled_reward": -0.08800935186445713, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2951.5833892822266, | |
| "epoch": 0.112, | |
| "grad_norm": 0.05241960659623146, | |
| "kl": 3.910064697265625e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0687, | |
| "reward": -0.1431019864976406, | |
| "reward_after_mean": -0.1431019864976406, | |
| "reward_after_std": 0.41360887698829174, | |
| "reward_before_mean": 0.09948256425559521, | |
| "reward_before_std": 0.33612028509378433, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.24258453957736492, | |
| "reward_change_min": -0.34666948951780796, | |
| "reward_change_std": 0.12914846278727055, | |
| "reward_std": 0.41360888816416264, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/cosine_scaled_reward": -0.06718412227928638, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2825.062526702881, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.09673970937728882, | |
| "kl": 4.477798938751221e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": -0.0012, | |
| "reward": -0.14087136555463076, | |
| "reward_after_mean": -0.14087136555463076, | |
| "reward_after_std": 0.5487220250070095, | |
| "reward_before_mean": 0.08683701790869236, | |
| "reward_before_std": 0.5104697393253446, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.2277083769440651, | |
| "reward_change_min": -0.35002104938030243, | |
| "reward_change_std": 0.13233954645693302, | |
| "reward_std": 0.5487220343202353, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/cosine_scaled_reward": -0.07982964627444744, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_fraction": 0.0, | |
| "completion_length": 2499.2083587646484, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.07807345688343048, | |
| "kl": 5.307793617248535e-05, | |
| "lambda_div_used": 0.7000000000000001, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0319, | |
| "reward": -0.19067936576902866, | |
| "reward_after_mean": -0.19067936576902866, | |
| "reward_after_std": 0.5915912743657827, | |
| "reward_before_mean": 0.013846036046743393, | |
| "reward_before_std": 0.5670791789889336, | |
| "reward_change_max": 0.0, | |
| "reward_change_mean": -0.20452538691461086, | |
| "reward_change_min": -0.4066152162849903, | |
| "reward_change_std": 0.14473197888582945, | |
| "reward_std": 0.5915912911295891, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/cosine_scaled_reward": -0.11115396713648806, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11428571428571428, | |
| "step": 100, | |
| "total_flos": 0.0, | |
| "train_loss": 0.004007526492219044, | |
| "train_runtime": 34518.5958, | |
| "train_samples_per_second": 0.139, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |