| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "episode": 800, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "episode": 8, |
| "epoch": 0.01, |
| "eps": 0, |
| "loss/policy_avg": 0.09475220739841461, |
| "loss/value_avg": 37.87782287597656, |
| "lr": 5e-05, |
| "objective/entropy": -46.64240264892578, |
| "objective/kl": 0.24876117706298828, |
| "objective/non_score_reward": -0.012438058853149414, |
| "objective/rlhf_reward": -7.19224739074707, |
| "objective/scores": -7.1798095703125, |
| "policy/approxkl_avg": 0.23542489111423492, |
| "policy/clipfrac_avg": 0.442307710647583, |
| "policy/entropy_avg": 1.8903515338897705, |
| "step": 1, |
| "val/clipfrac_avg": 0.75, |
| "val/num_eos_tokens": 9, |
| "val/ratio": 1.0843846797943115, |
| "val/ratio_var": 0.015354042872786522 |
| }, |
| { |
| "episode": 16, |
| "epoch": 0.02, |
| "eps": 0, |
| "loss/policy_avg": -0.09378913044929504, |
| "loss/value_avg": 17.912017822265625, |
| "lr": 4.9500000000000004e-05, |
| "objective/entropy": -43.38051986694336, |
| "objective/kl": -0.39787864685058594, |
| "objective/non_score_reward": 0.019893934950232506, |
| "objective/rlhf_reward": -7.087190628051758, |
| "objective/scores": -7.10708475112915, |
| "policy/approxkl_avg": 0.031495485454797745, |
| "policy/clipfrac_avg": 0.510869562625885, |
| "policy/entropy_avg": 1.8803576231002808, |
| "step": 2, |
| "val/clipfrac_avg": 0.75, |
| "val/num_eos_tokens": 9, |
| "val/ratio": 1.0658495426177979, |
| "val/ratio_var": 0.0052874707616865635 |
| }, |
| { |
| "episode": 24, |
| "epoch": 0.03, |
| "eps": 0, |
| "loss/policy_avg": -0.048949725925922394, |
| "loss/value_avg": 14.420023918151855, |
| "lr": 4.9e-05, |
| "objective/entropy": -19.564510345458984, |
| "objective/kl": 8.628725051879883, |
| "objective/non_score_reward": -0.4314362406730652, |
| "objective/rlhf_reward": -7.2510809898376465, |
| "objective/scores": -6.819644927978516, |
| "policy/approxkl_avg": 0.02875935286283493, |
| "policy/clipfrac_avg": 0.42465752363204956, |
| "policy/entropy_avg": 2.1179404258728027, |
| "step": 3, |
| "val/clipfrac_avg": 0.7191358208656311, |
| "val/num_eos_tokens": 8, |
| "val/ratio": 1.0251271724700928, |
| "val/ratio_var": 0.0014984746230766177 |
| }, |
| { |
| "episode": 32, |
| "epoch": 0.04, |
| "eps": 0, |
| "loss/policy_avg": -0.058641545474529266, |
| "loss/value_avg": 6.136292934417725, |
| "lr": 4.85e-05, |
| "objective/entropy": 2.016387462615967, |
| "objective/kl": 12.600632667541504, |
| "objective/non_score_reward": -0.6300315856933594, |
| "objective/rlhf_reward": -7.23648738861084, |
| "objective/scores": -6.6064558029174805, |
| "policy/approxkl_avg": 0.026198051869869232, |
| "policy/clipfrac_avg": 0.279629647731781, |
| "policy/entropy_avg": 2.0662665367126465, |
| "step": 4, |
| "val/clipfrac_avg": 0.6328670978546143, |
| "val/num_eos_tokens": 9, |
| "val/ratio": 1.0075256824493408, |
| "val/ratio_var": 3.9297356124734506e-05 |
| }, |
| { |
| "episode": 40, |
| "epoch": 0.05, |
| "eps": 0, |
| "loss/policy_avg": -0.06558828055858612, |
| "loss/value_avg": 1.9106276035308838, |
| "lr": 4.8e-05, |
| "objective/entropy": 50.39515686035156, |
| "objective/kl": 18.737285614013672, |
| "objective/non_score_reward": -0.9368643164634705, |
| "objective/rlhf_reward": -8.13361644744873, |
| "objective/scores": -7.196752548217773, |
| "policy/approxkl_avg": 0.017262864857912064, |
| "policy/clipfrac_avg": 0.22424891591072083, |
| "policy/entropy_avg": 2.027545928955078, |
| "step": 5, |
| "val/clipfrac_avg": 0.5177083015441895, |
| "val/num_eos_tokens": 7, |
| "val/ratio": 1.0054020881652832, |
| "val/ratio_var": 0.00012101430911570787 |
| }, |
| { |
| "episode": 48, |
| "epoch": 0.06, |
| "eps": 0, |
| "loss/policy_avg": -0.029659481719136238, |
| "loss/value_avg": 0.9584429264068604, |
| "lr": 4.75e-05, |
| "objective/entropy": 31.915897369384766, |
| "objective/kl": 17.292184829711914, |
| "objective/non_score_reward": -0.8646091818809509, |
| "objective/rlhf_reward": -7.128448009490967, |
| "objective/scores": -6.263838768005371, |
| "policy/approxkl_avg": 0.00980815663933754, |
| "policy/clipfrac_avg": 0.1230158805847168, |
| "policy/entropy_avg": 1.5745285749435425, |
| "step": 6, |
| "val/clipfrac_avg": 0.4331395626068115, |
| "val/num_eos_tokens": 7, |
| "val/ratio": 0.9889805316925049, |
| "val/ratio_var": 0.00010295213724020869 |
| }, |
| { |
| "episode": 56, |
| "epoch": 0.07, |
| "eps": 0, |
| "loss/policy_avg": -0.05306820571422577, |
| "loss/value_avg": 0.9622513055801392, |
| "lr": 4.7e-05, |
| "objective/entropy": 12.873809814453125, |
| "objective/kl": 17.415592193603516, |
| "objective/non_score_reward": -0.8707795739173889, |
| "objective/rlhf_reward": -7.566481590270996, |
| "objective/scores": -6.695702075958252, |
| "policy/approxkl_avg": 0.016156045719981194, |
| "policy/clipfrac_avg": 0.2081081122159958, |
| "policy/entropy_avg": 1.9123384952545166, |
| "step": 7, |
| "val/clipfrac_avg": 0.5481771230697632, |
| "val/num_eos_tokens": 8, |
| "val/ratio": 0.9937976598739624, |
| "val/ratio_var": 5.228411828284152e-05 |
| }, |
| { |
| "episode": 64, |
| "epoch": 0.08, |
| "eps": 0, |
| "loss/policy_avg": -0.022447112947702408, |
| "loss/value_avg": 0.4383925497531891, |
| "lr": 4.6500000000000005e-05, |
| "objective/entropy": 6.965538024902344, |
| "objective/kl": 21.360532760620117, |
| "objective/non_score_reward": -1.0680266618728638, |
| "objective/rlhf_reward": -6.4422149658203125, |
| "objective/scores": -5.374188423156738, |
| "policy/approxkl_avg": 0.0097531508654356, |
| "policy/clipfrac_avg": 0.0854545384645462, |
| "policy/entropy_avg": 1.2789623737335205, |
| "step": 8, |
| "val/clipfrac_avg": 0.1944444477558136, |
| "val/num_eos_tokens": 6, |
| "val/ratio": 1.0197606086730957, |
| "val/ratio_var": 0.00018990044191014022 |
| }, |
| { |
| "episode": 72, |
| "epoch": 0.09, |
| "eps": 0, |
| "loss/policy_avg": -0.03375006467103958, |
| "loss/value_avg": 0.20129632949829102, |
| "lr": 4.600000000000001e-05, |
| "objective/entropy": 25.035720825195312, |
| "objective/kl": 21.478357315063477, |
| "objective/non_score_reward": -1.0739178657531738, |
| "objective/rlhf_reward": -6.791517734527588, |
| "objective/scores": -5.717599868774414, |
| "policy/approxkl_avg": 0.03056078404188156, |
| "policy/clipfrac_avg": 0.12059858441352844, |
| "policy/entropy_avg": 1.3224170207977295, |
| "step": 9, |
| "val/clipfrac_avg": 0.1833910048007965, |
| "val/num_eos_tokens": 5, |
| "val/ratio": 1.0177733898162842, |
| "val/ratio_var": 0.000167887206771411 |
| }, |
| { |
| "episode": 80, |
| "epoch": 0.1, |
| "eps": 0, |
| "loss/policy_avg": -0.018979491665959358, |
| "loss/value_avg": 0.5691134333610535, |
| "lr": 4.55e-05, |
| "objective/entropy": 34.5059814453125, |
| "objective/kl": 27.583213806152344, |
| "objective/non_score_reward": -1.3791606426239014, |
| "objective/rlhf_reward": -8.178009033203125, |
| "objective/scores": -6.7988481521606445, |
| "policy/approxkl_avg": 0.01339983381330967, |
| "policy/clipfrac_avg": 0.08974358439445496, |
| "policy/entropy_avg": 1.328680157661438, |
| "step": 10, |
| "val/clipfrac_avg": 0.45820188522338867, |
| "val/num_eos_tokens": 6, |
| "val/ratio": 1.0151302814483643, |
| "val/ratio_var": 0.00030882572173140943 |
| }, |
| { |
| "episode": 88, |
| "epoch": 0.11, |
| "eps": 0, |
| "loss/policy_avg": -0.01908348686993122, |
| "loss/value_avg": 0.7630579471588135, |
| "lr": 4.5e-05, |
| "objective/entropy": 25.90598487854004, |
| "objective/kl": 21.881332397460938, |
| "objective/non_score_reward": -1.0940666198730469, |
| "objective/rlhf_reward": -7.77611780166626, |
| "objective/scores": -6.682051181793213, |
| "policy/approxkl_avg": 0.009299904108047485, |
| "policy/clipfrac_avg": 0.08960843086242676, |
| "policy/entropy_avg": 1.1157985925674438, |
| "step": 11, |
| "val/clipfrac_avg": 0.1889880895614624, |
| "val/num_eos_tokens": 5, |
| "val/ratio": 1.0160670280456543, |
| "val/ratio_var": 0.00019202969269827008 |
| }, |
| { |
| "episode": 96, |
| "epoch": 0.12, |
| "eps": 0, |
| "loss/policy_avg": -0.007455375976860523, |
| "loss/value_avg": 0.11711454391479492, |
| "lr": 4.4500000000000004e-05, |
| "objective/entropy": 4.318185806274414, |
| "objective/kl": 25.807804107666016, |
| "objective/non_score_reward": -1.2903902530670166, |
| "objective/rlhf_reward": -5.789934158325195, |
| "objective/scores": -4.4995436668396, |
| "policy/approxkl_avg": 0.0064476775005459785, |
| "policy/clipfrac_avg": 0.0356217622756958, |
| "policy/entropy_avg": 0.33628159761428833, |
| "step": 12, |
| "val/clipfrac_avg": 0.27583980560302734, |
| "val/num_eos_tokens": 1, |
| "val/ratio": 1.0104122161865234, |
| "val/ratio_var": 1.4566742720489856e-05 |
| }, |
| { |
| "episode": 104, |
| "epoch": 0.13, |
| "eps": 0, |
| "loss/policy_avg": -0.0054306937381625175, |
| "loss/value_avg": 0.6040916442871094, |
| "lr": 4.4000000000000006e-05, |
| "objective/entropy": 4.192093372344971, |
| "objective/kl": 24.524070739746094, |
| "objective/non_score_reward": -1.226203441619873, |
| "objective/rlhf_reward": -7.248112201690674, |
| "objective/scores": -6.021908760070801, |
| "policy/approxkl_avg": 0.013138508424162865, |
| "policy/clipfrac_avg": 0.03693181648850441, |
| "policy/entropy_avg": 0.5951633453369141, |
| "step": 13, |
| "val/clipfrac_avg": 0.2570621371269226, |
| "val/num_eos_tokens": 2, |
| "val/ratio": 0.9956222772598267, |
| "val/ratio_var": 1.213013911183225e-05 |
| }, |
| { |
| "episode": 112, |
| "epoch": 0.14, |
| "eps": 0, |
| "loss/policy_avg": -0.016900666058063507, |
| "loss/value_avg": 0.7295329570770264, |
| "lr": 4.35e-05, |
| "objective/entropy": 13.571832656860352, |
| "objective/kl": 25.062355041503906, |
| "objective/non_score_reward": -1.2531176805496216, |
| "objective/rlhf_reward": -5.660390377044678, |
| "objective/scores": -4.407272815704346, |
| "policy/approxkl_avg": 0.010249488987028599, |
| "policy/clipfrac_avg": 0.037146225571632385, |
| "policy/entropy_avg": 0.2530284523963928, |
| "step": 14, |
| "val/clipfrac_avg": 0.26238206028938293, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0186505317687988, |
| "val/ratio_var": 0.00026969113969244063 |
| }, |
| { |
| "episode": 120, |
| "epoch": 0.15, |
| "eps": 0, |
| "loss/policy_avg": -0.0026563892606645823, |
| "loss/value_avg": 0.13339246809482574, |
| "lr": 4.3e-05, |
| "objective/entropy": 2.422701358795166, |
| "objective/kl": 19.802749633789062, |
| "objective/non_score_reward": -0.99013751745224, |
| "objective/rlhf_reward": -6.004833698272705, |
| "objective/scores": -5.01469612121582, |
| "policy/approxkl_avg": 0.003422472393140197, |
| "policy/clipfrac_avg": 0.009043928235769272, |
| "policy/entropy_avg": 0.2299969494342804, |
| "step": 15, |
| "val/clipfrac_avg": 0.10180411487817764, |
| "val/num_eos_tokens": 1, |
| "val/ratio": 1.0032926797866821, |
| "val/ratio_var": 1.6620078895357437e-05 |
| }, |
| { |
| "episode": 128, |
| "epoch": 0.16, |
| "eps": 0, |
| "loss/policy_avg": -0.04060552269220352, |
| "loss/value_avg": 0.30010226368904114, |
| "lr": 4.25e-05, |
| "objective/entropy": 41.43886184692383, |
| "objective/kl": 33.01524353027344, |
| "objective/non_score_reward": -1.6507622003555298, |
| "objective/rlhf_reward": -6.505797386169434, |
| "objective/scores": -4.855035305023193, |
| "policy/approxkl_avg": 0.04338652640581131, |
| "policy/clipfrac_avg": 0.09436275064945221, |
| "policy/entropy_avg": 0.8455747961997986, |
| "step": 16, |
| "val/clipfrac_avg": 0.07823961228132248, |
| "val/num_eos_tokens": 1, |
| "val/ratio": 0.9889528751373291, |
| "val/ratio_var": 2.141590630344581e-05 |
| }, |
| { |
| "episode": 136, |
| "epoch": 0.17, |
| "eps": 0, |
| "loss/policy_avg": -0.000526640797033906, |
| "loss/value_avg": 0.16104120016098022, |
| "lr": 4.2e-05, |
| "objective/entropy": 0.7853485345840454, |
| "objective/kl": 22.7672176361084, |
| "objective/non_score_reward": -1.138360857963562, |
| "objective/rlhf_reward": -5.805202484130859, |
| "objective/scores": -4.666841506958008, |
| "policy/approxkl_avg": 0.00014227806241251528, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.011943262070417404, |
| "step": 17, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0015124082565308, |
| "val/ratio_var": 6.323739398794714e-07 |
| }, |
| { |
| "episode": 144, |
| "epoch": 0.18, |
| "eps": 0, |
| "loss/policy_avg": -0.0006135053699836135, |
| "loss/value_avg": 0.06096681207418442, |
| "lr": 4.15e-05, |
| "objective/entropy": 0.09207843244075775, |
| "objective/kl": 22.14600944519043, |
| "objective/non_score_reward": -1.1073005199432373, |
| "objective/rlhf_reward": -5.784175872802734, |
| "objective/scores": -4.676875591278076, |
| "policy/approxkl_avg": 1.3936476534581743e-05, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.00949782133102417, |
| "step": 18, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.000463843345642, |
| "val/ratio_var": 2.575906987711818e-10 |
| }, |
| { |
| "episode": 152, |
| "epoch": 0.19, |
| "eps": 0, |
| "loss/policy_avg": -0.0019343766616657376, |
| "loss/value_avg": 0.1923561841249466, |
| "lr": 4.1e-05, |
| "objective/entropy": 0.16122427582740784, |
| "objective/kl": 22.323423385620117, |
| "objective/non_score_reward": -1.1161713600158691, |
| "objective/rlhf_reward": -6.342621326446533, |
| "objective/scores": -5.226449966430664, |
| "policy/approxkl_avg": 0.00015607287059538066, |
| "policy/clipfrac_avg": 0.004716981202363968, |
| "policy/entropy_avg": 0.010668131522834301, |
| "step": 19, |
| "val/clipfrac_avg": 0.2299528270959854, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.001634120941162, |
| "val/ratio_var": 1.4890058075422985e-08 |
| }, |
| { |
| "episode": 160, |
| "epoch": 0.2, |
| "eps": 0, |
| "loss/policy_avg": -0.004725292790681124, |
| "loss/value_avg": 0.16123607754707336, |
| "lr": 4.05e-05, |
| "objective/entropy": 8.322664260864258, |
| "objective/kl": 24.683185577392578, |
| "objective/non_score_reward": -1.2341593503952026, |
| "objective/rlhf_reward": -5.897225379943848, |
| "objective/scores": -4.6630659103393555, |
| "policy/approxkl_avg": 0.009079387411475182, |
| "policy/clipfrac_avg": 0.010613207705318928, |
| "policy/entropy_avg": 0.029525920748710632, |
| "step": 20, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.020014762878418, |
| "val/ratio_var": 0.0007286081090569496 |
| }, |
| { |
| "episode": 168, |
| "epoch": 0.21, |
| "eps": 0, |
| "loss/policy_avg": 1.005048216029536e-05, |
| "loss/value_avg": 0.10769544541835785, |
| "lr": 4e-05, |
| "objective/entropy": 0.6765559911727905, |
| "objective/kl": 21.308088302612305, |
| "objective/non_score_reward": -1.0654044151306152, |
| "objective/rlhf_reward": -5.437287330627441, |
| "objective/scores": -4.371882915496826, |
| "policy/approxkl_avg": 4.5833239710191265e-05, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.004317480139434338, |
| "step": 21, |
| "val/clipfrac_avg": 0.08490566164255142, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9998916387557983, |
| "val/ratio_var": 4.1950055873485326e-08 |
| }, |
| { |
| "episode": 176, |
| "epoch": 0.22, |
| "eps": 0, |
| "loss/policy_avg": -0.002242459449917078, |
| "loss/value_avg": 0.08367361128330231, |
| "lr": 3.9500000000000005e-05, |
| "objective/entropy": 11.297447204589844, |
| "objective/kl": 20.96246337890625, |
| "objective/non_score_reward": -1.0481232404708862, |
| "objective/rlhf_reward": -5.453763484954834, |
| "objective/scores": -4.405640125274658, |
| "policy/approxkl_avg": 0.008936449885368347, |
| "policy/clipfrac_avg": 0.009124087169766426, |
| "policy/entropy_avg": 0.09280973672866821, |
| "step": 22, |
| "val/clipfrac_avg": 0.0024271844886243343, |
| "val/num_eos_tokens": 1, |
| "val/ratio": 0.9998486638069153, |
| "val/ratio_var": 1.0913846608673339e-06 |
| }, |
| { |
| "episode": 184, |
| "epoch": 0.23, |
| "eps": 0, |
| "loss/policy_avg": -2.541125832067337e-05, |
| "loss/value_avg": 0.04419855773448944, |
| "lr": 3.9000000000000006e-05, |
| "objective/entropy": 0.021149635314941406, |
| "objective/kl": 21.688682556152344, |
| "objective/non_score_reward": -1.0844340324401855, |
| "objective/rlhf_reward": -5.5749287605285645, |
| "objective/scores": -4.490494728088379, |
| "policy/approxkl_avg": 3.073716925428016e-07, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0026977781672030687, |
| "step": 23, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000680685043335, |
| "val/ratio_var": 6.224354365258478e-12 |
| }, |
| { |
| "episode": 192, |
| "epoch": 0.24, |
| "eps": 0, |
| "loss/policy_avg": -5.5702646932331845e-05, |
| "loss/value_avg": 0.12123075127601624, |
| "lr": 3.85e-05, |
| "objective/entropy": 0.02785348892211914, |
| "objective/kl": 21.36163330078125, |
| "objective/non_score_reward": -1.0680818557739258, |
| "objective/rlhf_reward": -5.926137924194336, |
| "objective/scores": -4.85805606842041, |
| "policy/approxkl_avg": 1.9325676703374484e-07, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.003609274746850133, |
| "step": 24, |
| "val/clipfrac_avg": 0.11674527823925018, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000431537628174, |
| "val/ratio_var": 7.615597041876754e-11 |
| }, |
| { |
| "episode": 200, |
| "epoch": 0.25, |
| "eps": 0, |
| "loss/policy_avg": 1.5368334061349742e-05, |
| "loss/value_avg": 0.10563156753778458, |
| "lr": 3.8e-05, |
| "objective/entropy": 0.03280210494995117, |
| "objective/kl": 21.531850814819336, |
| "objective/non_score_reward": -1.0765926837921143, |
| "objective/rlhf_reward": -5.382946968078613, |
| "objective/scores": -4.306354522705078, |
| "policy/approxkl_avg": 4.6442153234238504e-07, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.004041329957544804, |
| "step": 25, |
| "val/clipfrac_avg": 0.08726415038108826, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000438690185547, |
| "val/ratio_var": 7.789253686496522e-10 |
| }, |
| { |
| "episode": 208, |
| "epoch": 0.26, |
| "eps": 0, |
| "loss/policy_avg": -0.0069130496121943, |
| "loss/value_avg": 0.049083612859249115, |
| "lr": 3.7500000000000003e-05, |
| "objective/entropy": -0.11079001426696777, |
| "objective/kl": 18.664615631103516, |
| "objective/non_score_reward": -0.9332309365272522, |
| "objective/rlhf_reward": -5.723866939544678, |
| "objective/scores": -4.79063606262207, |
| "policy/approxkl_avg": 0.008058368228375912, |
| "policy/clipfrac_avg": 0.009842519648373127, |
| "policy/entropy_avg": 0.20962515473365784, |
| "step": 26, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 1, |
| "val/ratio": 0.9940447807312012, |
| "val/ratio_var": 1.0320742148905993e-05 |
| }, |
| { |
| "episode": 216, |
| "epoch": 0.27, |
| "eps": 0, |
| "loss/policy_avg": -0.019339777529239655, |
| "loss/value_avg": 0.20152565836906433, |
| "lr": 3.7e-05, |
| "objective/entropy": 19.612171173095703, |
| "objective/kl": 19.863204956054688, |
| "objective/non_score_reward": -0.9931602478027344, |
| "objective/rlhf_reward": -5.119001388549805, |
| "objective/scores": -4.12584114074707, |
| "policy/approxkl_avg": 0.0681137889623642, |
| "policy/clipfrac_avg": 0.028301885351538658, |
| "policy/entropy_avg": 0.12574249505996704, |
| "step": 27, |
| "val/clipfrac_avg": 0.19929245114326477, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.4276061058044434, |
| "val/ratio_var": 0.5271331667900085 |
| }, |
| { |
| "episode": 224, |
| "epoch": 0.28, |
| "eps": 0, |
| "loss/policy_avg": -0.003817332675680518, |
| "loss/value_avg": 0.1855505108833313, |
| "lr": 3.65e-05, |
| "objective/entropy": 17.2297306060791, |
| "objective/kl": 24.35909652709961, |
| "objective/non_score_reward": -1.2179547548294067, |
| "objective/rlhf_reward": -6.254249572753906, |
| "objective/scores": -5.036294937133789, |
| "policy/approxkl_avg": 0.07349927723407745, |
| "policy/clipfrac_avg": 0.011848341673612595, |
| "policy/entropy_avg": 0.03583555296063423, |
| "step": 28, |
| "val/clipfrac_avg": 0.18853428959846497, |
| "val/num_eos_tokens": 1, |
| "val/ratio": 0.9938024878501892, |
| "val/ratio_var": 1.8159549654228613e-05 |
| }, |
| { |
| "episode": 232, |
| "epoch": 0.29, |
| "eps": 0, |
| "loss/policy_avg": -0.0002830463636200875, |
| "loss/value_avg": 0.29814815521240234, |
| "lr": 3.6e-05, |
| "objective/entropy": 1.003382682800293, |
| "objective/kl": 22.782087326049805, |
| "objective/non_score_reward": -1.1391044855117798, |
| "objective/rlhf_reward": -6.658185958862305, |
| "objective/scores": -5.5190815925598145, |
| "policy/approxkl_avg": 5.4168755013961345e-05, |
| "policy/clipfrac_avg": 0.000589622650295496, |
| "policy/entropy_avg": 0.00883533339947462, |
| "step": 29, |
| "val/clipfrac_avg": 0.14799527823925018, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.000295639038086, |
| "val/ratio_var": 4.9585271000296416e-08 |
| }, |
| { |
| "episode": 240, |
| "epoch": 0.3, |
| "eps": 0, |
| "loss/policy_avg": 4.81777751701884e-05, |
| "loss/value_avg": 0.04622498154640198, |
| "lr": 3.55e-05, |
| "objective/entropy": 0.02696990966796875, |
| "objective/kl": 20.711894989013672, |
| "objective/non_score_reward": -1.0355949401855469, |
| "objective/rlhf_reward": -5.1848955154418945, |
| "objective/scores": -4.149300575256348, |
| "policy/approxkl_avg": 4.969482461092412e-07, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.002766390796750784, |
| "step": 30, |
| "val/clipfrac_avg": 0.1320754736661911, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0001327991485596, |
| "val/ratio_var": 7.744499264106253e-09 |
| }, |
| { |
| "episode": 248, |
| "epoch": 0.31, |
| "eps": 0, |
| "loss/policy_avg": -9.446142030355986e-06, |
| "loss/value_avg": 0.03906877338886261, |
| "lr": 3.5e-05, |
| "objective/entropy": 0.018702030181884766, |
| "objective/kl": 20.70791244506836, |
| "objective/non_score_reward": -1.0353957414627075, |
| "objective/rlhf_reward": -5.228009223937988, |
| "objective/scores": -4.19261360168457, |
| "policy/approxkl_avg": 7.734050200269849e-08, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0023581646382808685, |
| "step": 31, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000567436218262, |
| "val/ratio_var": 8.75753414231184e-10 |
| }, |
| { |
| "episode": 256, |
| "epoch": 0.32, |
| "eps": 0, |
| "loss/policy_avg": -0.0007870738045312464, |
| "loss/value_avg": 0.04817962646484375, |
| "lr": 3.45e-05, |
| "objective/entropy": 7.295212745666504, |
| "objective/kl": 20.889162063598633, |
| "objective/non_score_reward": -1.0444581508636475, |
| "objective/rlhf_reward": -5.3556060791015625, |
| "objective/scores": -4.311148166656494, |
| "policy/approxkl_avg": 0.03791780769824982, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.0018975285347551107, |
| "step": 32, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 5.218703269958496, |
| "val/ratio_var": 57.09228515625 |
| }, |
| { |
| "episode": 264, |
| "epoch": 0.33, |
| "eps": 0, |
| "loss/policy_avg": -1.0527459380682558e-05, |
| "loss/value_avg": 0.04428659379482269, |
| "lr": 3.4000000000000007e-05, |
| "objective/entropy": 0.011212348937988281, |
| "objective/kl": 21.30687141418457, |
| "objective/non_score_reward": -1.0653434991836548, |
| "objective/rlhf_reward": -5.890254497528076, |
| "objective/scores": -4.824911117553711, |
| "policy/approxkl_avg": 2.762976292203234e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0016911793500185013, |
| "step": 33, |
| "val/clipfrac_avg": 0.12028302252292633, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000114440917969, |
| "val/ratio_var": 3.550345087366402e-11 |
| }, |
| { |
| "episode": 272, |
| "epoch": 0.34, |
| "eps": 0, |
| "loss/policy_avg": -0.0006948995869606733, |
| "loss/value_avg": 0.32256680727005005, |
| "lr": 3.35e-05, |
| "objective/entropy": 16.441970825195312, |
| "objective/kl": 19.257633209228516, |
| "objective/non_score_reward": -0.9628816843032837, |
| "objective/rlhf_reward": -5.559625148773193, |
| "objective/scores": -4.596743583679199, |
| "policy/approxkl_avg": 0.010671013966202736, |
| "policy/clipfrac_avg": 0.004716981202363968, |
| "policy/entropy_avg": 0.0016686677699908614, |
| "step": 34, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.067878246307373, |
| "val/ratio_var": 0.008366185240447521 |
| }, |
| { |
| "episode": 280, |
| "epoch": 0.35, |
| "eps": 0, |
| "loss/policy_avg": -9.87138719210634e-06, |
| "loss/value_avg": 0.09626642614603043, |
| "lr": 3.3e-05, |
| "objective/entropy": 0.011440753936767578, |
| "objective/kl": 22.810344696044922, |
| "objective/non_score_reward": -1.1405171155929565, |
| "objective/rlhf_reward": -6.108673095703125, |
| "objective/scores": -4.968155860900879, |
| "policy/approxkl_avg": 2.907630047843668e-08, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0016481179045513272, |
| "step": 35, |
| "val/clipfrac_avg": 0.13148584961891174, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.000030517578125, |
| "val/ratio_var": 7.212008767965017e-11 |
| }, |
| { |
| "episode": 288, |
| "epoch": 0.36, |
| "eps": 0, |
| "loss/policy_avg": -0.00041965150739997625, |
| "loss/value_avg": 0.34581074118614197, |
| "lr": 3.2500000000000004e-05, |
| "objective/entropy": 5.809754848480225, |
| "objective/kl": 19.50436782836914, |
| "objective/non_score_reward": -0.9752184152603149, |
| "objective/rlhf_reward": -6.9395976066589355, |
| "objective/scores": -5.96437931060791, |
| "policy/approxkl_avg": 0.0018271624576300383, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.00134533760137856, |
| "step": 36, |
| "val/clipfrac_avg": 0.21049527823925018, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9987656474113464, |
| "val/ratio_var": 7.995101896085544e-07 |
| }, |
| { |
| "episode": 296, |
| "epoch": 0.37, |
| "eps": 0, |
| "loss/policy_avg": -0.0005644646007567644, |
| "loss/value_avg": 0.03859845548868179, |
| "lr": 3.2000000000000005e-05, |
| "objective/entropy": 10.471796035766602, |
| "objective/kl": 24.136667251586914, |
| "objective/non_score_reward": -1.2068334817886353, |
| "objective/rlhf_reward": -6.471621990203857, |
| "objective/scores": -5.264788627624512, |
| "policy/approxkl_avg": 0.003387878183275461, |
| "policy/clipfrac_avg": 0.002358490601181984, |
| "policy/entropy_avg": 0.001884155673906207, |
| "step": 37, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.011574387550354, |
| "val/ratio_var": 0.0001694993843557313 |
| }, |
| { |
| "episode": 304, |
| "epoch": 0.38, |
| "eps": 0, |
| "loss/policy_avg": 1.0252632591800648e-06, |
| "loss/value_avg": 0.12985283136367798, |
| "lr": 3.15e-05, |
| "objective/entropy": 0.009371757507324219, |
| "objective/kl": 22.376888275146484, |
| "objective/non_score_reward": -1.1188445091247559, |
| "objective/rlhf_reward": -5.919074058532715, |
| "objective/scores": -4.800229549407959, |
| "policy/approxkl_avg": 3.2507914138335536e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0015507801435887814, |
| "step": 38, |
| "val/clipfrac_avg": 0.09669811278581619, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9999986886978149, |
| "val/ratio_var": 4.625277938430372e-11 |
| }, |
| { |
| "episode": 312, |
| "epoch": 0.39, |
| "eps": 0, |
| "loss/policy_avg": -0.0002961930294986814, |
| "loss/value_avg": 0.22089466452598572, |
| "lr": 3.1e-05, |
| "objective/entropy": 7.122612476348877, |
| "objective/kl": 16.583417892456055, |
| "objective/non_score_reward": -0.8291708827018738, |
| "objective/rlhf_reward": -5.8148112297058105, |
| "objective/scores": -4.985640525817871, |
| "policy/approxkl_avg": 0.001451514894142747, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.001556121977046132, |
| "step": 39, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0047032833099365, |
| "val/ratio_var": 2.341072104172781e-05 |
| }, |
| { |
| "episode": 320, |
| "epoch": 0.4, |
| "eps": 0, |
| "loss/policy_avg": -0.0014894630294293165, |
| "loss/value_avg": 0.12523594498634338, |
| "lr": 3.05e-05, |
| "objective/entropy": 7.267007827758789, |
| "objective/kl": 18.883316040039062, |
| "objective/non_score_reward": -0.944165825843811, |
| "objective/rlhf_reward": -5.483607769012451, |
| "objective/scores": -4.53944206237793, |
| "policy/approxkl_avg": 0.026282615959644318, |
| "policy/clipfrac_avg": 0.00294811325147748, |
| "policy/entropy_avg": 0.0025057026650756598, |
| "step": 40, |
| "val/clipfrac_avg": 0.2045990526676178, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 2.0079188346862793, |
| "val/ratio_var": 2.947521686553955 |
| }, |
| { |
| "episode": 328, |
| "epoch": 0.41, |
| "eps": 0, |
| "loss/policy_avg": 1.742775202728808e-05, |
| "loss/value_avg": 0.08753944933414459, |
| "lr": 3e-05, |
| "objective/entropy": 0.014935731887817383, |
| "objective/kl": 21.758373260498047, |
| "objective/non_score_reward": -1.0879186391830444, |
| "objective/rlhf_reward": -5.788181781768799, |
| "objective/scores": -4.700263023376465, |
| "policy/approxkl_avg": 8.582184563010742e-08, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.002552807331085205, |
| "step": 41, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.999954879283905, |
| "val/ratio_var": 1.7294539134127263e-09 |
| }, |
| { |
| "episode": 336, |
| "epoch": 0.42, |
| "eps": 0, |
| "loss/policy_avg": 3.205050006727106e-06, |
| "loss/value_avg": 0.1433621644973755, |
| "lr": 2.95e-05, |
| "objective/entropy": 0.0236055850982666, |
| "objective/kl": 22.17743492126465, |
| "objective/non_score_reward": -1.1088719367980957, |
| "objective/rlhf_reward": -5.329220294952393, |
| "objective/scores": -4.220348358154297, |
| "policy/approxkl_avg": 6.920163997392592e-08, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0035035195760428905, |
| "step": 42, |
| "val/clipfrac_avg": 0.26768869161605835, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9999628067016602, |
| "val/ratio_var": 1.2399977711297083e-09 |
| }, |
| { |
| "episode": 344, |
| "epoch": 0.43, |
| "eps": 0, |
| "loss/policy_avg": -0.0001469778799219057, |
| "loss/value_avg": 0.06910190731287003, |
| "lr": 2.9e-05, |
| "objective/entropy": 4.376821517944336, |
| "objective/kl": 23.600093841552734, |
| "objective/non_score_reward": -1.1800047159194946, |
| "objective/rlhf_reward": -5.795012950897217, |
| "objective/scores": -4.615008354187012, |
| "policy/approxkl_avg": 0.0004178693052381277, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.004530872218310833, |
| "step": 43, |
| "val/clipfrac_avg": 0.05896226316690445, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9992430806159973, |
| "val/ratio_var": 4.377442337499815e-07 |
| }, |
| { |
| "episode": 352, |
| "epoch": 0.44, |
| "eps": 0, |
| "loss/policy_avg": -2.1136806026333943e-05, |
| "loss/value_avg": 0.17057329416275024, |
| "lr": 2.8499999999999998e-05, |
| "objective/entropy": 0.029170513153076172, |
| "objective/kl": 23.519248962402344, |
| "objective/non_score_reward": -1.1759625673294067, |
| "objective/rlhf_reward": -6.7914838790893555, |
| "objective/scores": -5.615521430969238, |
| "policy/approxkl_avg": 3.24759383829587e-08, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.003596365451812744, |
| "step": 44, |
| "val/clipfrac_avg": 0.18278300762176514, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000362396240234, |
| "val/ratio_var": 6.971466182115194e-10 |
| }, |
| { |
| "episode": 360, |
| "epoch": 0.45, |
| "eps": 0, |
| "loss/policy_avg": 9.945049896487035e-06, |
| "loss/value_avg": 0.07444664090871811, |
| "lr": 2.8000000000000003e-05, |
| "objective/entropy": 0.026226326823234558, |
| "objective/kl": 20.804168701171875, |
| "objective/non_score_reward": -1.0402084589004517, |
| "objective/rlhf_reward": -5.101940631866455, |
| "objective/scores": -4.061732292175293, |
| "policy/approxkl_avg": 1.7325481493912775e-08, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0033669061958789825, |
| "step": 45, |
| "val/clipfrac_avg": 0.09021226316690445, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000312328338623, |
| "val/ratio_var": 1.461017973269918e-10 |
| }, |
| { |
| "episode": 368, |
| "epoch": 0.46, |
| "eps": 0, |
| "loss/policy_avg": -2.804990072036162e-05, |
| "loss/value_avg": 0.1255272775888443, |
| "lr": 2.7500000000000004e-05, |
| "objective/entropy": 0.615976870059967, |
| "objective/kl": 21.274887084960938, |
| "objective/non_score_reward": -1.063744306564331, |
| "objective/rlhf_reward": -5.944283485412598, |
| "objective/scores": -4.8805389404296875, |
| "policy/approxkl_avg": 7.317021663766354e-05, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.005477185361087322, |
| "step": 46, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0005582571029663, |
| "val/ratio_var": 2.227838393764614e-07 |
| }, |
| { |
| "episode": 376, |
| "epoch": 0.47, |
| "eps": 0, |
| "loss/policy_avg": -0.0003541487967595458, |
| "loss/value_avg": 0.10688911378383636, |
| "lr": 2.7000000000000002e-05, |
| "objective/entropy": 5.619821548461914, |
| "objective/kl": 21.989046096801758, |
| "objective/non_score_reward": -1.0994524955749512, |
| "objective/rlhf_reward": -5.335377216339111, |
| "objective/scores": -4.23592472076416, |
| "policy/approxkl_avg": 0.0005093185463920236, |
| "policy/clipfrac_avg": 0.00294811325147748, |
| "policy/entropy_avg": 0.004899248480796814, |
| "step": 47, |
| "val/clipfrac_avg": 0.002358490601181984, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9987145662307739, |
| "val/ratio_var": 1.1634953125394532e-06 |
| }, |
| { |
| "episode": 384, |
| "epoch": 0.48, |
| "eps": 0, |
| "loss/policy_avg": -0.00010589355952106416, |
| "loss/value_avg": 0.178936168551445, |
| "lr": 2.6500000000000004e-05, |
| "objective/entropy": 6.092685699462891, |
| "objective/kl": 23.867069244384766, |
| "objective/non_score_reward": -1.1933534145355225, |
| "objective/rlhf_reward": -6.364969253540039, |
| "objective/scores": -5.1716156005859375, |
| "policy/approxkl_avg": 0.0002781845105346292, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.004049578681588173, |
| "step": 48, |
| "val/clipfrac_avg": 0.036556605249643326, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0012478828430176, |
| "val/ratio_var": 1.7213611727129319e-06 |
| }, |
| { |
| "episode": 392, |
| "epoch": 0.49, |
| "eps": 0, |
| "loss/policy_avg": -0.0019225918222218752, |
| "loss/value_avg": 0.2773953974246979, |
| "lr": 2.6000000000000002e-05, |
| "objective/entropy": 12.097511291503906, |
| "objective/kl": 24.415565490722656, |
| "objective/non_score_reward": -1.220778465270996, |
| "objective/rlhf_reward": -7.17116641998291, |
| "objective/scores": -5.950387954711914, |
| "policy/approxkl_avg": 0.012523818761110306, |
| "policy/clipfrac_avg": 0.003537735901772976, |
| "policy/entropy_avg": 0.009605629369616508, |
| "step": 49, |
| "val/clipfrac_avg": 0.24174529314041138, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.999590277671814, |
| "val/ratio_var": 6.160067300697847e-07 |
| }, |
| { |
| "episode": 400, |
| "epoch": 0.5, |
| "eps": 0, |
| "loss/policy_avg": -8.95777702680789e-06, |
| "loss/value_avg": 0.09243077039718628, |
| "lr": 2.5500000000000003e-05, |
| "objective/entropy": 0.029079481959342957, |
| "objective/kl": 21.414337158203125, |
| "objective/non_score_reward": -1.0707169771194458, |
| "objective/rlhf_reward": -5.165630340576172, |
| "objective/scores": -4.094913482666016, |
| "policy/approxkl_avg": 3.28439213603815e-08, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.00365700526162982, |
| "step": 50, |
| "val/clipfrac_avg": 0.17216980457305908, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9999852776527405, |
| "val/ratio_var": 6.366533966684074e-10 |
| }, |
| { |
| "episode": 408, |
| "epoch": 0.51, |
| "eps": 0, |
| "loss/policy_avg": -0.0005268001696094871, |
| "loss/value_avg": 0.3501094579696655, |
| "lr": 2.5e-05, |
| "objective/entropy": 7.087305545806885, |
| "objective/kl": 19.910663604736328, |
| "objective/non_score_reward": -0.9955330491065979, |
| "objective/rlhf_reward": -6.61014461517334, |
| "objective/scores": -5.614611625671387, |
| "policy/approxkl_avg": 0.0009080818854272366, |
| "policy/clipfrac_avg": 0.005306603852659464, |
| "policy/entropy_avg": 0.0055799842812120914, |
| "step": 51, |
| "val/clipfrac_avg": 0.1350235939025879, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0038042068481445, |
| "val/ratio_var": 1.224496190843638e-05 |
| }, |
| { |
| "episode": 416, |
| "epoch": 0.52, |
| "eps": 0, |
| "loss/policy_avg": -2.6310954126529396e-05, |
| "loss/value_avg": 0.15892276167869568, |
| "lr": 2.45e-05, |
| "objective/entropy": 0.056726157665252686, |
| "objective/kl": 22.944862365722656, |
| "objective/non_score_reward": -1.1472431421279907, |
| "objective/rlhf_reward": -5.8535990715026855, |
| "objective/scores": -4.706356048583984, |
| "policy/approxkl_avg": 1.523222181276651e-06, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.006282071582973003, |
| "step": 52, |
| "val/clipfrac_avg": 0.18278300762176514, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.999813437461853, |
| "val/ratio_var": 2.821833788857475e-08 |
| }, |
| { |
| "episode": 424, |
| "epoch": 0.53, |
| "eps": 0, |
| "loss/policy_avg": 6.494516128441319e-06, |
| "loss/value_avg": 0.058799147605895996, |
| "lr": 2.4e-05, |
| "objective/entropy": 0.09326425194740295, |
| "objective/kl": 21.923828125, |
| "objective/non_score_reward": -1.0961912870407104, |
| "objective/rlhf_reward": -5.75281286239624, |
| "objective/scores": -4.65662145614624, |
| "policy/approxkl_avg": 2.059743565041572e-06, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.00874295923858881, |
| "step": 53, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9997859597206116, |
| "val/ratio_var": 3.57754323943027e-08 |
| }, |
| { |
| "episode": 432, |
| "epoch": 0.54, |
| "eps": 0, |
| "loss/policy_avg": -2.164826037187595e-05, |
| "loss/value_avg": 0.04323159158229828, |
| "lr": 2.35e-05, |
| "objective/entropy": 0.11925557255744934, |
| "objective/kl": 21.405750274658203, |
| "objective/non_score_reward": -1.0702874660491943, |
| "objective/rlhf_reward": -5.812557220458984, |
| "objective/scores": -4.742269515991211, |
| "policy/approxkl_avg": 1.437954608718428e-07, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.009416119195520878, |
| "step": 54, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000391006469727, |
| "val/ratio_var": 5.505272238082171e-09 |
| }, |
| { |
| "episode": 440, |
| "epoch": 0.55, |
| "eps": 0, |
| "loss/policy_avg": -0.0003278666699770838, |
| "loss/value_avg": 0.0747484341263771, |
| "lr": 2.3000000000000003e-05, |
| "objective/entropy": 5.897934913635254, |
| "objective/kl": 22.183162689208984, |
| "objective/non_score_reward": -1.1091580390930176, |
| "objective/rlhf_reward": -5.195685863494873, |
| "objective/scores": -4.0865278244018555, |
| "policy/approxkl_avg": 0.0002924558939412236, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.011092341504991055, |
| "step": 55, |
| "val/clipfrac_avg": 0.08195754885673523, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0012880563735962, |
| "val/ratio_var": 1.8590358195069712e-06 |
| }, |
| { |
| "episode": 448, |
| "epoch": 0.56, |
| "eps": 0, |
| "loss/policy_avg": -2.5927633032551967e-05, |
| "loss/value_avg": 0.1024797111749649, |
| "lr": 2.25e-05, |
| "objective/entropy": 0.43828973174095154, |
| "objective/kl": 21.43529510498047, |
| "objective/non_score_reward": -1.0717647075653076, |
| "objective/rlhf_reward": -5.657342910766602, |
| "objective/scores": -4.585577964782715, |
| "policy/approxkl_avg": 0.0002615810662973672, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.015617892146110535, |
| "step": 56, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9997985363006592, |
| "val/ratio_var": 2.572847712656312e-08 |
| }, |
| { |
| "episode": 456, |
| "epoch": 0.57, |
| "eps": 0, |
| "loss/policy_avg": -0.0005400018999353051, |
| "loss/value_avg": 0.052224770188331604, |
| "lr": 2.2000000000000003e-05, |
| "objective/entropy": 0.4916686415672302, |
| "objective/kl": 21.881145477294922, |
| "objective/non_score_reward": -1.094057321548462, |
| "objective/rlhf_reward": -5.429174423217773, |
| "objective/scores": -4.335117340087891, |
| "policy/approxkl_avg": 0.0003721084212884307, |
| "policy/clipfrac_avg": 0.004127358552068472, |
| "policy/entropy_avg": 0.018060289323329926, |
| "step": 57, |
| "val/clipfrac_avg": 0.06191037595272064, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9988338947296143, |
| "val/ratio_var": 1.326149572378199e-06 |
| }, |
| { |
| "episode": 464, |
| "epoch": 0.58, |
| "eps": 0, |
| "loss/policy_avg": -0.0007685062009841204, |
| "loss/value_avg": 0.46044227480888367, |
| "lr": 2.15e-05, |
| "objective/entropy": 5.862210750579834, |
| "objective/kl": 24.761629104614258, |
| "objective/non_score_reward": -1.238081455230713, |
| "objective/rlhf_reward": -5.525996685028076, |
| "objective/scores": -4.287915229797363, |
| "policy/approxkl_avg": 0.0017531004268676043, |
| "policy/clipfrac_avg": 0.008254717104136944, |
| "policy/entropy_avg": 0.01944148726761341, |
| "step": 58, |
| "val/clipfrac_avg": 0.033608488738536835, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9990991353988647, |
| "val/ratio_var": 4.7575713324476965e-07 |
| }, |
| { |
| "episode": 472, |
| "epoch": 0.59, |
| "eps": 0, |
| "loss/policy_avg": -0.00046633119927719235, |
| "loss/value_avg": 0.18473884463310242, |
| "lr": 2.1e-05, |
| "objective/entropy": 1.0018765926361084, |
| "objective/kl": 23.490116119384766, |
| "objective/non_score_reward": -1.1745058298110962, |
| "objective/rlhf_reward": -4.755178451538086, |
| "objective/scores": -3.5806727409362793, |
| "policy/approxkl_avg": 0.00026144221192225814, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.012994904071092606, |
| "step": 59, |
| "val/clipfrac_avg": 0.23231130838394165, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0006680488586426, |
| "val/ratio_var": 2.8671664153989695e-07 |
| }, |
| { |
| "episode": 480, |
| "epoch": 0.6, |
| "eps": 0, |
| "loss/policy_avg": -0.00043274153722450137, |
| "loss/value_avg": 0.08987745642662048, |
| "lr": 2.05e-05, |
| "objective/entropy": 0.7680976390838623, |
| "objective/kl": 24.150394439697266, |
| "objective/non_score_reward": -1.2075196504592896, |
| "objective/rlhf_reward": -4.8026533126831055, |
| "objective/scores": -3.5951337814331055, |
| "policy/approxkl_avg": 0.0007885855156928301, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.007194040343165398, |
| "step": 60, |
| "val/clipfrac_avg": 0.13030660152435303, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9989591836929321, |
| "val/ratio_var": 6.779022214686847e-07 |
| }, |
| { |
| "episode": 488, |
| "epoch": 0.61, |
| "eps": 0, |
| "loss/policy_avg": -0.00015176059969235212, |
| "loss/value_avg": 0.05639723315834999, |
| "lr": 2e-05, |
| "objective/entropy": 0.042793840169906616, |
| "objective/kl": 24.719331741333008, |
| "objective/non_score_reward": -1.235966682434082, |
| "objective/rlhf_reward": -4.184081077575684, |
| "objective/scores": -2.9481143951416016, |
| "policy/approxkl_avg": 1.515730446044472e-06, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0038767289370298386, |
| "step": 61, |
| "val/clipfrac_avg": 0.2895047068595886, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0002093315124512, |
| "val/ratio_var": 2.6440673650540703e-08 |
| }, |
| { |
| "episode": 496, |
| "epoch": 0.62, |
| "eps": 0, |
| "loss/policy_avg": -0.00039324647514149547, |
| "loss/value_avg": 0.11202812939882278, |
| "lr": 1.9500000000000003e-05, |
| "objective/entropy": 0.8435674905776978, |
| "objective/kl": 23.85810089111328, |
| "objective/non_score_reward": -1.192905068397522, |
| "objective/rlhf_reward": -4.872404098510742, |
| "objective/scores": -3.6794989109039307, |
| "policy/approxkl_avg": 9.81152625172399e-05, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.004706778563559055, |
| "step": 62, |
| "val/clipfrac_avg": 0.08136792480945587, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0006957054138184, |
| "val/ratio_var": 5.388393446992268e-07 |
| }, |
| { |
| "episode": 504, |
| "epoch": 0.63, |
| "eps": 0, |
| "loss/policy_avg": -0.00048710883129388094, |
| "loss/value_avg": 0.3882809281349182, |
| "lr": 1.9e-05, |
| "objective/entropy": 6.33875036239624, |
| "objective/kl": 23.255828857421875, |
| "objective/non_score_reward": -1.1627914905548096, |
| "objective/rlhf_reward": -5.508538246154785, |
| "objective/scores": -4.345746994018555, |
| "policy/approxkl_avg": 0.0013963093515485525, |
| "policy/clipfrac_avg": 0.00294811325147748, |
| "policy/entropy_avg": 0.00336352176964283, |
| "step": 63, |
| "val/clipfrac_avg": 0.1362028419971466, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9983474612236023, |
| "val/ratio_var": 1.6093348449430778e-06 |
| }, |
| { |
| "episode": 512, |
| "epoch": 0.64, |
| "eps": 0, |
| "loss/policy_avg": -0.0009707484859973192, |
| "loss/value_avg": 0.07156310975551605, |
| "lr": 1.85e-05, |
| "objective/entropy": 5.413853168487549, |
| "objective/kl": 25.648712158203125, |
| "objective/non_score_reward": -1.2824357748031616, |
| "objective/rlhf_reward": -4.381594181060791, |
| "objective/scores": -3.099158525466919, |
| "policy/approxkl_avg": 0.00428994745016098, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.0022988244891166687, |
| "step": 64, |
| "val/clipfrac_avg": 0.10908018797636032, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9985601902008057, |
| "val/ratio_var": 1.0099387282025418e-06 |
| }, |
| { |
| "episode": 520, |
| "epoch": 0.65, |
| "eps": 0, |
| "loss/policy_avg": -0.0018268069252371788, |
| "loss/value_avg": 0.037753816694021225, |
| "lr": 1.8e-05, |
| "objective/entropy": 5.359340190887451, |
| "objective/kl": 23.45077133178711, |
| "objective/non_score_reward": -1.17253839969635, |
| "objective/rlhf_reward": -4.046453475952148, |
| "objective/scores": -2.873914957046509, |
| "policy/approxkl_avg": 0.030117690563201904, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.0016830174718052149, |
| "step": 65, |
| "val/clipfrac_avg": 0.0383254699409008, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9982761144638062, |
| "val/ratio_var": 1.3298521253091167e-06 |
| }, |
| { |
| "episode": 528, |
| "epoch": 0.66, |
| "eps": 0, |
| "loss/policy_avg": -0.0002859890228137374, |
| "loss/value_avg": 0.021327916532754898, |
| "lr": 1.75e-05, |
| "objective/entropy": 6.843641757965088, |
| "objective/kl": 24.481531143188477, |
| "objective/non_score_reward": -1.224076509475708, |
| "objective/rlhf_reward": -4.482547283172607, |
| "objective/scores": -3.2584707736968994, |
| "policy/approxkl_avg": 0.008686334826052189, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.0017893059412017465, |
| "step": 66, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9984126091003418, |
| "val/ratio_var": 1.1801236041719676e-06 |
| }, |
| { |
| "episode": 536, |
| "epoch": 0.67, |
| "eps": 0, |
| "loss/policy_avg": -0.0005837440257892013, |
| "loss/value_avg": 0.053136035799980164, |
| "lr": 1.7000000000000003e-05, |
| "objective/entropy": 6.8793745040893555, |
| "objective/kl": 22.919111251831055, |
| "objective/non_score_reward": -1.1459555625915527, |
| "objective/rlhf_reward": -4.624814033508301, |
| "objective/scores": -3.478858232498169, |
| "policy/approxkl_avg": 0.0112453643232584, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.001637791283428669, |
| "step": 67, |
| "val/clipfrac_avg": 0.10613207519054413, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9983804821968079, |
| "val/ratio_var": 1.2074211781509803e-06 |
| }, |
| { |
| "episode": 544, |
| "epoch": 0.68, |
| "eps": 0, |
| "loss/policy_avg": 0.00033806508872658014, |
| "loss/value_avg": 0.048039525747299194, |
| "lr": 1.65e-05, |
| "objective/entropy": 14.75048542022705, |
| "objective/kl": 23.478042602539062, |
| "objective/non_score_reward": -1.1739022731781006, |
| "objective/rlhf_reward": -4.242119789123535, |
| "objective/scores": -3.0682172775268555, |
| "policy/approxkl_avg": 0.027489835396409035, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.001942639471963048, |
| "step": 68, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9966699481010437, |
| "val/ratio_var": 5.0052558435709216e-06 |
| }, |
| { |
| "episode": 552, |
| "epoch": 0.69, |
| "eps": 0, |
| "loss/policy_avg": -0.00016136097838170826, |
| "loss/value_avg": 0.26613786816596985, |
| "lr": 1.6000000000000003e-05, |
| "objective/entropy": 5.09047269821167, |
| "objective/kl": 23.750701904296875, |
| "objective/non_score_reward": -1.1875351667404175, |
| "objective/rlhf_reward": -5.098639011383057, |
| "objective/scores": -3.9111037254333496, |
| "policy/approxkl_avg": 0.0023809224367141724, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.0014511016197502613, |
| "step": 69, |
| "val/clipfrac_avg": 0.08254716545343399, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9986908435821533, |
| "val/ratio_var": 8.683288683641877e-07 |
| }, |
| { |
| "episode": 560, |
| "epoch": 0.7, |
| "eps": 0, |
| "loss/policy_avg": 5.313775091053685e-06, |
| "loss/value_avg": 0.04069013148546219, |
| "lr": 1.55e-05, |
| "objective/entropy": 0.004417300224304199, |
| "objective/kl": 22.818960189819336, |
| "objective/non_score_reward": -1.140947937965393, |
| "objective/rlhf_reward": -4.2426886558532715, |
| "objective/scores": -3.101740598678589, |
| "policy/approxkl_avg": 1.6673491476382196e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.000669731292873621, |
| "step": 70, |
| "val/clipfrac_avg": 0.033018868416547775, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000087022781372, |
| "val/ratio_var": 8.521776166670048e-12 |
| }, |
| { |
| "episode": 568, |
| "epoch": 0.71, |
| "eps": 0, |
| "loss/policy_avg": 3.097684384556487e-06, |
| "loss/value_avg": 0.07564935088157654, |
| "lr": 1.5e-05, |
| "objective/entropy": 0.0037894248962402344, |
| "objective/kl": 24.71213722229004, |
| "objective/non_score_reward": -1.2356069087982178, |
| "objective/rlhf_reward": -4.127857208251953, |
| "objective/scores": -2.8922500610351562, |
| "policy/approxkl_avg": 1.0003454731233319e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0005924965953454375, |
| "step": 71, |
| "val/clipfrac_avg": 0.1179245263338089, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.000006079673767, |
| "val/ratio_var": 2.1884716261411086e-12 |
| }, |
| { |
| "episode": 576, |
| "epoch": 0.72, |
| "eps": 0, |
| "loss/policy_avg": -3.769466502490104e-06, |
| "loss/value_avg": 0.018075991421937943, |
| "lr": 1.45e-05, |
| "objective/entropy": 0.004113674163818359, |
| "objective/kl": 24.736343383789062, |
| "objective/non_score_reward": -1.2368173599243164, |
| "objective/rlhf_reward": -4.213069438934326, |
| "objective/scores": -2.9762520790100098, |
| "policy/approxkl_avg": 7.042492145004076e-10, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0006491912645287812, |
| "step": 72, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000057220458984, |
| "val/ratio_var": 9.189685687763238e-13 |
| }, |
| { |
| "episode": 584, |
| "epoch": 0.73, |
| "eps": 0, |
| "loss/policy_avg": -8.439961675321683e-06, |
| "loss/value_avg": 0.02476009726524353, |
| "lr": 1.4000000000000001e-05, |
| "objective/entropy": 0.004063129425048828, |
| "objective/kl": 24.381113052368164, |
| "objective/non_score_reward": -1.2190558910369873, |
| "objective/rlhf_reward": -4.345273971557617, |
| "objective/scores": -3.12621808052063, |
| "policy/approxkl_avg": 1.6590684381867504e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0006231879233382642, |
| "step": 73, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000081062316895, |
| "val/ratio_var": 8.526512829121202e-13 |
| }, |
| { |
| "episode": 592, |
| "epoch": 0.74, |
| "eps": 0, |
| "loss/policy_avg": -0.0015291509917005897, |
| "loss/value_avg": 0.0400041788816452, |
| "lr": 1.3500000000000001e-05, |
| "objective/entropy": 5.482907295227051, |
| "objective/kl": 19.337940216064453, |
| "objective/non_score_reward": -0.9668970704078674, |
| "objective/rlhf_reward": -4.515492916107178, |
| "objective/scores": -3.548595666885376, |
| "policy/approxkl_avg": 0.04833262413740158, |
| "policy/clipfrac_avg": 0.0018703242531046271, |
| "policy/entropy_avg": 0.11862978339195251, |
| "step": 74, |
| "val/clipfrac_avg": 0.007462686393409967, |
| "val/num_eos_tokens": 1, |
| "val/ratio": 0.9982566833496094, |
| "val/ratio_var": 1.3634206652568537e-06 |
| }, |
| { |
| "episode": 600, |
| "epoch": 0.75, |
| "eps": 0, |
| "loss/policy_avg": 4.78032063710998e-07, |
| "loss/value_avg": 0.02071548067033291, |
| "lr": 1.3000000000000001e-05, |
| "objective/entropy": 0.003952503204345703, |
| "objective/kl": 24.335468292236328, |
| "objective/non_score_reward": -1.216773509979248, |
| "objective/rlhf_reward": -4.120009899139404, |
| "objective/scores": -2.9032363891601562, |
| "policy/approxkl_avg": 4.740483561249675e-10, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0005925038713030517, |
| "step": 75, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000042915344238, |
| "val/ratio_var": 3.884300379172062e-13 |
| }, |
| { |
| "episode": 608, |
| "epoch": 0.76, |
| "eps": 0, |
| "loss/policy_avg": -1.0813885182869853e-06, |
| "loss/value_avg": 0.016514349728822708, |
| "lr": 1.25e-05, |
| "objective/entropy": 0.0032129287719726562, |
| "objective/kl": 24.535846710205078, |
| "objective/non_score_reward": -1.2267922163009644, |
| "objective/rlhf_reward": -4.2841620445251465, |
| "objective/scores": -3.0573699474334717, |
| "policy/approxkl_avg": 6.217855119672322e-10, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0005110952188260853, |
| "step": 76, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.000004529953003, |
| "val/ratio_var": 2.03688913067053e-13 |
| }, |
| { |
| "episode": 616, |
| "epoch": 0.77, |
| "eps": 0, |
| "loss/policy_avg": -7.110946626198711e-06, |
| "loss/value_avg": 0.047897208482027054, |
| "lr": 1.2e-05, |
| "objective/entropy": 0.003764629364013672, |
| "objective/kl": 25.578466415405273, |
| "objective/non_score_reward": -1.2789231538772583, |
| "objective/rlhf_reward": -4.72192907333374, |
| "objective/scores": -3.4430060386657715, |
| "policy/approxkl_avg": 5.927374147063347e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0005579590797424316, |
| "step": 77, |
| "val/clipfrac_avg": 0.004127358552068472, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000112056732178, |
| "val/ratio_var": 2.03688913067053e-13 |
| }, |
| { |
| "episode": 624, |
| "epoch": 0.78, |
| "eps": 0, |
| "loss/policy_avg": -3.5153384487784933e-06, |
| "loss/value_avg": 0.014893004670739174, |
| "lr": 1.1500000000000002e-05, |
| "objective/entropy": 0.003036022186279297, |
| "objective/kl": 24.764558792114258, |
| "objective/non_score_reward": -1.2382278442382812, |
| "objective/rlhf_reward": -4.514825820922852, |
| "objective/scores": -3.2765979766845703, |
| "policy/approxkl_avg": 1.0030707375818793e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0004918260383419693, |
| "step": 78, |
| "val/clipfrac_avg": 0.010613207705318928, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000040531158447, |
| "val/ratio_var": 1.0421293683690255e-13 |
| }, |
| { |
| "episode": 632, |
| "epoch": 0.79, |
| "eps": 0, |
| "loss/policy_avg": -0.0017197050619870424, |
| "loss/value_avg": 0.059836748987436295, |
| "lr": 1.1000000000000001e-05, |
| "objective/entropy": 10.20138168334961, |
| "objective/kl": 18.15127182006836, |
| "objective/non_score_reward": -0.9075635075569153, |
| "objective/rlhf_reward": -4.070981502532959, |
| "objective/scores": -3.1634178161621094, |
| "policy/approxkl_avg": 0.672481894493103, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.0007890131091699004, |
| "step": 79, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 118013739008.0, |
| "val/ratio_var": 5.060441798977168e+22 |
| }, |
| { |
| "episode": 640, |
| "epoch": 0.8, |
| "eps": 0, |
| "loss/policy_avg": -0.0004759056319016963, |
| "loss/value_avg": 0.014939786866307259, |
| "lr": 1.05e-05, |
| "objective/entropy": 7.734498977661133, |
| "objective/kl": 23.559078216552734, |
| "objective/non_score_reward": -1.1779539585113525, |
| "objective/rlhf_reward": -4.140482425689697, |
| "objective/scores": -2.9625284671783447, |
| "policy/approxkl_avg": 0.10851067304611206, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.00210343929938972, |
| "step": 80, |
| "val/clipfrac_avg": 0.01945754699409008, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 381.6966552734375, |
| "val/ratio_var": 399719.8125 |
| }, |
| { |
| "episode": 648, |
| "epoch": 0.81, |
| "eps": 0, |
| "loss/policy_avg": -0.0009191618300974369, |
| "loss/value_avg": 0.45779138803482056, |
| "lr": 1e-05, |
| "objective/entropy": 10.361689567565918, |
| "objective/kl": 25.474790573120117, |
| "objective/non_score_reward": -1.2737394571304321, |
| "objective/rlhf_reward": -5.300909042358398, |
| "objective/scores": -4.027169704437256, |
| "policy/approxkl_avg": 0.001493292860686779, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.004847034811973572, |
| "step": 81, |
| "val/clipfrac_avg": 0.057193394750356674, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0019978284835815, |
| "val/ratio_var": 2.976292307721451e-06 |
| }, |
| { |
| "episode": 656, |
| "epoch": 0.82, |
| "eps": 0, |
| "loss/policy_avg": -0.0005446273135021329, |
| "loss/value_avg": 0.2114354372024536, |
| "lr": 9.5e-06, |
| "objective/entropy": 0.7502050399780273, |
| "objective/kl": 24.138385772705078, |
| "objective/non_score_reward": -1.2069193124771118, |
| "objective/rlhf_reward": -4.927611351013184, |
| "objective/scores": -3.7206919193267822, |
| "policy/approxkl_avg": 0.00017879472579807043, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.013909805566072464, |
| "step": 82, |
| "val/clipfrac_avg": 0.04245283082127571, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.999313235282898, |
| "val/ratio_var": 3.8654417267025565e-07 |
| }, |
| { |
| "episode": 664, |
| "epoch": 0.83, |
| "eps": 0, |
| "loss/policy_avg": -0.0008736238232813776, |
| "loss/value_avg": 0.019180217757821083, |
| "lr": 9e-06, |
| "objective/entropy": 0.9455151557922363, |
| "objective/kl": 24.899396896362305, |
| "objective/non_score_reward": -1.2449698448181152, |
| "objective/rlhf_reward": -4.538572311401367, |
| "objective/scores": -3.293602705001831, |
| "policy/approxkl_avg": 0.00010491647117305547, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.006164656486362219, |
| "step": 83, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.999660849571228, |
| "val/ratio_var": 1.1305993297128225e-07 |
| }, |
| { |
| "episode": 672, |
| "epoch": 0.84, |
| "eps": 0, |
| "loss/policy_avg": 1.0482122888788581e-05, |
| "loss/value_avg": 0.058847103267908096, |
| "lr": 8.500000000000002e-06, |
| "objective/entropy": 0.026853561401367188, |
| "objective/kl": 27.00433349609375, |
| "objective/non_score_reward": -1.3502166271209717, |
| "objective/rlhf_reward": -4.081714153289795, |
| "objective/scores": -2.7314975261688232, |
| "policy/approxkl_avg": 6.986047296209108e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.004489392973482609, |
| "step": 84, |
| "val/clipfrac_avg": 0.03537735715508461, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000537633895874, |
| "val/ratio_var": 1.511182290414581e-09 |
| }, |
| { |
| "episode": 680, |
| "epoch": 0.85, |
| "eps": 0, |
| "loss/policy_avg": -2.936266810138477e-06, |
| "loss/value_avg": 0.016916655004024506, |
| "lr": 8.000000000000001e-06, |
| "objective/entropy": 0.020194053649902344, |
| "objective/kl": 24.650644302368164, |
| "objective/non_score_reward": -1.2325321435928345, |
| "objective/rlhf_reward": -4.201452255249023, |
| "objective/scores": -2.9689202308654785, |
| "policy/approxkl_avg": 1.4028140871147343e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0036443518474698067, |
| "step": 85, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000245571136475, |
| "val/ratio_var": 3.302128848137187e-10 |
| }, |
| { |
| "episode": 688, |
| "epoch": 0.86, |
| "eps": 0, |
| "loss/policy_avg": -0.00116700679063797, |
| "loss/value_avg": 0.03304029628634453, |
| "lr": 7.5e-06, |
| "objective/entropy": 10.308148384094238, |
| "objective/kl": 26.107011795043945, |
| "objective/non_score_reward": -1.3053505420684814, |
| "objective/rlhf_reward": -4.5653228759765625, |
| "objective/scores": -3.259972095489502, |
| "policy/approxkl_avg": 0.0007528926944360137, |
| "policy/clipfrac_avg": 0.003537735901772976, |
| "policy/entropy_avg": 0.003782036015763879, |
| "step": 86, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9984903931617737, |
| "val/ratio_var": 1.3541301768782432e-06 |
| }, |
| { |
| "episode": 696, |
| "epoch": 0.87, |
| "eps": 0, |
| "loss/policy_avg": -0.0012483038008213043, |
| "loss/value_avg": 0.020644810050725937, |
| "lr": 7.000000000000001e-06, |
| "objective/entropy": 4.934552192687988, |
| "objective/kl": 25.656095504760742, |
| "objective/non_score_reward": -1.2828046083450317, |
| "objective/rlhf_reward": -4.459787845611572, |
| "objective/scores": -3.17698335647583, |
| "policy/approxkl_avg": 0.0011864990228787065, |
| "policy/clipfrac_avg": 0.001768867950886488, |
| "policy/entropy_avg": 0.0023125973530113697, |
| "step": 87, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9989151358604431, |
| "val/ratio_var": 6.361988198477775e-07 |
| }, |
| { |
| "episode": 704, |
| "epoch": 0.88, |
| "eps": 0, |
| "loss/policy_avg": -0.0006414442323148251, |
| "loss/value_avg": 0.36193448305130005, |
| "lr": 6.5000000000000004e-06, |
| "objective/entropy": 4.682817459106445, |
| "objective/kl": 25.37774658203125, |
| "objective/non_score_reward": -1.2688874006271362, |
| "objective/rlhf_reward": -4.902969837188721, |
| "objective/scores": -3.634082555770874, |
| "policy/approxkl_avg": 0.00025906978407874703, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.0020385892130434513, |
| "step": 88, |
| "val/clipfrac_avg": 0.03773584961891174, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9993649125099182, |
| "val/ratio_var": 2.928149456238316e-07 |
| }, |
| { |
| "episode": 712, |
| "epoch": 0.89, |
| "eps": 0, |
| "loss/policy_avg": -0.001403487753123045, |
| "loss/value_avg": 0.045823924243450165, |
| "lr": 6e-06, |
| "objective/entropy": 7.662001132965088, |
| "objective/kl": 26.13595962524414, |
| "objective/non_score_reward": -1.306797981262207, |
| "objective/rlhf_reward": -4.496212005615234, |
| "objective/scores": -3.1894140243530273, |
| "policy/approxkl_avg": 0.001071479171514511, |
| "policy/clipfrac_avg": 0.00294811325147748, |
| "policy/entropy_avg": 0.0017833821475505829, |
| "step": 89, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9985510110855103, |
| "val/ratio_var": 1.2452730970835546e-06 |
| }, |
| { |
| "episode": 720, |
| "epoch": 0.9, |
| "eps": 0, |
| "loss/policy_avg": 2.8316421776253264e-06, |
| "loss/value_avg": 0.02468748763203621, |
| "lr": 5.500000000000001e-06, |
| "objective/entropy": 0.006358623504638672, |
| "objective/kl": 25.177474975585938, |
| "objective/non_score_reward": -1.258873701095581, |
| "objective/rlhf_reward": -4.340834617614746, |
| "objective/scores": -3.081960678100586, |
| "policy/approxkl_avg": 3.0429461728154195e-10, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0012125474167987704, |
| "step": 90, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.000009298324585, |
| "val/ratio_var": 3.510081114654895e-11 |
| }, |
| { |
| "episode": 728, |
| "epoch": 0.91, |
| "eps": 0, |
| "loss/policy_avg": -0.003975835628807545, |
| "loss/value_avg": 0.0309587549418211, |
| "lr": 5e-06, |
| "objective/entropy": 9.262920379638672, |
| "objective/kl": 27.77571678161621, |
| "objective/non_score_reward": -1.3887858390808105, |
| "objective/rlhf_reward": -4.151398658752441, |
| "objective/scores": -2.762612819671631, |
| "policy/approxkl_avg": 0.0016623970586806536, |
| "policy/clipfrac_avg": 0.004716981202363968, |
| "policy/entropy_avg": 0.025041375309228897, |
| "step": 91, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9995808601379395, |
| "val/ratio_var": 2.4279825083795004e-07 |
| }, |
| { |
| "episode": 736, |
| "epoch": 0.92, |
| "eps": 0, |
| "loss/policy_avg": -3.4936194879264804e-06, |
| "loss/value_avg": 0.021127665415406227, |
| "lr": 4.5e-06, |
| "objective/entropy": 0.003822803497314453, |
| "objective/kl": 24.14543342590332, |
| "objective/non_score_reward": -1.207271695137024, |
| "objective/rlhf_reward": -4.3759307861328125, |
| "objective/scores": -3.168658971786499, |
| "policy/approxkl_avg": 2.551990563315343e-10, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.000724360637832433, |
| "step": 92, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.000006914138794, |
| "val/ratio_var": 1.4746129664566787e-11 |
| }, |
| { |
| "episode": 744, |
| "epoch": 0.93, |
| "eps": 0, |
| "loss/policy_avg": -0.0001179491518996656, |
| "loss/value_avg": 0.0367477647960186, |
| "lr": 4.000000000000001e-06, |
| "objective/entropy": 5.264230251312256, |
| "objective/kl": 24.125144958496094, |
| "objective/non_score_reward": -1.2062572240829468, |
| "objective/rlhf_reward": -4.48881721496582, |
| "objective/scores": -3.282560110092163, |
| "policy/approxkl_avg": 3.359060792718083e-05, |
| "policy/clipfrac_avg": 0.000589622650295496, |
| "policy/entropy_avg": 0.0007435465813614428, |
| "step": 93, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9997267127037048, |
| "val/ratio_var": 6.120335171999614e-08 |
| }, |
| { |
| "episode": 752, |
| "epoch": 0.94, |
| "eps": 0, |
| "loss/policy_avg": -0.0003386925673112273, |
| "loss/value_avg": 0.010906368494033813, |
| "lr": 3.5000000000000004e-06, |
| "objective/entropy": 5.7897210121154785, |
| "objective/kl": 23.875186920166016, |
| "objective/non_score_reward": -1.1937594413757324, |
| "objective/rlhf_reward": -4.192901611328125, |
| "objective/scores": -2.9991419315338135, |
| "policy/approxkl_avg": 0.00011795548925874755, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.0006920391460880637, |
| "step": 94, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9995193481445312, |
| "val/ratio_var": 1.679601666637609e-07 |
| }, |
| { |
| "episode": 760, |
| "epoch": 0.95, |
| "eps": 0, |
| "loss/policy_avg": -7.089016889949562e-06, |
| "loss/value_avg": 0.023602645844221115, |
| "lr": 3e-06, |
| "objective/entropy": 0.0036764144897460938, |
| "objective/kl": 24.60375213623047, |
| "objective/non_score_reward": -1.2301876544952393, |
| "objective/rlhf_reward": -4.6299896240234375, |
| "objective/scores": -3.399801731109619, |
| "policy/approxkl_avg": 8.320251154714242e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0006472456734627485, |
| "step": 95, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000110864639282, |
| "val/ratio_var": 1.4495071809506044e-12 |
| }, |
| { |
| "episode": 768, |
| "epoch": 0.96, |
| "eps": 0, |
| "loss/policy_avg": 4.672603211020032e-07, |
| "loss/value_avg": 0.0325840599834919, |
| "lr": 2.5e-06, |
| "objective/entropy": 0.0027709007263183594, |
| "objective/kl": 24.18463897705078, |
| "objective/non_score_reward": -1.2092320919036865, |
| "objective/rlhf_reward": -4.501224517822266, |
| "objective/scores": -3.291992664337158, |
| "policy/approxkl_avg": 1.2074271582562801e-09, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0005334683228284121, |
| "step": 96, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 1.0000051259994507, |
| "val/ratio_var": 3.268496584496461e-13 |
| }, |
| { |
| "episode": 776, |
| "epoch": 0.97, |
| "eps": 0, |
| "loss/policy_avg": -0.0007012633141130209, |
| "loss/value_avg": 0.025306837633252144, |
| "lr": 2.0000000000000003e-06, |
| "objective/entropy": 6.557465076446533, |
| "objective/kl": 24.751359939575195, |
| "objective/non_score_reward": -1.2375680208206177, |
| "objective/rlhf_reward": -4.222304821014404, |
| "objective/scores": -2.984736919403076, |
| "policy/approxkl_avg": 0.00011602477025007829, |
| "policy/clipfrac_avg": 0.001179245300590992, |
| "policy/entropy_avg": 0.0004479637718759477, |
| "step": 97, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9995274543762207, |
| "val/ratio_var": 1.737082584440941e-07 |
| }, |
| { |
| "episode": 784, |
| "epoch": 0.98, |
| "eps": 0, |
| "loss/policy_avg": -3.1152703741099685e-05, |
| "loss/value_avg": 0.01748664304614067, |
| "lr": 1.5e-06, |
| "objective/entropy": 6.415027141571045, |
| "objective/kl": 23.67858123779297, |
| "objective/non_score_reward": -1.1839290857315063, |
| "objective/rlhf_reward": -4.280977249145508, |
| "objective/scores": -3.097048282623291, |
| "policy/approxkl_avg": 2.8390975785441697e-05, |
| "policy/clipfrac_avg": 0.000589622650295496, |
| "policy/entropy_avg": 0.00046760181430727243, |
| "step": 98, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9997364282608032, |
| "val/ratio_var": 4.89392171232339e-08 |
| }, |
| { |
| "episode": 792, |
| "epoch": 0.99, |
| "eps": 0, |
| "loss/policy_avg": -0.00032194965751841664, |
| "loss/value_avg": 0.17389172315597534, |
| "lr": 1.0000000000000002e-06, |
| "objective/entropy": 1.3875689506530762, |
| "objective/kl": 23.84311866760254, |
| "objective/non_score_reward": -1.1921560764312744, |
| "objective/rlhf_reward": -5.213768005371094, |
| "objective/scores": -4.02161169052124, |
| "policy/approxkl_avg": 7.786412425048184e-06, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.0005042271804995835, |
| "step": 99, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9998690485954285, |
| "val/ratio_var": 1.8143422764183015e-08 |
| }, |
| { |
| "episode": 800, |
| "epoch": 1.0, |
| "eps": 0, |
| "loss/policy_avg": -0.0009410554193891585, |
| "loss/value_avg": 0.45231789350509644, |
| "lr": 5.000000000000001e-07, |
| "objective/entropy": 8.080781936645508, |
| "objective/kl": 22.65479278564453, |
| "objective/non_score_reward": -1.1327396631240845, |
| "objective/rlhf_reward": -5.200520992279053, |
| "objective/scores": -4.067781448364258, |
| "policy/approxkl_avg": 2.2665039068670012e-05, |
| "policy/clipfrac_avg": 0.0, |
| "policy/entropy_avg": 0.00041846060776151717, |
| "step": 100, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9996796250343323, |
| "val/ratio_var": 1.0175851627991506e-07 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 100, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1.0, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": true, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0, |
| "train_batch_size": null, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|