| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "episode": 1600, |
| "epoch": 2.0, |
| "eval_steps": 50, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "episode": 16, |
| "epoch": 0.02, |
| "eps": 0, |
| "loss/policy_avg": 0.0004486599937081337, |
| "loss/value_avg": 1.2938655614852905, |
| "lr": 0.0, |
| "objective/entropy": 162.11053466796875, |
| "objective/kl": 4.939833164215088, |
| "objective/non_score_reward": -0.24699166417121887, |
| "objective/rlhf_reward": -0.29166939854621887, |
| "objective/scores": -0.044677734375, |
| "policy/approxkl_avg": 0.001968675758689642, |
| "policy/clipfrac_avg": 0.00244140625, |
| "policy/entropy_avg": 1.5513076782226562, |
| "step": 1, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9633879661560059, |
| "val/ratio_var": 3.8020948522898834e-07 |
| }, |
| { |
| "episode": 32, |
| "epoch": 0.04, |
| "eps": 0, |
| "loss/policy_avg": -0.0011649401858448982, |
| "loss/value_avg": 1.1032419204711914, |
| "lr": 1.0000000000000001e-07, |
| "objective/entropy": 185.68942260742188, |
| "objective/kl": 5.192228317260742, |
| "objective/non_score_reward": -0.25961142778396606, |
| "objective/rlhf_reward": -0.6131270527839661, |
| "objective/scores": -0.353515625, |
| "policy/approxkl_avg": 0.002094644121825695, |
| "policy/clipfrac_avg": 0.00146484375, |
| "policy/entropy_avg": 1.7179677486419678, |
| "step": 2, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9616182446479797, |
| "val/ratio_var": 2.1992273104842752e-06 |
| }, |
| { |
| "episode": 48, |
| "epoch": 0.06, |
| "eps": 0, |
| "loss/policy_avg": -0.00016614759806543589, |
| "loss/value_avg": 1.5101345777511597, |
| "lr": 2.0000000000000002e-07, |
| "objective/entropy": 174.795654296875, |
| "objective/kl": 5.0596089363098145, |
| "objective/non_score_reward": -0.25298047065734863, |
| "objective/rlhf_reward": -0.013722658157348633, |
| "objective/scores": 0.2392578125, |
| "policy/approxkl_avg": 0.0020569346379488707, |
| "policy/clipfrac_avg": 0.0018310546875, |
| "policy/entropy_avg": 1.5935308933258057, |
| "step": 3, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9618856906890869, |
| "val/ratio_var": 3.546992275005323e-06 |
| }, |
| { |
| "episode": 64, |
| "epoch": 0.08, |
| "eps": 0, |
| "loss/policy_avg": -0.0001529245637357235, |
| "loss/value_avg": 1.3132586479187012, |
| "lr": 3.0000000000000004e-07, |
| "objective/entropy": 167.1978759765625, |
| "objective/kl": 5.142940044403076, |
| "objective/non_score_reward": -0.25714701414108276, |
| "objective/rlhf_reward": -0.31622904539108276, |
| "objective/scores": -0.05908203125, |
| "policy/approxkl_avg": 0.002137545496225357, |
| "policy/clipfrac_avg": 0.0025634765625, |
| "policy/entropy_avg": 1.6127209663391113, |
| "step": 4, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9616750478744507, |
| "val/ratio_var": 1.2533583912954782e-06 |
| }, |
| { |
| "episode": 80, |
| "epoch": 0.1, |
| "eps": 0, |
| "loss/policy_avg": -0.00246006203815341, |
| "loss/value_avg": 1.1861101388931274, |
| "lr": 4.0000000000000003e-07, |
| "objective/entropy": 161.64456176757812, |
| "objective/kl": 4.87198543548584, |
| "objective/non_score_reward": -0.24359926581382751, |
| "objective/rlhf_reward": -0.3009723126888275, |
| "objective/scores": -0.057373046875, |
| "policy/approxkl_avg": 0.001990006770938635, |
| "policy/clipfrac_avg": 0.002197265625, |
| "policy/entropy_avg": 1.506896734237671, |
| "step": 5, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9635031819343567, |
| "val/ratio_var": 3.90803052141564e-06 |
| }, |
| { |
| "episode": 96, |
| "epoch": 0.12, |
| "eps": 0, |
| "loss/policy_avg": -0.0034578712657094, |
| "loss/value_avg": 1.237225890159607, |
| "lr": 5.000000000000001e-07, |
| "objective/entropy": 176.53646850585938, |
| "objective/kl": 5.119318962097168, |
| "objective/non_score_reward": -0.2559659481048584, |
| "objective/rlhf_reward": -0.2571256160736084, |
| "objective/scores": -0.00115966796875, |
| "policy/approxkl_avg": 0.0020507299341261387, |
| "policy/clipfrac_avg": 0.002197265625, |
| "policy/entropy_avg": 1.5803486108779907, |
| "step": 6, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9626491069793701, |
| "val/ratio_var": 4.622608685167506e-06 |
| }, |
| { |
| "episode": 112, |
| "epoch": 0.14, |
| "eps": 0, |
| "loss/policy_avg": -0.0025506531819701195, |
| "loss/value_avg": 1.258737325668335, |
| "lr": 6.000000000000001e-07, |
| "objective/entropy": 177.9249725341797, |
| "objective/kl": 5.401371002197266, |
| "objective/non_score_reward": -0.27006858587265015, |
| "objective/rlhf_reward": -0.5435060858726501, |
| "objective/scores": -0.2734375, |
| "policy/approxkl_avg": 0.0022585245314985514, |
| "policy/clipfrac_avg": 0.0029296875, |
| "policy/entropy_avg": 1.6439510583877563, |
| "step": 7, |
| "val/clipfrac_avg": 0.0003662109375, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9604523181915283, |
| "val/ratio_var": 6.208483227965189e-06 |
| }, |
| { |
| "episode": 128, |
| "epoch": 0.16, |
| "eps": 0, |
| "loss/policy_avg": -0.0017832191661000252, |
| "loss/value_avg": 1.2353310585021973, |
| "lr": 7.000000000000001e-07, |
| "objective/entropy": 175.4106903076172, |
| "objective/kl": 4.9585371017456055, |
| "objective/non_score_reward": -0.24792686104774475, |
| "objective/rlhf_reward": -0.37683311104774475, |
| "objective/scores": -0.12890625, |
| "policy/approxkl_avg": 0.002244518604129553, |
| "policy/clipfrac_avg": 0.003173828125, |
| "policy/entropy_avg": 1.6177117824554443, |
| "step": 8, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9618170261383057, |
| "val/ratio_var": 4.459182491700631e-06 |
| }, |
| { |
| "episode": 144, |
| "epoch": 0.18, |
| "eps": 0, |
| "loss/policy_avg": -0.002314523793756962, |
| "loss/value_avg": 1.3316550254821777, |
| "lr": 8.000000000000001e-07, |
| "objective/entropy": 175.06866455078125, |
| "objective/kl": 4.954601764678955, |
| "objective/non_score_reward": -0.2477300763130188, |
| "objective/rlhf_reward": -0.0358160138130188, |
| "objective/scores": 0.2119140625, |
| "policy/approxkl_avg": 0.002068200148642063, |
| "policy/clipfrac_avg": 0.0015869140625, |
| "policy/entropy_avg": 1.619426965713501, |
| "step": 9, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.962709367275238, |
| "val/ratio_var": 3.7464862998604076e-06 |
| }, |
| { |
| "episode": 160, |
| "epoch": 0.2, |
| "eps": 0, |
| "loss/policy_avg": -0.0006005736067891121, |
| "loss/value_avg": 1.2744617462158203, |
| "lr": 9.000000000000001e-07, |
| "objective/entropy": 169.3418731689453, |
| "objective/kl": 4.965663909912109, |
| "objective/non_score_reward": -0.24828319251537323, |
| "objective/rlhf_reward": 0.03491993248462677, |
| "objective/scores": 0.283203125, |
| "policy/approxkl_avg": 0.002044553868472576, |
| "policy/clipfrac_avg": 0.00244140625, |
| "policy/entropy_avg": 1.5893105268478394, |
| "step": 10, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9636451601982117, |
| "val/ratio_var": 1.0843526069947984e-05 |
| }, |
| { |
| "episode": 176, |
| "epoch": 0.22, |
| "eps": 0, |
| "loss/policy_avg": -0.0016839192248880863, |
| "loss/value_avg": 1.1354484558105469, |
| "lr": 1.0000000000000002e-06, |
| "objective/entropy": 167.39401245117188, |
| "objective/kl": 5.0550994873046875, |
| "objective/non_score_reward": -0.25275495648384094, |
| "objective/rlhf_reward": -0.08673933148384094, |
| "objective/scores": 0.166015625, |
| "policy/approxkl_avg": 0.002176377223804593, |
| "policy/clipfrac_avg": 0.001708984375, |
| "policy/entropy_avg": 1.5987792015075684, |
| "step": 11, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9604842066764832, |
| "val/ratio_var": 1.695754804131866e-06 |
| }, |
| { |
| "episode": 192, |
| "epoch": 0.24, |
| "eps": 0, |
| "loss/policy_avg": -0.0015829752665013075, |
| "loss/value_avg": 1.138115406036377, |
| "lr": 1.1e-06, |
| "objective/entropy": 169.50271606445312, |
| "objective/kl": 5.478184223175049, |
| "objective/non_score_reward": -0.27390921115875244, |
| "objective/rlhf_reward": -0.43992483615875244, |
| "objective/scores": -0.166015625, |
| "policy/approxkl_avg": 0.00216104369610548, |
| "policy/clipfrac_avg": 0.0020751953125, |
| "policy/entropy_avg": 1.6336290836334229, |
| "step": 12, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9616599678993225, |
| "val/ratio_var": 1.954093249878497e-06 |
| }, |
| { |
| "episode": 208, |
| "epoch": 0.26, |
| "eps": 0, |
| "loss/policy_avg": -0.0012321844696998596, |
| "loss/value_avg": 1.1720128059387207, |
| "lr": 1.2000000000000002e-06, |
| "objective/entropy": 181.92147827148438, |
| "objective/kl": 5.243380546569824, |
| "objective/non_score_reward": -0.2621690332889557, |
| "objective/rlhf_reward": -0.2769395411014557, |
| "objective/scores": -0.0147705078125, |
| "policy/approxkl_avg": 0.002162383636459708, |
| "policy/clipfrac_avg": 0.0008544921875, |
| "policy/entropy_avg": 1.696028470993042, |
| "step": 13, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9608445763587952, |
| "val/ratio_var": 3.191448058714741e-06 |
| }, |
| { |
| "episode": 224, |
| "epoch": 0.28, |
| "eps": 0, |
| "loss/policy_avg": -0.00338327931240201, |
| "loss/value_avg": 1.1387133598327637, |
| "lr": 1.3e-06, |
| "objective/entropy": 185.0263671875, |
| "objective/kl": 5.307041168212891, |
| "objective/non_score_reward": -0.2653520703315735, |
| "objective/rlhf_reward": -0.5329301953315735, |
| "objective/scores": -0.267578125, |
| "policy/approxkl_avg": 0.0022570325527340174, |
| "policy/clipfrac_avg": 0.001953125, |
| "policy/entropy_avg": 1.674648642539978, |
| "step": 14, |
| "val/clipfrac_avg": 0.000244140625, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9600518941879272, |
| "val/ratio_var": 1.6098074411274865e-06 |
| }, |
| { |
| "episode": 240, |
| "epoch": 0.3, |
| "eps": 0, |
| "loss/policy_avg": -0.0031057698652148247, |
| "loss/value_avg": 1.1971783638000488, |
| "lr": 1.4000000000000001e-06, |
| "objective/entropy": 168.69589233398438, |
| "objective/kl": 4.939919471740723, |
| "objective/non_score_reward": -0.2469959706068039, |
| "objective/rlhf_reward": -0.4647694230079651, |
| "objective/scores": -0.2177734375, |
| "policy/approxkl_avg": 0.0019079549238085747, |
| "policy/clipfrac_avg": 0.00146484375, |
| "policy/entropy_avg": 1.599722981452942, |
| "step": 15, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9643268585205078, |
| "val/ratio_var": 3.971512342104688e-06 |
| }, |
| { |
| "episode": 256, |
| "epoch": 0.32, |
| "eps": 0, |
| "loss/policy_avg": -0.0029412589501589537, |
| "loss/value_avg": 1.1075444221496582, |
| "lr": 1.5e-06, |
| "objective/entropy": 155.5027618408203, |
| "objective/kl": 4.750798225402832, |
| "objective/non_score_reward": -0.23753991723060608, |
| "objective/rlhf_reward": -0.2956453859806061, |
| "objective/scores": -0.05810546875, |
| "policy/approxkl_avg": 0.0019890139810740948, |
| "policy/clipfrac_avg": 0.00244140625, |
| "policy/entropy_avg": 1.4853081703186035, |
| "step": 16, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9637153148651123, |
| "val/ratio_var": 1.7876334368338576e-06 |
| }, |
| { |
| "episode": 272, |
| "epoch": 0.34, |
| "eps": 0, |
| "loss/policy_avg": -0.0014371530851349235, |
| "loss/value_avg": 1.2808384895324707, |
| "lr": 1.6000000000000001e-06, |
| "objective/entropy": 174.11500549316406, |
| "objective/kl": 5.128355503082275, |
| "objective/non_score_reward": -0.25641775131225586, |
| "objective/rlhf_reward": -0.43610525131225586, |
| "objective/scores": -0.1796875, |
| "policy/approxkl_avg": 0.0020958627574145794, |
| "policy/clipfrac_avg": 0.001220703125, |
| "policy/entropy_avg": 1.6312119960784912, |
| "step": 17, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9622437953948975, |
| "val/ratio_var": 2.562965846664156e-06 |
| }, |
| { |
| "episode": 288, |
| "epoch": 0.36, |
| "eps": 0, |
| "loss/policy_avg": -0.0013770293444395065, |
| "loss/value_avg": 1.1316921710968018, |
| "lr": 1.7000000000000002e-06, |
| "objective/entropy": 177.11965942382812, |
| "objective/kl": 4.7733917236328125, |
| "objective/non_score_reward": -0.23866958916187286, |
| "objective/rlhf_reward": -0.36562269926071167, |
| "objective/scores": -0.126953125, |
| "policy/approxkl_avg": 0.0020855318289250135, |
| "policy/clipfrac_avg": 0.00244140625, |
| "policy/entropy_avg": 1.652011513710022, |
| "step": 18, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9627939462661743, |
| "val/ratio_var": 2.3717864223726792e-06 |
| }, |
| { |
| "episode": 304, |
| "epoch": 0.38, |
| "eps": 0, |
| "loss/policy_avg": -0.002823261544108391, |
| "loss/value_avg": 1.0748326778411865, |
| "lr": 1.8000000000000001e-06, |
| "objective/entropy": 173.44908142089844, |
| "objective/kl": 4.939946174621582, |
| "objective/non_score_reward": -0.24699731171131134, |
| "objective/rlhf_reward": -0.31145042181015015, |
| "objective/scores": -0.064453125, |
| "policy/approxkl_avg": 0.002068548696115613, |
| "policy/clipfrac_avg": 0.001953125, |
| "policy/entropy_avg": 1.6042956113815308, |
| "step": 19, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9631877541542053, |
| "val/ratio_var": 7.887209903856274e-06 |
| }, |
| { |
| "episode": 320, |
| "epoch": 0.4, |
| "eps": 0, |
| "loss/policy_avg": -0.0016451161354780197, |
| "loss/value_avg": 1.1182610988616943, |
| "lr": 1.9000000000000002e-06, |
| "objective/entropy": 161.34744262695312, |
| "objective/kl": 5.098767280578613, |
| "objective/non_score_reward": -0.25493836402893066, |
| "objective/rlhf_reward": -0.49419617652893066, |
| "objective/scores": -0.2392578125, |
| "policy/approxkl_avg": 0.001963268965482712, |
| "policy/clipfrac_avg": 0.0015869140625, |
| "policy/entropy_avg": 1.510756254196167, |
| "step": 20, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9648376107215881, |
| "val/ratio_var": 5.044429599365685e-06 |
| }, |
| { |
| "episode": 336, |
| "epoch": 0.42, |
| "eps": 0, |
| "loss/policy_avg": -0.0013257116079330444, |
| "loss/value_avg": 1.215050220489502, |
| "lr": 2.0000000000000003e-06, |
| "objective/entropy": 155.19345092773438, |
| "objective/kl": 5.083975791931152, |
| "objective/non_score_reward": -0.2541987895965576, |
| "objective/rlhf_reward": -0.3499019145965576, |
| "objective/scores": -0.095703125, |
| "policy/approxkl_avg": 0.002038246486335993, |
| "policy/clipfrac_avg": 0.0023193359375, |
| "policy/entropy_avg": 1.447933316230774, |
| "step": 21, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9646758437156677, |
| "val/ratio_var": 2.349487203900935e-06 |
| }, |
| { |
| "episode": 352, |
| "epoch": 0.44, |
| "eps": 0, |
| "loss/policy_avg": -0.0019622594118118286, |
| "loss/value_avg": 0.9825942516326904, |
| "lr": 2.1000000000000002e-06, |
| "objective/entropy": 184.58116149902344, |
| "objective/kl": 5.247394561767578, |
| "objective/non_score_reward": -0.26236969232559204, |
| "objective/rlhf_reward": -0.41471344232559204, |
| "objective/scores": -0.15234375, |
| "policy/approxkl_avg": 0.002102417405694723, |
| "policy/clipfrac_avg": 0.0015869140625, |
| "policy/entropy_avg": 1.712675929069519, |
| "step": 22, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9619762897491455, |
| "val/ratio_var": 2.346512928852462e-06 |
| }, |
| { |
| "episode": 368, |
| "epoch": 0.46, |
| "eps": 0, |
| "loss/policy_avg": -0.0008872179314494133, |
| "loss/value_avg": 1.2242329120635986, |
| "lr": 2.2e-06, |
| "objective/entropy": 158.87860107421875, |
| "objective/kl": 4.657800197601318, |
| "objective/non_score_reward": -0.23289000988006592, |
| "objective/rlhf_reward": -0.3383587598800659, |
| "objective/scores": -0.10546875, |
| "policy/approxkl_avg": 0.0019458475289866328, |
| "policy/clipfrac_avg": 0.0018310546875, |
| "policy/entropy_avg": 1.4549307823181152, |
| "step": 23, |
| "val/clipfrac_avg": 0.0003662109375, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.966155469417572, |
| "val/ratio_var": 2.9357854600675637e-06 |
| }, |
| { |
| "episode": 384, |
| "epoch": 0.48, |
| "eps": 0, |
| "loss/policy_avg": -0.0034685591235756874, |
| "loss/value_avg": 1.0394468307495117, |
| "lr": 2.3000000000000004e-06, |
| "objective/entropy": 164.78189086914062, |
| "objective/kl": 4.869457244873047, |
| "objective/non_score_reward": -0.2434728741645813, |
| "objective/rlhf_reward": -0.7161291241645813, |
| "objective/scores": -0.47265625, |
| "policy/approxkl_avg": 0.002021776745095849, |
| "policy/clipfrac_avg": 0.00244140625, |
| "policy/entropy_avg": 1.4984092712402344, |
| "step": 24, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9638509750366211, |
| "val/ratio_var": 1.510236529611575e-06 |
| }, |
| { |
| "episode": 400, |
| "epoch": 0.5, |
| "eps": 0, |
| "loss/policy_avg": -0.0017958339303731918, |
| "loss/value_avg": 1.0017895698547363, |
| "lr": 2.4000000000000003e-06, |
| "objective/entropy": 155.18121337890625, |
| "objective/kl": 4.872950553894043, |
| "objective/non_score_reward": -0.2436475157737732, |
| "objective/rlhf_reward": 0.0004931092262268066, |
| "objective/scores": 0.244140625, |
| "policy/approxkl_avg": 0.002069193869829178, |
| "policy/clipfrac_avg": 0.001708984375, |
| "policy/entropy_avg": 1.506508469581604, |
| "step": 25, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9632376432418823, |
| "val/ratio_var": 2.6072054879477946e-06 |
| }, |
| { |
| "episode": 416, |
| "epoch": 0.52, |
| "eps": 0, |
| "loss/policy_avg": -0.0009807262104004622, |
| "loss/value_avg": 0.9018648862838745, |
| "lr": 2.5e-06, |
| "objective/entropy": 176.93988037109375, |
| "objective/kl": 5.13535213470459, |
| "objective/non_score_reward": -0.256767600774765, |
| "objective/rlhf_reward": -0.24211916327476501, |
| "objective/scores": 0.0146484375, |
| "policy/approxkl_avg": 0.0020367163233458996, |
| "policy/clipfrac_avg": 0.001953125, |
| "policy/entropy_avg": 1.5867078304290771, |
| "step": 26, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.962630569934845, |
| "val/ratio_var": 6.3079619394557085e-06 |
| }, |
| { |
| "episode": 432, |
| "epoch": 0.54, |
| "eps": 0, |
| "loss/policy_avg": -0.0008640075102448463, |
| "loss/value_avg": 1.1066097021102905, |
| "lr": 2.6e-06, |
| "objective/entropy": 162.58128356933594, |
| "objective/kl": 4.76553201675415, |
| "objective/non_score_reward": -0.23827658593654633, |
| "objective/rlhf_reward": -0.5585891008377075, |
| "objective/scores": -0.3203125, |
| "policy/approxkl_avg": 0.0020129948388785124, |
| "policy/clipfrac_avg": 0.0030517578125, |
| "policy/entropy_avg": 1.5091907978057861, |
| "step": 27, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9642030000686646, |
| "val/ratio_var": 3.215169272152707e-06 |
| }, |
| { |
| "episode": 448, |
| "epoch": 0.56, |
| "eps": 0, |
| "loss/policy_avg": -0.003287755884230137, |
| "loss/value_avg": 0.9977786540985107, |
| "lr": 2.7000000000000004e-06, |
| "objective/entropy": 156.29837036132812, |
| "objective/kl": 4.870822906494141, |
| "objective/non_score_reward": -0.2435411512851715, |
| "objective/rlhf_reward": -0.2541612684726715, |
| "objective/scores": -0.0106201171875, |
| "policy/approxkl_avg": 0.001957303611561656, |
| "policy/clipfrac_avg": 0.0013427734375, |
| "policy/entropy_avg": 1.4713962078094482, |
| "step": 28, |
| "val/clipfrac_avg": 0.000244140625, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.965004563331604, |
| "val/ratio_var": 2.187839299949701e-06 |
| }, |
| { |
| "episode": 464, |
| "epoch": 0.58, |
| "eps": 0, |
| "loss/policy_avg": -0.0021433092188090086, |
| "loss/value_avg": 0.9118539094924927, |
| "lr": 2.8000000000000003e-06, |
| "objective/entropy": 163.495849609375, |
| "objective/kl": 4.88909912109375, |
| "objective/non_score_reward": -0.24445496499538422, |
| "objective/rlhf_reward": -0.017892464995384216, |
| "objective/scores": 0.2265625, |
| "policy/approxkl_avg": 0.0021847893949598074, |
| "policy/clipfrac_avg": 0.003173828125, |
| "policy/entropy_avg": 1.5781747102737427, |
| "step": 29, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9634232521057129, |
| "val/ratio_var": 4.35601987192058e-06 |
| }, |
| { |
| "episode": 480, |
| "epoch": 0.6, |
| "eps": 0, |
| "loss/policy_avg": -0.0023626741021871567, |
| "loss/value_avg": 0.9815002679824829, |
| "lr": 2.9e-06, |
| "objective/entropy": 167.91629028320312, |
| "objective/kl": 4.972644805908203, |
| "objective/non_score_reward": -0.2486322671175003, |
| "objective/rlhf_reward": -0.6919916272163391, |
| "objective/scores": -0.443359375, |
| "policy/approxkl_avg": 0.002145718550309539, |
| "policy/clipfrac_avg": 0.00146484375, |
| "policy/entropy_avg": 1.5622011423110962, |
| "step": 30, |
| "val/clipfrac_avg": 0.0006103515625, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9621492028236389, |
| "val/ratio_var": 5.3428561841428746e-06 |
| }, |
| { |
| "episode": 496, |
| "epoch": 0.62, |
| "eps": 0, |
| "loss/policy_avg": -0.003964821808040142, |
| "loss/value_avg": 0.8878471851348877, |
| "lr": 3e-06, |
| "objective/entropy": 166.8218994140625, |
| "objective/kl": 5.199556827545166, |
| "objective/non_score_reward": -0.2599778473377228, |
| "objective/rlhf_reward": -0.4894700348377228, |
| "objective/scores": -0.2294921875, |
| "policy/approxkl_avg": 0.00210373941808939, |
| "policy/clipfrac_avg": 0.00244140625, |
| "policy/entropy_avg": 1.568078875541687, |
| "step": 31, |
| "val/clipfrac_avg": 0.0006103515625, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9622396230697632, |
| "val/ratio_var": 5.290597528073704e-06 |
| }, |
| { |
| "episode": 512, |
| "epoch": 0.64, |
| "eps": 0, |
| "loss/policy_avg": -0.0024933372624218464, |
| "loss/value_avg": 0.7922680377960205, |
| "lr": 3.1000000000000004e-06, |
| "objective/entropy": 176.921875, |
| "objective/kl": 5.756374359130859, |
| "objective/non_score_reward": -0.2878187298774719, |
| "objective/rlhf_reward": -0.3390882611274719, |
| "objective/scores": -0.05126953125, |
| "policy/approxkl_avg": 0.002244055736809969, |
| "policy/clipfrac_avg": 0.002197265625, |
| "policy/entropy_avg": 1.6759358644485474, |
| "step": 32, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9592071175575256, |
| "val/ratio_var": 4.2995147850888316e-06 |
| }, |
| { |
| "episode": 528, |
| "epoch": 0.66, |
| "eps": 0, |
| "loss/policy_avg": -0.0017838962376117706, |
| "loss/value_avg": 0.7303538918495178, |
| "lr": 3.2000000000000003e-06, |
| "objective/entropy": 175.94744873046875, |
| "objective/kl": 5.338512420654297, |
| "objective/non_score_reward": -0.2669256329536438, |
| "objective/rlhf_reward": -0.2965887188911438, |
| "objective/scores": -0.0296630859375, |
| "policy/approxkl_avg": 0.002238932531327009, |
| "policy/clipfrac_avg": 0.00244140625, |
| "policy/entropy_avg": 1.6415915489196777, |
| "step": 33, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9602410793304443, |
| "val/ratio_var": 1.0174014732911019e-06 |
| }, |
| { |
| "episode": 544, |
| "epoch": 0.68, |
| "eps": 0, |
| "loss/policy_avg": -0.002395186573266983, |
| "loss/value_avg": 0.9008848667144775, |
| "lr": 3.3000000000000006e-06, |
| "objective/entropy": 159.09799194335938, |
| "objective/kl": 4.972096920013428, |
| "objective/non_score_reward": -0.24860484898090363, |
| "objective/rlhf_reward": -0.012276723980903625, |
| "objective/scores": 0.236328125, |
| "policy/approxkl_avg": 0.002032281132414937, |
| "policy/clipfrac_avg": 0.0025634765625, |
| "policy/entropy_avg": 1.5187857151031494, |
| "step": 34, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.963579535484314, |
| "val/ratio_var": 1.3504248954632203e-06 |
| }, |
| { |
| "episode": 560, |
| "epoch": 0.7, |
| "eps": 0, |
| "loss/policy_avg": -0.0040529826655983925, |
| "loss/value_avg": 0.738201916217804, |
| "lr": 3.4000000000000005e-06, |
| "objective/entropy": 189.93099975585938, |
| "objective/kl": 5.9919657707214355, |
| "objective/non_score_reward": -0.2995982766151428, |
| "objective/rlhf_reward": -0.8152232766151428, |
| "objective/scores": -0.515625, |
| "policy/approxkl_avg": 0.0024048539344221354, |
| "policy/clipfrac_avg": 0.0037841796875, |
| "policy/entropy_avg": 1.7418715953826904, |
| "step": 35, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9581993222236633, |
| "val/ratio_var": 2.2299211650533834e-06 |
| }, |
| { |
| "episode": 576, |
| "epoch": 0.72, |
| "eps": 0, |
| "loss/policy_avg": -0.003544020466506481, |
| "loss/value_avg": 0.7707731127738953, |
| "lr": 3.5e-06, |
| "objective/entropy": 159.47125244140625, |
| "objective/kl": 4.8106207847595215, |
| "objective/non_score_reward": -0.2405310571193695, |
| "objective/rlhf_reward": -0.6545935869216919, |
| "objective/scores": -0.4140625, |
| "policy/approxkl_avg": 0.002112124115228653, |
| "policy/clipfrac_avg": 0.003173828125, |
| "policy/entropy_avg": 1.4905831813812256, |
| "step": 36, |
| "val/clipfrac_avg": 0.0003662109375, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9643076658248901, |
| "val/ratio_var": 7.036061106191482e-06 |
| }, |
| { |
| "episode": 592, |
| "epoch": 0.74, |
| "eps": 0, |
| "loss/policy_avg": -0.0030114920809865, |
| "loss/value_avg": 0.6595614552497864, |
| "lr": 3.6000000000000003e-06, |
| "objective/entropy": 180.1719970703125, |
| "objective/kl": 5.4066996574401855, |
| "objective/non_score_reward": -0.27033501863479614, |
| "objective/rlhf_reward": -0.31232720613479614, |
| "objective/scores": -0.0419921875, |
| "policy/approxkl_avg": 0.0021726740524172783, |
| "policy/clipfrac_avg": 0.001708984375, |
| "policy/entropy_avg": 1.7136811017990112, |
| "step": 37, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9603984355926514, |
| "val/ratio_var": 3.5773789477389073e-06 |
| }, |
| { |
| "episode": 608, |
| "epoch": 0.76, |
| "eps": 0, |
| "loss/policy_avg": -0.002592116594314575, |
| "loss/value_avg": 0.5918077230453491, |
| "lr": 3.7e-06, |
| "objective/entropy": 167.25192260742188, |
| "objective/kl": 4.988465309143066, |
| "objective/non_score_reward": -0.24942323565483093, |
| "objective/rlhf_reward": -0.19009706377983093, |
| "objective/scores": 0.059326171875, |
| "policy/approxkl_avg": 0.0021313969045877457, |
| "policy/clipfrac_avg": 0.003662109375, |
| "policy/entropy_avg": 1.5170129537582397, |
| "step": 38, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9635162353515625, |
| "val/ratio_var": 4.518392870522803e-06 |
| }, |
| { |
| "episode": 624, |
| "epoch": 0.78, |
| "eps": 0, |
| "loss/policy_avg": -0.0039001721888780594, |
| "loss/value_avg": 0.7430535554885864, |
| "lr": 3.8000000000000005e-06, |
| "objective/entropy": 180.25039672851562, |
| "objective/kl": 5.839836597442627, |
| "objective/non_score_reward": -0.29199182987213135, |
| "objective/rlhf_reward": -0.39160120487213135, |
| "objective/scores": -0.099609375, |
| "policy/approxkl_avg": 0.002172058681026101, |
| "policy/clipfrac_avg": 0.003662109375, |
| "policy/entropy_avg": 1.696380853652954, |
| "step": 39, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.962264895439148, |
| "val/ratio_var": 3.2267200822388986e-06 |
| }, |
| { |
| "episode": 640, |
| "epoch": 0.8, |
| "eps": 0, |
| "loss/policy_avg": -0.0031222254037857056, |
| "loss/value_avg": 0.6954755783081055, |
| "lr": 3.900000000000001e-06, |
| "objective/entropy": 147.38571166992188, |
| "objective/kl": 5.248948097229004, |
| "objective/non_score_reward": -0.26244741678237915, |
| "objective/rlhf_reward": -0.41576772928237915, |
| "objective/scores": -0.1533203125, |
| "policy/approxkl_avg": 0.0020004287362098694, |
| "policy/clipfrac_avg": 0.0028076171875, |
| "policy/entropy_avg": 1.4099265336990356, |
| "step": 40, |
| "val/clipfrac_avg": 0.0003662109375, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.964896559715271, |
| "val/ratio_var": 5.165110906091286e-06 |
| }, |
| { |
| "episode": 656, |
| "epoch": 0.82, |
| "eps": 0, |
| "loss/policy_avg": -0.0036731180734932423, |
| "loss/value_avg": 0.6531814336776733, |
| "lr": 4.000000000000001e-06, |
| "objective/entropy": 186.69369506835938, |
| "objective/kl": 5.787052154541016, |
| "objective/non_score_reward": -0.2893525958061218, |
| "objective/rlhf_reward": -0.15849322080612183, |
| "objective/scores": 0.130859375, |
| "policy/approxkl_avg": 0.002225282369181514, |
| "policy/clipfrac_avg": 0.0023193359375, |
| "policy/entropy_avg": 1.731877088546753, |
| "step": 41, |
| "val/clipfrac_avg": 0.0006103515625, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9604864120483398, |
| "val/ratio_var": 2.4040532480285037e-06 |
| }, |
| { |
| "episode": 672, |
| "epoch": 0.84, |
| "eps": 0, |
| "loss/policy_avg": -0.003208685666322708, |
| "loss/value_avg": 0.4511321187019348, |
| "lr": 4.1e-06, |
| "objective/entropy": 193.63333129882812, |
| "objective/kl": 5.826928615570068, |
| "objective/non_score_reward": -0.2913464307785034, |
| "objective/rlhf_reward": -0.6956433057785034, |
| "objective/scores": -0.404296875, |
| "policy/approxkl_avg": 0.0021463697776198387, |
| "policy/clipfrac_avg": 0.0028076171875, |
| "policy/entropy_avg": 1.808129906654358, |
| "step": 42, |
| "val/clipfrac_avg": 0.000244140625, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9606460928916931, |
| "val/ratio_var": 1.945327767316485e-06 |
| }, |
| { |
| "episode": 688, |
| "epoch": 0.86, |
| "eps": 0, |
| "loss/policy_avg": -0.00527562340721488, |
| "loss/value_avg": 0.5236003398895264, |
| "lr": 4.2000000000000004e-06, |
| "objective/entropy": 200.14642333984375, |
| "objective/kl": 6.3243584632873535, |
| "objective/non_score_reward": -0.31621789932250977, |
| "objective/rlhf_reward": -0.21416711807250977, |
| "objective/scores": 0.10205078125, |
| "policy/approxkl_avg": 0.002284294692799449, |
| "policy/clipfrac_avg": 0.002685546875, |
| "policy/entropy_avg": 1.8296796083450317, |
| "step": 43, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9591376781463623, |
| "val/ratio_var": 3.582605586416321e-06 |
| }, |
| { |
| "episode": 704, |
| "epoch": 0.88, |
| "eps": 0, |
| "loss/policy_avg": -0.004552279599010944, |
| "loss/value_avg": 0.44474709033966064, |
| "lr": 4.3e-06, |
| "objective/entropy": 198.2988739013672, |
| "objective/kl": 6.11703634262085, |
| "objective/non_score_reward": -0.3058518171310425, |
| "objective/rlhf_reward": -0.3976486921310425, |
| "objective/scores": -0.091796875, |
| "policy/approxkl_avg": 0.002352041658014059, |
| "policy/clipfrac_avg": 0.0030517578125, |
| "policy/entropy_avg": 1.8390659093856812, |
| "step": 44, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9594440460205078, |
| "val/ratio_var": 1.183144263450231e-06 |
| }, |
| { |
| "episode": 720, |
| "epoch": 0.9, |
| "eps": 0, |
| "loss/policy_avg": -0.0043452465906739235, |
| "loss/value_avg": 0.5411888360977173, |
| "lr": 4.4e-06, |
| "objective/entropy": 176.61441040039062, |
| "objective/kl": 6.0966949462890625, |
| "objective/non_score_reward": -0.30483478307724, |
| "objective/rlhf_reward": -0.35756915807724, |
| "objective/scores": -0.052734375, |
| "policy/approxkl_avg": 0.0021276341285556555, |
| "policy/clipfrac_avg": 0.0040283203125, |
| "policy/entropy_avg": 1.6279263496398926, |
| "step": 45, |
| "val/clipfrac_avg": 0.0003662109375, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9633927345275879, |
| "val/ratio_var": 3.1293473057303345e-06 |
| }, |
| { |
| "episode": 736, |
| "epoch": 0.92, |
| "eps": 0, |
| "loss/policy_avg": -0.004358572885394096, |
| "loss/value_avg": 0.42890092730522156, |
| "lr": 4.5e-06, |
| "objective/entropy": 184.73455810546875, |
| "objective/kl": 6.081456184387207, |
| "objective/non_score_reward": -0.3040727972984314, |
| "objective/rlhf_reward": -0.4119829535484314, |
| "objective/scores": -0.10791015625, |
| "policy/approxkl_avg": 0.0021303845569491386, |
| "policy/clipfrac_avg": 0.001953125, |
| "policy/entropy_avg": 1.743346929550171, |
| "step": 46, |
| "val/clipfrac_avg": 0.00048828125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9609779119491577, |
| "val/ratio_var": 2.9215109407232376e-06 |
| }, |
| { |
| "episode": 752, |
| "epoch": 0.94, |
| "eps": 0, |
| "loss/policy_avg": -0.00496285455301404, |
| "loss/value_avg": 0.4583837687969208, |
| "lr": 4.600000000000001e-06, |
| "objective/entropy": 193.1529541015625, |
| "objective/kl": 6.68864631652832, |
| "objective/non_score_reward": -0.33443236351013184, |
| "objective/rlhf_reward": -0.22603392601013184, |
| "objective/scores": 0.1083984375, |
| "policy/approxkl_avg": 0.002309663686901331, |
| "policy/clipfrac_avg": 0.0028076171875, |
| "policy/entropy_avg": 1.819280743598938, |
| "step": 47, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.958611011505127, |
| "val/ratio_var": 3.0761455036554253e-06 |
| }, |
| { |
| "episode": 768, |
| "epoch": 0.96, |
| "eps": 0, |
| "loss/policy_avg": -0.0036918839905411005, |
| "loss/value_avg": 0.5343762636184692, |
| "lr": 4.7e-06, |
| "objective/entropy": 172.60386657714844, |
| "objective/kl": 5.954087734222412, |
| "objective/non_score_reward": -0.29770439863204956, |
| "objective/rlhf_reward": -0.9109856486320496, |
| "objective/scores": -0.61328125, |
| "policy/approxkl_avg": 0.0021302001550793648, |
| "policy/clipfrac_avg": 0.0029296875, |
| "policy/entropy_avg": 1.6243486404418945, |
| "step": 48, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9628680944442749, |
| "val/ratio_var": 7.764373549434822e-06 |
| }, |
| { |
| "episode": 784, |
| "epoch": 0.98, |
| "eps": 0, |
| "loss/policy_avg": -0.0063984692096710205, |
| "loss/value_avg": 0.4495910108089447, |
| "lr": 4.800000000000001e-06, |
| "objective/entropy": 177.82730102539062, |
| "objective/kl": 6.320567607879639, |
| "objective/non_score_reward": -0.3160283863544464, |
| "objective/rlhf_reward": -0.685168981552124, |
| "objective/scores": -0.369140625, |
| "policy/approxkl_avg": 0.002262189518660307, |
| "policy/clipfrac_avg": 0.00341796875, |
| "policy/entropy_avg": 1.6700631380081177, |
| "step": 49, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9607704877853394, |
| "val/ratio_var": 6.711939931847155e-06 |
| }, |
| { |
| "episode": 800, |
| "epoch": 1.0, |
| "eps": 0, |
| "loss/policy_avg": -0.007105860859155655, |
| "loss/value_avg": 0.3898148536682129, |
| "lr": 4.9000000000000005e-06, |
| "objective/entropy": 202.3233184814453, |
| "objective/kl": 7.163693428039551, |
| "objective/non_score_reward": -0.35818469524383545, |
| "objective/rlhf_reward": -0.45291125774383545, |
| "objective/scores": -0.0947265625, |
| "policy/approxkl_avg": 0.0024232997093349695, |
| "policy/clipfrac_avg": 0.0042724609375, |
| "policy/entropy_avg": 1.8700368404388428, |
| "step": 50, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9586131572723389, |
| "val/ratio_var": 2.8943763936695177e-06 |
| }, |
| { |
| "episode": 816, |
| "epoch": 1.02, |
| "eps": 0, |
| "loss/policy_avg": -0.003659446258097887, |
| "loss/value_avg": 0.36625581979751587, |
| "lr": 5e-06, |
| "objective/entropy": 202.9866180419922, |
| "objective/kl": 8.40648078918457, |
| "objective/non_score_reward": -0.42032405734062195, |
| "objective/rlhf_reward": -0.7484490871429443, |
| "objective/scores": -0.328125, |
| "policy/approxkl_avg": 0.002428454579785466, |
| "policy/clipfrac_avg": 0.0023193359375, |
| "policy/entropy_avg": 1.9231579303741455, |
| "step": 51, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.958379864692688, |
| "val/ratio_var": 6.084300139264087e-07 |
| }, |
| { |
| "episode": 832, |
| "epoch": 1.04, |
| "eps": 0, |
| "loss/policy_avg": -0.006424235180020332, |
| "loss/value_avg": 0.4322627782821655, |
| "lr": 5.1e-06, |
| "objective/entropy": 188.87704467773438, |
| "objective/kl": 6.012404441833496, |
| "objective/non_score_reward": -0.30062025785446167, |
| "objective/rlhf_reward": -0.34749525785446167, |
| "objective/scores": -0.046875, |
| "policy/approxkl_avg": 0.0023025020491331816, |
| "policy/clipfrac_avg": 0.0025634765625, |
| "policy/entropy_avg": 1.7421900033950806, |
| "step": 52, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9606554508209229, |
| "val/ratio_var": 3.889582330884878e-06 |
| }, |
| { |
| "episode": 848, |
| "epoch": 1.06, |
| "eps": 0, |
| "loss/policy_avg": -0.0050460826605558395, |
| "loss/value_avg": 0.4796571731567383, |
| "lr": 5.2e-06, |
| "objective/entropy": 164.9066162109375, |
| "objective/kl": 5.981889724731445, |
| "objective/non_score_reward": -0.2990944981575012, |
| "objective/rlhf_reward": -0.6916726231575012, |
| "objective/scores": -0.392578125, |
| "policy/approxkl_avg": 0.0021395045332610607, |
| "policy/clipfrac_avg": 0.0048828125, |
| "policy/entropy_avg": 1.5888962745666504, |
| "step": 53, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9628995656967163, |
| "val/ratio_var": 4.597809947881615e-06 |
| }, |
| { |
| "episode": 864, |
| "epoch": 1.08, |
| "eps": 0, |
| "loss/policy_avg": -0.006381146609783173, |
| "loss/value_avg": 0.3751007616519928, |
| "lr": 5.300000000000001e-06, |
| "objective/entropy": 197.0960693359375, |
| "objective/kl": 6.693915843963623, |
| "objective/non_score_reward": -0.3346957862377167, |
| "objective/rlhf_reward": -0.7624301910400391, |
| "objective/scores": -0.427734375, |
| "policy/approxkl_avg": 0.0023175738751888275, |
| "policy/clipfrac_avg": 0.004150390625, |
| "policy/entropy_avg": 1.8260283470153809, |
| "step": 54, |
| "val/clipfrac_avg": 0.000244140625, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9607384204864502, |
| "val/ratio_var": 3.528241450112546e-06 |
| }, |
| { |
| "episode": 880, |
| "epoch": 1.1, |
| "eps": 0, |
| "loss/policy_avg": -0.005390330217778683, |
| "loss/value_avg": 0.3408937454223633, |
| "lr": 5.400000000000001e-06, |
| "objective/entropy": 189.24441528320312, |
| "objective/kl": 7.497405052185059, |
| "objective/non_score_reward": -0.374870240688324, |
| "objective/rlhf_reward": -0.493034303188324, |
| "objective/scores": -0.1181640625, |
| "policy/approxkl_avg": 0.0023757517337799072, |
| "policy/clipfrac_avg": 0.0042724609375, |
| "policy/entropy_avg": 1.729443073272705, |
| "step": 55, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9610804319381714, |
| "val/ratio_var": 5.716868599847658e-06 |
| }, |
| { |
| "episode": 896, |
| "epoch": 1.12, |
| "eps": 0, |
| "loss/policy_avg": -0.009562751278281212, |
| "loss/value_avg": 0.34088462591171265, |
| "lr": 5.500000000000001e-06, |
| "objective/entropy": 186.15057373046875, |
| "objective/kl": 7.024878025054932, |
| "objective/non_score_reward": -0.35124391317367554, |
| "objective/rlhf_reward": -0.31706422567367554, |
| "objective/scores": 0.0341796875, |
| "policy/approxkl_avg": 0.002572981407865882, |
| "policy/clipfrac_avg": 0.0069580078125, |
| "policy/entropy_avg": 1.7276947498321533, |
| "step": 56, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9590590000152588, |
| "val/ratio_var": 3.823240604106104e-06 |
| }, |
| { |
| "episode": 912, |
| "epoch": 1.14, |
| "eps": 0, |
| "loss/policy_avg": -0.006179399322718382, |
| "loss/value_avg": 0.3504621088504791, |
| "lr": 5.600000000000001e-06, |
| "objective/entropy": 203.68405151367188, |
| "objective/kl": 7.05451774597168, |
| "objective/non_score_reward": -0.3527258634567261, |
| "objective/rlhf_reward": -0.2843664884567261, |
| "objective/scores": 0.068359375, |
| "policy/approxkl_avg": 0.0024855139199644327, |
| "policy/clipfrac_avg": 0.0064697265625, |
| "policy/entropy_avg": 1.8781225681304932, |
| "step": 57, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9579586982727051, |
| "val/ratio_var": 6.8513472797349095e-06 |
| }, |
| { |
| "episode": 928, |
| "epoch": 1.16, |
| "eps": 0, |
| "loss/policy_avg": -0.006329299416393042, |
| "loss/value_avg": 0.2883201241493225, |
| "lr": 5.7e-06, |
| "objective/entropy": 188.50772094726562, |
| "objective/kl": 7.495998382568359, |
| "objective/non_score_reward": -0.3747999370098114, |
| "objective/rlhf_reward": -0.3398878276348114, |
| "objective/scores": 0.034912109375, |
| "policy/approxkl_avg": 0.002424823120236397, |
| "policy/clipfrac_avg": 0.0064697265625, |
| "policy/entropy_avg": 1.7669949531555176, |
| "step": 58, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9617897272109985, |
| "val/ratio_var": 3.0927415082260268e-06 |
| }, |
| { |
| "episode": 944, |
| "epoch": 1.18, |
| "eps": 0, |
| "loss/policy_avg": -0.008085895329713821, |
| "loss/value_avg": 0.30470138788223267, |
| "lr": 5.8e-06, |
| "objective/entropy": 190.35797119140625, |
| "objective/kl": 8.07366943359375, |
| "objective/non_score_reward": -0.40368348360061646, |
| "objective/rlhf_reward": -0.6810272336006165, |
| "objective/scores": -0.27734375, |
| "policy/approxkl_avg": 0.002423493890091777, |
| "policy/clipfrac_avg": 0.005859375, |
| "policy/entropy_avg": 1.745201826095581, |
| "step": 59, |
| "val/clipfrac_avg": 0.0003662109375, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9606545567512512, |
| "val/ratio_var": 2.6997802251571557e-06 |
| }, |
| { |
| "episode": 960, |
| "epoch": 1.2, |
| "eps": 0, |
| "loss/policy_avg": -0.0073398323729634285, |
| "loss/value_avg": 0.2474157214164734, |
| "lr": 5.9e-06, |
| "objective/entropy": 186.44775390625, |
| "objective/kl": 7.319401741027832, |
| "objective/non_score_reward": -0.36597010493278503, |
| "objective/rlhf_reward": -0.32300135493278503, |
| "objective/scores": 0.04296875, |
| "policy/approxkl_avg": 0.0022131437435746193, |
| "policy/clipfrac_avg": 0.0037841796875, |
| "policy/entropy_avg": 1.7592250108718872, |
| "step": 60, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9618268609046936, |
| "val/ratio_var": 1.1245646192037384e-06 |
| }, |
| { |
| "episode": 976, |
| "epoch": 1.22, |
| "eps": 0, |
| "loss/policy_avg": -0.008636923506855965, |
| "loss/value_avg": 0.3011922240257263, |
| "lr": 6e-06, |
| "objective/entropy": 194.01901245117188, |
| "objective/kl": 8.093948364257812, |
| "objective/non_score_reward": -0.404697448015213, |
| "objective/rlhf_reward": -0.5292091369628906, |
| "objective/scores": -0.12451171875, |
| "policy/approxkl_avg": 0.002539466368034482, |
| "policy/clipfrac_avg": 0.0064697265625, |
| "policy/entropy_avg": 1.7535340785980225, |
| "step": 61, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9611643552780151, |
| "val/ratio_var": 4.485158569877967e-06 |
| }, |
| { |
| "episode": 992, |
| "epoch": 1.24, |
| "eps": 0, |
| "loss/policy_avg": -0.008942335844039917, |
| "loss/value_avg": 0.314566433429718, |
| "lr": 6.1e-06, |
| "objective/entropy": 207.50445556640625, |
| "objective/kl": 9.318485260009766, |
| "objective/non_score_reward": -0.4659242033958435, |
| "objective/rlhf_reward": -0.6280335783958435, |
| "objective/scores": -0.162109375, |
| "policy/approxkl_avg": 0.002469500759616494, |
| "policy/clipfrac_avg": 0.0057373046875, |
| "policy/entropy_avg": 1.886934757232666, |
| "step": 62, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9599511027336121, |
| "val/ratio_var": 1.6062762142610154e-06 |
| }, |
| { |
| "episode": 1008, |
| "epoch": 1.26, |
| "eps": 0, |
| "loss/policy_avg": -0.0077853333204984665, |
| "loss/value_avg": 0.2659590542316437, |
| "lr": 6.200000000000001e-06, |
| "objective/entropy": 184.21324157714844, |
| "objective/kl": 8.50517463684082, |
| "objective/non_score_reward": -0.4252587556838989, |
| "objective/rlhf_reward": -0.6410790681838989, |
| "objective/scores": -0.2158203125, |
| "policy/approxkl_avg": 0.0023279902525246143, |
| "policy/clipfrac_avg": 0.0050048828125, |
| "policy/entropy_avg": 1.704209566116333, |
| "step": 63, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9618266820907593, |
| "val/ratio_var": 2.3681529910390964e-06 |
| }, |
| { |
| "episode": 1024, |
| "epoch": 1.28, |
| "eps": 0, |
| "loss/policy_avg": -0.008203014731407166, |
| "loss/value_avg": 0.24721446633338928, |
| "lr": 6.300000000000001e-06, |
| "objective/entropy": 167.58615112304688, |
| "objective/kl": 8.397467613220215, |
| "objective/non_score_reward": -0.4198733866214752, |
| "objective/rlhf_reward": -0.5258303880691528, |
| "objective/scores": -0.10595703125, |
| "policy/approxkl_avg": 0.0023372513242065907, |
| "policy/clipfrac_avg": 0.00634765625, |
| "policy/entropy_avg": 1.590405821800232, |
| "step": 64, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9629535675048828, |
| "val/ratio_var": 1.0923018862740719e-06 |
| }, |
| { |
| "episode": 1040, |
| "epoch": 1.3, |
| "eps": 0, |
| "loss/policy_avg": -0.010154301300644875, |
| "loss/value_avg": 0.255871057510376, |
| "lr": 6.4000000000000006e-06, |
| "objective/entropy": 187.9450225830078, |
| "objective/kl": 8.685728073120117, |
| "objective/non_score_reward": -0.4342864155769348, |
| "objective/rlhf_reward": -0.4543059468269348, |
| "objective/scores": -0.02001953125, |
| "policy/approxkl_avg": 0.0025775341782718897, |
| "policy/clipfrac_avg": 0.0072021484375, |
| "policy/entropy_avg": 1.717556118965149, |
| "step": 65, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9599798917770386, |
| "val/ratio_var": 3.966343228967162e-06 |
| }, |
| { |
| "episode": 1056, |
| "epoch": 1.32, |
| "eps": 0, |
| "loss/policy_avg": -0.006520813796669245, |
| "loss/value_avg": 0.20695430040359497, |
| "lr": 6.5000000000000004e-06, |
| "objective/entropy": 171.31948852539062, |
| "objective/kl": 8.512733459472656, |
| "objective/non_score_reward": -0.4256366491317749, |
| "objective/rlhf_reward": -0.7537616491317749, |
| "objective/scores": -0.328125, |
| "policy/approxkl_avg": 0.0022518050391227007, |
| "policy/clipfrac_avg": 0.0069580078125, |
| "policy/entropy_avg": 1.5771468877792358, |
| "step": 66, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9653177857398987, |
| "val/ratio_var": 5.141545898368349e-06 |
| }, |
| { |
| "episode": 1072, |
| "epoch": 1.34, |
| "eps": 0, |
| "loss/policy_avg": -0.007219092920422554, |
| "loss/value_avg": 0.21666979789733887, |
| "lr": 6.600000000000001e-06, |
| "objective/entropy": 182.22146606445312, |
| "objective/kl": 10.377399444580078, |
| "objective/non_score_reward": -0.518869936466217, |
| "objective/rlhf_reward": -1.4524636268615723, |
| "objective/scores": -0.93359375, |
| "policy/approxkl_avg": 0.0024627677630633116, |
| "policy/clipfrac_avg": 0.0068359375, |
| "policy/entropy_avg": 1.7197209596633911, |
| "step": 67, |
| "val/clipfrac_avg": 0.0040283203125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9610635042190552, |
| "val/ratio_var": 2.259886286992696e-06 |
| }, |
| { |
| "episode": 1088, |
| "epoch": 1.36, |
| "eps": 0, |
| "loss/policy_avg": -0.009395209141075611, |
| "loss/value_avg": 0.22811496257781982, |
| "lr": 6.700000000000001e-06, |
| "objective/entropy": 184.89671325683594, |
| "objective/kl": 9.966436386108398, |
| "objective/non_score_reward": -0.4983218014240265, |
| "objective/rlhf_reward": -0.5730288028717041, |
| "objective/scores": -0.07470703125, |
| "policy/approxkl_avg": 0.002558775944635272, |
| "policy/clipfrac_avg": 0.0076904296875, |
| "policy/entropy_avg": 1.7344484329223633, |
| "step": 68, |
| "val/clipfrac_avg": 0.0013427734375, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9611660242080688, |
| "val/ratio_var": 4.669778263632907e-06 |
| }, |
| { |
| "episode": 1104, |
| "epoch": 1.38, |
| "eps": 0, |
| "loss/policy_avg": -0.011558989062905312, |
| "loss/value_avg": 0.2556169629096985, |
| "lr": 6.800000000000001e-06, |
| "objective/entropy": 185.43399047851562, |
| "objective/kl": 9.309539794921875, |
| "objective/non_score_reward": -0.46547698974609375, |
| "objective/rlhf_reward": -0.16078948974609375, |
| "objective/scores": 0.3046875, |
| "policy/approxkl_avg": 0.0025009019300341606, |
| "policy/clipfrac_avg": 0.0067138671875, |
| "policy/entropy_avg": 1.7306885719299316, |
| "step": 69, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9632130265235901, |
| "val/ratio_var": 8.589087883592583e-06 |
| }, |
| { |
| "episode": 1120, |
| "epoch": 1.4, |
| "eps": 0, |
| "loss/policy_avg": -0.008942339569330215, |
| "loss/value_avg": 0.19310247898101807, |
| "lr": 6.9e-06, |
| "objective/entropy": 186.04307556152344, |
| "objective/kl": 8.404040336608887, |
| "objective/non_score_reward": -0.4202020466327667, |
| "objective/rlhf_reward": -0.7112176418304443, |
| "objective/scores": -0.291015625, |
| "policy/approxkl_avg": 0.0025021119508892298, |
| "policy/clipfrac_avg": 0.0072021484375, |
| "policy/entropy_avg": 1.6525442600250244, |
| "step": 70, |
| "val/clipfrac_avg": 0.0003662109375, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9628069400787354, |
| "val/ratio_var": 1.0831280405909638e-06 |
| }, |
| { |
| "episode": 1136, |
| "epoch": 1.42, |
| "eps": 0, |
| "loss/policy_avg": -0.011153987608850002, |
| "loss/value_avg": 0.1662825345993042, |
| "lr": 7e-06, |
| "objective/entropy": 180.92037963867188, |
| "objective/kl": 10.789932250976562, |
| "objective/non_score_reward": -0.539496660232544, |
| "objective/rlhf_reward": -0.670356035232544, |
| "objective/scores": -0.130859375, |
| "policy/approxkl_avg": 0.002762062707915902, |
| "policy/clipfrac_avg": 0.01025390625, |
| "policy/entropy_avg": 1.6263654232025146, |
| "step": 71, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9615206122398376, |
| "val/ratio_var": 3.7071347378514474e-06 |
| }, |
| { |
| "episode": 1152, |
| "epoch": 1.44, |
| "eps": 0, |
| "loss/policy_avg": -0.00964896660298109, |
| "loss/value_avg": 0.1657545566558838, |
| "lr": 7.100000000000001e-06, |
| "objective/entropy": 199.1170654296875, |
| "objective/kl": 9.041791915893555, |
| "objective/non_score_reward": -0.4520896077156067, |
| "objective/rlhf_reward": -0.7138083577156067, |
| "objective/scores": -0.26171875, |
| "policy/approxkl_avg": 0.002562435809522867, |
| "policy/clipfrac_avg": 0.0069580078125, |
| "policy/entropy_avg": 1.7946536540985107, |
| "step": 72, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9602845907211304, |
| "val/ratio_var": 6.241244591365103e-06 |
| }, |
| { |
| "episode": 1168, |
| "epoch": 1.46, |
| "eps": 0, |
| "loss/policy_avg": -0.011143850162625313, |
| "loss/value_avg": 0.15495336055755615, |
| "lr": 7.2000000000000005e-06, |
| "objective/entropy": 177.09274291992188, |
| "objective/kl": 10.396652221679688, |
| "objective/non_score_reward": -0.5198326110839844, |
| "objective/rlhf_reward": -0.6262779235839844, |
| "objective/scores": -0.1064453125, |
| "policy/approxkl_avg": 0.002627840731292963, |
| "policy/clipfrac_avg": 0.0086669921875, |
| "policy/entropy_avg": 1.652453899383545, |
| "step": 73, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9642380475997925, |
| "val/ratio_var": 2.231507551186951e-06 |
| }, |
| { |
| "episode": 1184, |
| "epoch": 1.48, |
| "eps": 0, |
| "loss/policy_avg": -0.011983148753643036, |
| "loss/value_avg": 0.1448766589164734, |
| "lr": 7.3e-06, |
| "objective/entropy": 184.20877075195312, |
| "objective/kl": 9.668464660644531, |
| "objective/non_score_reward": -0.48342326283454895, |
| "objective/rlhf_reward": -0.7168216705322266, |
| "objective/scores": -0.2333984375, |
| "policy/approxkl_avg": 0.0025414335541427135, |
| "policy/clipfrac_avg": 0.0081787109375, |
| "policy/entropy_avg": 1.7255767583847046, |
| "step": 74, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9632611274719238, |
| "val/ratio_var": 8.115292075672187e-06 |
| }, |
| { |
| "episode": 1200, |
| "epoch": 1.5, |
| "eps": 0, |
| "loss/policy_avg": -0.007804821245372295, |
| "loss/value_avg": 0.16503594815731049, |
| "lr": 7.4e-06, |
| "objective/entropy": 160.60726928710938, |
| "objective/kl": 10.010700225830078, |
| "objective/non_score_reward": -0.5005350112915039, |
| "objective/rlhf_reward": -0.7358865737915039, |
| "objective/scores": -0.2353515625, |
| "policy/approxkl_avg": 0.002399697434157133, |
| "policy/clipfrac_avg": 0.0081787109375, |
| "policy/entropy_avg": 1.5353442430496216, |
| "step": 75, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9654887914657593, |
| "val/ratio_var": 7.977667337399907e-06 |
| }, |
| { |
| "episode": 1216, |
| "epoch": 1.52, |
| "eps": 0, |
| "loss/policy_avg": -0.012328230775892735, |
| "loss/value_avg": 0.1142549216747284, |
| "lr": 7.500000000000001e-06, |
| "objective/entropy": 200.383544921875, |
| "objective/kl": 9.883283615112305, |
| "objective/non_score_reward": -0.49416422843933105, |
| "objective/rlhf_reward": -0.875023603439331, |
| "objective/scores": -0.380859375, |
| "policy/approxkl_avg": 0.0030652470886707306, |
| "policy/clipfrac_avg": 0.0091552734375, |
| "policy/entropy_avg": 1.8219434022903442, |
| "step": 76, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9619243144989014, |
| "val/ratio_var": 9.272188435716089e-06 |
| }, |
| { |
| "episode": 1232, |
| "epoch": 1.54, |
| "eps": 0, |
| "loss/policy_avg": -0.009849337860941887, |
| "loss/value_avg": 0.09608270227909088, |
| "lr": 7.600000000000001e-06, |
| "objective/entropy": 194.978759765625, |
| "objective/kl": 9.415257453918457, |
| "objective/non_score_reward": -0.47076287865638733, |
| "objective/rlhf_reward": -0.6123644113540649, |
| "objective/scores": -0.1416015625, |
| "policy/approxkl_avg": 0.002732700901106, |
| "policy/clipfrac_avg": 0.009033203125, |
| "policy/entropy_avg": 1.7990806102752686, |
| "step": 77, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.959631085395813, |
| "val/ratio_var": 1.135609363700496e-05 |
| }, |
| { |
| "episode": 1248, |
| "epoch": 1.56, |
| "eps": 0, |
| "loss/policy_avg": -0.010330687277019024, |
| "loss/value_avg": 0.10260234773159027, |
| "lr": 7.7e-06, |
| "objective/entropy": 195.35775756835938, |
| "objective/kl": 11.809048652648926, |
| "objective/non_score_reward": -0.5904524326324463, |
| "objective/rlhf_reward": -1.0259993076324463, |
| "objective/scores": -0.435546875, |
| "policy/approxkl_avg": 0.0028561949729919434, |
| "policy/clipfrac_avg": 0.0106201171875, |
| "policy/entropy_avg": 1.775145173072815, |
| "step": 78, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9603179097175598, |
| "val/ratio_var": 7.384338687188574e-07 |
| }, |
| { |
| "episode": 1264, |
| "epoch": 1.58, |
| "eps": 0, |
| "loss/policy_avg": -0.008724691346287727, |
| "loss/value_avg": 0.1127011626958847, |
| "lr": 7.800000000000002e-06, |
| "objective/entropy": 192.0875701904297, |
| "objective/kl": 11.521632194519043, |
| "objective/non_score_reward": -0.5760816335678101, |
| "objective/rlhf_reward": -0.7323316335678101, |
| "objective/scores": -0.15625, |
| "policy/approxkl_avg": 0.0027797226794064045, |
| "policy/clipfrac_avg": 0.009765625, |
| "policy/entropy_avg": 1.8669971227645874, |
| "step": 79, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9582688808441162, |
| "val/ratio_var": 3.075662561968784e-06 |
| }, |
| { |
| "episode": 1280, |
| "epoch": 1.6, |
| "eps": 0, |
| "loss/policy_avg": -0.010857291519641876, |
| "loss/value_avg": 0.09611570090055466, |
| "lr": 7.9e-06, |
| "objective/entropy": 205.4402313232422, |
| "objective/kl": 10.072277069091797, |
| "objective/non_score_reward": -0.5036138296127319, |
| "objective/rlhf_reward": -0.8102544546127319, |
| "objective/scores": -0.306640625, |
| "policy/approxkl_avg": 0.002693342510610819, |
| "policy/clipfrac_avg": 0.0098876953125, |
| "policy/entropy_avg": 1.8436152935028076, |
| "step": 80, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9618000984191895, |
| "val/ratio_var": 3.328791990497848e-06 |
| }, |
| { |
| "episode": 1296, |
| "epoch": 1.62, |
| "eps": 0, |
| "loss/policy_avg": -0.011511455290019512, |
| "loss/value_avg": 0.15145719051361084, |
| "lr": 8.000000000000001e-06, |
| "objective/entropy": 179.18080139160156, |
| "objective/kl": 9.748101234436035, |
| "objective/non_score_reward": -0.48740506172180176, |
| "objective/rlhf_reward": -0.7432644367218018, |
| "objective/scores": -0.255859375, |
| "policy/approxkl_avg": 0.0032036681659519672, |
| "policy/clipfrac_avg": 0.0150146484375, |
| "policy/entropy_avg": 1.6454665660858154, |
| "step": 81, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9626948237419128, |
| "val/ratio_var": 3.5850334825227037e-06 |
| }, |
| { |
| "episode": 1312, |
| "epoch": 1.64, |
| "eps": 0, |
| "loss/policy_avg": -0.013651885092258453, |
| "loss/value_avg": 0.11120277643203735, |
| "lr": 8.1e-06, |
| "objective/entropy": 210.29153442382812, |
| "objective/kl": 12.029438018798828, |
| "objective/non_score_reward": -0.6014719009399414, |
| "objective/rlhf_reward": -0.6336984634399414, |
| "objective/scores": -0.0322265625, |
| "policy/approxkl_avg": 0.00316830538213253, |
| "policy/clipfrac_avg": 0.01513671875, |
| "policy/entropy_avg": 1.9450147151947021, |
| "step": 82, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9584970474243164, |
| "val/ratio_var": 1.1431648999860045e-05 |
| }, |
| { |
| "episode": 1328, |
| "epoch": 1.66, |
| "eps": 0, |
| "loss/policy_avg": -0.012597035616636276, |
| "loss/value_avg": 0.0916157066822052, |
| "lr": 8.2e-06, |
| "objective/entropy": 191.00723266601562, |
| "objective/kl": 12.672134399414062, |
| "objective/non_score_reward": -0.6336066722869873, |
| "objective/rlhf_reward": -1.0515754222869873, |
| "objective/scores": -0.41796875, |
| "policy/approxkl_avg": 0.002965549472719431, |
| "policy/clipfrac_avg": 0.0135498046875, |
| "policy/entropy_avg": 1.7200603485107422, |
| "step": 83, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9628971815109253, |
| "val/ratio_var": 8.506719495926518e-06 |
| }, |
| { |
| "episode": 1344, |
| "epoch": 1.68, |
| "eps": 0, |
| "loss/policy_avg": -0.011588087305426598, |
| "loss/value_avg": 0.08295813202857971, |
| "lr": 8.3e-06, |
| "objective/entropy": 171.20484924316406, |
| "objective/kl": 11.827956199645996, |
| "objective/non_score_reward": -0.5913978219032288, |
| "objective/rlhf_reward": -0.6612220406532288, |
| "objective/scores": -0.06982421875, |
| "policy/approxkl_avg": 0.0028498598840087652, |
| "policy/clipfrac_avg": 0.01220703125, |
| "policy/entropy_avg": 1.6282272338867188, |
| "step": 84, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9648880958557129, |
| "val/ratio_var": 2.407702822893043e-06 |
| }, |
| { |
| "episode": 1360, |
| "epoch": 1.7, |
| "eps": 0, |
| "loss/policy_avg": -0.011325595900416374, |
| "loss/value_avg": 0.11857910454273224, |
| "lr": 8.400000000000001e-06, |
| "objective/entropy": 210.7764434814453, |
| "objective/kl": 11.930746078491211, |
| "objective/non_score_reward": -0.5965373516082764, |
| "objective/rlhf_reward": -0.7498576641082764, |
| "objective/scores": -0.1533203125, |
| "policy/approxkl_avg": 0.0030916037503629923, |
| "policy/clipfrac_avg": 0.01220703125, |
| "policy/entropy_avg": 1.8590654134750366, |
| "step": 85, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9609018564224243, |
| "val/ratio_var": 2.87379384644737e-06 |
| }, |
| { |
| "episode": 1376, |
| "epoch": 1.72, |
| "eps": 0, |
| "loss/policy_avg": -0.009613092988729477, |
| "loss/value_avg": 0.08760805428028107, |
| "lr": 8.5e-06, |
| "objective/entropy": 177.6520233154297, |
| "objective/kl": 11.344982147216797, |
| "objective/non_score_reward": -0.567249059677124, |
| "objective/rlhf_reward": -0.786975622177124, |
| "objective/scores": -0.2197265625, |
| "policy/approxkl_avg": 0.0025511980056762695, |
| "policy/clipfrac_avg": 0.010009765625, |
| "policy/entropy_avg": 1.7009525299072266, |
| "step": 86, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9633545875549316, |
| "val/ratio_var": 4.815346528630471e-06 |
| }, |
| { |
| "episode": 1392, |
| "epoch": 1.74, |
| "eps": 0, |
| "loss/policy_avg": -0.012850948609411716, |
| "loss/value_avg": 0.0762033462524414, |
| "lr": 8.6e-06, |
| "objective/entropy": 177.36361694335938, |
| "objective/kl": 11.560365676879883, |
| "objective/non_score_reward": -0.578018307685852, |
| "objective/rlhf_reward": -0.583633542060852, |
| "objective/scores": -0.005615234375, |
| "policy/approxkl_avg": 0.002776096574962139, |
| "policy/clipfrac_avg": 0.0123291015625, |
| "policy/entropy_avg": 1.6667418479919434, |
| "step": 87, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9627068042755127, |
| "val/ratio_var": 1.2500585398811381e-05 |
| }, |
| { |
| "episode": 1408, |
| "epoch": 1.76, |
| "eps": 0, |
| "loss/policy_avg": -0.009959004819393158, |
| "loss/value_avg": 0.0731486976146698, |
| "lr": 8.700000000000001e-06, |
| "objective/entropy": 173.85479736328125, |
| "objective/kl": 11.528783798217773, |
| "objective/non_score_reward": -0.5764391422271729, |
| "objective/rlhf_reward": -0.29714226722717285, |
| "objective/scores": 0.279296875, |
| "policy/approxkl_avg": 0.002866474213078618, |
| "policy/clipfrac_avg": 0.011962890625, |
| "policy/entropy_avg": 1.6390188932418823, |
| "step": 88, |
| "val/clipfrac_avg": 0.0028076171875, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9629536867141724, |
| "val/ratio_var": 1.2676916412601713e-05 |
| }, |
| { |
| "episode": 1424, |
| "epoch": 1.78, |
| "eps": 0, |
| "loss/policy_avg": -0.00972694717347622, |
| "loss/value_avg": 0.09233620762825012, |
| "lr": 8.8e-06, |
| "objective/entropy": 185.50747680664062, |
| "objective/kl": 9.55545425415039, |
| "objective/non_score_reward": -0.4777727723121643, |
| "objective/rlhf_reward": -0.6418352723121643, |
| "objective/scores": -0.1640625, |
| "policy/approxkl_avg": 0.0028601475059986115, |
| "policy/clipfrac_avg": 0.0111083984375, |
| "policy/entropy_avg": 1.7339318990707397, |
| "step": 89, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9640640616416931, |
| "val/ratio_var": 1.0825136087078135e-05 |
| }, |
| { |
| "episode": 1440, |
| "epoch": 1.8, |
| "eps": 0, |
| "loss/policy_avg": -0.014528412371873856, |
| "loss/value_avg": 0.06530416011810303, |
| "lr": 8.900000000000001e-06, |
| "objective/entropy": 179.24249267578125, |
| "objective/kl": 11.102625846862793, |
| "objective/non_score_reward": -0.5551312565803528, |
| "objective/rlhf_reward": -0.5868695378303528, |
| "objective/scores": -0.03173828125, |
| "policy/approxkl_avg": 0.0030379812233150005, |
| "policy/clipfrac_avg": 0.0152587890625, |
| "policy/entropy_avg": 1.651671051979065, |
| "step": 90, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9644126296043396, |
| "val/ratio_var": 2.1015594029449858e-05 |
| }, |
| { |
| "episode": 1456, |
| "epoch": 1.82, |
| "eps": 0, |
| "loss/policy_avg": -0.014452735893428326, |
| "loss/value_avg": 0.08553409576416016, |
| "lr": 9e-06, |
| "objective/entropy": 175.57264709472656, |
| "objective/kl": 11.622018814086914, |
| "objective/non_score_reward": -0.5811009407043457, |
| "objective/rlhf_reward": -0.3135228157043457, |
| "objective/scores": 0.267578125, |
| "policy/approxkl_avg": 0.0030906922183930874, |
| "policy/clipfrac_avg": 0.0172119140625, |
| "policy/entropy_avg": 1.63707435131073, |
| "step": 91, |
| "val/clipfrac_avg": 0.00048828125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9620192646980286, |
| "val/ratio_var": 7.649259714526124e-06 |
| }, |
| { |
| "episode": 1472, |
| "epoch": 1.84, |
| "eps": 0, |
| "loss/policy_avg": -0.013859651982784271, |
| "loss/value_avg": 0.07400637865066528, |
| "lr": 9.100000000000001e-06, |
| "objective/entropy": 194.52926635742188, |
| "objective/kl": 9.147865295410156, |
| "objective/non_score_reward": -0.45739322900772095, |
| "objective/rlhf_reward": -0.27086979150772095, |
| "objective/scores": 0.1865234375, |
| "policy/approxkl_avg": 0.004359320737421513, |
| "policy/clipfrac_avg": 0.019775390625, |
| "policy/entropy_avg": 1.7847541570663452, |
| "step": 92, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9608691930770874, |
| "val/ratio_var": 6.1120294958527666e-06 |
| }, |
| { |
| "episode": 1488, |
| "epoch": 1.86, |
| "eps": 0, |
| "loss/policy_avg": -0.013310099020600319, |
| "loss/value_avg": 0.06840085983276367, |
| "lr": 9.200000000000002e-06, |
| "objective/entropy": 200.57981872558594, |
| "objective/kl": 13.42363452911377, |
| "objective/non_score_reward": -0.6711816787719727, |
| "objective/rlhf_reward": -0.8000879287719727, |
| "objective/scores": -0.12890625, |
| "policy/approxkl_avg": 0.0038994555361568928, |
| "policy/clipfrac_avg": 0.017333984375, |
| "policy/entropy_avg": 1.8697346448898315, |
| "step": 93, |
| "val/clipfrac_avg": 0.000244140625, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.961440920829773, |
| "val/ratio_var": 1.7970425687963143e-05 |
| }, |
| { |
| "episode": 1504, |
| "epoch": 1.88, |
| "eps": 0, |
| "loss/policy_avg": -0.010019056499004364, |
| "loss/value_avg": 0.0840313732624054, |
| "lr": 9.3e-06, |
| "objective/entropy": 157.54547119140625, |
| "objective/kl": 11.56136703491211, |
| "objective/non_score_reward": -0.5780683755874634, |
| "objective/rlhf_reward": -0.9257246255874634, |
| "objective/scores": -0.34765625, |
| "policy/approxkl_avg": 0.0030462387949228287, |
| "policy/clipfrac_avg": 0.0146484375, |
| "policy/entropy_avg": 1.48014235496521, |
| "step": 94, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9657242298126221, |
| "val/ratio_var": 2.893652663260582e-06 |
| }, |
| { |
| "episode": 1520, |
| "epoch": 1.9, |
| "eps": 0, |
| "loss/policy_avg": -0.011758793145418167, |
| "loss/value_avg": 0.05380406230688095, |
| "lr": 9.4e-06, |
| "objective/entropy": 158.7347412109375, |
| "objective/kl": 12.345122337341309, |
| "objective/non_score_reward": -0.6172561049461365, |
| "objective/rlhf_reward": -0.4190139174461365, |
| "objective/scores": 0.1982421875, |
| "policy/approxkl_avg": 0.003418966196477413, |
| "policy/clipfrac_avg": 0.021728515625, |
| "policy/entropy_avg": 1.4787856340408325, |
| "step": 95, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.964798092842102, |
| "val/ratio_var": 7.247604116855655e-06 |
| }, |
| { |
| "episode": 1536, |
| "epoch": 1.92, |
| "eps": 0, |
| "loss/policy_avg": -0.011863401159644127, |
| "loss/value_avg": 0.06334728747606277, |
| "lr": 9.5e-06, |
| "objective/entropy": 159.30589294433594, |
| "objective/kl": 14.144343376159668, |
| "objective/non_score_reward": -0.7072170972824097, |
| "objective/rlhf_reward": -0.6832913160324097, |
| "objective/scores": 0.02392578125, |
| "policy/approxkl_avg": 0.005127988290041685, |
| "policy/clipfrac_avg": 0.01611328125, |
| "policy/entropy_avg": 1.5055749416351318, |
| "step": 96, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9649109840393066, |
| "val/ratio_var": 7.781473868817557e-06 |
| }, |
| { |
| "episode": 1552, |
| "epoch": 1.94, |
| "eps": 0, |
| "loss/policy_avg": -0.017321571707725525, |
| "loss/value_avg": 0.08850554376840591, |
| "lr": 9.600000000000001e-06, |
| "objective/entropy": 159.99664306640625, |
| "objective/kl": 13.361457824707031, |
| "objective/non_score_reward": -0.6680729389190674, |
| "objective/rlhf_reward": -0.9053776264190674, |
| "objective/scores": -0.2373046875, |
| "policy/approxkl_avg": 0.0042739748023450375, |
| "policy/clipfrac_avg": 0.02294921875, |
| "policy/entropy_avg": 1.5079615116119385, |
| "step": 97, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9678010940551758, |
| "val/ratio_var": 1.4525655387842562e-05 |
| }, |
| { |
| "episode": 1568, |
| "epoch": 1.96, |
| "eps": 0, |
| "loss/policy_avg": -0.007993525825440884, |
| "loss/value_avg": 0.0631803646683693, |
| "lr": 9.7e-06, |
| "objective/entropy": 131.73757934570312, |
| "objective/kl": 13.03054428100586, |
| "objective/non_score_reward": -0.6515272259712219, |
| "objective/rlhf_reward": -0.4103162884712219, |
| "objective/scores": 0.2412109375, |
| "policy/approxkl_avg": 0.002947921399027109, |
| "policy/clipfrac_avg": 0.013427734375, |
| "policy/entropy_avg": 1.2414445877075195, |
| "step": 98, |
| "val/clipfrac_avg": 0.0001220703125, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9706454277038574, |
| "val/ratio_var": 3.9927599573275074e-05 |
| }, |
| { |
| "episode": 1584, |
| "epoch": 1.98, |
| "eps": 0, |
| "loss/policy_avg": -0.010491937398910522, |
| "loss/value_avg": 0.05786694213747978, |
| "lr": 9.800000000000001e-06, |
| "objective/entropy": 161.06646728515625, |
| "objective/kl": 12.394622802734375, |
| "objective/non_score_reward": -0.6197311282157898, |
| "objective/rlhf_reward": -0.7295944094657898, |
| "objective/scores": -0.10986328125, |
| "policy/approxkl_avg": 0.003401440568268299, |
| "policy/clipfrac_avg": 0.0164794921875, |
| "policy/entropy_avg": 1.5170197486877441, |
| "step": 99, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9651945233345032, |
| "val/ratio_var": 9.938579751178622e-06 |
| }, |
| { |
| "episode": 1600, |
| "epoch": 2.0, |
| "eps": 0, |
| "loss/policy_avg": -0.011365102604031563, |
| "loss/value_avg": 0.04688062146306038, |
| "lr": 9.9e-06, |
| "objective/entropy": 138.40078735351562, |
| "objective/kl": 13.543630599975586, |
| "objective/non_score_reward": -0.6771814823150635, |
| "objective/rlhf_reward": -0.5014002323150635, |
| "objective/scores": 0.17578125, |
| "policy/approxkl_avg": 0.0028700516559183598, |
| "policy/clipfrac_avg": 0.0155029296875, |
| "policy/entropy_avg": 1.324121117591858, |
| "step": 100, |
| "val/clipfrac_avg": 0.0, |
| "val/num_eos_tokens": 0, |
| "val/ratio": 0.9678511619567871, |
| "val/ratio_var": 7.740309229120612e-06 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 100, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2.0, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": true, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0, |
| "train_batch_size": null, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|