Text Generation
Safetensors
English
llama
conversational
Fino1-8B / trainer_state.json
lfqian's picture
Upload folder using huggingface_hub
2172a5e verified
raw
history blame
242 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"episode": 20032,
"epoch": 3.7803359124363087,
"eval_steps": 20,
"global_step": 313,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"episode": 64,
"epoch": 0.012077750518965842,
"eps": 0,
"loss/policy_avg": -0.0021184529177844524,
"loss/value_avg": 0.9311372637748718,
"lr": 0.0,
"objective/entropy": -600.715087890625,
"objective/kl": 0.46257561445236206,
"objective/non_score_reward": -0.013877267949283123,
"objective/rlhf_reward": 0.384560227394104,
"objective/scores": 0.3984375,
"policy/approxkl_avg": 0.00044652423821389675,
"policy/clipfrac_avg": 0.005666394717991352,
"policy/entropy_avg": 0.21374297142028809,
"step": 1,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.000056505203247,
"val/ratio_var": 4.3476825339894276e-07
},
{
"episode": 128,
"epoch": 0.024155501037931685,
"eps": 0,
"loss/policy_avg": -0.0030812141485512257,
"loss/value_avg": 0.8693833351135254,
"lr": 3.125e-08,
"objective/entropy": -595.1883544921875,
"objective/kl": 0.6688432097434998,
"objective/non_score_reward": -0.020065294578671455,
"objective/rlhf_reward": 0.3813018798828125,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.0004370739625301212,
"policy/clipfrac_avg": 0.006835754029452801,
"policy/entropy_avg": 0.21932220458984375,
"step": 2,
"val/clipfrac_avg": 0.00023904947738628834,
"val/num_eos_tokens": 49,
"val/ratio": 0.9999904632568359,
"val/ratio_var": 5.597846097771253e-07
},
{
"episode": 192,
"epoch": 0.03623325155689753,
"eps": 0,
"loss/policy_avg": -0.0009602411882951856,
"loss/value_avg": 0.9131457209587097,
"lr": 6.25e-08,
"objective/entropy": -561.6600341796875,
"objective/kl": 0.7238848805427551,
"objective/non_score_reward": -0.021716546267271042,
"objective/rlhf_reward": 0.39674046635627747,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.00045788957504555583,
"policy/clipfrac_avg": 0.007044724188745022,
"policy/entropy_avg": 0.2180023193359375,
"step": 3,
"val/clipfrac_avg": 0.00015014366363175213,
"val/num_eos_tokens": 50,
"val/ratio": 1.0000163316726685,
"val/ratio_var": 5.647702892019879e-07
},
{
"episode": 256,
"epoch": 0.04831100207586337,
"eps": 0,
"loss/policy_avg": -0.0013294187374413013,
"loss/value_avg": 0.9107441902160645,
"lr": 9.375e-08,
"objective/entropy": -489.9579162597656,
"objective/kl": 0.710690438747406,
"objective/non_score_reward": -0.021320713683962822,
"objective/rlhf_reward": 0.2943531274795532,
"objective/scores": 0.31640625,
"policy/approxkl_avg": 0.0008129056077450514,
"policy/clipfrac_avg": 0.0068802861496806145,
"policy/entropy_avg": 0.20705923438072205,
"step": 4,
"val/clipfrac_avg": 0.00046966708032414317,
"val/num_eos_tokens": 36,
"val/ratio": 0.999975860118866,
"val/ratio_var": 7.655679041818075e-07
},
{
"episode": 320,
"epoch": 0.06038875259482921,
"eps": 0,
"loss/policy_avg": -0.0032467995770275593,
"loss/value_avg": 0.8995952606201172,
"lr": 1.25e-07,
"objective/entropy": -685.2054443359375,
"objective/kl": 0.3006611764431,
"objective/non_score_reward": -0.00901983492076397,
"objective/rlhf_reward": 0.4690075218677521,
"objective/scores": 0.478515625,
"policy/approxkl_avg": 0.0003806678578257561,
"policy/clipfrac_avg": 0.006184540688991547,
"policy/entropy_avg": 0.2066497802734375,
"step": 5,
"val/clipfrac_avg": 0.0001254370145034045,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000379085540771,
"val/ratio_var": 4.958400268151308e-07
},
{
"episode": 384,
"epoch": 0.07246650311379506,
"eps": 0,
"loss/policy_avg": -0.00029869808349758387,
"loss/value_avg": 0.9305676221847534,
"lr": 1.5624999999999999e-07,
"objective/entropy": -588.39697265625,
"objective/kl": 0.5641751885414124,
"objective/non_score_reward": -0.016925256699323654,
"objective/rlhf_reward": 0.39762550592422485,
"objective/scores": 0.4140625,
"policy/approxkl_avg": 0.00041988492012023926,
"policy/clipfrac_avg": 0.006766438018530607,
"policy/entropy_avg": 0.20317253470420837,
"step": 6,
"val/clipfrac_avg": 0.00019526462710928172,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999103546142578,
"val/ratio_var": 6.99273925874877e-07
},
{
"episode": 448,
"epoch": 0.0845442536327609,
"eps": 0,
"loss/policy_avg": -0.0019068828551098704,
"loss/value_avg": 0.8919577598571777,
"lr": 1.875e-07,
"objective/entropy": -614.7843017578125,
"objective/kl": 0.33637887239456177,
"objective/non_score_reward": -0.010091365315020084,
"objective/rlhf_reward": 0.3663734793663025,
"objective/scores": 0.376953125,
"policy/approxkl_avg": 0.0004250165948178619,
"policy/clipfrac_avg": 0.0070974379777908325,
"policy/entropy_avg": 0.2131398618221283,
"step": 7,
"val/clipfrac_avg": 0.00019152543973177671,
"val/num_eos_tokens": 43,
"val/ratio": 1.000044822692871,
"val/ratio_var": 5.280454047351668e-07
},
{
"episode": 512,
"epoch": 0.09662200415172674,
"eps": 0,
"loss/policy_avg": -0.003216695738956332,
"loss/value_avg": 0.8838874101638794,
"lr": 2.1875e-07,
"objective/entropy": -576.663330078125,
"objective/kl": 0.7862333059310913,
"objective/non_score_reward": -0.023586997762322426,
"objective/rlhf_reward": 0.3934051990509033,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.000438332324847579,
"policy/clipfrac_avg": 0.006604321300983429,
"policy/entropy_avg": 0.20785841345787048,
"step": 8,
"val/clipfrac_avg": 0.0003107336815446615,
"val/num_eos_tokens": 36,
"val/ratio": 0.9998782873153687,
"val/ratio_var": 6.208914555827505e-07
},
{
"episode": 576,
"epoch": 0.10869975467069258,
"eps": 0,
"loss/policy_avg": -0.002265141811221838,
"loss/value_avg": 0.869255542755127,
"lr": 2.5e-07,
"objective/entropy": -627.076171875,
"objective/kl": 0.32534003257751465,
"objective/non_score_reward": -0.009760200046002865,
"objective/rlhf_reward": 0.39697808027267456,
"objective/scores": 0.40625,
"policy/approxkl_avg": 0.00039745302638038993,
"policy/clipfrac_avg": 0.006331109441816807,
"policy/entropy_avg": 0.19233450293540955,
"step": 9,
"val/clipfrac_avg": 0.00015899499703664333,
"val/num_eos_tokens": 41,
"val/ratio": 1.0001146793365479,
"val/ratio_var": 7.010530111983826e-07
},
{
"episode": 640,
"epoch": 0.12077750518965842,
"eps": 0,
"loss/policy_avg": -0.0026860858779400587,
"loss/value_avg": 0.8342069387435913,
"lr": 2.8125e-07,
"objective/entropy": -670.0674438476562,
"objective/kl": 0.7622801661491394,
"objective/non_score_reward": -0.022868404164910316,
"objective/rlhf_reward": 0.38533473014831543,
"objective/scores": 0.408203125,
"policy/approxkl_avg": 0.0003712985198944807,
"policy/clipfrac_avg": 0.006433566566556692,
"policy/entropy_avg": 0.20289739966392517,
"step": 10,
"val/clipfrac_avg": 0.027581503614783287,
"val/num_eos_tokens": 45,
"val/ratio": 0.999953031539917,
"val/ratio_var": 6.136452270766313e-07
},
{
"episode": 704,
"epoch": 0.13285525570862428,
"eps": 0,
"loss/policy_avg": -0.008414413779973984,
"loss/value_avg": 0.7493016123771667,
"lr": 3.1249999999999997e-07,
"objective/entropy": -643.5463256835938,
"objective/kl": 0.8814950585365295,
"objective/non_score_reward": -0.026444854214787483,
"objective/rlhf_reward": 0.42033249139785767,
"objective/scores": 0.447265625,
"policy/approxkl_avg": 0.0004149469896219671,
"policy/clipfrac_avg": 0.0073681240901350975,
"policy/entropy_avg": 0.21221670508384705,
"step": 11,
"val/clipfrac_avg": 0.0004212568746879697,
"val/num_eos_tokens": 36,
"val/ratio": 0.9999436736106873,
"val/ratio_var": 6.905478926455544e-07
},
{
"episode": 768,
"epoch": 0.14493300622759012,
"eps": 0,
"loss/policy_avg": -0.008181717246770859,
"loss/value_avg": 0.7314225435256958,
"lr": 3.4375e-07,
"objective/entropy": -599.110595703125,
"objective/kl": 0.7627489566802979,
"objective/non_score_reward": -0.02288246899843216,
"objective/rlhf_reward": 0.44879722595214844,
"objective/scores": 0.47265625,
"policy/approxkl_avg": 0.00046156253665685654,
"policy/clipfrac_avg": 0.0068002426996827126,
"policy/entropy_avg": 0.2235361784696579,
"step": 12,
"val/clipfrac_avg": 0.00017142172146122903,
"val/num_eos_tokens": 41,
"val/ratio": 0.9999423027038574,
"val/ratio_var": 8.461731226816482e-07
},
{
"episode": 832,
"epoch": 0.15701075674655596,
"eps": 0,
"loss/policy_avg": -0.006054941564798355,
"loss/value_avg": 0.7129493951797485,
"lr": 3.75e-07,
"objective/entropy": -625.335693359375,
"objective/kl": 0.9311845898628235,
"objective/non_score_reward": -0.027935536578297615,
"objective/rlhf_reward": 0.3563418388366699,
"objective/scores": 0.384765625,
"policy/approxkl_avg": 0.0003990530385635793,
"policy/clipfrac_avg": 0.006634948309510946,
"policy/entropy_avg": 0.20673498511314392,
"step": 13,
"val/clipfrac_avg": 0.00031250983010977507,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999412298202515,
"val/ratio_var": 6.676891075585445e-07
},
{
"episode": 896,
"epoch": 0.1690885072655218,
"eps": 0,
"loss/policy_avg": -0.007185523398220539,
"loss/value_avg": 0.663692831993103,
"lr": 4.0625e-07,
"objective/entropy": -611.6224365234375,
"objective/kl": 0.598267674446106,
"objective/non_score_reward": -0.017948029562830925,
"objective/rlhf_reward": 0.3819543123245239,
"objective/scores": 0.400390625,
"policy/approxkl_avg": 0.0004264025192242116,
"policy/clipfrac_avg": 0.0068409196101129055,
"policy/entropy_avg": 0.20851516723632812,
"step": 14,
"val/clipfrac_avg": 0.17968440055847168,
"val/num_eos_tokens": 45,
"val/ratio": 0.9998716115951538,
"val/ratio_var": 8.126443162836949e-07
},
{
"episode": 960,
"epoch": 0.18116625778448764,
"eps": 0,
"loss/policy_avg": -0.01342801284044981,
"loss/value_avg": 0.5221339464187622,
"lr": 4.375e-07,
"objective/entropy": -577.73486328125,
"objective/kl": 1.043047547340393,
"objective/non_score_reward": -0.031291425228118896,
"objective/rlhf_reward": 0.3266187310218811,
"objective/scores": 0.357421875,
"policy/approxkl_avg": 0.00048208641237579286,
"policy/clipfrac_avg": 0.00789344497025013,
"policy/entropy_avg": 0.21881103515625,
"step": 15,
"val/clipfrac_avg": 0.015999414026737213,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000479221343994,
"val/ratio_var": 7.270523383340333e-07
},
{
"episode": 1024,
"epoch": 0.19324400830345348,
"eps": 0,
"loss/policy_avg": -0.014559760689735413,
"loss/value_avg": 0.4795520305633545,
"lr": 4.6874999999999996e-07,
"objective/entropy": -704.927978515625,
"objective/kl": 1.3000061511993408,
"objective/non_score_reward": -0.03900018334388733,
"objective/rlhf_reward": 0.39117559790611267,
"objective/scores": 0.4296875,
"policy/approxkl_avg": 0.00035720731830224395,
"policy/clipfrac_avg": 0.006962340325117111,
"policy/entropy_avg": 0.19104096293449402,
"step": 16,
"val/clipfrac_avg": 0.0004542362876236439,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999203681945801,
"val/ratio_var": 6.327572918962687e-07
},
{
"episode": 1088,
"epoch": 0.20532175882241932,
"eps": 0,
"loss/policy_avg": -0.017483970150351524,
"loss/value_avg": 0.4525485634803772,
"lr": 5e-07,
"objective/entropy": -632.856689453125,
"objective/kl": 1.6142749786376953,
"objective/non_score_reward": -0.0484282523393631,
"objective/rlhf_reward": 0.3153412640094757,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.0004206376615911722,
"policy/clipfrac_avg": 0.007758093532174826,
"policy/entropy_avg": 0.20397186279296875,
"step": 17,
"val/clipfrac_avg": 0.001352960942313075,
"val/num_eos_tokens": 55,
"val/ratio": 1.0000765323638916,
"val/ratio_var": 6.663805720563687e-07
},
{
"episode": 1152,
"epoch": 0.21739950934138516,
"eps": 0,
"loss/policy_avg": -0.01164332777261734,
"loss/value_avg": 0.414880633354187,
"lr": 4.983164983164983e-07,
"objective/entropy": -660.91943359375,
"objective/kl": 2.686311721801758,
"objective/non_score_reward": -0.08058934658765793,
"objective/rlhf_reward": 0.3207778334617615,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.0004323392640799284,
"policy/clipfrac_avg": 0.008179331198334694,
"policy/entropy_avg": 0.19593684375286102,
"step": 18,
"val/clipfrac_avg": 0.0011506883893162012,
"val/num_eos_tokens": 51,
"val/ratio": 0.9998383522033691,
"val/ratio_var": 7.198556772891607e-07
},
{
"episode": 1216,
"epoch": 0.229477259860351,
"eps": 0,
"loss/policy_avg": -0.011872556060552597,
"loss/value_avg": 0.38459596037864685,
"lr": 4.966329966329966e-07,
"objective/entropy": -632.638916015625,
"objective/kl": 3.21876859664917,
"objective/non_score_reward": -0.09656305611133575,
"objective/rlhf_reward": 0.23790958523750305,
"objective/scores": 0.333984375,
"policy/approxkl_avg": 0.0004764531913679093,
"policy/clipfrac_avg": 0.008287805132567883,
"policy/entropy_avg": 0.2162272185087204,
"step": 19,
"val/clipfrac_avg": 0.00905265286564827,
"val/num_eos_tokens": 37,
"val/ratio": 1.0000991821289062,
"val/ratio_var": 7.565479904769745e-07
},
{
"episode": 1280,
"epoch": 0.24155501037931684,
"eps": 0,
"loss/policy_avg": -0.014671847224235535,
"loss/value_avg": 0.3181418478488922,
"lr": 4.949494949494949e-07,
"objective/entropy": -706.05078125,
"objective/kl": 3.669142007827759,
"objective/non_score_reward": -0.11007425934076309,
"objective/rlhf_reward": 0.3664882481098175,
"objective/scores": 0.4765625,
"policy/approxkl_avg": 0.0003931926330551505,
"policy/clipfrac_avg": 0.008725658059120178,
"policy/entropy_avg": 0.17680613696575165,
"step": 20,
"val/clipfrac_avg": 0.03898124024271965,
"val/num_eos_tokens": 54,
"val/ratio": 1.0001307725906372,
"val/ratio_var": 6.557406777574215e-07
},
{
"episode": 1344,
"epoch": 0.2536327608982827,
"eps": 0,
"loss/policy_avg": -0.014831740409135818,
"loss/value_avg": 0.2612203061580658,
"lr": 4.932659932659932e-07,
"objective/entropy": -704.768310546875,
"objective/kl": 3.722026824951172,
"objective/non_score_reward": -0.11166080832481384,
"objective/rlhf_reward": 0.33413997292518616,
"objective/scores": 0.4453125,
"policy/approxkl_avg": 0.00046230730367824435,
"policy/clipfrac_avg": 0.008072879165410995,
"policy/entropy_avg": 0.18310165405273438,
"step": 21,
"val/clipfrac_avg": 0.006685478147119284,
"val/num_eos_tokens": 47,
"val/ratio": 0.9999584555625916,
"val/ratio_var": 6.261999487833236e-07
},
{
"episode": 1408,
"epoch": 0.26571051141724855,
"eps": 0,
"loss/policy_avg": -0.015546409413218498,
"loss/value_avg": 0.22234514355659485,
"lr": 4.915824915824915e-07,
"objective/entropy": -686.92041015625,
"objective/kl": 5.413008689880371,
"objective/non_score_reward": -0.16239026188850403,
"objective/rlhf_reward": 0.30050036311149597,
"objective/scores": 0.462890625,
"policy/approxkl_avg": 0.00041679860441945493,
"policy/clipfrac_avg": 0.00833301804959774,
"policy/entropy_avg": 0.1929423063993454,
"step": 22,
"val/clipfrac_avg": 0.009531511925160885,
"val/num_eos_tokens": 48,
"val/ratio": 0.999933660030365,
"val/ratio_var": 7.41119151825842e-07
},
{
"episode": 1472,
"epoch": 0.27778826193621436,
"eps": 0,
"loss/policy_avg": -0.013549113646149635,
"loss/value_avg": 0.20537236332893372,
"lr": 4.898989898989898e-07,
"objective/entropy": -710.58544921875,
"objective/kl": 5.630204200744629,
"objective/non_score_reward": -0.16890612244606018,
"objective/rlhf_reward": 0.2529688775539398,
"objective/scores": 0.421875,
"policy/approxkl_avg": 0.00040828564669936895,
"policy/clipfrac_avg": 0.009321928024291992,
"policy/entropy_avg": 0.18656031787395477,
"step": 23,
"val/clipfrac_avg": 0.0010966494446620345,
"val/num_eos_tokens": 45,
"val/ratio": 1.0000791549682617,
"val/ratio_var": 8.043146522140887e-07
},
{
"episode": 1536,
"epoch": 0.28986601245518023,
"eps": 0,
"loss/policy_avg": -0.014127358794212341,
"loss/value_avg": 0.184538334608078,
"lr": 4.882154882154882e-07,
"objective/entropy": -721.2072143554688,
"objective/kl": 7.244493007659912,
"objective/non_score_reward": -0.2173347771167755,
"objective/rlhf_reward": 0.2309074103832245,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.00042275150190107524,
"policy/clipfrac_avg": 0.009146442636847496,
"policy/entropy_avg": 0.19444656372070312,
"step": 24,
"val/clipfrac_avg": 0.00022602800163440406,
"val/num_eos_tokens": 48,
"val/ratio": 0.9999423027038574,
"val/ratio_var": 6.538029424518754e-07
},
{
"episode": 1600,
"epoch": 0.30194376297414605,
"eps": 0,
"loss/policy_avg": -0.013546439819037914,
"loss/value_avg": 0.15871131420135498,
"lr": 4.865319865319866e-07,
"objective/entropy": -625.06640625,
"objective/kl": 6.906096935272217,
"objective/non_score_reward": -0.20718291401863098,
"objective/rlhf_reward": 0.14267060160636902,
"objective/scores": 0.349609375,
"policy/approxkl_avg": 0.0004575018538162112,
"policy/clipfrac_avg": 0.009727372787892818,
"policy/entropy_avg": 0.2004598081111908,
"step": 25,
"val/clipfrac_avg": 9.441343718208373e-05,
"val/num_eos_tokens": 33,
"val/ratio": 1.0002121925354004,
"val/ratio_var": 6.285458766797092e-07
},
{
"episode": 1664,
"epoch": 0.3140215134931119,
"eps": 0,
"loss/policy_avg": -0.011472932994365692,
"loss/value_avg": 0.15153326094150543,
"lr": 4.848484848484849e-07,
"objective/entropy": -657.1741333007812,
"objective/kl": 8.73617172241211,
"objective/non_score_reward": -0.26208510994911194,
"objective/rlhf_reward": 0.11779768764972687,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.00045209572999738157,
"policy/clipfrac_avg": 0.0087648406624794,
"policy/entropy_avg": 0.20401255786418915,
"step": 26,
"val/clipfrac_avg": 0.00012975589197594672,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000041723251343,
"val/ratio_var": 9.090064736483328e-07
},
{
"episode": 1728,
"epoch": 0.3260992640120777,
"eps": 0,
"loss/policy_avg": -0.018957365304231644,
"loss/value_avg": 0.1270497739315033,
"lr": 4.831649831649832e-07,
"objective/entropy": -725.923828125,
"objective/kl": 9.15277099609375,
"objective/non_score_reward": -0.2745831608772278,
"objective/rlhf_reward": 0.1878191977739334,
"objective/scores": 0.462890625,
"policy/approxkl_avg": 0.00039935283712111413,
"policy/clipfrac_avg": 0.009701108559966087,
"policy/entropy_avg": 0.18258032202720642,
"step": 27,
"val/clipfrac_avg": 0.0003692947211675346,
"val/num_eos_tokens": 50,
"val/ratio": 0.9999105334281921,
"val/ratio_var": 6.027379413353628e-07
},
{
"episode": 1792,
"epoch": 0.3381770145310436,
"eps": 0,
"loss/policy_avg": -0.015594224445521832,
"loss/value_avg": 0.11441653966903687,
"lr": 4.814814814814814e-07,
"objective/entropy": -735.0335693359375,
"objective/kl": 9.175653457641602,
"objective/non_score_reward": -0.27526962757110596,
"objective/rlhf_reward": 0.13391008973121643,
"objective/scores": 0.41015625,
"policy/approxkl_avg": 0.0004158214433118701,
"policy/clipfrac_avg": 0.008673434145748615,
"policy/entropy_avg": 0.17068862915039062,
"step": 28,
"val/clipfrac_avg": 0.0001518530771136284,
"val/num_eos_tokens": 52,
"val/ratio": 0.9999884366989136,
"val/ratio_var": 8.806293294583156e-07
},
{
"episode": 1856,
"epoch": 0.3502547650500094,
"eps": 0,
"loss/policy_avg": -0.011532934382557869,
"loss/value_avg": 0.09116180986166,
"lr": 4.797979797979798e-07,
"objective/entropy": -773.42919921875,
"objective/kl": 10.120838165283203,
"objective/non_score_reward": -0.3036251366138458,
"objective/rlhf_reward": 0.17586705088615417,
"objective/scores": 0.48046875,
"policy/approxkl_avg": 0.00036347960121929646,
"policy/clipfrac_avg": 0.009156275540590286,
"policy/entropy_avg": 0.16425704956054688,
"step": 29,
"val/clipfrac_avg": 3.780891711357981e-05,
"val/num_eos_tokens": 50,
"val/ratio": 1.0000102519989014,
"val/ratio_var": 5.752245328949357e-07
},
{
"episode": 1920,
"epoch": 0.3623325155689753,
"eps": 0,
"loss/policy_avg": -0.01278415322303772,
"loss/value_avg": 0.08662945032119751,
"lr": 4.781144781144781e-07,
"objective/entropy": -718.801025390625,
"objective/kl": 10.949674606323242,
"objective/non_score_reward": -0.3284902274608612,
"objective/rlhf_reward": 0.0650644600391388,
"objective/scores": 0.39453125,
"policy/approxkl_avg": 0.0004170535539742559,
"policy/clipfrac_avg": 0.009168609045445919,
"policy/entropy_avg": 0.17079035937786102,
"step": 30,
"val/clipfrac_avg": 0.00021001597633585334,
"val/num_eos_tokens": 53,
"val/ratio": 0.9999901652336121,
"val/ratio_var": 7.901667800069845e-07
},
{
"episode": 1984,
"epoch": 0.37441026608794115,
"eps": 0,
"loss/policy_avg": -0.013262813910841942,
"loss/value_avg": 0.07429289817810059,
"lr": 4.7643097643097643e-07,
"objective/entropy": -708.4739379882812,
"objective/kl": 11.724746704101562,
"objective/non_score_reward": -0.35174238681793213,
"objective/rlhf_reward": -0.014340057969093323,
"objective/scores": 0.337890625,
"policy/approxkl_avg": 0.0004177941009402275,
"policy/clipfrac_avg": 0.009081902913749218,
"policy/entropy_avg": 0.18209967017173767,
"step": 31,
"val/clipfrac_avg": 0.0003269795561209321,
"val/num_eos_tokens": 49,
"val/ratio": 0.999956488609314,
"val/ratio_var": 6.545072324115608e-07
},
{
"episode": 2048,
"epoch": 0.38648801660690696,
"eps": 0,
"loss/policy_avg": -0.018486851826310158,
"loss/value_avg": 0.06496821343898773,
"lr": 4.7474747474747474e-07,
"objective/entropy": -730.5189819335938,
"objective/kl": 13.40658187866211,
"objective/non_score_reward": -0.4021974205970764,
"objective/rlhf_reward": -0.03842787444591522,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.00040673528565093875,
"policy/clipfrac_avg": 0.008559602312743664,
"policy/entropy_avg": 0.16196060180664062,
"step": 32,
"val/clipfrac_avg": 8.302954665850848e-05,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999591708183289,
"val/ratio_var": 6.225269544302137e-07
},
{
"episode": 2112,
"epoch": 0.3985657671258728,
"eps": 0,
"loss/policy_avg": -0.022785823792219162,
"loss/value_avg": 0.054975174367427826,
"lr": 4.7306397306397305e-07,
"objective/entropy": -750.11865234375,
"objective/kl": 12.143804550170898,
"objective/non_score_reward": -0.36431413888931274,
"objective/rlhf_reward": 0.09711165726184845,
"objective/scores": 0.4609375,
"policy/approxkl_avg": 0.00039204792119562626,
"policy/clipfrac_avg": 0.009267458692193031,
"policy/entropy_avg": 0.15810012817382812,
"step": 33,
"val/clipfrac_avg": 6.448548811022192e-05,
"val/num_eos_tokens": 51,
"val/ratio": 1.0001405477523804,
"val/ratio_var": 6.74541126954864e-07
},
{
"episode": 2176,
"epoch": 0.41064351764483864,
"eps": 0,
"loss/policy_avg": -0.021774116903543472,
"loss/value_avg": 0.048116378486156464,
"lr": 4.7138047138047136e-07,
"objective/entropy": -726.4152221679688,
"objective/kl": 12.474294662475586,
"objective/non_score_reward": -0.3742288053035736,
"objective/rlhf_reward": 0.02713838219642639,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.0006013754173181951,
"policy/clipfrac_avg": 0.009044626727700233,
"policy/entropy_avg": 0.17004776000976562,
"step": 34,
"val/clipfrac_avg": 1.9868224626407027e-05,
"val/num_eos_tokens": 57,
"val/ratio": 1.0000003576278687,
"val/ratio_var": 5.519239607565396e-07
},
{
"episode": 2240,
"epoch": 0.4227212681638045,
"eps": 0,
"loss/policy_avg": -0.011024661362171173,
"loss/value_avg": 0.04245440661907196,
"lr": 4.696969696969697e-07,
"objective/entropy": -765.597900390625,
"objective/kl": 12.814224243164062,
"objective/non_score_reward": -0.38442671298980713,
"objective/rlhf_reward": -0.020657174289226532,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.00036188805825076997,
"policy/clipfrac_avg": 0.009150207042694092,
"policy/entropy_avg": 0.14880117774009705,
"step": 35,
"val/clipfrac_avg": 6.972333721932955e-06,
"val/num_eos_tokens": 54,
"val/ratio": 0.9999343752861023,
"val/ratio_var": 4.806460651707312e-07
},
{
"episode": 2304,
"epoch": 0.4347990186827703,
"eps": 0,
"loss/policy_avg": -0.018790725618600845,
"loss/value_avg": 0.038986437022686005,
"lr": 4.68013468013468e-07,
"objective/entropy": -748.5819091796875,
"objective/kl": 13.179786682128906,
"objective/non_score_reward": -0.39539361000061035,
"objective/rlhf_reward": 0.02013372629880905,
"objective/scores": 0.416015625,
"policy/approxkl_avg": 0.00038032219163142145,
"policy/clipfrac_avg": 0.009533729404211044,
"policy/entropy_avg": 0.15250270068645477,
"step": 36,
"val/clipfrac_avg": 2.8229449526406825e-05,
"val/num_eos_tokens": 55,
"val/ratio": 1.0000063180923462,
"val/ratio_var": 7.776847610330151e-07
},
{
"episode": 2368,
"epoch": 0.4468767692017362,
"eps": 0,
"loss/policy_avg": -0.018214058130979538,
"loss/value_avg": 0.034576669335365295,
"lr": 4.663299663299663e-07,
"objective/entropy": -702.28369140625,
"objective/kl": 15.299257278442383,
"objective/non_score_reward": -0.45897769927978516,
"objective/rlhf_reward": -0.15282535552978516,
"objective/scores": 0.306640625,
"policy/approxkl_avg": 0.00047967006685212255,
"policy/clipfrac_avg": 0.009605104103684425,
"policy/entropy_avg": 0.16119003295898438,
"step": 37,
"val/clipfrac_avg": 1.4692055628984235e-05,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000842809677124,
"val/ratio_var": 6.511489800686832e-07
},
{
"episode": 2432,
"epoch": 0.458954519720702,
"eps": 0,
"loss/policy_avg": -0.022558456286787987,
"loss/value_avg": 0.032565370202064514,
"lr": 4.646464646464646e-07,
"objective/entropy": -743.125732421875,
"objective/kl": 15.134292602539062,
"objective/non_score_reward": -0.45402878522872925,
"objective/rlhf_reward": -0.12102095782756805,
"objective/scores": 0.33203125,
"policy/approxkl_avg": 0.0005570814246311784,
"policy/clipfrac_avg": 0.009378625079989433,
"policy/entropy_avg": 0.15362422168254852,
"step": 38,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 52,
"val/ratio": 1.0000710487365723,
"val/ratio_var": 7.270390369740198e-07
},
{
"episode": 2496,
"epoch": 0.47103227023966787,
"eps": 0,
"loss/policy_avg": -0.029226083308458328,
"loss/value_avg": 0.029515882954001427,
"lr": 4.6296296296296297e-07,
"objective/entropy": -698.0175170898438,
"objective/kl": 15.121957778930664,
"objective/non_score_reward": -0.4536587595939636,
"objective/rlhf_reward": -0.06596343964338303,
"objective/scores": 0.38671875,
"policy/approxkl_avg": 0.00044198459363542497,
"policy/clipfrac_avg": 0.008979331701993942,
"policy/entropy_avg": 0.172088623046875,
"step": 39,
"val/clipfrac_avg": 1.966176751011517e-05,
"val/num_eos_tokens": 48,
"val/ratio": 1.0001137256622314,
"val/ratio_var": 9.834893717197701e-07
},
{
"episode": 2560,
"epoch": 0.4831100207586337,
"eps": 0,
"loss/policy_avg": -0.025167806074023247,
"loss/value_avg": 0.027836887165904045,
"lr": 4.612794612794613e-07,
"objective/entropy": -725.21875,
"objective/kl": 14.953241348266602,
"objective/non_score_reward": -0.4485971927642822,
"objective/rlhf_reward": -0.04869486391544342,
"objective/scores": 0.400390625,
"policy/approxkl_avg": 0.0005358229391276836,
"policy/clipfrac_avg": 0.008523606695234776,
"policy/entropy_avg": 0.15728633105754852,
"step": 40,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.00004243850708,
"val/ratio_var": 5.402997089731798e-07
},
{
"episode": 2624,
"epoch": 0.49518777127759955,
"eps": 0,
"loss/policy_avg": -0.030176600441336632,
"loss/value_avg": 0.026436101645231247,
"lr": 4.595959595959596e-07,
"objective/entropy": -781.7913818359375,
"objective/kl": 14.04931926727295,
"objective/non_score_reward": -0.42147958278656006,
"objective/rlhf_reward": 0.09707509726285934,
"objective/scores": 0.51953125,
"policy/approxkl_avg": 0.0003690444864332676,
"policy/clipfrac_avg": 0.008865940384566784,
"policy/entropy_avg": 0.14566168189048767,
"step": 41,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 62,
"val/ratio": 1.0000038146972656,
"val/ratio_var": 8.146014920384914e-07
},
{
"episode": 2688,
"epoch": 0.5072655217965654,
"eps": 0,
"loss/policy_avg": -0.02603175863623619,
"loss/value_avg": 0.026311784982681274,
"lr": 4.579124579124579e-07,
"objective/entropy": -753.4428100585938,
"objective/kl": 15.049712181091309,
"objective/non_score_reward": -0.4514913558959961,
"objective/rlhf_reward": -0.0354757234454155,
"objective/scores": 0.416015625,
"policy/approxkl_avg": 0.0003813736548181623,
"policy/clipfrac_avg": 0.009573683142662048,
"policy/entropy_avg": 0.17486445605754852,
"step": 42,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 57,
"val/ratio": 1.0002083778381348,
"val/ratio_var": 6.971042694203788e-07
},
{
"episode": 2752,
"epoch": 0.5193432723155312,
"eps": 0,
"loss/policy_avg": -0.027423618361353874,
"loss/value_avg": 0.024181833490729332,
"lr": 4.562289562289562e-07,
"objective/entropy": -720.7274780273438,
"objective/kl": 17.52047348022461,
"objective/non_score_reward": -0.5256141424179077,
"objective/rlhf_reward": -0.1222938597202301,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.00042545428732410073,
"policy/clipfrac_avg": 0.00958210788667202,
"policy/entropy_avg": 0.1733601987361908,
"step": 43,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 54,
"val/ratio": 0.9999567866325378,
"val/ratio_var": 7.062473628138832e-07
},
{
"episode": 2816,
"epoch": 0.5314210228344971,
"eps": 0,
"loss/policy_avg": -0.033347710967063904,
"loss/value_avg": 0.023924967274069786,
"lr": 4.545454545454545e-07,
"objective/entropy": -753.5906982421875,
"objective/kl": 16.592561721801758,
"objective/non_score_reward": -0.4977768063545227,
"objective/rlhf_reward": -0.04953461140394211,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.000394497939851135,
"policy/clipfrac_avg": 0.009626075625419617,
"policy/entropy_avg": 0.14611563086509705,
"step": 44,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.0000271797180176,
"val/ratio_var": 8.373976356779167e-07
},
{
"episode": 2880,
"epoch": 0.543498773353463,
"eps": 0,
"loss/policy_avg": -0.03707805275917053,
"loss/value_avg": 0.02223985455930233,
"lr": 4.5286195286195283e-07,
"objective/entropy": -725.1084594726562,
"objective/kl": 15.50640869140625,
"objective/non_score_reward": -0.4651922583580017,
"objective/rlhf_reward": -0.021344579756259918,
"objective/scores": 0.443359375,
"policy/approxkl_avg": 0.0004309536307118833,
"policy/clipfrac_avg": 0.009066203609108925,
"policy/entropy_avg": 0.16333135962486267,
"step": 45,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 1.00018310546875,
"val/ratio_var": 1.699194172033458e-06
},
{
"episode": 2944,
"epoch": 0.5555765238724287,
"eps": 0,
"loss/policy_avg": -0.03546188026666641,
"loss/value_avg": 0.021326132118701935,
"lr": 4.5117845117845114e-07,
"objective/entropy": -742.35107421875,
"objective/kl": 15.387621879577637,
"objective/non_score_reward": -0.46162867546081543,
"objective/rlhf_reward": -0.03047630935907364,
"objective/scores": 0.431640625,
"policy/approxkl_avg": 0.0004147663130424917,
"policy/clipfrac_avg": 0.009872214868664742,
"policy/entropy_avg": 0.15110652148723602,
"step": 46,
"val/clipfrac_avg": 9.790101103135385e-06,
"val/num_eos_tokens": 43,
"val/ratio": 1.000044822692871,
"val/ratio_var": 8.873777233020519e-07
},
{
"episode": 3008,
"epoch": 0.5676542743913946,
"eps": 0,
"loss/policy_avg": -0.022155379876494408,
"loss/value_avg": 0.021301649510860443,
"lr": 4.494949494949495e-07,
"objective/entropy": -754.4105834960938,
"objective/kl": 14.502567291259766,
"objective/non_score_reward": -0.4350770115852356,
"objective/rlhf_reward": -0.0415223091840744,
"objective/scores": 0.39453125,
"policy/approxkl_avg": 0.0004166339640505612,
"policy/clipfrac_avg": 0.008984292857348919,
"policy/entropy_avg": 0.14285914599895477,
"step": 47,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 1.0002052783966064,
"val/ratio_var": 1.7462091363995569e-06
},
{
"episode": 3072,
"epoch": 0.5797320249103605,
"eps": 0,
"loss/policy_avg": -0.027517154812812805,
"loss/value_avg": 0.020472221076488495,
"lr": 4.478114478114478e-07,
"objective/entropy": -718.2733154296875,
"objective/kl": 16.20379638671875,
"objective/non_score_reward": -0.48611387610435486,
"objective/rlhf_reward": -0.15676817297935486,
"objective/scores": 0.330078125,
"policy/approxkl_avg": 0.00043310271576046944,
"policy/clipfrac_avg": 0.009795701131224632,
"policy/entropy_avg": 0.16522979736328125,
"step": 48,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 53,
"val/ratio": 1.000115156173706,
"val/ratio_var": 8.753415272622078e-07
},
{
"episode": 3136,
"epoch": 0.5918097754293263,
"eps": 0,
"loss/policy_avg": -0.037540458142757416,
"loss/value_avg": 0.01891172304749489,
"lr": 4.461279461279461e-07,
"objective/entropy": -738.3416748046875,
"objective/kl": 15.657567977905273,
"objective/non_score_reward": -0.4697270393371582,
"objective/rlhf_reward": 0.01708938181400299,
"objective/scores": 0.486328125,
"policy/approxkl_avg": 0.00037665231502614915,
"policy/clipfrac_avg": 0.008994007483124733,
"policy/entropy_avg": 0.14120499789714813,
"step": 49,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 56,
"val/ratio": 1.000009536743164,
"val/ratio_var": 6.900321523062303e-07
},
{
"episode": 3200,
"epoch": 0.6038875259482921,
"eps": 0,
"loss/policy_avg": -0.03546880930662155,
"loss/value_avg": 0.01827932894229889,
"lr": 4.444444444444444e-07,
"objective/entropy": -700.5809936523438,
"objective/kl": 16.617774963378906,
"objective/non_score_reward": -0.4985332190990448,
"objective/rlhf_reward": -0.1123027503490448,
"objective/scores": 0.38671875,
"policy/approxkl_avg": 0.00040262818220071495,
"policy/clipfrac_avg": 0.009067821316421032,
"policy/entropy_avg": 0.13315296173095703,
"step": 50,
"val/clipfrac_avg": 6.10590086580487e-06,
"val/num_eos_tokens": 53,
"val/ratio": 1.0000566244125366,
"val/ratio_var": 8.310821044688055e-07
},
{
"episode": 3264,
"epoch": 0.615965276467258,
"eps": 0,
"loss/policy_avg": -0.022362984716892242,
"loss/value_avg": 0.01790526881814003,
"lr": 4.4276094276094275e-07,
"objective/entropy": -797.2921752929688,
"objective/kl": 14.45788860321045,
"objective/non_score_reward": -0.4337366223335266,
"objective/rlhf_reward": 0.025736041367053986,
"objective/scores": 0.458984375,
"policy/approxkl_avg": 0.00034859025618061423,
"policy/clipfrac_avg": 0.008412575349211693,
"policy/entropy_avg": 0.12615332007408142,
"step": 51,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 58,
"val/ratio": 1.000011682510376,
"val/ratio_var": 5.029750127505395e-07
},
{
"episode": 3328,
"epoch": 0.6280430269862238,
"eps": 0,
"loss/policy_avg": -0.03401505947113037,
"loss/value_avg": 0.018212419003248215,
"lr": 4.4107744107744106e-07,
"objective/entropy": -690.7019653320312,
"objective/kl": 15.584673881530762,
"objective/non_score_reward": -0.4675402045249939,
"objective/rlhf_reward": -0.03858514130115509,
"objective/scores": 0.4296875,
"policy/approxkl_avg": 0.0003729221352841705,
"policy/clipfrac_avg": 0.00865244958549738,
"policy/entropy_avg": 0.13912074267864227,
"step": 52,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 1.000160574913025,
"val/ratio_var": 6.19857644323929e-07
},
{
"episode": 3392,
"epoch": 0.6401207775051897,
"eps": 0,
"loss/policy_avg": -0.012439190410077572,
"loss/value_avg": 0.016416631639003754,
"lr": 4.3939393939393937e-07,
"objective/entropy": -722.3007202148438,
"objective/kl": 15.750506401062012,
"objective/non_score_reward": -0.47251516580581665,
"objective/rlhf_reward": -0.16929252445697784,
"objective/scores": 0.302734375,
"policy/approxkl_avg": 0.00040123704820871353,
"policy/clipfrac_avg": 0.009440924972295761,
"policy/entropy_avg": 0.14617919921875,
"step": 53,
"val/clipfrac_avg": 6.367155947373249e-06,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999165534973145,
"val/ratio_var": 5.078387061985268e-07
},
{
"episode": 3456,
"epoch": 0.6521985280241555,
"eps": 0,
"loss/policy_avg": -0.026336457580327988,
"loss/value_avg": 0.01662503555417061,
"lr": 4.377104377104377e-07,
"objective/entropy": -792.0408935546875,
"objective/kl": 15.416799545288086,
"objective/non_score_reward": -0.4625040292739868,
"objective/rlhf_reward": -0.012308701872825623,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.0010212509660050273,
"policy/clipfrac_avg": 0.008898193016648293,
"policy/entropy_avg": 0.13633601367473602,
"step": 54,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 54,
"val/ratio": 0.9999532699584961,
"val/ratio_var": 4.915744966638158e-07
},
{
"episode": 3520,
"epoch": 0.6642762785431213,
"eps": 0,
"loss/policy_avg": -0.03656713292002678,
"loss/value_avg": 0.016763746738433838,
"lr": 4.3602693602693604e-07,
"objective/entropy": -728.6891479492188,
"objective/kl": 15.019660949707031,
"objective/non_score_reward": -0.4505898356437683,
"objective/rlhf_reward": -0.0018593519926071167,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.0011785384267568588,
"policy/clipfrac_avg": 0.009183433838188648,
"policy/entropy_avg": 0.15262095630168915,
"step": 55,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 1.0000860691070557,
"val/ratio_var": 8.0383756539959e-07
},
{
"episode": 3584,
"epoch": 0.6763540290620872,
"eps": 0,
"loss/policy_avg": -0.022373374551534653,
"loss/value_avg": 0.016118954867124557,
"lr": 4.3434343434343435e-07,
"objective/entropy": -780.7221069335938,
"objective/kl": 14.14107608795166,
"objective/non_score_reward": -0.4242323040962219,
"objective/rlhf_reward": -0.03507213294506073,
"objective/scores": 0.388671875,
"policy/approxkl_avg": 0.0003644491662271321,
"policy/clipfrac_avg": 0.009697480127215385,
"policy/entropy_avg": 0.1477101743221283,
"step": 56,
"val/clipfrac_avg": 4.006410563306417e-06,
"val/num_eos_tokens": 58,
"val/ratio": 1.0000145435333252,
"val/ratio_var": 4.715680574918224e-07
},
{
"episode": 3648,
"epoch": 0.6884317795810531,
"eps": 0,
"loss/policy_avg": -0.02042277157306671,
"loss/value_avg": 0.015200886875391006,
"lr": 4.326599326599326e-07,
"objective/entropy": -730.958740234375,
"objective/kl": 15.971136093139648,
"objective/non_score_reward": -0.47913408279418945,
"objective/rlhf_reward": -0.11194658279418945,
"objective/scores": 0.3671875,
"policy/approxkl_avg": 0.0004102127568330616,
"policy/clipfrac_avg": 0.009771636687219143,
"policy/entropy_avg": 0.14384841918945312,
"step": 57,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 54,
"val/ratio": 1.0000948905944824,
"val/ratio_var": 5.781259915238479e-07
},
{
"episode": 3712,
"epoch": 0.7005095301000188,
"eps": 0,
"loss/policy_avg": -0.04674074053764343,
"loss/value_avg": 0.015770789235830307,
"lr": 4.309764309764309e-07,
"objective/entropy": -689.0645141601562,
"objective/kl": 15.481245994567871,
"objective/non_score_reward": -0.4644373655319214,
"objective/rlhf_reward": -0.0034998655319213867,
"objective/scores": 0.4609375,
"policy/approxkl_avg": 0.00042656456935219467,
"policy/clipfrac_avg": 0.009159904904663563,
"policy/entropy_avg": 0.15465545654296875,
"step": 58,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 1.000055193901062,
"val/ratio_var": 6.497099320768029e-07
},
{
"episode": 3776,
"epoch": 0.7125872806189847,
"eps": 0,
"loss/policy_avg": -0.018530046567320824,
"loss/value_avg": 0.014360702596604824,
"lr": 4.292929292929293e-07,
"objective/entropy": -639.9326782226562,
"objective/kl": 17.16830825805664,
"objective/non_score_reward": -0.5150492787361145,
"objective/rlhf_reward": -0.2088969349861145,
"objective/scores": 0.306640625,
"policy/approxkl_avg": 0.00047735171392560005,
"policy/clipfrac_avg": 0.010190478526055813,
"policy/entropy_avg": 0.17077922821044922,
"step": 59,
"val/clipfrac_avg": 8.251181498053484e-06,
"val/num_eos_tokens": 47,
"val/ratio": 1.0000905990600586,
"val/ratio_var": 1.1262843599979533e-06
},
{
"episode": 3840,
"epoch": 0.7246650311379506,
"eps": 0,
"loss/policy_avg": -0.03477172553539276,
"loss/value_avg": 0.014889835380017757,
"lr": 4.276094276094276e-07,
"objective/entropy": -693.819091796875,
"objective/kl": 14.728355407714844,
"objective/non_score_reward": -0.4418506622314453,
"objective/rlhf_reward": -0.03706549108028412,
"objective/scores": 0.404296875,
"policy/approxkl_avg": 0.00047620845725759864,
"policy/clipfrac_avg": 0.00950541626662016,
"policy/entropy_avg": 0.15099716186523438,
"step": 60,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 1.0001107454299927,
"val/ratio_var": 7.359848268606584e-07
},
{
"episode": 3904,
"epoch": 0.7367427816569164,
"eps": 0,
"loss/policy_avg": -0.02886144444346428,
"loss/value_avg": 0.014064384624361992,
"lr": 4.259259259259259e-07,
"objective/entropy": -732.0101318359375,
"objective/kl": 16.070905685424805,
"objective/non_score_reward": -0.48212718963623047,
"objective/rlhf_reward": -0.09003733098506927,
"objective/scores": 0.392578125,
"policy/approxkl_avg": 0.0004534229519777,
"policy/clipfrac_avg": 0.008900020271539688,
"policy/entropy_avg": 0.15126292407512665,
"step": 61,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999483823776245,
"val/ratio_var": 5.890042302780785e-07
},
{
"episode": 3968,
"epoch": 0.7488205321758823,
"eps": 0,
"loss/policy_avg": -0.03744254261255264,
"loss/value_avg": 0.015733784064650536,
"lr": 4.242424242424242e-07,
"objective/entropy": -729.4462890625,
"objective/kl": 15.581929206848145,
"objective/non_score_reward": -0.4674578905105591,
"objective/rlhf_reward": 0.05256165564060211,
"objective/scores": 0.51953125,
"policy/approxkl_avg": 0.0003785984590649605,
"policy/clipfrac_avg": 0.008160373196005821,
"policy/entropy_avg": 0.14229774475097656,
"step": 62,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 58,
"val/ratio": 1.0000548362731934,
"val/ratio_var": 4.930154204885184e-07
},
{
"episode": 4032,
"epoch": 0.760898282694848,
"eps": 0,
"loss/policy_avg": -0.030014147982001305,
"loss/value_avg": 0.014109417796134949,
"lr": 4.225589225589226e-07,
"objective/entropy": -747.200439453125,
"objective/kl": 15.121429443359375,
"objective/non_score_reward": -0.453642874956131,
"objective/rlhf_reward": -0.04592801630496979,
"objective/scores": 0.408203125,
"policy/approxkl_avg": 0.0004679747798945755,
"policy/clipfrac_avg": 0.008998386561870575,
"policy/entropy_avg": 0.15067800879478455,
"step": 63,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 55,
"val/ratio": 1.0000910758972168,
"val/ratio_var": 5.872569204257161e-07
},
{
"episode": 4096,
"epoch": 0.7729760332138139,
"eps": 0,
"loss/policy_avg": -0.02917386218905449,
"loss/value_avg": 0.013095545582473278,
"lr": 4.208754208754209e-07,
"objective/entropy": -728.0292358398438,
"objective/kl": 15.903536796569824,
"objective/non_score_reward": -0.4771060347557068,
"objective/rlhf_reward": -0.11333651840686798,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.000516406842507422,
"policy/clipfrac_avg": 0.008886368945240974,
"policy/entropy_avg": 0.1517333984375,
"step": 64,
"val/clipfrac_avg": 5.552594302571379e-06,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999017119407654,
"val/ratio_var": 7.202033316389134e-07
},
{
"episode": 4160,
"epoch": 0.7850537837327798,
"eps": 0,
"loss/policy_avg": -0.026137467473745346,
"loss/value_avg": 0.012687700800597668,
"lr": 4.1919191919191915e-07,
"objective/entropy": -767.3764038085938,
"objective/kl": 15.278279304504395,
"objective/non_score_reward": -0.4583483934402466,
"objective/rlhf_reward": -0.008641347289085388,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.0003777016536332667,
"policy/clipfrac_avg": 0.009344375692307949,
"policy/entropy_avg": 0.14270401000976562,
"step": 65,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999732971191406,
"val/ratio_var": 6.31260888894758e-07
},
{
"episode": 4224,
"epoch": 0.7971315342517457,
"eps": 0,
"loss/policy_avg": -0.02155480347573757,
"loss/value_avg": 0.012883363291621208,
"lr": 4.1750841750841746e-07,
"objective/entropy": -701.712890625,
"objective/kl": 15.514598846435547,
"objective/non_score_reward": -0.4654379189014435,
"objective/rlhf_reward": -0.08604338765144348,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.00044276120024733245,
"policy/clipfrac_avg": 0.00973192136734724,
"policy/entropy_avg": 0.15912756323814392,
"step": 66,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 1.0000464916229248,
"val/ratio_var": 7.58379371745832e-07
},
{
"episode": 4288,
"epoch": 0.8092092847707114,
"eps": 0,
"loss/policy_avg": -0.03506336733698845,
"loss/value_avg": 0.014060527086257935,
"lr": 4.158249158249158e-07,
"objective/entropy": -723.3513793945312,
"objective/kl": 16.13052749633789,
"objective/non_score_reward": -0.4839158058166504,
"objective/rlhf_reward": -0.03518534451723099,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.0006458936259150505,
"policy/clipfrac_avg": 0.009050115011632442,
"policy/entropy_avg": 0.14800135791301727,
"step": 67,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 0.9999250173568726,
"val/ratio_var": 6.238656169443857e-07
},
{
"episode": 4352,
"epoch": 0.8212870352896773,
"eps": 0,
"loss/policy_avg": -0.03492492437362671,
"loss/value_avg": 0.013395547866821289,
"lr": 4.1414141414141413e-07,
"objective/entropy": -756.134765625,
"objective/kl": 14.228071212768555,
"objective/non_score_reward": -0.4268421530723572,
"objective/rlhf_reward": 0.005775056779384613,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.00040148766129277647,
"policy/clipfrac_avg": 0.008605660870671272,
"policy/entropy_avg": 0.14455795288085938,
"step": 68,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 0.9998807907104492,
"val/ratio_var": 5.104772071717889e-07
},
{
"episode": 4416,
"epoch": 0.8333647858086431,
"eps": 0,
"loss/policy_avg": -0.03622628003358841,
"loss/value_avg": 0.012129010632634163,
"lr": 4.1245791245791244e-07,
"objective/entropy": -692.700439453125,
"objective/kl": 15.215728759765625,
"objective/non_score_reward": -0.45647183060646057,
"objective/rlhf_reward": -0.03947964310646057,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.00043082493357360363,
"policy/clipfrac_avg": 0.009177702479064465,
"policy/entropy_avg": 0.15514373779296875,
"step": 69,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000324249267578,
"val/ratio_var": 8.417625281254004e-07
},
{
"episode": 4480,
"epoch": 0.845442536327609,
"eps": 0,
"loss/policy_avg": -0.0306796133518219,
"loss/value_avg": 0.011692370288074017,
"lr": 4.1077441077441075e-07,
"objective/entropy": -700.46533203125,
"objective/kl": 14.929758071899414,
"objective/non_score_reward": -0.4478927254676819,
"objective/rlhf_reward": -0.05726771056652069,
"objective/scores": 0.390625,
"policy/approxkl_avg": 0.000412381486967206,
"policy/clipfrac_avg": 0.008844866417348385,
"policy/entropy_avg": 0.15450796484947205,
"step": 70,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 52,
"val/ratio": 1.0001542568206787,
"val/ratio_var": 7.975154403538909e-07
},
{
"episode": 4544,
"epoch": 0.8575202868465748,
"eps": 0,
"loss/policy_avg": -0.03419748693704605,
"loss/value_avg": 0.012416936457157135,
"lr": 4.090909090909091e-07,
"objective/entropy": -772.2770385742188,
"objective/kl": 12.517084121704102,
"objective/non_score_reward": -0.3755125403404236,
"objective/rlhf_reward": 0.14401870965957642,
"objective/scores": 0.51953125,
"policy/approxkl_avg": 0.00036138106952421367,
"policy/clipfrac_avg": 0.008941545151174068,
"policy/entropy_avg": 0.13918177783489227,
"step": 71,
"val/clipfrac_avg": 9.753433914738707e-06,
"val/num_eos_tokens": 55,
"val/ratio": 0.9999549388885498,
"val/ratio_var": 4.947730758431135e-07
},
{
"episode": 4608,
"epoch": 0.8695980373655406,
"eps": 0,
"loss/policy_avg": -0.0443786196410656,
"loss/value_avg": 0.011243673972785473,
"lr": 4.0740740740740737e-07,
"objective/entropy": -702.0181884765625,
"objective/kl": 14.902286529541016,
"objective/non_score_reward": -0.44706863164901733,
"objective/rlhf_reward": -0.01152174174785614,
"objective/scores": 0.435546875,
"policy/approxkl_avg": 0.00046824943274259567,
"policy/clipfrac_avg": 0.009215106256306171,
"policy/entropy_avg": 0.15385818481445312,
"step": 72,
"val/clipfrac_avg": 4.2761357690324076e-06,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999047517776489,
"val/ratio_var": 8.828073418953863e-07
},
{
"episode": 4672,
"epoch": 0.8816757878845065,
"eps": 0,
"loss/policy_avg": -0.044579483568668365,
"loss/value_avg": 0.013477655127644539,
"lr": 4.057239057239057e-07,
"objective/entropy": -750.3492431640625,
"objective/kl": 12.742916107177734,
"objective/non_score_reward": -0.38228750228881836,
"objective/rlhf_reward": 0.16458749771118164,
"objective/scores": 0.546875,
"policy/approxkl_avg": 0.00037200923543423414,
"policy/clipfrac_avg": 0.008558372035622597,
"policy/entropy_avg": 0.14228439331054688,
"step": 73,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 0.999910831451416,
"val/ratio_var": 6.942154300304537e-07
},
{
"episode": 4736,
"epoch": 0.8937535384034724,
"eps": 0,
"loss/policy_avg": -0.020492155104875565,
"loss/value_avg": 0.01326964795589447,
"lr": 4.04040404040404e-07,
"objective/entropy": -776.6446533203125,
"objective/kl": 14.296285629272461,
"objective/non_score_reward": -0.42888855934143066,
"objective/rlhf_reward": 0.007146604359149933,
"objective/scores": 0.435546875,
"policy/approxkl_avg": 0.00041618672548793256,
"policy/clipfrac_avg": 0.009593900293111801,
"policy/entropy_avg": 0.15248744189739227,
"step": 74,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999758005142212,
"val/ratio_var": 6.57985367524816e-07
},
{
"episode": 4800,
"epoch": 0.9058312889224382,
"eps": 0,
"loss/policy_avg": -0.03961968421936035,
"loss/value_avg": 0.013151820749044418,
"lr": 4.0235690235690236e-07,
"objective/entropy": -710.4595336914062,
"objective/kl": 14.758489608764648,
"objective/non_score_reward": -0.44275468587875366,
"objective/rlhf_reward": -0.023809373378753662,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.0004320571315474808,
"policy/clipfrac_avg": 0.009666088968515396,
"policy/entropy_avg": 0.16128668189048767,
"step": 75,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.0000278949737549,
"val/ratio_var": 7.166216846599127e-07
},
{
"episode": 4864,
"epoch": 0.917909039441404,
"eps": 0,
"loss/policy_avg": -0.04150720685720444,
"loss/value_avg": 0.013188062235713005,
"lr": 4.0067340067340067e-07,
"objective/entropy": -744.156005859375,
"objective/kl": 15.219215393066406,
"objective/non_score_reward": -0.4565764367580414,
"objective/rlhf_reward": 0.07613840699195862,
"objective/scores": 0.53125,
"policy/approxkl_avg": 0.0003768115711864084,
"policy/clipfrac_avg": 0.00844576582312584,
"policy/entropy_avg": 0.15825144946575165,
"step": 76,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999443292617798,
"val/ratio_var": 8.649810183669615e-07
},
{
"episode": 4928,
"epoch": 0.9299867899603699,
"eps": 0,
"loss/policy_avg": -0.025156065821647644,
"loss/value_avg": 0.011967229656875134,
"lr": 3.98989898989899e-07,
"objective/entropy": -674.7507934570312,
"objective/kl": 14.667293548583984,
"objective/non_score_reward": -0.44001880288124084,
"objective/rlhf_reward": -0.10603442043066025,
"objective/scores": 0.333984375,
"policy/approxkl_avg": 0.00046505866339430213,
"policy/clipfrac_avg": 0.009665473364293575,
"policy/entropy_avg": 0.15473303198814392,
"step": 77,
"val/clipfrac_avg": 7.867573003750294e-06,
"val/num_eos_tokens": 47,
"val/ratio": 0.9999678134918213,
"val/ratio_var": 6.291583645179344e-07
},
{
"episode": 4992,
"epoch": 0.9420645404793357,
"eps": 0,
"loss/policy_avg": -0.04044274613261223,
"loss/value_avg": 0.012631962075829506,
"lr": 3.973063973063973e-07,
"objective/entropy": -680.3353271484375,
"objective/kl": 14.307917594909668,
"objective/non_score_reward": -0.4292375147342682,
"objective/rlhf_reward": -0.0073625147342681885,
"objective/scores": 0.421875,
"policy/approxkl_avg": 0.0005386772681958973,
"policy/clipfrac_avg": 0.008763562887907028,
"policy/entropy_avg": 0.16646194458007812,
"step": 78,
"val/clipfrac_avg": 1.0013714927481487e-05,
"val/num_eos_tokens": 53,
"val/ratio": 0.9999792575836182,
"val/ratio_var": 7.390693212983024e-07
},
{
"episode": 5056,
"epoch": 0.9541422909983016,
"eps": 0,
"loss/policy_avg": -0.048411011695861816,
"loss/value_avg": 0.011281365528702736,
"lr": 3.956228956228956e-07,
"objective/entropy": -662.266845703125,
"objective/kl": 15.492654800415039,
"objective/non_score_reward": -0.46477964520454407,
"objective/rlhf_reward": -0.049252308905124664,
"objective/scores": 0.416015625,
"policy/approxkl_avg": 0.0005287184612825513,
"policy/clipfrac_avg": 0.009335631504654884,
"policy/entropy_avg": 0.18247604370117188,
"step": 79,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 40,
"val/ratio": 1.0000967979431152,
"val/ratio_var": 5.371284714783542e-07
},
{
"episode": 5120,
"epoch": 0.9662200415172674,
"eps": 0,
"loss/policy_avg": -0.05055360123515129,
"loss/value_avg": 0.01074596494436264,
"lr": 3.939393939393939e-07,
"objective/entropy": -696.76025390625,
"objective/kl": 13.393022537231445,
"objective/non_score_reward": -0.40179064869880676,
"objective/rlhf_reward": 0.028385117650032043,
"objective/scores": 0.4296875,
"policy/approxkl_avg": 0.00045566324843093753,
"policy/clipfrac_avg": 0.01011097151786089,
"policy/entropy_avg": 0.16834895312786102,
"step": 80,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 58,
"val/ratio": 1.0001335144042969,
"val/ratio_var": 8.958918442658614e-07
},
{
"episode": 5184,
"epoch": 0.9782977920362332,
"eps": 0,
"loss/policy_avg": -0.03783099725842476,
"loss/value_avg": 0.010885774157941341,
"lr": 3.922558922558922e-07,
"objective/entropy": -701.8968505859375,
"objective/kl": 14.081245422363281,
"objective/non_score_reward": -0.4224373698234558,
"objective/rlhf_reward": -0.01716393232345581,
"objective/scores": 0.40625,
"policy/approxkl_avg": 0.00042904424481093884,
"policy/clipfrac_avg": 0.008891528472304344,
"policy/entropy_avg": 0.16935603320598602,
"step": 81,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 55,
"val/ratio": 0.9999715089797974,
"val/ratio_var": 8.756766760598111e-07
},
{
"episode": 5248,
"epoch": 0.9903755425551991,
"eps": 0,
"loss/policy_avg": -0.03562987968325615,
"loss/value_avg": 0.010367941111326218,
"lr": 3.9057239057239053e-07,
"objective/entropy": -733.0692749023438,
"objective/kl": 12.260353088378906,
"objective/non_score_reward": -0.36781054735183716,
"objective/rlhf_reward": 0.10777536779642105,
"objective/scores": 0.4765625,
"policy/approxkl_avg": 0.00047890731366351247,
"policy/clipfrac_avg": 0.008688896894454956,
"policy/entropy_avg": 0.1473541259765625,
"step": 82,
"val/clipfrac_avg": 4.130320121475961e-06,
"val/num_eos_tokens": 54,
"val/ratio": 1.000084400177002,
"val/ratio_var": 5.904771569475997e-07
},
{
"episode": 5312,
"epoch": 1.002453293074165,
"eps": 0,
"loss/policy_avg": -0.03139907121658325,
"loss/value_avg": 0.010464398190379143,
"lr": 3.888888888888889e-07,
"objective/entropy": -696.6053466796875,
"objective/kl": 12.150976181030273,
"objective/non_score_reward": -0.36452925205230713,
"objective/rlhf_reward": -0.008083932101726532,
"objective/scores": 0.35546875,
"policy/approxkl_avg": 0.00038306796341203153,
"policy/clipfrac_avg": 0.009303595870733261,
"policy/entropy_avg": 0.17142996191978455,
"step": 83,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 52,
"val/ratio": 1.0002813339233398,
"val/ratio_var": 6.230750955182884e-07
},
{
"episode": 5376,
"epoch": 1.0145310435931307,
"eps": 0,
"loss/policy_avg": -0.03326902911067009,
"loss/value_avg": 0.009499987587332726,
"lr": 3.872053872053872e-07,
"objective/entropy": -681.0773315429688,
"objective/kl": 13.791988372802734,
"objective/non_score_reward": -0.41375964879989624,
"objective/rlhf_reward": 0.0037208348512649536,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.00044912411249242723,
"policy/clipfrac_avg": 0.009229006245732307,
"policy/entropy_avg": 0.16622290015220642,
"step": 84,
"val/clipfrac_avg": 1.3086264516459778e-05,
"val/num_eos_tokens": 50,
"val/ratio": 1.0001062154769897,
"val/ratio_var": 6.348414558488003e-07
},
{
"episode": 5440,
"epoch": 1.0266087941120967,
"eps": 0,
"loss/policy_avg": -0.024213604629039764,
"loss/value_avg": 0.009874923154711723,
"lr": 3.855218855218855e-07,
"objective/entropy": -694.9255981445312,
"objective/kl": 12.345937728881836,
"objective/non_score_reward": -0.37037813663482666,
"objective/rlhf_reward": 0.06370390206575394,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.0010469364933669567,
"policy/clipfrac_avg": 0.007577784359455109,
"policy/entropy_avg": 0.17229843139648438,
"step": 85,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 35,
"val/ratio": 1.000024676322937,
"val/ratio_var": 6.738481488355319e-07
},
{
"episode": 5504,
"epoch": 1.0386865446310625,
"eps": 0,
"loss/policy_avg": -0.024558693170547485,
"loss/value_avg": 0.00916454941034317,
"lr": 3.8383838383838377e-07,
"objective/entropy": -758.008544921875,
"objective/kl": 11.96615219116211,
"objective/non_score_reward": -0.3589845895767212,
"objective/rlhf_reward": 0.05996072292327881,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.0004611395706888288,
"policy/clipfrac_avg": 0.009094095788896084,
"policy/entropy_avg": 0.1533660888671875,
"step": 86,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000550746917725,
"val/ratio_var": 5.709296146960696e-07
},
{
"episode": 5568,
"epoch": 1.0507642951500282,
"eps": 0,
"loss/policy_avg": -0.03639592230319977,
"loss/value_avg": 0.010131916962563992,
"lr": 3.8215488215488214e-07,
"objective/entropy": -700.35693359375,
"objective/kl": 13.447929382324219,
"objective/non_score_reward": -0.40343791246414185,
"objective/rlhf_reward": -0.0006058886647224426,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.00040239907684735954,
"policy/clipfrac_avg": 0.00921421404927969,
"policy/entropy_avg": 0.17262396216392517,
"step": 87,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.0000275373458862,
"val/ratio_var": 6.002900931889599e-07
},
{
"episode": 5632,
"epoch": 1.0628420456689942,
"eps": 0,
"loss/policy_avg": -0.01775004342198372,
"loss/value_avg": 0.010766441933810711,
"lr": 3.8047138047138045e-07,
"objective/entropy": -757.93701171875,
"objective/kl": 12.896736145019531,
"objective/non_score_reward": -0.3869020938873291,
"objective/rlhf_reward": 0.0408322811126709,
"objective/scores": 0.427734375,
"policy/approxkl_avg": 0.0003993526042904705,
"policy/clipfrac_avg": 0.00875895470380783,
"policy/entropy_avg": 0.158355712890625,
"step": 88,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 61,
"val/ratio": 0.9999960660934448,
"val/ratio_var": 5.071574946668989e-07
},
{
"episode": 5696,
"epoch": 1.07491979618796,
"eps": 0,
"loss/policy_avg": -0.019584549590945244,
"loss/value_avg": 0.009444857016205788,
"lr": 3.7878787878787876e-07,
"objective/entropy": -716.8355102539062,
"objective/kl": 12.824501037597656,
"objective/non_score_reward": -0.3847350478172302,
"objective/rlhf_reward": 0.03274543583393097,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.00036994865513406694,
"policy/clipfrac_avg": 0.008531251922249794,
"policy/entropy_avg": 0.1638692319393158,
"step": 89,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 1.000080943107605,
"val/ratio_var": 4.814820613319171e-07
},
{
"episode": 5760,
"epoch": 1.086997546706926,
"eps": 0,
"loss/policy_avg": -0.03423365205526352,
"loss/value_avg": 0.009232178330421448,
"lr": 3.7710437710437707e-07,
"objective/entropy": -693.8837280273438,
"objective/kl": 14.44405746459961,
"objective/non_score_reward": -0.4333217144012451,
"objective/rlhf_reward": 0.0012486129999160767,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.0004389469395391643,
"policy/clipfrac_avg": 0.009360795840620995,
"policy/entropy_avg": 0.17364120483398438,
"step": 90,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 55,
"val/ratio": 1.000040054321289,
"val/ratio_var": 6.926705395926547e-07
},
{
"episode": 5824,
"epoch": 1.0990752972258917,
"eps": 0,
"loss/policy_avg": -0.032273001968860626,
"loss/value_avg": 0.008706326596438885,
"lr": 3.7542087542087543e-07,
"objective/entropy": -689.3602294921875,
"objective/kl": 12.720474243164062,
"objective/non_score_reward": -0.38161420822143555,
"objective/rlhf_reward": 0.004616260528564453,
"objective/scores": 0.38671875,
"policy/approxkl_avg": 0.0006273721810430288,
"policy/clipfrac_avg": 0.009181271307170391,
"policy/entropy_avg": 0.18628311157226562,
"step": 91,
"val/clipfrac_avg": 1.6534391761524603e-05,
"val/num_eos_tokens": 41,
"val/ratio": 1.0000529289245605,
"val/ratio_var": 8.713162742424174e-07
},
{
"episode": 5888,
"epoch": 1.1111530477448575,
"eps": 0,
"loss/policy_avg": -0.02137349173426628,
"loss/value_avg": 0.009642090648412704,
"lr": 3.7373737373737374e-07,
"objective/entropy": -692.1001586914062,
"objective/kl": 14.837923049926758,
"objective/non_score_reward": -0.4451376795768738,
"objective/rlhf_reward": -0.06671970337629318,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.00043404646567068994,
"policy/clipfrac_avg": 0.009237932972609997,
"policy/entropy_avg": 0.17236074805259705,
"step": 92,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 56,
"val/ratio": 1.0002210140228271,
"val/ratio_var": 5.661090085595788e-07
},
{
"episode": 5952,
"epoch": 1.1232307982638234,
"eps": 0,
"loss/policy_avg": -0.02585110068321228,
"loss/value_avg": 0.008299533277750015,
"lr": 3.7205387205387205e-07,
"objective/entropy": -694.7158813476562,
"objective/kl": 13.083290100097656,
"objective/non_score_reward": -0.3924986720085144,
"objective/rlhf_reward": -0.0001646876335144043,
"objective/scores": 0.392578125,
"policy/approxkl_avg": 0.00045136193512007594,
"policy/clipfrac_avg": 0.009170552715659142,
"policy/entropy_avg": 0.1658935546875,
"step": 93,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.0000104904174805,
"val/ratio_var": 6.102125666984648e-07
},
{
"episode": 6016,
"epoch": 1.1353085487827892,
"eps": 0,
"loss/policy_avg": -0.009908072650432587,
"loss/value_avg": 0.008584199473261833,
"lr": 3.703703703703703e-07,
"objective/entropy": -704.2706298828125,
"objective/kl": 13.47339916229248,
"objective/non_score_reward": -0.4042019844055176,
"objective/rlhf_reward": -0.041409000754356384,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.0004217799287289381,
"policy/clipfrac_avg": 0.009384381584823132,
"policy/entropy_avg": 0.15645718574523926,
"step": 94,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999314546585083,
"val/ratio_var": 5.486367058438191e-07
},
{
"episode": 6080,
"epoch": 1.147386299301755,
"eps": 0,
"loss/policy_avg": -0.017584126442670822,
"loss/value_avg": 0.008460859768092632,
"lr": 3.686868686868687e-07,
"objective/entropy": -668.5169677734375,
"objective/kl": 13.884815216064453,
"objective/non_score_reward": -0.41654446721076965,
"objective/rlhf_reward": -0.041544459760189056,
"objective/scores": 0.375,
"policy/approxkl_avg": 0.00046371493954211473,
"policy/clipfrac_avg": 0.0087856724858284,
"policy/entropy_avg": 0.17994818091392517,
"step": 95,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000855922698975,
"val/ratio_var": 8.110731641863822e-07
},
{
"episode": 6144,
"epoch": 1.159464049820721,
"eps": 0,
"loss/policy_avg": -0.044272422790527344,
"loss/value_avg": 0.008409342728555202,
"lr": 3.67003367003367e-07,
"objective/entropy": -681.5213623046875,
"objective/kl": 12.985597610473633,
"objective/non_score_reward": -0.38956788182258606,
"objective/rlhf_reward": 0.06941649317741394,
"objective/scores": 0.458984375,
"policy/approxkl_avg": 0.0004158633528277278,
"policy/clipfrac_avg": 0.008287884294986725,
"policy/entropy_avg": 0.17576727271080017,
"step": 96,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 0.9998923540115356,
"val/ratio_var": 7.354411764026736e-07
},
{
"episode": 6208,
"epoch": 1.1715418003396867,
"eps": 0,
"loss/policy_avg": -0.021865837275981903,
"loss/value_avg": 0.008920140564441681,
"lr": 3.653198653198653e-07,
"objective/entropy": -668.6990966796875,
"objective/kl": 13.717606544494629,
"objective/non_score_reward": -0.41152817010879517,
"objective/rlhf_reward": -0.03311019390821457,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.0005852892645634711,
"policy/clipfrac_avg": 0.009737148880958557,
"policy/entropy_avg": 0.17390570044517517,
"step": 97,
"val/clipfrac_avg": 5.780614628747571e-06,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000433921813965,
"val/ratio_var": 7.945446327539685e-07
},
{
"episode": 6272,
"epoch": 1.1836195508586527,
"eps": 0,
"loss/policy_avg": -0.01734349876642227,
"loss/value_avg": 0.008430123329162598,
"lr": 3.636363636363636e-07,
"objective/entropy": -680.8672485351562,
"objective/kl": 13.248254776000977,
"objective/non_score_reward": -0.3974476158618927,
"objective/rlhf_reward": -0.0346546545624733,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.0004206518060527742,
"policy/clipfrac_avg": 0.008977975696325302,
"policy/entropy_avg": 0.18427658081054688,
"step": 98,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 0.9999732971191406,
"val/ratio_var": 7.201407470347476e-07
},
{
"episode": 6336,
"epoch": 1.1956973013776184,
"eps": 0,
"loss/policy_avg": -0.01861773617565632,
"loss/value_avg": 0.007797658443450928,
"lr": 3.6195286195286197e-07,
"objective/entropy": -750.1210327148438,
"objective/kl": 12.454809188842773,
"objective/non_score_reward": -0.37364426255226135,
"objective/rlhf_reward": 0.08729323744773865,
"objective/scores": 0.4609375,
"policy/approxkl_avg": 0.00039422509144060314,
"policy/clipfrac_avg": 0.008532920852303505,
"policy/entropy_avg": 0.15262095630168915,
"step": 99,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 1.0000927448272705,
"val/ratio_var": 6.317893621599069e-07
},
{
"episode": 6400,
"epoch": 1.2077750518965842,
"eps": 0,
"loss/policy_avg": -0.018518339842557907,
"loss/value_avg": 0.008531475439667702,
"lr": 3.602693602693603e-07,
"objective/entropy": -659.4075317382812,
"objective/kl": 14.026962280273438,
"objective/non_score_reward": -0.4208088517189026,
"objective/rlhf_reward": -0.014070577919483185,
"objective/scores": 0.40625,
"policy/approxkl_avg": 0.0004532616585493088,
"policy/clipfrac_avg": 0.009551241993904114,
"policy/entropy_avg": 0.1776987761259079,
"step": 100,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0001015663146973,
"val/ratio_var": 7.798981869200361e-07
},
{
"episode": 6464,
"epoch": 1.2198528024155502,
"eps": 0,
"loss/policy_avg": -0.016511594876646996,
"loss/value_avg": 0.007756595965474844,
"lr": 3.5858585858585854e-07,
"objective/entropy": -679.53662109375,
"objective/kl": 11.8580322265625,
"objective/non_score_reward": -0.3557409346103668,
"objective/rlhf_reward": 0.003634057939052582,
"objective/scores": 0.359375,
"policy/approxkl_avg": 0.000443882163381204,
"policy/clipfrac_avg": 0.008907586336135864,
"policy/entropy_avg": 0.16290760040283203,
"step": 101,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000029802322388,
"val/ratio_var": 6.948280315555166e-07
},
{
"episode": 6528,
"epoch": 1.231930552934516,
"eps": 0,
"loss/policy_avg": -0.0391608364880085,
"loss/value_avg": 0.00805463083088398,
"lr": 3.5690235690235685e-07,
"objective/entropy": -706.1781616210938,
"objective/kl": 12.108580589294434,
"objective/non_score_reward": -0.36325740814208984,
"objective/rlhf_reward": 0.11525820195674896,
"objective/scores": 0.478515625,
"policy/approxkl_avg": 0.00041044512181542814,
"policy/clipfrac_avg": 0.009424544870853424,
"policy/entropy_avg": 0.18370692431926727,
"step": 102,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 0.9999408721923828,
"val/ratio_var": 7.074789323269215e-07
},
{
"episode": 6592,
"epoch": 1.2440083034534817,
"eps": 0,
"loss/policy_avg": -0.028079848736524582,
"loss/value_avg": 0.007725001312792301,
"lr": 3.552188552188552e-07,
"objective/entropy": -759.4559326171875,
"objective/kl": 10.86036491394043,
"objective/non_score_reward": -0.3258109390735626,
"objective/rlhf_reward": 0.17907187342643738,
"objective/scores": 0.50390625,
"policy/approxkl_avg": 0.0003799691912718117,
"policy/clipfrac_avg": 0.009264815598726273,
"policy/entropy_avg": 0.16744868457317352,
"step": 103,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 57,
"val/ratio": 1.0000367164611816,
"val/ratio_var": 8.271580327345873e-07
},
{
"episode": 6656,
"epoch": 1.2560860539724477,
"eps": 0,
"loss/policy_avg": -0.029258184134960175,
"loss/value_avg": 0.008032194338738918,
"lr": 3.535353535353535e-07,
"objective/entropy": -731.3373413085938,
"objective/kl": 12.31856632232666,
"objective/non_score_reward": -0.3695569932460785,
"objective/rlhf_reward": 0.0728258341550827,
"objective/scores": 0.44140625,
"policy/approxkl_avg": 0.00038161594420671463,
"policy/clipfrac_avg": 0.008145595900714397,
"policy/entropy_avg": 0.15793482959270477,
"step": 104,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 0.9998757839202881,
"val/ratio_var": 5.09608128140826e-07
},
{
"episode": 6720,
"epoch": 1.2681638044914134,
"eps": 0,
"loss/policy_avg": -0.02511785924434662,
"loss/value_avg": 0.007496064528822899,
"lr": 3.5185185185185183e-07,
"objective/entropy": -748.223388671875,
"objective/kl": 10.859156608581543,
"objective/non_score_reward": -0.3257746994495392,
"objective/rlhf_reward": 0.12442061305046082,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.00035941399983130395,
"policy/clipfrac_avg": 0.007973343133926392,
"policy/entropy_avg": 0.1588083952665329,
"step": 105,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 0.9999687075614929,
"val/ratio_var": 5.749298566115613e-07
},
{
"episode": 6784,
"epoch": 1.2802415550103794,
"eps": 0,
"loss/policy_avg": -0.016160570085048676,
"loss/value_avg": 0.0075116828083992004,
"lr": 3.5016835016835014e-07,
"objective/entropy": -734.2767333984375,
"objective/kl": 10.657108306884766,
"objective/non_score_reward": -0.3197132647037506,
"objective/rlhf_reward": 0.08360705524682999,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.0003991243429481983,
"policy/clipfrac_avg": 0.008354730904102325,
"policy/entropy_avg": 0.17373785376548767,
"step": 106,
"val/clipfrac_avg": 6.916777692822507e-06,
"val/num_eos_tokens": 48,
"val/ratio": 0.9999769926071167,
"val/ratio_var": 2.0136023977102013e-06
},
{
"episode": 6848,
"epoch": 1.2923193055293452,
"eps": 0,
"loss/policy_avg": -0.031159965321421623,
"loss/value_avg": 0.007707377430051565,
"lr": 3.484848484848485e-07,
"objective/entropy": -751.6048583984375,
"objective/kl": 10.399733543395996,
"objective/non_score_reward": -0.31199198961257935,
"objective/rlhf_reward": 0.16603532433509827,
"objective/scores": 0.478515625,
"policy/approxkl_avg": 0.0003584410878829658,
"policy/clipfrac_avg": 0.0083873700350523,
"policy/entropy_avg": 0.17502593994140625,
"step": 107,
"val/clipfrac_avg": 4.006410563306417e-06,
"val/num_eos_tokens": 59,
"val/ratio": 1.0000163316726685,
"val/ratio_var": 4.1057577959691116e-07
},
{
"episode": 6912,
"epoch": 1.304397056048311,
"eps": 0,
"loss/policy_avg": -0.028454942628741264,
"loss/value_avg": 0.007068981416523457,
"lr": 3.4680134680134676e-07,
"objective/entropy": -743.0848388671875,
"objective/kl": 11.543351173400879,
"objective/non_score_reward": -0.34630051255226135,
"objective/rlhf_reward": 0.08485182374715805,
"objective/scores": 0.431640625,
"policy/approxkl_avg": 0.00046511981054209173,
"policy/clipfrac_avg": 0.00906567182391882,
"policy/entropy_avg": 0.17058055102825165,
"step": 108,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0001753568649292,
"val/ratio_var": 5.996980121381057e-07
},
{
"episode": 6976,
"epoch": 1.316474806567277,
"eps": 0,
"loss/policy_avg": -0.025691799819469452,
"loss/value_avg": 0.007134787738323212,
"lr": 3.451178451178451e-07,
"objective/entropy": -738.3368530273438,
"objective/kl": 12.694337844848633,
"objective/non_score_reward": -0.38083016872406006,
"objective/rlhf_reward": 0.022490166127681732,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.0003700514789670706,
"policy/clipfrac_avg": 0.008202668279409409,
"policy/entropy_avg": 0.16368231177330017,
"step": 109,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999978542327881,
"val/ratio_var": 7.537715305261372e-07
},
{
"episode": 7040,
"epoch": 1.3285525570862426,
"eps": 0,
"loss/policy_avg": -0.02102075144648552,
"loss/value_avg": 0.0069115618243813515,
"lr": 3.434343434343434e-07,
"objective/entropy": -674.031494140625,
"objective/kl": 11.79364013671875,
"objective/non_score_reward": -0.3538092076778412,
"objective/rlhf_reward": 0.0021478235721588135,
"objective/scores": 0.35546875,
"policy/approxkl_avg": 0.0004363355692476034,
"policy/clipfrac_avg": 0.009443921968340874,
"policy/entropy_avg": 0.18636958301067352,
"step": 110,
"val/clipfrac_avg": 1.0153373295906931e-05,
"val/num_eos_tokens": 40,
"val/ratio": 1.0000288486480713,
"val/ratio_var": 8.098888883978361e-07
},
{
"episode": 7104,
"epoch": 1.3406303076052086,
"eps": 0,
"loss/policy_avg": -0.032738231122493744,
"loss/value_avg": 0.006995133124291897,
"lr": 3.4175084175084175e-07,
"objective/entropy": -694.0885620117188,
"objective/kl": 11.100361824035645,
"objective/non_score_reward": -0.33301082253456116,
"objective/rlhf_reward": 0.11034853756427765,
"objective/scores": 0.443359375,
"policy/approxkl_avg": 0.0004124289262108505,
"policy/clipfrac_avg": 0.009696437045931816,
"policy/entropy_avg": 0.16990280151367188,
"step": 111,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 1.000012993812561,
"val/ratio_var": 6.493988848887966e-07
},
{
"episode": 7168,
"epoch": 1.3527080581241744,
"eps": 0,
"loss/policy_avg": -0.014131966978311539,
"loss/value_avg": 0.0068663340061903,
"lr": 3.4006734006734006e-07,
"objective/entropy": -708.8604125976562,
"objective/kl": 12.0945405960083,
"objective/non_score_reward": -0.36283618211746216,
"objective/rlhf_reward": 0.07319895178079605,
"objective/scores": 0.435546875,
"policy/approxkl_avg": 0.0003713433106895536,
"policy/clipfrac_avg": 0.008440559729933739,
"policy/entropy_avg": 0.17221546173095703,
"step": 112,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 0.9999561309814453,
"val/ratio_var": 7.880616976763122e-07
},
{
"episode": 7232,
"epoch": 1.3647858086431401,
"eps": 0,
"loss/policy_avg": -0.025116082280874252,
"loss/value_avg": 0.006925811991095543,
"lr": 3.3838383838383837e-07,
"objective/entropy": -764.6071166992188,
"objective/kl": 9.985912322998047,
"objective/non_score_reward": -0.29957738518714905,
"objective/rlhf_reward": 0.20384058356285095,
"objective/scores": 0.50390625,
"policy/approxkl_avg": 0.00035295903217047453,
"policy/clipfrac_avg": 0.008869624696671963,
"policy/entropy_avg": 0.14903895556926727,
"step": 113,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 56,
"val/ratio": 0.9999246597290039,
"val/ratio_var": 6.762850830455136e-07
},
{
"episode": 7296,
"epoch": 1.3768635591621061,
"eps": 0,
"loss/policy_avg": -0.03208712860941887,
"loss/value_avg": 0.006576072424650192,
"lr": 3.3670033670033673e-07,
"objective/entropy": -727.884521484375,
"objective/kl": 10.056183815002441,
"objective/non_score_reward": -0.30168551206588745,
"objective/rlhf_reward": 0.20173244178295135,
"objective/scores": 0.50390625,
"policy/approxkl_avg": 0.00036313707823865116,
"policy/clipfrac_avg": 0.008787401020526886,
"policy/entropy_avg": 0.16502508521080017,
"step": 114,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 55,
"val/ratio": 1.000067114830017,
"val/ratio_var": 6.396308549483365e-07
},
{
"episode": 7360,
"epoch": 1.3889413096810719,
"eps": 0,
"loss/policy_avg": -0.012382835149765015,
"loss/value_avg": 0.007463869638741016,
"lr": 3.35016835016835e-07,
"objective/entropy": -742.7560424804688,
"objective/kl": 11.383095741271973,
"objective/non_score_reward": -0.3414928615093231,
"objective/rlhf_reward": 0.06329229474067688,
"objective/scores": 0.404296875,
"policy/approxkl_avg": 0.0003729486488737166,
"policy/clipfrac_avg": 0.008608178235590458,
"policy/entropy_avg": 0.15799586474895477,
"step": 115,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000958442687988,
"val/ratio_var": 5.714954340874101e-07
},
{
"episode": 7424,
"epoch": 1.4010190602000376,
"eps": 0,
"loss/policy_avg": -0.025664834305644035,
"loss/value_avg": 0.007761640008538961,
"lr": 3.333333333333333e-07,
"objective/entropy": -698.7653198242188,
"objective/kl": 11.45352554321289,
"objective/non_score_reward": -0.34360575675964355,
"objective/rlhf_reward": 0.14565207064151764,
"objective/scores": 0.48828125,
"policy/approxkl_avg": 0.0004009153926745057,
"policy/clipfrac_avg": 0.009290624409914017,
"policy/entropy_avg": 0.17402777075767517,
"step": 116,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000454187393188,
"val/ratio_var": 5.306055754772387e-07
},
{
"episode": 7488,
"epoch": 1.4130968107190036,
"eps": 0,
"loss/policy_avg": 0.002953662071377039,
"loss/value_avg": 0.008123742416501045,
"lr": 3.316498316498316e-07,
"objective/entropy": -682.9542236328125,
"objective/kl": 11.626581192016602,
"objective/non_score_reward": -0.348797470331192,
"objective/rlhf_reward": -0.012859970331192017,
"objective/scores": 0.3359375,
"policy/approxkl_avg": 0.00040126274689100683,
"policy/clipfrac_avg": 0.008363310247659683,
"policy/entropy_avg": 0.17609915137290955,
"step": 117,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 1.0000150203704834,
"val/ratio_var": 5.960429234619369e-07
},
{
"episode": 7552,
"epoch": 1.4251745612379694,
"eps": 0,
"loss/policy_avg": -0.024843420833349228,
"loss/value_avg": 0.008115454576909542,
"lr": 3.2996632996633e-07,
"objective/entropy": -706.5005493164062,
"objective/kl": 10.046842575073242,
"objective/non_score_reward": -0.30140525102615356,
"objective/rlhf_reward": 0.17369240522384644,
"objective/scores": 0.474609375,
"policy/approxkl_avg": 0.0003913644468411803,
"policy/clipfrac_avg": 0.008209867402911186,
"policy/entropy_avg": 0.16899840533733368,
"step": 118,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.000002145767212,
"val/ratio_var": 4.6929091013225843e-07
},
{
"episode": 7616,
"epoch": 1.4372523117569354,
"eps": 0,
"loss/policy_avg": -0.027740802615880966,
"loss/value_avg": 0.008931613527238369,
"lr": 3.282828282828283e-07,
"objective/entropy": -657.9352416992188,
"objective/kl": 13.040338516235352,
"objective/non_score_reward": -0.39121013879776,
"objective/rlhf_reward": -0.047460153698921204,
"objective/scores": 0.34375,
"policy/approxkl_avg": 0.0005606787162832916,
"policy/clipfrac_avg": 0.010201944038271904,
"policy/entropy_avg": 0.19644801318645477,
"step": 119,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999899864196777,
"val/ratio_var": 7.442695277859457e-07
},
{
"episode": 7680,
"epoch": 1.449330062275901,
"eps": 0,
"loss/policy_avg": -0.020683161914348602,
"loss/value_avg": 0.009700208902359009,
"lr": 3.265993265993266e-07,
"objective/entropy": -690.8984375,
"objective/kl": 11.422820091247559,
"objective/non_score_reward": -0.3426845967769623,
"objective/rlhf_reward": 0.07919040322303772,
"objective/scores": 0.421875,
"policy/approxkl_avg": 0.0004005637892987579,
"policy/clipfrac_avg": 0.008436471223831177,
"policy/entropy_avg": 0.17758052051067352,
"step": 120,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.00006103515625,
"val/ratio_var": 6.303826580733585e-07
},
{
"episode": 7744,
"epoch": 1.4614078127948669,
"eps": 0,
"loss/policy_avg": -0.013201544992625713,
"loss/value_avg": 0.008843690156936646,
"lr": 3.249158249158249e-07,
"objective/entropy": -661.7539672851562,
"objective/kl": 11.296271324157715,
"objective/non_score_reward": -0.33888810873031616,
"objective/rlhf_reward": 0.023416556417942047,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.00045110570499673486,
"policy/clipfrac_avg": 0.008799891918897629,
"policy/entropy_avg": 0.1949361264705658,
"step": 121,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 0.9999641180038452,
"val/ratio_var": 8.055627063185966e-07
},
{
"episode": 7808,
"epoch": 1.4734855633138328,
"eps": 0,
"loss/policy_avg": -0.017284566536545753,
"loss/value_avg": 0.011503017507493496,
"lr": 3.2323232323232327e-07,
"objective/entropy": -659.644287109375,
"objective/kl": 11.284911155700684,
"objective/non_score_reward": -0.33854734897613525,
"objective/rlhf_reward": 0.04646243155002594,
"objective/scores": 0.384765625,
"policy/approxkl_avg": 0.00042937506805174053,
"policy/clipfrac_avg": 0.008618071675300598,
"policy/entropy_avg": 0.19123205542564392,
"step": 122,
"val/clipfrac_avg": 0.00022032562992535532,
"val/num_eos_tokens": 43,
"val/ratio": 1.0000629425048828,
"val/ratio_var": 6.566247634509637e-07
},
{
"episode": 7872,
"epoch": 1.4855633138327986,
"eps": 0,
"loss/policy_avg": -0.021937822923064232,
"loss/value_avg": 0.009018287062644958,
"lr": 3.2154882154882153e-07,
"objective/entropy": -657.0106811523438,
"objective/kl": 10.135122299194336,
"objective/non_score_reward": -0.3040536642074585,
"objective/rlhf_reward": 0.10219632089138031,
"objective/scores": 0.40625,
"policy/approxkl_avg": 0.0003951935505028814,
"policy/clipfrac_avg": 0.008559124544262886,
"policy/entropy_avg": 0.17734527587890625,
"step": 123,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0001037120819092,
"val/ratio_var": 6.916661732248031e-07
},
{
"episode": 7936,
"epoch": 1.4976410643517646,
"eps": 0,
"loss/policy_avg": -0.013917829841375351,
"loss/value_avg": 0.009599328972399235,
"lr": 3.1986531986531984e-07,
"objective/entropy": -703.0108642578125,
"objective/kl": 11.34697151184082,
"objective/non_score_reward": -0.34040915966033936,
"objective/rlhf_reward": 0.048262715339660645,
"objective/scores": 0.388671875,
"policy/approxkl_avg": 0.0003819286357611418,
"policy/clipfrac_avg": 0.008146028965711594,
"policy/entropy_avg": 0.1836903989315033,
"step": 124,
"val/clipfrac_avg": 8.00051202531904e-06,
"val/num_eos_tokens": 60,
"val/ratio": 1.0001184940338135,
"val/ratio_var": 8.107883218144707e-07
},
{
"episode": 8000,
"epoch": 1.5097188148707303,
"eps": 0,
"loss/policy_avg": -0.02694123610854149,
"loss/value_avg": 0.00868251547217369,
"lr": 3.1818181818181815e-07,
"objective/entropy": -613.013671875,
"objective/kl": 13.164669036865234,
"objective/non_score_reward": -0.394940048456192,
"objective/rlhf_reward": -0.043865837156772614,
"objective/scores": 0.3515625,
"policy/approxkl_avg": 0.0005462130066007376,
"policy/clipfrac_avg": 0.008205180056393147,
"policy/entropy_avg": 0.20569229125976562,
"step": 125,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000652074813843,
"val/ratio_var": 7.626629781043448e-07
},
{
"episode": 8064,
"epoch": 1.521796565389696,
"eps": 0,
"loss/policy_avg": -0.021035056561231613,
"loss/value_avg": 0.008920412510633469,
"lr": 3.164983164983165e-07,
"objective/entropy": -658.7114868164062,
"objective/kl": 11.970376968383789,
"objective/non_score_reward": -0.35911130905151367,
"objective/rlhf_reward": 0.07545901089906693,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.00043813008232973516,
"policy/clipfrac_avg": 0.00897503923624754,
"policy/entropy_avg": 0.1892774999141693,
"step": 126,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 0.9998860359191895,
"val/ratio_var": 5.986601081531262e-07
},
{
"episode": 8128,
"epoch": 1.533874315908662,
"eps": 0,
"loss/policy_avg": -0.009907988831400871,
"loss/value_avg": 0.007388514932245016,
"lr": 3.148148148148148e-07,
"objective/entropy": -683.1361083984375,
"objective/kl": 11.096467971801758,
"objective/non_score_reward": -0.3328940272331238,
"objective/rlhf_reward": 0.006461434066295624,
"objective/scores": 0.33984375,
"policy/approxkl_avg": 0.0004163893754594028,
"policy/clipfrac_avg": 0.00852059293538332,
"policy/entropy_avg": 0.1878509521484375,
"step": 127,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.000105381011963,
"val/ratio_var": 7.102952963577991e-07
},
{
"episode": 8192,
"epoch": 1.5459520664276278,
"eps": 0,
"loss/policy_avg": -0.026054969057440758,
"loss/value_avg": 0.0072658974677324295,
"lr": 3.1313131313131313e-07,
"objective/entropy": -708.2806396484375,
"objective/kl": 10.58319091796875,
"objective/non_score_reward": -0.31749576330184937,
"objective/rlhf_reward": 0.10144957154989243,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.0004081852675881237,
"policy/clipfrac_avg": 0.008715375326573849,
"policy/entropy_avg": 0.17965063452720642,
"step": 128,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000120401382446,
"val/ratio_var": 7.822923180356156e-07
},
{
"episode": 8256,
"epoch": 1.5580298169465936,
"eps": 0,
"loss/policy_avg": -0.0229483749717474,
"loss/value_avg": 0.007459428161382675,
"lr": 3.1144781144781144e-07,
"objective/entropy": -659.6595458984375,
"objective/kl": 10.650728225708008,
"objective/non_score_reward": -0.31952184438705444,
"objective/rlhf_reward": 0.08672817051410675,
"objective/scores": 0.40625,
"policy/approxkl_avg": 0.00047252658987417817,
"policy/clipfrac_avg": 0.008922006003558636,
"policy/entropy_avg": 0.18110594153404236,
"step": 129,
"val/clipfrac_avg": 4.006410563306417e-06,
"val/num_eos_tokens": 41,
"val/ratio": 1.0000056028366089,
"val/ratio_var": 6.670750281045912e-07
},
{
"episode": 8320,
"epoch": 1.5701075674655596,
"eps": 0,
"loss/policy_avg": -0.02617826871573925,
"loss/value_avg": 0.007010661065578461,
"lr": 3.0976430976430975e-07,
"objective/entropy": -678.2298583984375,
"objective/kl": 12.300437927246094,
"objective/non_score_reward": -0.36901313066482544,
"objective/rlhf_reward": 0.03870171308517456,
"objective/scores": 0.408203125,
"policy/approxkl_avg": 0.0006277774227783084,
"policy/clipfrac_avg": 0.009144840762019157,
"policy/entropy_avg": 0.19200897216796875,
"step": 130,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 1.0000146627426147,
"val/ratio_var": 6.612851848331047e-07
},
{
"episode": 8384,
"epoch": 1.5821853179845253,
"eps": 0,
"loss/policy_avg": -0.010964921675622463,
"loss/value_avg": 0.006414837669581175,
"lr": 3.0808080808080806e-07,
"objective/entropy": -716.4149169921875,
"objective/kl": 10.809772491455078,
"objective/non_score_reward": -0.3242931365966797,
"objective/rlhf_reward": 0.038011543452739716,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.0003922901814803481,
"policy/clipfrac_avg": 0.00839821808040142,
"policy/entropy_avg": 0.18431854248046875,
"step": 131,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 52,
"val/ratio": 0.9999863505363464,
"val/ratio_var": 6.125242180132773e-07
},
{
"episode": 8448,
"epoch": 1.5942630685034913,
"eps": 0,
"loss/policy_avg": -0.02889040671288967,
"loss/value_avg": 0.00650613009929657,
"lr": 3.063973063973064e-07,
"objective/entropy": -713.104736328125,
"objective/kl": 10.613985061645508,
"objective/non_score_reward": -0.31841954588890076,
"objective/rlhf_reward": 0.11712733656167984,
"objective/scores": 0.435546875,
"policy/approxkl_avg": 0.00041002966463565826,
"policy/clipfrac_avg": 0.008486878126859665,
"policy/entropy_avg": 0.1799418181180954,
"step": 132,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0001921653747559,
"val/ratio_var": 7.679034865759604e-07
},
{
"episode": 8512,
"epoch": 1.606340819022457,
"eps": 0,
"loss/policy_avg": -0.017285965383052826,
"loss/value_avg": 0.005914734210819006,
"lr": 3.047138047138047e-07,
"objective/entropy": -597.8193359375,
"objective/kl": 10.843616485595703,
"objective/non_score_reward": -0.3253084719181061,
"objective/rlhf_reward": 0.024300895631313324,
"objective/scores": 0.349609375,
"policy/approxkl_avg": 0.0005316430469974875,
"policy/clipfrac_avg": 0.008745117112994194,
"policy/entropy_avg": 0.18398921191692352,
"step": 133,
"val/clipfrac_avg": 1.3069845408608671e-05,
"val/num_eos_tokens": 43,
"val/ratio": 1.0001111030578613,
"val/ratio_var": 1.0886411700994358e-06
},
{
"episode": 8576,
"epoch": 1.6184185695414228,
"eps": 0,
"loss/policy_avg": -0.0031869204249233007,
"loss/value_avg": 0.0062155104242265224,
"lr": 3.0303030303030305e-07,
"objective/entropy": -668.35791015625,
"objective/kl": 11.458172798156738,
"objective/non_score_reward": -0.3437451720237732,
"objective/rlhf_reward": 0.018315374851226807,
"objective/scores": 0.361328125,
"policy/approxkl_avg": 0.00043556466698646545,
"policy/clipfrac_avg": 0.009600733406841755,
"policy/entropy_avg": 0.1877404898405075,
"step": 134,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000466108322144,
"val/ratio_var": 5.28957230017113e-07
},
{
"episode": 8640,
"epoch": 1.6304963200603888,
"eps": 0,
"loss/policy_avg": -0.02297011762857437,
"loss/value_avg": 0.005568277090787888,
"lr": 3.0134680134680136e-07,
"objective/entropy": -664.0026245117188,
"objective/kl": 10.111173629760742,
"objective/non_score_reward": -0.30333518981933594,
"objective/rlhf_reward": 0.15467262268066406,
"objective/scores": 0.45703125,
"policy/approxkl_avg": 0.0005884866695851088,
"policy/clipfrac_avg": 0.008443444035947323,
"policy/entropy_avg": 0.1873784065246582,
"step": 135,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 0.9999123811721802,
"val/ratio_var": 6.365415856635082e-07
},
{
"episode": 8704,
"epoch": 1.6425740705793546,
"eps": 0,
"loss/policy_avg": -0.032759517431259155,
"loss/value_avg": 0.005268210079520941,
"lr": 2.9966329966329967e-07,
"objective/entropy": -600.26708984375,
"objective/kl": 11.405153274536133,
"objective/non_score_reward": -0.3421545624732971,
"objective/rlhf_reward": 0.05628291517496109,
"objective/scores": 0.3984375,
"policy/approxkl_avg": 0.0004980469821020961,
"policy/clipfrac_avg": 0.00921311229467392,
"policy/entropy_avg": 0.1832377165555954,
"step": 136,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000090599060059,
"val/ratio_var": 8.463471772302e-07
},
{
"episode": 8768,
"epoch": 1.6546518210983205,
"eps": 0,
"loss/policy_avg": -0.02364802360534668,
"loss/value_avg": 0.005613654851913452,
"lr": 2.9797979797979793e-07,
"objective/entropy": -710.4251708984375,
"objective/kl": 10.040770530700684,
"objective/non_score_reward": -0.30122309923171997,
"objective/rlhf_reward": 0.18656986951828003,
"objective/scores": 0.48828125,
"policy/approxkl_avg": 0.00038759977906011045,
"policy/clipfrac_avg": 0.008407797664403915,
"policy/entropy_avg": 0.17165374755859375,
"step": 137,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 1.0000735521316528,
"val/ratio_var": 5.960384896752657e-07
},
{
"episode": 8832,
"epoch": 1.6667295716172863,
"eps": 0,
"loss/policy_avg": -0.02203012816607952,
"loss/value_avg": 0.00595112843438983,
"lr": 2.962962962962963e-07,
"objective/entropy": -672.9989624023438,
"objective/kl": 10.036592483520508,
"objective/non_score_reward": -0.3010977804660797,
"objective/rlhf_reward": 0.07243738323450089,
"objective/scores": 0.373046875,
"policy/approxkl_avg": 0.00040325592271983624,
"policy/clipfrac_avg": 0.008593715727329254,
"policy/entropy_avg": 0.17470209300518036,
"step": 138,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 0.9999663233757019,
"val/ratio_var": 5.66750088637491e-07
},
{
"episode": 8896,
"epoch": 1.678807322136252,
"eps": 0,
"loss/policy_avg": -0.026858514174818993,
"loss/value_avg": 0.005622117780148983,
"lr": 2.946127946127946e-07,
"objective/entropy": -661.4876708984375,
"objective/kl": 11.446596145629883,
"objective/non_score_reward": -0.343397855758667,
"objective/rlhf_reward": 0.11412166804075241,
"objective/scores": 0.45703125,
"policy/approxkl_avg": 0.00048249890096485615,
"policy/clipfrac_avg": 0.008328979834914207,
"policy/entropy_avg": 0.18089675903320312,
"step": 139,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000536441802979,
"val/ratio_var": 7.740272849332541e-07
},
{
"episode": 8960,
"epoch": 1.690885072655218,
"eps": 0,
"loss/policy_avg": -0.012071679346263409,
"loss/value_avg": 0.005673854611814022,
"lr": 2.929292929292929e-07,
"objective/entropy": -716.2989501953125,
"objective/kl": 9.62091064453125,
"objective/non_score_reward": -0.2886272668838501,
"objective/rlhf_reward": 0.07660708576440811,
"objective/scores": 0.365234375,
"policy/approxkl_avg": 0.0004490650608204305,
"policy/clipfrac_avg": 0.008078145794570446,
"policy/entropy_avg": 0.17590078711509705,
"step": 140,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 0.9998876452445984,
"val/ratio_var": 7.121066687432176e-07
},
{
"episode": 9024,
"epoch": 1.7029628231741838,
"eps": 0,
"loss/policy_avg": -0.01407882571220398,
"loss/value_avg": 0.005721048917621374,
"lr": 2.912457912457912e-07,
"objective/entropy": -635.1033325195312,
"objective/kl": 11.491706848144531,
"objective/non_score_reward": -0.34475117921829224,
"objective/rlhf_reward": 0.012670673429965973,
"objective/scores": 0.357421875,
"policy/approxkl_avg": 0.00046376598766073585,
"policy/clipfrac_avg": 0.008426757529377937,
"policy/entropy_avg": 0.18351492285728455,
"step": 141,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9998941421508789,
"val/ratio_var": 7.447699772455962e-07
},
{
"episode": 9088,
"epoch": 1.7150405736931496,
"eps": 0,
"loss/policy_avg": -0.026968976482748985,
"loss/value_avg": 0.005361597985029221,
"lr": 2.895622895622896e-07,
"objective/entropy": -688.4439086914062,
"objective/kl": 10.791741371154785,
"objective/non_score_reward": -0.323752224445343,
"objective/rlhf_reward": 0.10886494815349579,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.00042141028097830713,
"policy/clipfrac_avg": 0.008808997459709644,
"policy/entropy_avg": 0.18024954199790955,
"step": 142,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000135898590088,
"val/ratio_var": 5.709226229555497e-07
},
{
"episode": 9152,
"epoch": 1.7271183242121155,
"eps": 0,
"loss/policy_avg": -0.027867591008543968,
"loss/value_avg": 0.005742911249399185,
"lr": 2.878787878787879e-07,
"objective/entropy": -663.561279296875,
"objective/kl": 9.091588973999023,
"objective/non_score_reward": -0.272747665643692,
"objective/rlhf_reward": 0.17207655310630798,
"objective/scores": 0.4453125,
"policy/approxkl_avg": 0.00043330591870471835,
"policy/clipfrac_avg": 0.008460369892418385,
"policy/entropy_avg": 0.19001516699790955,
"step": 143,
"val/clipfrac_avg": 6.127450888016028e-06,
"val/num_eos_tokens": 39,
"val/ratio": 1.0000602006912231,
"val/ratio_var": 6.067602953407913e-07
},
{
"episode": 9216,
"epoch": 1.7391960747310813,
"eps": 0,
"loss/policy_avg": -0.048003654927015305,
"loss/value_avg": 0.004765670746564865,
"lr": 2.8619528619528615e-07,
"objective/entropy": -609.2054443359375,
"objective/kl": 11.087499618530273,
"objective/non_score_reward": -0.33262500166893005,
"objective/rlhf_reward": 0.15663282573223114,
"objective/scores": 0.48828125,
"policy/approxkl_avg": 0.00043542124330997467,
"policy/clipfrac_avg": 0.008409352041780949,
"policy/entropy_avg": 0.18917052447795868,
"step": 144,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 0.9998970031738281,
"val/ratio_var": 7.055933224364708e-07
},
{
"episode": 9280,
"epoch": 1.7512738252500473,
"eps": 0,
"loss/policy_avg": -0.022776823490858078,
"loss/value_avg": 0.005445465445518494,
"lr": 2.8451178451178446e-07,
"objective/entropy": -649.3797607421875,
"objective/kl": 11.225455284118652,
"objective/non_score_reward": -0.3367636799812317,
"objective/rlhf_reward": 0.056791022419929504,
"objective/scores": 0.39453125,
"policy/approxkl_avg": 0.00043666589772328734,
"policy/clipfrac_avg": 0.008527948521077633,
"policy/entropy_avg": 0.1882273405790329,
"step": 145,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000823736190796,
"val/ratio_var": 7.544516051893879e-07
},
{
"episode": 9344,
"epoch": 1.763351575769013,
"eps": 0,
"loss/policy_avg": -0.02756238356232643,
"loss/value_avg": 0.005087685771286488,
"lr": 2.8282828282828283e-07,
"objective/entropy": -622.69384765625,
"objective/kl": 10.821589469909668,
"objective/non_score_reward": -0.3246476650238037,
"objective/rlhf_reward": 0.08306717872619629,
"objective/scores": 0.408203125,
"policy/approxkl_avg": 0.00043378135887905955,
"policy/clipfrac_avg": 0.008366484194993973,
"policy/entropy_avg": 0.18392562866210938,
"step": 146,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999205470085144,
"val/ratio_var": 7.217399229375587e-07
},
{
"episode": 9408,
"epoch": 1.7754293262879788,
"eps": 0,
"loss/policy_avg": -0.01545548252761364,
"loss/value_avg": 0.005610906984657049,
"lr": 2.8114478114478114e-07,
"objective/entropy": -710.6151733398438,
"objective/kl": 10.56408977508545,
"objective/non_score_reward": -0.316922664642334,
"objective/rlhf_reward": 0.09079217165708542,
"objective/scores": 0.408203125,
"policy/approxkl_avg": 0.00047942213132046163,
"policy/clipfrac_avg": 0.008930440992116928,
"policy/entropy_avg": 0.1723581999540329,
"step": 147,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 0.9999326467514038,
"val/ratio_var": 5.841548045282252e-07
},
{
"episode": 9472,
"epoch": 1.7875070768069448,
"eps": 0,
"loss/policy_avg": -0.030797995626926422,
"loss/value_avg": 0.005243232008069754,
"lr": 2.7946127946127945e-07,
"objective/entropy": -729.7215576171875,
"objective/kl": 7.677038192749023,
"objective/non_score_reward": -0.23031114041805267,
"objective/rlhf_reward": 0.26822400093078613,
"objective/scores": 0.498046875,
"policy/approxkl_avg": 0.0003217764897271991,
"policy/clipfrac_avg": 0.007711475715041161,
"policy/entropy_avg": 0.15406641364097595,
"step": 148,
"val/clipfrac_avg": 2.40384615608491e-05,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999207854270935,
"val/ratio_var": 6.207331466612231e-07
},
{
"episode": 9536,
"epoch": 1.7995848273259105,
"eps": 0,
"loss/policy_avg": -0.013873748481273651,
"loss/value_avg": 0.005824836902320385,
"lr": 2.7777777777777776e-07,
"objective/entropy": -724.1422729492188,
"objective/kl": 9.410755157470703,
"objective/non_score_reward": -0.28232264518737793,
"objective/rlhf_reward": 0.10390782356262207,
"objective/scores": 0.38671875,
"policy/approxkl_avg": 0.00034999821218661964,
"policy/clipfrac_avg": 0.00804916676133871,
"policy/entropy_avg": 0.1864827573299408,
"step": 149,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000414848327637,
"val/ratio_var": 6.957430400689191e-07
},
{
"episode": 9600,
"epoch": 1.8116625778448765,
"eps": 0,
"loss/policy_avg": -0.03579283133149147,
"loss/value_avg": 0.0054255155846476555,
"lr": 2.760942760942761e-07,
"objective/entropy": -673.954833984375,
"objective/kl": 11.36821174621582,
"objective/non_score_reward": -0.3410463333129883,
"objective/rlhf_reward": 0.10768412053585052,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.00042447494342923164,
"policy/clipfrac_avg": 0.00855330191552639,
"policy/entropy_avg": 0.19662603735923767,
"step": 150,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 53,
"val/ratio": 1.0001200437545776,
"val/ratio_var": 7.517347171415167e-07
},
{
"episode": 9664,
"epoch": 1.8237403283638423,
"eps": 0,
"loss/policy_avg": -0.008654760196805,
"loss/value_avg": 0.005138866137713194,
"lr": 2.7441077441077443e-07,
"objective/entropy": -660.7220458984375,
"objective/kl": 11.443527221679688,
"objective/non_score_reward": -0.3433057963848114,
"objective/rlhf_reward": 0.031938336789608,
"objective/scores": 0.375,
"policy/approxkl_avg": 0.00044237799011170864,
"policy/clipfrac_avg": 0.008529680781066418,
"policy/entropy_avg": 0.18802008032798767,
"step": 151,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.0000139474868774,
"val/ratio_var": 7.119359111129597e-07
},
{
"episode": 9728,
"epoch": 1.835818078882808,
"eps": 0,
"loss/policy_avg": -0.02010141685605049,
"loss/value_avg": 0.0053723035380244255,
"lr": 2.727272727272727e-07,
"objective/entropy": -687.9390869140625,
"objective/kl": 9.019115447998047,
"objective/non_score_reward": -0.2705734670162201,
"objective/rlhf_reward": 0.0761062279343605,
"objective/scores": 0.34765625,
"policy/approxkl_avg": 0.00040762912249192595,
"policy/clipfrac_avg": 0.008179357275366783,
"policy/entropy_avg": 0.18950526416301727,
"step": 152,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 0.9998996257781982,
"val/ratio_var": 6.454416165979637e-07
},
{
"episode": 9792,
"epoch": 1.847895829401774,
"eps": 0,
"loss/policy_avg": -0.003955461550503969,
"loss/value_avg": 0.0057320622727274895,
"lr": 2.71043771043771e-07,
"objective/entropy": -709.93115234375,
"objective/kl": 9.436538696289062,
"objective/non_score_reward": -0.2830961346626282,
"objective/rlhf_reward": 0.09629838913679123,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.00038871431024745107,
"policy/clipfrac_avg": 0.0074014379642903805,
"policy/entropy_avg": 0.18319067358970642,
"step": 153,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 1.0000646114349365,
"val/ratio_var": 5.15572935455566e-07
},
{
"episode": 9856,
"epoch": 1.8599735799207398,
"eps": 0,
"loss/policy_avg": -0.02044486068189144,
"loss/value_avg": 0.00510798767209053,
"lr": 2.6936026936026936e-07,
"objective/entropy": -618.0216064453125,
"objective/kl": 10.58153247833252,
"objective/non_score_reward": -0.3174459636211395,
"objective/rlhf_reward": 0.05462434142827988,
"objective/scores": 0.37109375,
"policy/approxkl_avg": 0.00045742533984594047,
"policy/clipfrac_avg": 0.008799334987998009,
"policy/entropy_avg": 0.2110799252986908,
"step": 154,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 0.9999656677246094,
"val/ratio_var": 6.85510656239785e-07
},
{
"episode": 9920,
"epoch": 1.8720513304397055,
"eps": 0,
"loss/policy_avg": -0.030187513679265976,
"loss/value_avg": 0.0049795545637607574,
"lr": 2.676767676767677e-07,
"objective/entropy": -632.8074951171875,
"objective/kl": 10.401787757873535,
"objective/non_score_reward": -0.3120536208152771,
"objective/rlhf_reward": 0.12495807558298111,
"objective/scores": 0.4375,
"policy/approxkl_avg": 0.0004211895284242928,
"policy/clipfrac_avg": 0.007907616905868053,
"policy/entropy_avg": 0.20108795166015625,
"step": 155,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 0.999963641166687,
"val/ratio_var": 6.760049586773675e-07
},
{
"episode": 9984,
"epoch": 1.8841290809586715,
"eps": 0,
"loss/policy_avg": -0.029912468045949936,
"loss/value_avg": 0.005053409840911627,
"lr": 2.65993265993266e-07,
"objective/entropy": -617.66943359375,
"objective/kl": 11.245973587036133,
"objective/non_score_reward": -0.33737921714782715,
"objective/rlhf_reward": 0.07375361770391464,
"objective/scores": 0.41015625,
"policy/approxkl_avg": 0.00048606080235913396,
"policy/clipfrac_avg": 0.00850730575621128,
"policy/entropy_avg": 0.2064310759305954,
"step": 156,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 40,
"val/ratio": 1.000071406364441,
"val/ratio_var": 9.804023193282774e-07
},
{
"episode": 10048,
"epoch": 1.8962068314776372,
"eps": 0,
"loss/policy_avg": -0.04771365970373154,
"loss/value_avg": 0.004816756118088961,
"lr": 2.643097643097643e-07,
"objective/entropy": -655.9871826171875,
"objective/kl": 9.88708782196045,
"objective/non_score_reward": -0.2966126501560211,
"objective/rlhf_reward": 0.19313344359397888,
"objective/scores": 0.490234375,
"policy/approxkl_avg": 0.00041826663073152304,
"policy/clipfrac_avg": 0.00776095874607563,
"policy/entropy_avg": 0.18035888671875,
"step": 157,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 1.0000017881393433,
"val/ratio_var": 6.541795869452471e-07
},
{
"episode": 10112,
"epoch": 1.9082845819966032,
"eps": 0,
"loss/policy_avg": -0.015434409491717815,
"loss/value_avg": 0.004857035353779793,
"lr": 2.6262626262626266e-07,
"objective/entropy": -639.9647216796875,
"objective/kl": 9.765108108520508,
"objective/non_score_reward": -0.2929532527923584,
"objective/rlhf_reward": 0.09498622268438339,
"objective/scores": 0.388671875,
"policy/approxkl_avg": 0.00043778051622211933,
"policy/clipfrac_avg": 0.009346296079456806,
"policy/entropy_avg": 0.1973876953125,
"step": 158,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000224113464355,
"val/ratio_var": 8.540955036551168e-07
},
{
"episode": 10176,
"epoch": 1.920362332515569,
"eps": 0,
"loss/policy_avg": -0.029195090755820274,
"loss/value_avg": 0.004645572509616613,
"lr": 2.609427609427609e-07,
"objective/entropy": -655.7578125,
"objective/kl": 11.296720504760742,
"objective/non_score_reward": -0.3389016389846802,
"objective/rlhf_reward": 0.10152805596590042,
"objective/scores": 0.44140625,
"policy/approxkl_avg": 0.0004221507697366178,
"policy/clipfrac_avg": 0.008524060249328613,
"policy/entropy_avg": 0.1910552978515625,
"step": 159,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 0.999944806098938,
"val/ratio_var": 5.19986315339338e-07
},
{
"episode": 10240,
"epoch": 1.9324400830345347,
"eps": 0,
"loss/policy_avg": -0.026694564148783684,
"loss/value_avg": 0.0052330996841192245,
"lr": 2.5925925925925923e-07,
"objective/entropy": -705.4793701171875,
"objective/kl": 9.390169143676758,
"objective/non_score_reward": -0.2817050516605377,
"objective/rlhf_reward": 0.17459377646446228,
"objective/scores": 0.45703125,
"policy/approxkl_avg": 0.000462901167338714,
"policy/clipfrac_avg": 0.00837808009237051,
"policy/entropy_avg": 0.18092474341392517,
"step": 160,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 40,
"val/ratio": 1.0000178813934326,
"val/ratio_var": 6.675589929727721e-07
},
{
"episode": 10304,
"epoch": 1.9445178335535007,
"eps": 0,
"loss/policy_avg": -0.03287532925605774,
"loss/value_avg": 0.004949102643877268,
"lr": 2.5757575757575754e-07,
"objective/entropy": -731.1031494140625,
"objective/kl": 9.570659637451172,
"objective/non_score_reward": -0.2871198058128357,
"objective/rlhf_reward": 0.2748919129371643,
"objective/scores": 0.5625,
"policy/approxkl_avg": 0.00036840554093942046,
"policy/clipfrac_avg": 0.008850205689668655,
"policy/entropy_avg": 0.17508062720298767,
"step": 161,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999874830245972,
"val/ratio_var": 6.138056392046565e-07
},
{
"episode": 10368,
"epoch": 1.9565955840724665,
"eps": 0,
"loss/policy_avg": -0.028138628229498863,
"loss/value_avg": 0.004971574060618877,
"lr": 2.558922558922559e-07,
"objective/entropy": -692.5169677734375,
"objective/kl": 9.982763290405273,
"objective/non_score_reward": -0.29948288202285767,
"objective/rlhf_reward": 0.18977493047714233,
"objective/scores": 0.48828125,
"policy/approxkl_avg": 0.0013259215047582984,
"policy/clipfrac_avg": 0.007515524979680777,
"policy/entropy_avg": 0.18165206909179688,
"step": 162,
"val/clipfrac_avg": 4.633748631022172e-06,
"val/num_eos_tokens": 40,
"val/ratio": 0.9999215006828308,
"val/ratio_var": 6.52264759537502e-07
},
{
"episode": 10432,
"epoch": 1.9686733345914325,
"eps": 0,
"loss/policy_avg": -0.007500559091567993,
"loss/value_avg": 0.005513847805559635,
"lr": 2.542087542087542e-07,
"objective/entropy": -709.478515625,
"objective/kl": 8.840448379516602,
"objective/non_score_reward": -0.2652134299278259,
"objective/rlhf_reward": 0.15617327392101288,
"objective/scores": 0.421875,
"policy/approxkl_avg": 0.00044637074461206794,
"policy/clipfrac_avg": 0.007994147948920727,
"policy/entropy_avg": 0.185394287109375,
"step": 163,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.0000178813934326,
"val/ratio_var": 6.560539986821823e-07
},
{
"episode": 10496,
"epoch": 1.9807510851103982,
"eps": 0,
"loss/policy_avg": -0.012414928525686264,
"loss/value_avg": 0.004942988511174917,
"lr": 2.525252525252525e-07,
"objective/entropy": -699.0042724609375,
"objective/kl": 9.015774726867676,
"objective/non_score_reward": -0.2704732418060303,
"objective/rlhf_reward": 0.17386269569396973,
"objective/scores": 0.4453125,
"policy/approxkl_avg": 0.0003955226275138557,
"policy/clipfrac_avg": 0.00792492926120758,
"policy/entropy_avg": 0.18563461303710938,
"step": 164,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 1.0000154972076416,
"val/ratio_var": 7.629991500834876e-07
},
{
"episode": 10560,
"epoch": 1.992828835629364,
"eps": 0,
"loss/policy_avg": -0.009898051619529724,
"loss/value_avg": 0.004960807505995035,
"lr": 2.5084175084175083e-07,
"objective/entropy": -661.8831787109375,
"objective/kl": 9.828740119934082,
"objective/non_score_reward": -0.29486221075057983,
"objective/rlhf_reward": 0.04888780415058136,
"objective/scores": 0.34375,
"policy/approxkl_avg": 0.00043535567237995565,
"policy/clipfrac_avg": 0.008342267014086246,
"policy/entropy_avg": 0.1956939697265625,
"step": 165,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 0.9997992515563965,
"val/ratio_var": 8.213489763875259e-07
},
{
"episode": 10624,
"epoch": 2.00490658614833,
"eps": 0,
"loss/policy_avg": 0.0006142702768556774,
"loss/value_avg": 0.0057089244946837425,
"lr": 2.4915824915824914e-07,
"objective/entropy": -616.60546875,
"objective/kl": 11.599992752075195,
"objective/non_score_reward": -0.34799978137016296,
"objective/rlhf_reward": -0.018409937620162964,
"objective/scores": 0.330078125,
"policy/approxkl_avg": 0.000447861006250605,
"policy/clipfrac_avg": 0.008485405705869198,
"policy/entropy_avg": 0.19893011450767517,
"step": 166,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 35,
"val/ratio": 0.9999897480010986,
"val/ratio_var": 8.826224302538321e-07
},
{
"episode": 10688,
"epoch": 2.0169843366672957,
"eps": 0,
"loss/policy_avg": -0.029148969799280167,
"loss/value_avg": 0.0051203519105911255,
"lr": 2.4747474747474745e-07,
"objective/entropy": -707.4173583984375,
"objective/kl": 9.282992362976074,
"objective/non_score_reward": -0.27848976850509644,
"objective/rlhf_reward": 0.21418601274490356,
"objective/scores": 0.4921875,
"policy/approxkl_avg": 0.00035875054891221225,
"policy/clipfrac_avg": 0.007683487143367529,
"policy/entropy_avg": 0.15778478980064392,
"step": 167,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999899864196777,
"val/ratio_var": 5.554063591262093e-07
},
{
"episode": 10752,
"epoch": 2.0290620871862615,
"eps": 0,
"loss/policy_avg": -0.020282533019781113,
"loss/value_avg": 0.00484459986910224,
"lr": 2.4579124579124576e-07,
"objective/entropy": -612.2288818359375,
"objective/kl": 10.996728897094727,
"objective/non_score_reward": -0.32990187406539917,
"objective/rlhf_reward": 0.05437546968460083,
"objective/scores": 0.384765625,
"policy/approxkl_avg": 0.0004697911790572107,
"policy/clipfrac_avg": 0.00856415368616581,
"policy/entropy_avg": 0.19417700171470642,
"step": 168,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000133514404297,
"val/ratio_var": 9.346218803329975e-07
},
{
"episode": 10816,
"epoch": 2.0411398377052272,
"eps": 0,
"loss/policy_avg": -0.0368703156709671,
"loss/value_avg": 0.005465418100357056,
"lr": 2.441077441077441e-07,
"objective/entropy": -655.9403076171875,
"objective/kl": 10.055724143981934,
"objective/non_score_reward": -0.30167171359062195,
"objective/rlhf_reward": 0.18416813015937805,
"objective/scores": 0.486328125,
"policy/approxkl_avg": 0.00042185792699456215,
"policy/clipfrac_avg": 0.00785021297633648,
"policy/entropy_avg": 0.17223486304283142,
"step": 169,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 1.0000656843185425,
"val/ratio_var": 6.886172627673659e-07
},
{
"episode": 10880,
"epoch": 2.0532175882241934,
"eps": 0,
"loss/policy_avg": -0.05005396902561188,
"loss/value_avg": 0.00523067032918334,
"lr": 2.4242424242424244e-07,
"objective/entropy": -686.6971435546875,
"objective/kl": 10.104284286499023,
"objective/non_score_reward": -0.30312851071357727,
"objective/rlhf_reward": 0.20663711428642273,
"objective/scores": 0.5078125,
"policy/approxkl_avg": 0.0010287510231137276,
"policy/clipfrac_avg": 0.0077257584780454636,
"policy/entropy_avg": 0.16867446899414062,
"step": 170,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000211000442505,
"val/ratio_var": 8.372552429136704e-07
},
{
"episode": 10944,
"epoch": 2.065295338743159,
"eps": 0,
"loss/policy_avg": -0.025773359462618828,
"loss/value_avg": 0.004905715584754944,
"lr": 2.407407407407407e-07,
"objective/entropy": -692.7886352539062,
"objective/kl": 8.267000198364258,
"objective/non_score_reward": -0.24800997972488403,
"objective/rlhf_reward": 0.18021267652511597,
"objective/scores": 0.427734375,
"policy/approxkl_avg": 0.0003900658048223704,
"policy/clipfrac_avg": 0.008449830114841461,
"policy/entropy_avg": 0.16828536987304688,
"step": 171,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.000011682510376,
"val/ratio_var": 7.061549354148156e-07
},
{
"episode": 11008,
"epoch": 2.077373089262125,
"eps": 0,
"loss/policy_avg": -0.013189585879445076,
"loss/value_avg": 0.005143987946212292,
"lr": 2.3905723905723906e-07,
"objective/entropy": -726.6845703125,
"objective/kl": 9.84701156616211,
"objective/non_score_reward": -0.2954103648662567,
"objective/rlhf_reward": 0.2207029163837433,
"objective/scores": 0.515625,
"policy/approxkl_avg": 0.000674139242619276,
"policy/clipfrac_avg": 0.008698908612132072,
"policy/entropy_avg": 0.17170843482017517,
"step": 172,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.000104308128357,
"val/ratio_var": 6.811029038544802e-07
},
{
"episode": 11072,
"epoch": 2.0894508397810907,
"eps": 0,
"loss/policy_avg": -0.015965035185217857,
"loss/value_avg": 0.004617646336555481,
"lr": 2.3737373737373737e-07,
"objective/entropy": -650.63671875,
"objective/kl": 11.198604583740234,
"objective/non_score_reward": -0.3359581232070923,
"objective/rlhf_reward": 0.009256713092327118,
"objective/scores": 0.345703125,
"policy/approxkl_avg": 0.0004264616873115301,
"policy/clipfrac_avg": 0.008644884452223778,
"policy/entropy_avg": 0.1950274407863617,
"step": 173,
"val/clipfrac_avg": 5.36388643013197e-06,
"val/num_eos_tokens": 48,
"val/ratio": 1.000180721282959,
"val/ratio_var": 1.0091738431583508e-06
},
{
"episode": 11136,
"epoch": 2.1015285903000565,
"eps": 0,
"loss/policy_avg": -0.034316565841436386,
"loss/value_avg": 0.004706418141722679,
"lr": 2.3569023569023568e-07,
"objective/entropy": -705.0397338867188,
"objective/kl": 8.905685424804688,
"objective/non_score_reward": -0.26717060804367065,
"objective/rlhf_reward": 0.27775126695632935,
"objective/scores": 0.546875,
"policy/approxkl_avg": 0.0004036706523038447,
"policy/clipfrac_avg": 0.007710058242082596,
"policy/entropy_avg": 0.1774342954158783,
"step": 174,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 0.9999071955680847,
"val/ratio_var": 6.414345534722088e-07
},
{
"episode": 11200,
"epoch": 2.1136063408190227,
"eps": 0,
"loss/policy_avg": -0.02365967631340027,
"loss/value_avg": 0.004380353260785341,
"lr": 2.34006734006734e-07,
"objective/entropy": -633.821044921875,
"objective/kl": 9.5961332321167,
"objective/non_score_reward": -0.287883996963501,
"objective/rlhf_reward": 0.11690116673707962,
"objective/scores": 0.404296875,
"policy/approxkl_avg": 0.00041495892219245434,
"policy/clipfrac_avg": 0.008334731683135033,
"policy/entropy_avg": 0.18314361572265625,
"step": 175,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 40,
"val/ratio": 1.0000516176223755,
"val/ratio_var": 5.880065145902336e-07
},
{
"episode": 11264,
"epoch": 2.1256840913379884,
"eps": 0,
"loss/policy_avg": -0.012720011174678802,
"loss/value_avg": 0.004893209785223007,
"lr": 2.323232323232323e-07,
"objective/entropy": -651.2650146484375,
"objective/kl": 9.453258514404297,
"objective/non_score_reward": -0.28359776735305786,
"objective/rlhf_reward": 0.10409756004810333,
"objective/scores": 0.38671875,
"policy/approxkl_avg": 0.0007613954949192703,
"policy/clipfrac_avg": 0.007399224676191807,
"policy/entropy_avg": 0.17717742919921875,
"step": 176,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000202655792236,
"val/ratio_var": 6.240634888854402e-07
},
{
"episode": 11328,
"epoch": 2.137761841856954,
"eps": 0,
"loss/policy_avg": -0.027373038232326508,
"loss/value_avg": 0.0046966951340436935,
"lr": 2.3063973063973064e-07,
"objective/entropy": -717.2123413085938,
"objective/kl": 8.953085899353027,
"objective/non_score_reward": -0.26859256625175476,
"objective/rlhf_reward": 0.21578243374824524,
"objective/scores": 0.484375,
"policy/approxkl_avg": 0.00035203900188207626,
"policy/clipfrac_avg": 0.007572174072265625,
"policy/entropy_avg": 0.159637451171875,
"step": 177,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 54,
"val/ratio": 1.0000574588775635,
"val/ratio_var": 5.12606447955477e-07
},
{
"episode": 11392,
"epoch": 2.14983959237592,
"eps": 0,
"loss/policy_avg": -0.006380847655236721,
"loss/value_avg": 0.0049251774325966835,
"lr": 2.2895622895622895e-07,
"objective/entropy": -712.037841796875,
"objective/kl": 8.245269775390625,
"objective/non_score_reward": -0.24735809862613678,
"objective/rlhf_reward": 0.13057157397270203,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.0003549880930222571,
"policy/clipfrac_avg": 0.007925866171717644,
"policy/entropy_avg": 0.1665090024471283,
"step": 178,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0001332759857178,
"val/ratio_var": 5.663550268764084e-07
},
{
"episode": 11456,
"epoch": 2.1619173428948857,
"eps": 0,
"loss/policy_avg": -0.0014782699290663004,
"loss/value_avg": 0.004644377622753382,
"lr": 2.2727272727272726e-07,
"objective/entropy": -662.6466064453125,
"objective/kl": 9.573324203491211,
"objective/non_score_reward": -0.2871997356414795,
"objective/rlhf_reward": 0.0892651155591011,
"objective/scores": 0.376953125,
"policy/approxkl_avg": 0.0003882443706970662,
"policy/clipfrac_avg": 0.007613882422447205,
"policy/entropy_avg": 0.17233356833457947,
"step": 179,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.0000505447387695,
"val/ratio_var": 5.133804279466858e-07
},
{
"episode": 11520,
"epoch": 2.173995093413852,
"eps": 0,
"loss/policy_avg": -0.01099494006484747,
"loss/value_avg": 0.00442532729357481,
"lr": 2.2558922558922557e-07,
"objective/entropy": -641.4156494140625,
"objective/kl": 9.477804183959961,
"objective/non_score_reward": -0.28433412313461304,
"objective/rlhf_reward": 0.05966003239154816,
"objective/scores": 0.34375,
"policy/approxkl_avg": 0.0004196200461592525,
"policy/clipfrac_avg": 0.008064374327659607,
"policy/entropy_avg": 0.1929067075252533,
"step": 180,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 40,
"val/ratio": 0.9999755620956421,
"val/ratio_var": 6.541202992593753e-07
},
{
"episode": 11584,
"epoch": 2.1860728439328176,
"eps": 0,
"loss/policy_avg": -0.0352904237806797,
"loss/value_avg": 0.004265302326530218,
"lr": 2.239057239057239e-07,
"objective/entropy": -642.61669921875,
"objective/kl": 10.230189323425293,
"objective/non_score_reward": -0.30690568685531616,
"objective/rlhf_reward": 0.11008650809526443,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.0004299771972000599,
"policy/clipfrac_avg": 0.008415701799094677,
"policy/entropy_avg": 0.18710581958293915,
"step": 181,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000258684158325,
"val/ratio_var": 7.682069735892583e-07
},
{
"episode": 11648,
"epoch": 2.1981505944517834,
"eps": 0,
"loss/policy_avg": -0.018043210729956627,
"loss/value_avg": 0.004413206595927477,
"lr": 2.222222222222222e-07,
"objective/entropy": -659.050537109375,
"objective/kl": 9.415782928466797,
"objective/non_score_reward": -0.28247350454330444,
"objective/rlhf_reward": 0.09106165170669556,
"objective/scores": 0.373046875,
"policy/approxkl_avg": 0.00045144298928789794,
"policy/clipfrac_avg": 0.008088944479823112,
"policy/entropy_avg": 0.18143844604492188,
"step": 182,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 39,
"val/ratio": 1.000122308731079,
"val/ratio_var": 6.821082365604525e-07
},
{
"episode": 11712,
"epoch": 2.210228344970749,
"eps": 0,
"loss/policy_avg": -0.01957491599023342,
"loss/value_avg": 0.004746252205222845,
"lr": 2.2053872053872053e-07,
"objective/entropy": -673.1300659179688,
"objective/kl": 10.322259902954102,
"objective/non_score_reward": -0.30966776609420776,
"objective/rlhf_reward": 0.12490253895521164,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.00038717369898222387,
"policy/clipfrac_avg": 0.007530445232987404,
"policy/entropy_avg": 0.18404261767864227,
"step": 183,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.000051498413086,
"val/ratio_var": 7.273123401319026e-07
},
{
"episode": 11776,
"epoch": 2.222306095489715,
"eps": 0,
"loss/policy_avg": 0.01077682338654995,
"loss/value_avg": 0.004722831770777702,
"lr": 2.1885521885521884e-07,
"objective/entropy": -616.521484375,
"objective/kl": 10.416351318359375,
"objective/non_score_reward": -0.3124905228614807,
"objective/rlhf_reward": -0.03417021036148071,
"objective/scores": 0.27734375,
"policy/approxkl_avg": 0.0005021474789828062,
"policy/clipfrac_avg": 0.008746202103793621,
"policy/entropy_avg": 0.2025197446346283,
"step": 184,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 0.9999576807022095,
"val/ratio_var": 6.454907293118595e-07
},
{
"episode": 11840,
"epoch": 2.2343838460086807,
"eps": 0,
"loss/policy_avg": -0.0027505457401275635,
"loss/value_avg": 0.0048131197690963745,
"lr": 2.1717171717171718e-07,
"objective/entropy": -694.5650634765625,
"objective/kl": 9.623019218444824,
"objective/non_score_reward": -0.28869056701660156,
"objective/rlhf_reward": 0.10217857360839844,
"objective/scores": 0.390625,
"policy/approxkl_avg": 0.0003909420920535922,
"policy/clipfrac_avg": 0.009193172678351402,
"policy/entropy_avg": 0.17480087280273438,
"step": 185,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.00017249584198,
"val/ratio_var": 6.435092814172094e-07
},
{
"episode": 11904,
"epoch": 2.246461596527647,
"eps": 0,
"loss/policy_avg": -0.02041536569595337,
"loss/value_avg": 0.005415412597358227,
"lr": 2.1548821548821546e-07,
"objective/entropy": -636.8676147460938,
"objective/kl": 10.659719467163086,
"objective/non_score_reward": -0.3197915852069855,
"objective/rlhf_reward": 0.11331389844417572,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.00040606613038107753,
"policy/clipfrac_avg": 0.008012184873223305,
"policy/entropy_avg": 0.18035762012004852,
"step": 186,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 0.9999368190765381,
"val/ratio_var": 5.389789521359489e-07
},
{
"episode": 11968,
"epoch": 2.2585393470466126,
"eps": 0,
"loss/policy_avg": -0.00026329857064411044,
"loss/value_avg": 0.0059758638963103294,
"lr": 2.138047138047138e-07,
"objective/entropy": -671.1956787109375,
"objective/kl": 10.020427703857422,
"objective/non_score_reward": -0.30061283707618713,
"objective/rlhf_reward": 0.10319577157497406,
"objective/scores": 0.404296875,
"policy/approxkl_avg": 0.0004317883576732129,
"policy/clipfrac_avg": 0.007914026267826557,
"policy/entropy_avg": 0.19041061401367188,
"step": 187,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000234842300415,
"val/ratio_var": 6.567967147930176e-07
},
{
"episode": 12032,
"epoch": 2.2706170975655784,
"eps": 0,
"loss/policy_avg": -0.016351381316781044,
"loss/value_avg": 0.004221225157380104,
"lr": 2.121212121212121e-07,
"objective/entropy": -581.1964721679688,
"objective/kl": 11.140705108642578,
"objective/non_score_reward": -0.3342211842536926,
"objective/rlhf_reward": 0.006599150598049164,
"objective/scores": 0.33984375,
"policy/approxkl_avg": 0.0005099625559523702,
"policy/clipfrac_avg": 0.008483211509883404,
"policy/entropy_avg": 0.2137502133846283,
"step": 188,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.00002121925354,
"val/ratio_var": 7.082234105837415e-07
},
{
"episode": 12096,
"epoch": 2.282694848084544,
"eps": 0,
"loss/policy_avg": -0.004529799334704876,
"loss/value_avg": 0.00449700653553009,
"lr": 2.1043771043771044e-07,
"objective/entropy": -733.6138305664062,
"objective/kl": 9.21728229522705,
"objective/non_score_reward": -0.27651846408843994,
"objective/rlhf_reward": 0.08773934096097946,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.00036096826079301536,
"policy/clipfrac_avg": 0.0077320970594882965,
"policy/entropy_avg": 0.16572698950767517,
"step": 189,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 0.9999610781669617,
"val/ratio_var": 5.408356287261995e-07
},
{
"episode": 12160,
"epoch": 2.29477259860351,
"eps": 0,
"loss/policy_avg": -0.011266498826444149,
"loss/value_avg": 0.004761071410030127,
"lr": 2.0875420875420873e-07,
"objective/entropy": -695.8765869140625,
"objective/kl": 10.30718994140625,
"objective/non_score_reward": -0.3092157244682312,
"objective/rlhf_reward": 0.1111944392323494,
"objective/scores": 0.419921875,
"policy/approxkl_avg": 0.0003880340082105249,
"policy/clipfrac_avg": 0.008478551171720028,
"policy/entropy_avg": 0.1890207976102829,
"step": 190,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999769926071167,
"val/ratio_var": 6.441308642024524e-07
},
{
"episode": 12224,
"epoch": 2.306850349122476,
"eps": 0,
"loss/policy_avg": -0.020808562636375427,
"loss/value_avg": 0.00449121231213212,
"lr": 2.0707070707070707e-07,
"objective/entropy": -632.7673950195312,
"objective/kl": 9.880966186523438,
"objective/non_score_reward": -0.29642897844314575,
"objective/rlhf_reward": 0.15034836530685425,
"objective/scores": 0.447265625,
"policy/approxkl_avg": 0.0005901949480175972,
"policy/clipfrac_avg": 0.007830065675079823,
"policy/entropy_avg": 0.1974080502986908,
"step": 191,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 0.9999300241470337,
"val/ratio_var": 5.447765261124005e-07
},
{
"episode": 12288,
"epoch": 2.318928099641442,
"eps": 0,
"loss/policy_avg": -0.01821664161980152,
"loss/value_avg": 0.004452117718756199,
"lr": 2.0538720538720538e-07,
"objective/entropy": -589.8839721679688,
"objective/kl": 11.97592544555664,
"objective/non_score_reward": -0.35927775502204895,
"objective/rlhf_reward": -0.04506877064704895,
"objective/scores": 0.314453125,
"policy/approxkl_avg": 0.000494088395498693,
"policy/clipfrac_avg": 0.008692565374076366,
"policy/entropy_avg": 0.20995458960533142,
"step": 192,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.000139594078064,
"val/ratio_var": 7.803449193488632e-07
},
{
"episode": 12352,
"epoch": 2.3310058501604076,
"eps": 0,
"loss/policy_avg": 0.004255164880305529,
"loss/value_avg": 0.004434296861290932,
"lr": 2.0370370370370369e-07,
"objective/entropy": -708.4127197265625,
"objective/kl": 9.581413269042969,
"objective/non_score_reward": -0.2874424159526825,
"objective/rlhf_reward": 0.0768154114484787,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.0004709034110419452,
"policy/clipfrac_avg": 0.007526259869337082,
"policy/entropy_avg": 0.19600550830364227,
"step": 193,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 52,
"val/ratio": 0.9999573230743408,
"val/ratio_var": 6.966297405597288e-07
},
{
"episode": 12416,
"epoch": 2.3430836006793734,
"eps": 0,
"loss/policy_avg": -0.030452650040388107,
"loss/value_avg": 0.004531817510724068,
"lr": 2.02020202020202e-07,
"objective/entropy": -637.3931274414062,
"objective/kl": 10.20181941986084,
"objective/non_score_reward": -0.30605456233024597,
"objective/rlhf_reward": 0.14560559391975403,
"objective/scores": 0.451171875,
"policy/approxkl_avg": 0.0004737515118904412,
"policy/clipfrac_avg": 0.00787600688636303,
"policy/entropy_avg": 0.1870168149471283,
"step": 194,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000483989715576,
"val/ratio_var": 8.018863582037739e-07
},
{
"episode": 12480,
"epoch": 2.355161351198339,
"eps": 0,
"loss/policy_avg": -0.014566441997885704,
"loss/value_avg": 0.004195361863821745,
"lr": 2.0033670033670033e-07,
"objective/entropy": -724.8233642578125,
"objective/kl": 8.487796783447266,
"objective/non_score_reward": -0.25463390350341797,
"objective/rlhf_reward": 0.19556137919425964,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.00040911376709118485,
"policy/clipfrac_avg": 0.008261054754257202,
"policy/entropy_avg": 0.1738535612821579,
"step": 195,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000691413879395,
"val/ratio_var": 5.904955173718918e-07
},
{
"episode": 12544,
"epoch": 2.3672391017173053,
"eps": 0,
"loss/policy_avg": -0.022108733654022217,
"loss/value_avg": 0.0041517410427331924,
"lr": 1.9865319865319864e-07,
"objective/entropy": -605.9757080078125,
"objective/kl": 10.502775192260742,
"objective/non_score_reward": -0.31508326530456543,
"objective/rlhf_reward": 0.07261204719543457,
"objective/scores": 0.38671875,
"policy/approxkl_avg": 0.00047133685438893735,
"policy/clipfrac_avg": 0.007839309982955456,
"policy/entropy_avg": 0.19527944922447205,
"step": 196,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999372959136963,
"val/ratio_var": 8.094798431557138e-07
},
{
"episode": 12608,
"epoch": 2.379316852236271,
"eps": 0,
"loss/policy_avg": -0.02246645651757717,
"loss/value_avg": 0.004199231043457985,
"lr": 1.9696969696969696e-07,
"objective/entropy": -639.3452758789062,
"objective/kl": 9.353677749633789,
"objective/non_score_reward": -0.2806103229522705,
"objective/rlhf_reward": 0.0772998258471489,
"objective/scores": 0.357421875,
"policy/approxkl_avg": 0.0004011366399936378,
"policy/clipfrac_avg": 0.007909499108791351,
"policy/entropy_avg": 0.1853407323360443,
"step": 197,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999942779541016,
"val/ratio_var": 5.837881076331541e-07
},
{
"episode": 12672,
"epoch": 2.391394602755237,
"eps": 0,
"loss/policy_avg": -0.035098060965538025,
"loss/value_avg": 0.004067492671310902,
"lr": 1.9528619528619527e-07,
"objective/entropy": -698.6280517578125,
"objective/kl": 8.177114486694336,
"objective/non_score_reward": -0.24531343579292297,
"objective/rlhf_reward": 0.18925687670707703,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.00038718507857993245,
"policy/clipfrac_avg": 0.008279062807559967,
"policy/entropy_avg": 0.18160438537597656,
"step": 198,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.0000050067901611,
"val/ratio_var": 5.738119170928258e-07
},
{
"episode": 12736,
"epoch": 2.4034723532742026,
"eps": 0,
"loss/policy_avg": -0.022469520568847656,
"loss/value_avg": 0.00446331687271595,
"lr": 1.936026936026936e-07,
"objective/entropy": -676.8287963867188,
"objective/kl": 7.68222713470459,
"objective/non_score_reward": -0.2304668128490448,
"objective/rlhf_reward": 0.1865253746509552,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.00038739325827918947,
"policy/clipfrac_avg": 0.008100518956780434,
"policy/entropy_avg": 0.17807134985923767,
"step": 199,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999980926513672,
"val/ratio_var": 7.030320716694405e-07
},
{
"episode": 12800,
"epoch": 2.4155501037931684,
"eps": 0,
"loss/policy_avg": -0.008693840354681015,
"loss/value_avg": 0.004308072850108147,
"lr": 1.9191919191919189e-07,
"objective/entropy": -678.0347900390625,
"objective/kl": 8.073586463928223,
"objective/non_score_reward": -0.2422075867652893,
"objective/rlhf_reward": 0.1515912413597107,
"objective/scores": 0.39453125,
"policy/approxkl_avg": 0.00046054236008785665,
"policy/clipfrac_avg": 0.008118792437016964,
"policy/entropy_avg": 0.19109344482421875,
"step": 200,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.999969482421875,
"val/ratio_var": 5.471772510645678e-07
},
{
"episode": 12864,
"epoch": 2.4276278543121346,
"eps": 0,
"loss/policy_avg": -0.02215806394815445,
"loss/value_avg": 0.00444747693836689,
"lr": 1.9023569023569022e-07,
"objective/entropy": -715.016357421875,
"objective/kl": 8.792640686035156,
"objective/non_score_reward": -0.2637792229652405,
"objective/rlhf_reward": 0.22645515203475952,
"objective/scores": 0.490234375,
"policy/approxkl_avg": 0.0003587045648600906,
"policy/clipfrac_avg": 0.0076547968201339245,
"policy/entropy_avg": 0.1849416196346283,
"step": 201,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.000044584274292,
"val/ratio_var": 5.405810838965408e-07
},
{
"episode": 12928,
"epoch": 2.4397056048311003,
"eps": 0,
"loss/policy_avg": 0.004962640814483166,
"loss/value_avg": 0.004433006979525089,
"lr": 1.8855218855218853e-07,
"objective/entropy": -646.1973876953125,
"objective/kl": 9.110560417175293,
"objective/non_score_reward": -0.27331680059432983,
"objective/rlhf_reward": 0.10168319940567017,
"objective/scores": 0.375,
"policy/approxkl_avg": 0.0004163091944064945,
"policy/clipfrac_avg": 0.007584965787827969,
"policy/entropy_avg": 0.18780645728111267,
"step": 202,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999338984489441,
"val/ratio_var": 9.266473739444336e-07
},
{
"episode": 12992,
"epoch": 2.451783355350066,
"eps": 0,
"loss/policy_avg": -0.011171831749379635,
"loss/value_avg": 0.0047411127015948296,
"lr": 1.8686868686868687e-07,
"objective/entropy": -690.5891723632812,
"objective/kl": 8.7337646484375,
"objective/non_score_reward": -0.26201295852661133,
"objective/rlhf_reward": 0.13154172897338867,
"objective/scores": 0.39453125,
"policy/approxkl_avg": 0.00038152310298755765,
"policy/clipfrac_avg": 0.007856165990233421,
"policy/entropy_avg": 0.1933492124080658,
"step": 203,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 0.9999629855155945,
"val/ratio_var": 6.461675070568162e-07
},
{
"episode": 13056,
"epoch": 2.463861105869032,
"eps": 0,
"loss/policy_avg": -0.03274771571159363,
"loss/value_avg": 0.004181091673672199,
"lr": 1.8518518518518516e-07,
"objective/entropy": -734.372802734375,
"objective/kl": 6.88037109375,
"objective/non_score_reward": -0.20641113817691803,
"objective/rlhf_reward": 0.25159668922424316,
"objective/scores": 0.45703125,
"policy/approxkl_avg": 0.0003489043447189033,
"policy/clipfrac_avg": 0.007448253687471151,
"policy/entropy_avg": 0.17730967700481415,
"step": 204,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 53,
"val/ratio": 1.0000073909759521,
"val/ratio_var": 6.07091635629331e-07
},
{
"episode": 13120,
"epoch": 2.4759388563879976,
"eps": 0,
"loss/policy_avg": -0.016816487535834312,
"loss/value_avg": 0.0042554219253361225,
"lr": 1.835016835016835e-07,
"objective/entropy": -794.4978637695312,
"objective/kl": 5.602289199829102,
"objective/non_score_reward": -0.16806866228580475,
"objective/rlhf_reward": 0.39443135261535645,
"objective/scores": 0.5625,
"policy/approxkl_avg": 0.0002819629153236747,
"policy/clipfrac_avg": 0.006934004835784435,
"policy/entropy_avg": 0.15758514404296875,
"step": 205,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 0.9999727010726929,
"val/ratio_var": 4.5193257847131463e-07
},
{
"episode": 13184,
"epoch": 2.4880166069069634,
"eps": 0,
"loss/policy_avg": -0.017609162256121635,
"loss/value_avg": 0.00427992781624198,
"lr": 1.818181818181818e-07,
"objective/entropy": -712.7396850585938,
"objective/kl": 8.044666290283203,
"objective/non_score_reward": -0.24133998155593872,
"objective/rlhf_reward": 0.19616001844406128,
"objective/scores": 0.4375,
"policy/approxkl_avg": 0.000342213868862018,
"policy/clipfrac_avg": 0.006520026829093695,
"policy/entropy_avg": 0.17752330005168915,
"step": 206,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 0.9999613165855408,
"val/ratio_var": 5.999673931000871e-07
},
{
"episode": 13248,
"epoch": 2.5000943574259296,
"eps": 0,
"loss/policy_avg": -0.03822872042655945,
"loss/value_avg": 0.0038146479055285454,
"lr": 1.8013468013468014e-07,
"objective/entropy": -676.6316528320312,
"objective/kl": 8.43276596069336,
"objective/non_score_reward": -0.2529829740524292,
"objective/rlhf_reward": 0.2226029485464096,
"objective/scores": 0.4765625,
"policy/approxkl_avg": 0.00040029053343459964,
"policy/clipfrac_avg": 0.007741398643702269,
"policy/entropy_avg": 0.18157577514648438,
"step": 207,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 38,
"val/ratio": 1.0000382661819458,
"val/ratio_var": 7.777077826176537e-07
},
{
"episode": 13312,
"epoch": 2.5121721079448953,
"eps": 0,
"loss/policy_avg": -0.017532743513584137,
"loss/value_avg": 0.004521360620856285,
"lr": 1.7845117845117842e-07,
"objective/entropy": -642.2452392578125,
"objective/kl": 9.599320411682129,
"objective/non_score_reward": -0.2879796028137207,
"objective/rlhf_reward": 0.0982508733868599,
"objective/scores": 0.38671875,
"policy/approxkl_avg": 0.0005058823153376579,
"policy/clipfrac_avg": 0.007621736731380224,
"policy/entropy_avg": 0.19717535376548767,
"step": 208,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999377727508545,
"val/ratio_var": 6.733653208357282e-07
},
{
"episode": 13376,
"epoch": 2.524249858463861,
"eps": 0,
"loss/policy_avg": -0.012456863187253475,
"loss/value_avg": 0.004330017603933811,
"lr": 1.7676767676767676e-07,
"objective/entropy": -684.7198486328125,
"objective/kl": 8.890013694763184,
"objective/non_score_reward": -0.2667003870010376,
"objective/rlhf_reward": 0.1131824254989624,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.0003934592823497951,
"policy/clipfrac_avg": 0.008775051683187485,
"policy/entropy_avg": 0.18945693969726562,
"step": 209,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0001251697540283,
"val/ratio_var": 6.748607574991183e-07
},
{
"episode": 13440,
"epoch": 2.536327608982827,
"eps": 0,
"loss/policy_avg": -0.01945299468934536,
"loss/value_avg": 0.0042161582969129086,
"lr": 1.7508417508417507e-07,
"objective/entropy": -704.7137451171875,
"objective/kl": 9.013429641723633,
"objective/non_score_reward": -0.27040284872055054,
"objective/rlhf_reward": 0.13438229262828827,
"objective/scores": 0.404296875,
"policy/approxkl_avg": 0.0003603402874432504,
"policy/clipfrac_avg": 0.008057435974478722,
"policy/entropy_avg": 0.18650183081626892,
"step": 210,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.0000133514404297,
"val/ratio_var": 5.646013505611336e-07
},
{
"episode": 13504,
"epoch": 2.5484053595017926,
"eps": 0,
"loss/policy_avg": -0.01828945055603981,
"loss/value_avg": 0.004183897748589516,
"lr": 1.7340067340067338e-07,
"objective/entropy": -715.1771240234375,
"objective/kl": 7.647032737731934,
"objective/non_score_reward": -0.22941097617149353,
"objective/rlhf_reward": 0.21931949257850647,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.0005932983476668596,
"policy/clipfrac_avg": 0.006059914827346802,
"policy/entropy_avg": 0.17059580981731415,
"step": 211,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 0.9999650716781616,
"val/ratio_var": 4.5039382712275255e-07
},
{
"episode": 13568,
"epoch": 2.560483110020759,
"eps": 0,
"loss/policy_avg": -0.01205519214272499,
"loss/value_avg": 0.004262065049260855,
"lr": 1.717171717171717e-07,
"objective/entropy": -680.666015625,
"objective/kl": 9.011266708374023,
"objective/non_score_reward": -0.2703379988670349,
"objective/rlhf_reward": 0.14860734343528748,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.0003913857217412442,
"policy/clipfrac_avg": 0.008144761435687542,
"policy/entropy_avg": 0.1854146420955658,
"step": 212,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.000110387802124,
"val/ratio_var": 4.5980641516507603e-07
},
{
"episode": 13632,
"epoch": 2.5725608605397245,
"eps": 0,
"loss/policy_avg": -0.0006327772280201316,
"loss/value_avg": 0.004324252717196941,
"lr": 1.7003367003367003e-07,
"objective/entropy": -680.5126953125,
"objective/kl": 8.441967010498047,
"objective/non_score_reward": -0.25325900316238403,
"objective/rlhf_reward": 0.11051052808761597,
"objective/scores": 0.36328125,
"policy/approxkl_avg": 0.0003816906246356666,
"policy/clipfrac_avg": 0.007939379662275314,
"policy/entropy_avg": 0.19525527954101562,
"step": 213,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000630617141724,
"val/ratio_var": 6.067442654966726e-07
},
{
"episode": 13696,
"epoch": 2.5846386110586903,
"eps": 0,
"loss/policy_avg": -0.03070930764079094,
"loss/value_avg": 0.0039781369268894196,
"lr": 1.6835016835016837e-07,
"objective/entropy": -648.4881591796875,
"objective/kl": 8.808910369873047,
"objective/non_score_reward": -0.26426729559898376,
"objective/rlhf_reward": 0.16542020440101624,
"objective/scores": 0.4296875,
"policy/approxkl_avg": 0.0004112160240765661,
"policy/clipfrac_avg": 0.008430849760770798,
"policy/entropy_avg": 0.18458303809165955,
"step": 214,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 1.0000827312469482,
"val/ratio_var": 5.764974844169046e-07
},
{
"episode": 13760,
"epoch": 2.596716361577656,
"eps": 0,
"loss/policy_avg": -0.01275802031159401,
"loss/value_avg": 0.004106822889298201,
"lr": 1.6666666666666665e-07,
"objective/entropy": -706.819580078125,
"objective/kl": 8.268199920654297,
"objective/non_score_reward": -0.24804598093032837,
"objective/rlhf_reward": 0.18847745656967163,
"objective/scores": 0.4375,
"policy/approxkl_avg": 0.0003638950875028968,
"policy/clipfrac_avg": 0.00869040098041296,
"policy/entropy_avg": 0.18547821044921875,
"step": 215,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000922679901123,
"val/ratio_var": 5.714567805625848e-07
},
{
"episode": 13824,
"epoch": 2.608794112096622,
"eps": 0,
"loss/policy_avg": -0.014732430689036846,
"loss/value_avg": 0.004122150130569935,
"lr": 1.64983164983165e-07,
"objective/entropy": -678.539306640625,
"objective/kl": 8.51124382019043,
"objective/non_score_reward": -0.25533732771873474,
"objective/rlhf_reward": 0.13821735978126526,
"objective/scores": 0.39453125,
"policy/approxkl_avg": 0.0003934210108127445,
"policy/clipfrac_avg": 0.008303534239530563,
"policy/entropy_avg": 0.19069163501262665,
"step": 216,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0001001358032227,
"val/ratio_var": 6.961653866710549e-07
},
{
"episode": 13888,
"epoch": 2.620871862615588,
"eps": 0,
"loss/policy_avg": -0.0114736994728446,
"loss/value_avg": 0.004468954633921385,
"lr": 1.632996632996633e-07,
"objective/entropy": -760.7581787109375,
"objective/kl": 6.764780044555664,
"objective/non_score_reward": -0.20294338464736938,
"objective/rlhf_reward": 0.2609238028526306,
"objective/scores": 0.46484375,
"policy/approxkl_avg": 0.000336907512973994,
"policy/clipfrac_avg": 0.007351381238549948,
"policy/entropy_avg": 0.17353948950767517,
"step": 217,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.000031590461731,
"val/ratio_var": 5.147477963873826e-07
},
{
"episode": 13952,
"epoch": 2.632949613134554,
"eps": 0,
"loss/policy_avg": -0.025034895166754723,
"loss/value_avg": 0.004216045141220093,
"lr": 1.6161616161616163e-07,
"objective/entropy": -698.8699951171875,
"objective/kl": 7.570901870727539,
"objective/non_score_reward": -0.2271270602941513,
"objective/rlhf_reward": 0.2767791748046875,
"objective/scores": 0.50390625,
"policy/approxkl_avg": 0.0004006924282293767,
"policy/clipfrac_avg": 0.007912924513220787,
"policy/entropy_avg": 0.17928314208984375,
"step": 218,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 52,
"val/ratio": 1.0000429153442383,
"val/ratio_var": 5.616009843834036e-07
},
{
"episode": 14016,
"epoch": 2.6450273636535195,
"eps": 0,
"loss/policy_avg": -0.010850876569747925,
"loss/value_avg": 0.004759899340569973,
"lr": 1.5993265993265992e-07,
"objective/entropy": -677.7619018554688,
"objective/kl": 9.129847526550293,
"objective/non_score_reward": -0.27389541268348694,
"objective/rlhf_reward": 0.10403427481651306,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.00039389575249515474,
"policy/clipfrac_avg": 0.008119095116853714,
"policy/entropy_avg": 0.19026947021484375,
"step": 219,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 0.9999043345451355,
"val/ratio_var": 6.120080797700211e-07
},
{
"episode": 14080,
"epoch": 2.6571051141724853,
"eps": 0,
"loss/policy_avg": 0.0019542532972991467,
"loss/value_avg": 0.004348535090684891,
"lr": 1.5824915824915826e-07,
"objective/entropy": -694.8416748046875,
"objective/kl": 7.038139343261719,
"objective/non_score_reward": -0.21114417910575867,
"objective/rlhf_reward": 0.17752769589424133,
"objective/scores": 0.388671875,
"policy/approxkl_avg": 0.0003559057950042188,
"policy/clipfrac_avg": 0.007189783733338118,
"policy/entropy_avg": 0.16869863867759705,
"step": 220,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 38,
"val/ratio": 0.9999446868896484,
"val/ratio_var": 6.206274747455609e-07
},
{
"episode": 14144,
"epoch": 2.669182864691451,
"eps": 0,
"loss/policy_avg": -0.013687599450349808,
"loss/value_avg": 0.004020463675260544,
"lr": 1.5656565656565657e-07,
"objective/entropy": -699.5345458984375,
"objective/kl": 8.55117416381836,
"objective/non_score_reward": -0.25653523206710815,
"objective/rlhf_reward": 0.11699993908405304,
"objective/scores": 0.373046875,
"policy/approxkl_avg": 0.00037346064345911145,
"policy/clipfrac_avg": 0.008171791210770607,
"policy/entropy_avg": 0.19686762988567352,
"step": 221,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000358819961548,
"val/ratio_var": 5.536150524676486e-07
},
{
"episode": 14208,
"epoch": 2.6812606152104173,
"eps": 0,
"loss/policy_avg": -0.016610831022262573,
"loss/value_avg": 0.003959144465625286,
"lr": 1.5488215488215488e-07,
"objective/entropy": -733.2947998046875,
"objective/kl": 7.419867515563965,
"objective/non_score_reward": -0.22259601950645447,
"objective/rlhf_reward": 0.25543132424354553,
"objective/scores": 0.478515625,
"policy/approxkl_avg": 0.0003535112482495606,
"policy/clipfrac_avg": 0.007308521773666143,
"policy/entropy_avg": 0.1968231201171875,
"step": 222,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 0.9999250173568726,
"val/ratio_var": 4.929243573315034e-07
},
{
"episode": 14272,
"epoch": 2.693338365729383,
"eps": 0,
"loss/policy_avg": -0.03384992107748985,
"loss/value_avg": 0.003912989050149918,
"lr": 1.531986531986532e-07,
"objective/entropy": -646.734130859375,
"objective/kl": 8.461997985839844,
"objective/non_score_reward": -0.25385990738868713,
"objective/rlhf_reward": 0.16215571761131287,
"objective/scores": 0.416015625,
"policy/approxkl_avg": 0.0005014871712774038,
"policy/clipfrac_avg": 0.00828520953655243,
"policy/entropy_avg": 0.19067637622356415,
"step": 223,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.0000280141830444,
"val/ratio_var": 7.016036533968872e-07
},
{
"episode": 14336,
"epoch": 2.7054161162483488,
"eps": 0,
"loss/policy_avg": -0.0035064732655882835,
"loss/value_avg": 0.003849966451525688,
"lr": 1.5151515151515152e-07,
"objective/entropy": -686.4854736328125,
"objective/kl": 8.041788101196289,
"objective/non_score_reward": -0.24125364422798157,
"objective/rlhf_reward": 0.12544558942317963,
"objective/scores": 0.3671875,
"policy/approxkl_avg": 0.00039225397631525993,
"policy/clipfrac_avg": 0.00783010758459568,
"policy/entropy_avg": 0.19433467090129852,
"step": 224,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 0.9999794960021973,
"val/ratio_var": 5.356313295123982e-07
},
{
"episode": 14400,
"epoch": 2.7174938667673145,
"eps": 0,
"loss/policy_avg": -0.005889839958399534,
"loss/value_avg": 0.004009313881397247,
"lr": 1.4983164983164983e-07,
"objective/entropy": -688.8871459960938,
"objective/kl": 7.107503890991211,
"objective/non_score_reward": -0.2132251262664795,
"objective/rlhf_reward": 0.1681225299835205,
"objective/scores": 0.380859375,
"policy/approxkl_avg": 0.00035966013092547655,
"policy/clipfrac_avg": 0.0072424449026584625,
"policy/entropy_avg": 0.1872762143611908,
"step": 225,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000863075256348,
"val/ratio_var": 6.394271281351394e-07
},
{
"episode": 14464,
"epoch": 2.7295716172862803,
"eps": 0,
"loss/policy_avg": -0.03164386376738548,
"loss/value_avg": 0.003974028863012791,
"lr": 1.4814814814814815e-07,
"objective/entropy": -636.7869873046875,
"objective/kl": 9.086867332458496,
"objective/non_score_reward": -0.2726060152053833,
"objective/rlhf_reward": 0.1607435941696167,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.0004679976846091449,
"policy/clipfrac_avg": 0.007549212779849768,
"policy/entropy_avg": 0.1928914487361908,
"step": 226,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 0.9999264478683472,
"val/ratio_var": 5.871561938874947e-07
},
{
"episode": 14528,
"epoch": 2.7416493678052465,
"eps": 0,
"loss/policy_avg": -0.029706722125411034,
"loss/value_avg": 0.003999405540525913,
"lr": 1.4646464646464646e-07,
"objective/entropy": -714.131103515625,
"objective/kl": 8.953620910644531,
"objective/non_score_reward": -0.26860859990119934,
"objective/rlhf_reward": 0.22016091644763947,
"objective/scores": 0.48828125,
"policy/approxkl_avg": 0.0004997976357117295,
"policy/clipfrac_avg": 0.0073518408462405205,
"policy/entropy_avg": 0.19288381934165955,
"step": 227,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 0.999913215637207,
"val/ratio_var": 6.02664385951357e-07
},
{
"episode": 14592,
"epoch": 2.7537271183242122,
"eps": 0,
"loss/policy_avg": -0.024763260036706924,
"loss/value_avg": 0.003725615097209811,
"lr": 1.447811447811448e-07,
"objective/entropy": -598.0142822265625,
"objective/kl": 8.618000030517578,
"objective/non_score_reward": -0.2585400342941284,
"objective/rlhf_reward": 0.17554199695587158,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.00044109029113315046,
"policy/clipfrac_avg": 0.008174901828169823,
"policy/entropy_avg": 0.207733154296875,
"step": 228,
"val/clipfrac_avg": 1.2475050425564405e-05,
"val/num_eos_tokens": 35,
"val/ratio": 0.9999794363975525,
"val/ratio_var": 7.014916718617314e-07
},
{
"episode": 14656,
"epoch": 2.765804868843178,
"eps": 0,
"loss/policy_avg": -0.025254826992750168,
"loss/value_avg": 0.004053793381899595,
"lr": 1.4309764309764308e-07,
"objective/entropy": -670.363037109375,
"objective/kl": 7.579680442810059,
"objective/non_score_reward": -0.22739042341709137,
"objective/rlhf_reward": 0.23550020158290863,
"objective/scores": 0.462890625,
"policy/approxkl_avg": 0.00038262922316789627,
"policy/clipfrac_avg": 0.007922390475869179,
"policy/entropy_avg": 0.18486277759075165,
"step": 229,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 36,
"val/ratio": 1.0000662803649902,
"val/ratio_var": 6.002863983667339e-07
},
{
"episode": 14720,
"epoch": 2.7778826193621438,
"eps": 0,
"loss/policy_avg": -0.009143723174929619,
"loss/value_avg": 0.004357549827545881,
"lr": 1.4141414141414141e-07,
"objective/entropy": -646.9794921875,
"objective/kl": 9.054327964782715,
"objective/non_score_reward": -0.27162984013557434,
"objective/rlhf_reward": 0.09873150289058685,
"objective/scores": 0.37109375,
"policy/approxkl_avg": 0.00039568787906318903,
"policy/clipfrac_avg": 0.007754582446068525,
"policy/entropy_avg": 0.1911672055721283,
"step": 230,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 1.000026822090149,
"val/ratio_var": 7.080183195284917e-07
},
{
"episode": 14784,
"epoch": 2.7899603698811095,
"eps": 0,
"loss/policy_avg": -0.03357026353478432,
"loss/value_avg": 0.00450306199491024,
"lr": 1.3973063973063972e-07,
"objective/entropy": -694.3631591796875,
"objective/kl": 7.8914995193481445,
"objective/non_score_reward": -0.2367449700832367,
"objective/rlhf_reward": 0.2432354986667633,
"objective/scores": 0.48046875,
"policy/approxkl_avg": 0.00037845989572815597,
"policy/clipfrac_avg": 0.007996518164873123,
"policy/entropy_avg": 0.19372813403606415,
"step": 231,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9998430013656616,
"val/ratio_var": 5.800552003165649e-07
},
{
"episode": 14848,
"epoch": 2.8020381204000753,
"eps": 0,
"loss/policy_avg": 0.001023156102746725,
"loss/value_avg": 0.004101088736206293,
"lr": 1.3804713804713806e-07,
"objective/entropy": -682.6402587890625,
"objective/kl": 9.124707221984863,
"objective/non_score_reward": -0.273741215467453,
"objective/rlhf_reward": 0.102479487657547,
"objective/scores": 0.376953125,
"policy/approxkl_avg": 0.0006178760668262839,
"policy/clipfrac_avg": 0.007577064447104931,
"policy/entropy_avg": 0.19171142578125,
"step": 232,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 53,
"val/ratio": 0.9999958276748657,
"val/ratio_var": 4.763281822306453e-07
},
{
"episode": 14912,
"epoch": 2.8141158709190415,
"eps": 0,
"loss/policy_avg": -0.011374952271580696,
"loss/value_avg": 0.004087153356522322,
"lr": 1.3636363636363635e-07,
"objective/entropy": -650.131103515625,
"objective/kl": 9.105212211608887,
"objective/non_score_reward": -0.27315637469291687,
"objective/rlhf_reward": 0.15897253155708313,
"objective/scores": 0.431640625,
"policy/approxkl_avg": 0.00038859708001837134,
"policy/clipfrac_avg": 0.007399349473416805,
"policy/entropy_avg": 0.17907333374023438,
"step": 233,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 1.0000715255737305,
"val/ratio_var": 5.53652057533327e-07
},
{
"episode": 14976,
"epoch": 2.8261936214380072,
"eps": 0,
"loss/policy_avg": -0.01846359483897686,
"loss/value_avg": 0.003997947089374065,
"lr": 1.3468013468013468e-07,
"objective/entropy": -721.1434326171875,
"objective/kl": 7.176656723022461,
"objective/non_score_reward": -0.21529969573020935,
"objective/rlhf_reward": 0.24759092926979065,
"objective/scores": 0.462890625,
"policy/approxkl_avg": 0.0003538678865879774,
"policy/clipfrac_avg": 0.00723436800763011,
"policy/entropy_avg": 0.17639541625976562,
"step": 234,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 51,
"val/ratio": 1.0001486539840698,
"val/ratio_var": 5.433992669168219e-07
},
{
"episode": 15040,
"epoch": 2.838271371956973,
"eps": 0,
"loss/policy_avg": -0.003030909225344658,
"loss/value_avg": 0.0041738273575901985,
"lr": 1.32996632996633e-07,
"objective/entropy": -593.6438598632812,
"objective/kl": 10.389270782470703,
"objective/non_score_reward": -0.31167811155319214,
"objective/rlhf_reward": 0.017911747097969055,
"objective/scores": 0.330078125,
"policy/approxkl_avg": 0.00048703886568546295,
"policy/clipfrac_avg": 0.007803687360137701,
"policy/entropy_avg": 0.20614878833293915,
"step": 235,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 0.9999415278434753,
"val/ratio_var": 8.815354703983758e-07
},
{
"episode": 15104,
"epoch": 2.8503491224759387,
"eps": 0,
"loss/policy_avg": -0.028734426945447922,
"loss/value_avg": 0.00377194257453084,
"lr": 1.3131313131313133e-07,
"objective/entropy": -750.08837890625,
"objective/kl": 7.2935791015625,
"objective/non_score_reward": -0.2188073694705963,
"objective/rlhf_reward": 0.2689855992794037,
"objective/scores": 0.48828125,
"policy/approxkl_avg": 0.0003561212797649205,
"policy/clipfrac_avg": 0.007725189905613661,
"policy/entropy_avg": 0.1868082731962204,
"step": 236,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 54,
"val/ratio": 0.9999493360519409,
"val/ratio_var": 5.775621048087487e-07
},
{
"episode": 15168,
"epoch": 2.8624268729949045,
"eps": 0,
"loss/policy_avg": -0.02059931308031082,
"loss/value_avg": 0.004160116892307997,
"lr": 1.2962962962962961e-07,
"objective/entropy": -689.4764404296875,
"objective/kl": 7.785543441772461,
"objective/non_score_reward": -0.2335663139820099,
"objective/rlhf_reward": 0.1804961860179901,
"objective/scores": 0.4140625,
"policy/approxkl_avg": 0.0003638736379798502,
"policy/clipfrac_avg": 0.007504904642701149,
"policy/entropy_avg": 0.19126257300376892,
"step": 237,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.000117540359497,
"val/ratio_var": 5.907762670176453e-07
},
{
"episode": 15232,
"epoch": 2.8745046235138707,
"eps": 0,
"loss/policy_avg": -0.002399355173110962,
"loss/value_avg": 0.003902244148775935,
"lr": 1.2794612794612795e-07,
"objective/entropy": -693.060791015625,
"objective/kl": 6.487215042114258,
"objective/non_score_reward": -0.19461645185947418,
"objective/rlhf_reward": 0.18038354814052582,
"objective/scores": 0.375,
"policy/approxkl_avg": 0.00036661222111433744,
"policy/clipfrac_avg": 0.007351069711148739,
"policy/entropy_avg": 0.1860555112361908,
"step": 238,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000566244125366,
"val/ratio_var": 6.291702447924763e-07
},
{
"episode": 15296,
"epoch": 2.8865823740328365,
"eps": 0,
"loss/policy_avg": -0.007437670137733221,
"loss/value_avg": 0.004013408906757832,
"lr": 1.2626262626262626e-07,
"objective/entropy": -704.9505615234375,
"objective/kl": 6.939170837402344,
"objective/non_score_reward": -0.20817512273788452,
"objective/rlhf_reward": 0.17024284601211548,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.00033546844497323036,
"policy/clipfrac_avg": 0.007042970508337021,
"policy/entropy_avg": 0.1695149838924408,
"step": 239,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000756978988647,
"val/ratio_var": 6.734092039550887e-07
},
{
"episode": 15360,
"epoch": 2.898660124551802,
"eps": 0,
"loss/policy_avg": -0.028576653450727463,
"loss/value_avg": 0.004010652657598257,
"lr": 1.2457912457912457e-07,
"objective/entropy": -654.0175170898438,
"objective/kl": 8.195779800415039,
"objective/non_score_reward": -0.24587342143058777,
"objective/rlhf_reward": 0.18869690597057343,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.0004402039048727602,
"policy/clipfrac_avg": 0.007189431693404913,
"policy/entropy_avg": 0.17950567603111267,
"step": 240,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 0.9998716115951538,
"val/ratio_var": 6.682057573925704e-07
},
{
"episode": 15424,
"epoch": 2.910737875070768,
"eps": 0,
"loss/policy_avg": -0.005296625196933746,
"loss/value_avg": 0.0039635319262743,
"lr": 1.2289562289562288e-07,
"objective/entropy": -656.5180053710938,
"objective/kl": 8.551060676574707,
"objective/non_score_reward": -0.2565317749977112,
"objective/rlhf_reward": 0.13604632019996643,
"objective/scores": 0.392578125,
"policy/approxkl_avg": 0.00038327404763549566,
"policy/clipfrac_avg": 0.007787951733916998,
"policy/entropy_avg": 0.17977142333984375,
"step": 241,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0001020431518555,
"val/ratio_var": 5.759193868470902e-07
},
{
"episode": 15488,
"epoch": 2.9228156255897337,
"eps": 0,
"loss/policy_avg": -0.009168568067252636,
"loss/value_avg": 0.004365907050669193,
"lr": 1.2121212121212122e-07,
"objective/entropy": -702.2899169921875,
"objective/kl": 7.23637056350708,
"objective/non_score_reward": -0.21709111332893372,
"objective/rlhf_reward": 0.22870966792106628,
"objective/scores": 0.4453125,
"policy/approxkl_avg": 0.0003652493469417095,
"policy/clipfrac_avg": 0.007043258287012577,
"policy/entropy_avg": 0.1999460905790329,
"step": 242,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 0.9999537467956543,
"val/ratio_var": 4.7287392135331174e-07
},
{
"episode": 15552,
"epoch": 2.9348933761087,
"eps": 0,
"loss/policy_avg": -0.03705034777522087,
"loss/value_avg": 0.0035443564411252737,
"lr": 1.1952861952861953e-07,
"objective/entropy": -629.838623046875,
"objective/kl": 8.425390243530273,
"objective/non_score_reward": -0.25276172161102295,
"objective/rlhf_reward": 0.22135938704013824,
"objective/scores": 0.474609375,
"policy/approxkl_avg": 0.00040427930071018636,
"policy/clipfrac_avg": 0.00702214939519763,
"policy/entropy_avg": 0.19646072387695312,
"step": 243,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 1.0001170635223389,
"val/ratio_var": 6.153204026304593e-07
},
{
"episode": 15616,
"epoch": 2.9469711266276657,
"eps": 0,
"loss/policy_avg": -0.03318122774362564,
"loss/value_avg": 0.003984754905104637,
"lr": 1.1784511784511784e-07,
"objective/entropy": -658.1822509765625,
"objective/kl": 8.162740707397461,
"objective/non_score_reward": -0.2448822408914566,
"objective/rlhf_reward": 0.2568267583847046,
"objective/scores": 0.5,
"policy/approxkl_avg": 0.00037857884308323264,
"policy/clipfrac_avg": 0.0069054896011948586,
"policy/entropy_avg": 0.18754324316978455,
"step": 244,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 37,
"val/ratio": 0.9999241828918457,
"val/ratio_var": 6.279624926719407e-07
},
{
"episode": 15680,
"epoch": 2.9590488771466315,
"eps": 0,
"loss/policy_avg": -0.019931495189666748,
"loss/value_avg": 0.003651971695944667,
"lr": 1.1616161616161615e-07,
"objective/entropy": -606.1823120117188,
"objective/kl": 7.644600868225098,
"objective/non_score_reward": -0.22933802008628845,
"objective/rlhf_reward": 0.14859166741371155,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.0005081476992927492,
"policy/clipfrac_avg": 0.007128735538572073,
"policy/entropy_avg": 0.19517645239830017,
"step": 245,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 40,
"val/ratio": 1.0000133514404297,
"val/ratio_var": 6.433290309360018e-07
},
{
"episode": 15744,
"epoch": 2.971126627665597,
"eps": 0,
"loss/policy_avg": -0.044989436864852905,
"loss/value_avg": 0.004129257518798113,
"lr": 1.1447811447811447e-07,
"objective/entropy": -718.4443359375,
"objective/kl": 6.58664608001709,
"objective/non_score_reward": -0.197599396109581,
"objective/rlhf_reward": 0.3522053062915802,
"objective/scores": 0.55078125,
"policy/approxkl_avg": 0.00035908090649172664,
"policy/clipfrac_avg": 0.006561779882758856,
"policy/entropy_avg": 0.17469915747642517,
"step": 246,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 1.000040888786316,
"val/ratio_var": 6.177034492793609e-07
},
{
"episode": 15808,
"epoch": 2.983204378184563,
"eps": 0,
"loss/policy_avg": -0.006306433584541082,
"loss/value_avg": 0.00377975357696414,
"lr": 1.1279461279461279e-07,
"objective/entropy": -669.0179443359375,
"objective/kl": 7.104648590087891,
"objective/non_score_reward": -0.21313944458961487,
"objective/rlhf_reward": 0.19604022800922394,
"objective/scores": 0.41015625,
"policy/approxkl_avg": 0.0003768262395169586,
"policy/clipfrac_avg": 0.007034813519567251,
"policy/entropy_avg": 0.1779836118221283,
"step": 247,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 0.999952495098114,
"val/ratio_var": 5.565264018514426e-07
},
{
"episode": 15872,
"epoch": 2.995282128703529,
"eps": 0,
"loss/policy_avg": -0.030956070870161057,
"loss/value_avg": 0.0035534966737031937,
"lr": 1.111111111111111e-07,
"objective/entropy": -611.2562255859375,
"objective/kl": 8.399555206298828,
"objective/non_score_reward": -0.25198665261268616,
"objective/rlhf_reward": 0.16500553488731384,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.00039667676901444793,
"policy/clipfrac_avg": 0.007198335137218237,
"policy/entropy_avg": 0.19819514453411102,
"step": 248,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 0.999969482421875,
"val/ratio_var": 4.946619469592406e-07
},
{
"episode": 15936,
"epoch": 3.007359879222495,
"eps": 0,
"loss/policy_avg": -0.017340319231152534,
"loss/value_avg": 0.004106181673705578,
"lr": 1.0942760942760942e-07,
"objective/entropy": -661.902099609375,
"objective/kl": 7.087057113647461,
"objective/non_score_reward": -0.21261171996593475,
"objective/rlhf_reward": 0.24832576513290405,
"objective/scores": 0.4609375,
"policy/approxkl_avg": 0.00036943715531378984,
"policy/clipfrac_avg": 0.0072855958715081215,
"policy/entropy_avg": 0.19068431854248047,
"step": 249,
"val/clipfrac_avg": 5.44804743185523e-06,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000860691070557,
"val/ratio_var": 6.541467314491456e-07
},
{
"episode": 16000,
"epoch": 3.0194376297414607,
"eps": 0,
"loss/policy_avg": -0.006075289100408554,
"loss/value_avg": 0.0037019317969679832,
"lr": 1.0774410774410773e-07,
"objective/entropy": -622.1962890625,
"objective/kl": 8.35627555847168,
"objective/non_score_reward": -0.25068825483322144,
"objective/rlhf_reward": 0.055464085191488266,
"objective/scores": 0.306640625,
"policy/approxkl_avg": 0.0004216305387672037,
"policy/clipfrac_avg": 0.00783085823059082,
"policy/entropy_avg": 0.2043101042509079,
"step": 250,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 33,
"val/ratio": 0.9999951124191284,
"val/ratio_var": 7.26553423646692e-07
},
{
"episode": 16064,
"epoch": 3.0315153802604264,
"eps": 0,
"loss/policy_avg": -0.02081982046365738,
"loss/value_avg": 0.0035106416326016188,
"lr": 1.0606060606060605e-07,
"objective/entropy": -641.5849609375,
"objective/kl": 8.316513061523438,
"objective/non_score_reward": -0.24949535727500916,
"objective/rlhf_reward": 0.19825854897499084,
"objective/scores": 0.447265625,
"policy/approxkl_avg": 0.00040991941932588816,
"policy/clipfrac_avg": 0.007065493613481522,
"policy/entropy_avg": 0.19249090552330017,
"step": 251,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 0.999900221824646,
"val/ratio_var": 8.348190476681339e-07
},
{
"episode": 16128,
"epoch": 3.043593130779392,
"eps": 0,
"loss/policy_avg": -0.03310645744204521,
"loss/value_avg": 0.003968073055148125,
"lr": 1.0437710437710436e-07,
"objective/entropy": -636.48974609375,
"objective/kl": 7.7286529541015625,
"objective/non_score_reward": -0.23185959458351135,
"objective/rlhf_reward": 0.17146071791648865,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.00042435387149453163,
"policy/clipfrac_avg": 0.007897584699094296,
"policy/entropy_avg": 0.20297622680664062,
"step": 252,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 0.99993896484375,
"val/ratio_var": 8.238701525442593e-07
},
{
"episode": 16192,
"epoch": 3.0556708812983584,
"eps": 0,
"loss/policy_avg": -0.01020126324146986,
"loss/value_avg": 0.0036760650109499693,
"lr": 1.0269360269360269e-07,
"objective/entropy": -679.7139892578125,
"objective/kl": 7.155096054077148,
"objective/non_score_reward": -0.2146528959274292,
"objective/rlhf_reward": 0.216499462723732,
"objective/scores": 0.431640625,
"policy/approxkl_avg": 0.0003815985983237624,
"policy/clipfrac_avg": 0.007189772091805935,
"policy/entropy_avg": 0.19544348120689392,
"step": 253,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999157190322876,
"val/ratio_var": 5.615696636596113e-07
},
{
"episode": 16256,
"epoch": 3.067748631817324,
"eps": 0,
"loss/policy_avg": -0.005997738800942898,
"loss/value_avg": 0.004006213508546352,
"lr": 1.01010101010101e-07,
"objective/entropy": -635.3106689453125,
"objective/kl": 8.364971160888672,
"objective/non_score_reward": -0.2509491443634033,
"objective/rlhf_reward": 0.12844537198543549,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.000423733436036855,
"policy/clipfrac_avg": 0.006593957543373108,
"policy/entropy_avg": 0.2101338803768158,
"step": 254,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999582171440125,
"val/ratio_var": 5.447363378152659e-07
},
{
"episode": 16320,
"epoch": 3.07982638233629,
"eps": 0,
"loss/policy_avg": -0.016920043155550957,
"loss/value_avg": 0.0037770867347717285,
"lr": 9.932659932659932e-08,
"objective/entropy": -681.9376220703125,
"objective/kl": 7.062074661254883,
"objective/non_score_reward": -0.2118622362613678,
"objective/rlhf_reward": 0.2236846387386322,
"objective/scores": 0.435546875,
"policy/approxkl_avg": 0.00037374263047240674,
"policy/clipfrac_avg": 0.0069709960371255875,
"policy/entropy_avg": 0.18284988403320312,
"step": 255,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 40,
"val/ratio": 1.0000779628753662,
"val/ratio_var": 6.587182497241884e-07
},
{
"episode": 16384,
"epoch": 3.0919041328552557,
"eps": 0,
"loss/policy_avg": -0.01712847873568535,
"loss/value_avg": 0.0041097188368439674,
"lr": 9.764309764309763e-08,
"objective/entropy": -683.023193359375,
"objective/kl": 6.437891960144043,
"objective/non_score_reward": -0.19313675165176392,
"objective/rlhf_reward": 0.18479293584823608,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.0003696854109875858,
"policy/clipfrac_avg": 0.007503229193389416,
"policy/entropy_avg": 0.18361155688762665,
"step": 256,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.0000218152999878,
"val/ratio_var": 5.72627527617442e-07
},
{
"episode": 16448,
"epoch": 3.1039818833742214,
"eps": 0,
"loss/policy_avg": -0.03213302046060562,
"loss/value_avg": 0.0037582411896437407,
"lr": 9.595959595959594e-08,
"objective/entropy": -648.9036254882812,
"objective/kl": 7.319805145263672,
"objective/non_score_reward": -0.21959413588047028,
"objective/rlhf_reward": 0.22278867661952972,
"objective/scores": 0.44140625,
"policy/approxkl_avg": 0.0003951989929191768,
"policy/clipfrac_avg": 0.006181157194077969,
"policy/entropy_avg": 0.18743896484375,
"step": 257,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000240802764893,
"val/ratio_var": 6.705196256007184e-07
},
{
"episode": 16512,
"epoch": 3.116059633893187,
"eps": 0,
"loss/policy_avg": -0.026423348113894463,
"loss/value_avg": 0.00365483108907938,
"lr": 9.427609427609427e-08,
"objective/entropy": -658.4329223632812,
"objective/kl": 6.591666221618652,
"objective/non_score_reward": -0.19774997234344482,
"objective/rlhf_reward": 0.23535549640655518,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.0003852533991448581,
"policy/clipfrac_avg": 0.006929041352123022,
"policy/entropy_avg": 0.18116506934165955,
"step": 258,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.000004768371582,
"val/ratio_var": 6.329533448479197e-07
},
{
"episode": 16576,
"epoch": 3.1281373844121534,
"eps": 0,
"loss/policy_avg": -0.01229805313050747,
"loss/value_avg": 0.004078000318259001,
"lr": 9.259259259259258e-08,
"objective/entropy": -706.8790893554688,
"objective/kl": 6.781729698181152,
"objective/non_score_reward": -0.20345187187194824,
"objective/rlhf_reward": 0.25748562812805176,
"objective/scores": 0.4609375,
"policy/approxkl_avg": 0.00035077554639428854,
"policy/clipfrac_avg": 0.00699991500005126,
"policy/entropy_avg": 0.18130874633789062,
"step": 259,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999758005142212,
"val/ratio_var": 5.301695296111575e-07
},
{
"episode": 16640,
"epoch": 3.140215134931119,
"eps": 0,
"loss/policy_avg": -0.03154832124710083,
"loss/value_avg": 0.003632882609963417,
"lr": 9.09090909090909e-08,
"objective/entropy": -613.3504638671875,
"objective/kl": 7.657683372497559,
"objective/non_score_reward": -0.2297305017709732,
"objective/rlhf_reward": 0.148199200630188,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.0005249691312201321,
"policy/clipfrac_avg": 0.007619872223585844,
"policy/entropy_avg": 0.18793997168540955,
"step": 260,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.000046730041504,
"val/ratio_var": 7.239053161356424e-07
},
{
"episode": 16704,
"epoch": 3.152292885450085,
"eps": 0,
"loss/policy_avg": -0.01213142555207014,
"loss/value_avg": 0.0035843513906002045,
"lr": 8.922558922558921e-08,
"objective/entropy": -649.6549072265625,
"objective/kl": 9.2880277633667,
"objective/non_score_reward": -0.27864083647727966,
"objective/rlhf_reward": 0.12467947602272034,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.00038886774564161897,
"policy/clipfrac_avg": 0.006718775257468224,
"policy/entropy_avg": 0.18620681762695312,
"step": 261,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.999856173992157,
"val/ratio_var": 4.936819664180803e-07
},
{
"episode": 16768,
"epoch": 3.1643706359690507,
"eps": 0,
"loss/policy_avg": -0.007179616950452328,
"loss/value_avg": 0.0035246573388576508,
"lr": 8.754208754208754e-08,
"objective/entropy": -623.578857421875,
"objective/kl": 8.410058975219727,
"objective/non_score_reward": -0.25230175256729126,
"objective/rlhf_reward": 0.08363573253154755,
"objective/scores": 0.3359375,
"policy/approxkl_avg": 0.00044147943845018744,
"policy/clipfrac_avg": 0.007476884871721268,
"policy/entropy_avg": 0.20351791381835938,
"step": 262,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 1.0000280141830444,
"val/ratio_var": 6.352038326440379e-07
},
{
"episode": 16832,
"epoch": 3.1764483864880164,
"eps": 0,
"loss/policy_avg": -0.003754607168957591,
"loss/value_avg": 0.003930999897420406,
"lr": 8.585858585858585e-08,
"objective/entropy": -696.397705078125,
"objective/kl": 8.027302742004395,
"objective/non_score_reward": -0.2408190667629242,
"objective/rlhf_reward": 0.0904797613620758,
"objective/scores": 0.33203125,
"policy/approxkl_avg": 0.0003533074341248721,
"policy/clipfrac_avg": 0.00765426829457283,
"policy/entropy_avg": 0.18895339965820312,
"step": 263,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.99993896484375,
"val/ratio_var": 4.5711240659329633e-07
},
{
"episode": 16896,
"epoch": 3.1885261370069826,
"eps": 0,
"loss/policy_avg": -0.017033755779266357,
"loss/value_avg": 0.003304037032648921,
"lr": 8.417508417508418e-08,
"objective/entropy": -598.6932373046875,
"objective/kl": 7.669593811035156,
"objective/non_score_reward": -0.23008780181407928,
"objective/rlhf_reward": 0.13075204193592072,
"objective/scores": 0.361328125,
"policy/approxkl_avg": 0.00047474654274992645,
"policy/clipfrac_avg": 0.007650506682693958,
"policy/entropy_avg": 0.20692571997642517,
"step": 264,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 39,
"val/ratio": 1.0001184940338135,
"val/ratio_var": 9.710288395581301e-07
},
{
"episode": 16960,
"epoch": 3.2006038875259484,
"eps": 0,
"loss/policy_avg": -0.02458018623292446,
"loss/value_avg": 0.00406844075769186,
"lr": 8.24915824915825e-08,
"objective/entropy": -673.213134765625,
"objective/kl": 6.648907661437988,
"objective/non_score_reward": -0.1994672268629074,
"objective/rlhf_reward": 0.2600054144859314,
"objective/scores": 0.458984375,
"policy/approxkl_avg": 0.0003697554930113256,
"policy/clipfrac_avg": 0.0070952074602246284,
"policy/entropy_avg": 0.1852405071258545,
"step": 265,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 53,
"val/ratio": 1.0000247955322266,
"val/ratio_var": 5.141792485119367e-07
},
{
"episode": 17024,
"epoch": 3.212681638044914,
"eps": 0,
"loss/policy_avg": -0.009238027967512608,
"loss/value_avg": 0.004165910184383392,
"lr": 8.080808080808082e-08,
"objective/entropy": -735.2806396484375,
"objective/kl": 6.235321998596191,
"objective/non_score_reward": -0.1870596557855606,
"objective/rlhf_reward": 0.2616708278656006,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.00032986787846311927,
"policy/clipfrac_avg": 0.006603958085179329,
"policy/entropy_avg": 0.17215602099895477,
"step": 266,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 0.9999458193778992,
"val/ratio_var": 5.301350824993278e-07
},
{
"episode": 17088,
"epoch": 3.22475938856388,
"eps": 0,
"loss/policy_avg": -0.01698639616370201,
"loss/value_avg": 0.0037531605921685696,
"lr": 7.912457912457913e-08,
"objective/entropy": -652.947265625,
"objective/kl": 7.257780075073242,
"objective/non_score_reward": -0.21773339807987213,
"objective/rlhf_reward": 0.21537207067012787,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.000403220416046679,
"policy/clipfrac_avg": 0.006984008476138115,
"policy/entropy_avg": 0.1990203857421875,
"step": 267,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 1.0000309944152832,
"val/ratio_var": 6.781556862733851e-07
},
{
"episode": 17152,
"epoch": 3.2368371390828456,
"eps": 0,
"loss/policy_avg": -0.028039943426847458,
"loss/value_avg": 0.003373341169208288,
"lr": 7.744107744107744e-08,
"objective/entropy": -569.8634033203125,
"objective/kl": 7.230891227722168,
"objective/non_score_reward": -0.21692675352096558,
"objective/rlhf_reward": 0.14879590272903442,
"objective/scores": 0.365234375,
"policy/approxkl_avg": 0.0004610806645359844,
"policy/clipfrac_avg": 0.007782801054418087,
"policy/entropy_avg": 0.21935272216796875,
"step": 268,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 38,
"val/ratio": 0.9999889135360718,
"val/ratio_var": 6.67467986659176e-07
},
{
"episode": 17216,
"epoch": 3.248914889601812,
"eps": 0,
"loss/policy_avg": 0.012302754446864128,
"loss/value_avg": 0.0036267773248255253,
"lr": 7.575757575757576e-08,
"objective/entropy": -655.1212768554688,
"objective/kl": 8.060359954833984,
"objective/non_score_reward": -0.24181079864501953,
"objective/rlhf_reward": 0.03614329546689987,
"objective/scores": 0.27734375,
"policy/approxkl_avg": 0.0003768009482882917,
"policy/clipfrac_avg": 0.0069151511415839195,
"policy/entropy_avg": 0.20404815673828125,
"step": 269,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999961256980896,
"val/ratio_var": 5.263832463242579e-07
},
{
"episode": 17280,
"epoch": 3.2609926401207776,
"eps": 0,
"loss/policy_avg": -0.004253086168318987,
"loss/value_avg": 0.0037821203004568815,
"lr": 7.407407407407407e-08,
"objective/entropy": -601.41015625,
"objective/kl": 7.641479015350342,
"objective/non_score_reward": -0.2292443811893463,
"objective/rlhf_reward": 0.1254919469356537,
"objective/scores": 0.35546875,
"policy/approxkl_avg": 0.0004039438790641725,
"policy/clipfrac_avg": 0.0064240507781505585,
"policy/entropy_avg": 0.19796499609947205,
"step": 270,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 0.9998639822006226,
"val/ratio_var": 5.743943347624736e-07
},
{
"episode": 17344,
"epoch": 3.2730703906397434,
"eps": 0,
"loss/policy_avg": -0.03380737453699112,
"loss/value_avg": 0.0034328820183873177,
"lr": 7.23905723905724e-08,
"objective/entropy": -647.0220947265625,
"objective/kl": 7.08984899520874,
"objective/non_score_reward": -0.21269546449184418,
"objective/rlhf_reward": 0.27851545810699463,
"objective/scores": 0.4921875,
"policy/approxkl_avg": 0.00040110870031639934,
"policy/clipfrac_avg": 0.007048811763525009,
"policy/entropy_avg": 0.19513702392578125,
"step": 271,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 0.9998549222946167,
"val/ratio_var": 6.118988267189707e-07
},
{
"episode": 17408,
"epoch": 3.285148141158709,
"eps": 0,
"loss/policy_avg": -0.014850600622594357,
"loss/value_avg": 0.0036329745780676603,
"lr": 7.070707070707071e-08,
"objective/entropy": -586.4075317382812,
"objective/kl": 8.249979019165039,
"objective/non_score_reward": -0.24749934673309326,
"objective/rlhf_reward": 0.15679752826690674,
"objective/scores": 0.404296875,
"policy/approxkl_avg": 0.00042898274841718376,
"policy/clipfrac_avg": 0.0069151753559708595,
"policy/entropy_avg": 0.19608816504478455,
"step": 272,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 1.0001020431518555,
"val/ratio_var": 8.127553314807301e-07
},
{
"episode": 17472,
"epoch": 3.297225891677675,
"eps": 0,
"loss/policy_avg": -0.01862741820514202,
"loss/value_avg": 0.0038365069776773453,
"lr": 6.902356902356903e-08,
"objective/entropy": -696.0003051757812,
"objective/kl": 7.6856584548950195,
"objective/non_score_reward": -0.2305697500705719,
"objective/rlhf_reward": 0.2308560311794281,
"objective/scores": 0.4609375,
"policy/approxkl_avg": 0.00037589838029816747,
"policy/clipfrac_avg": 0.007162098772823811,
"policy/entropy_avg": 0.186614990234375,
"step": 273,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000542402267456,
"val/ratio_var": 5.687811608368065e-07
},
{
"episode": 17536,
"epoch": 3.309303642196641,
"eps": 0,
"loss/policy_avg": -0.02315349504351616,
"loss/value_avg": 0.0040380991995334625,
"lr": 6.734006734006734e-08,
"objective/entropy": -722.7449951171875,
"objective/kl": 6.226775646209717,
"objective/non_score_reward": -0.18680325150489807,
"objective/rlhf_reward": 0.24581393599510193,
"objective/scores": 0.43359375,
"policy/approxkl_avg": 0.0003334844659548253,
"policy/clipfrac_avg": 0.0064841099083423615,
"policy/entropy_avg": 0.18703460693359375,
"step": 274,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000557899475098,
"val/ratio_var": 5.662852800014662e-07
},
{
"episode": 17600,
"epoch": 3.321381392715607,
"eps": 0,
"loss/policy_avg": -0.02682194858789444,
"loss/value_avg": 0.0037449407391250134,
"lr": 6.565656565656566e-08,
"objective/entropy": -635.6344604492188,
"objective/kl": 7.442312240600586,
"objective/non_score_reward": -0.22326935827732086,
"objective/rlhf_reward": 0.22106656432151794,
"objective/scores": 0.4453125,
"policy/approxkl_avg": 0.00040927709778770804,
"policy/clipfrac_avg": 0.006474938243627548,
"policy/entropy_avg": 0.19021479785442352,
"step": 275,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0002195835113525,
"val/ratio_var": 6.032373676134739e-07
},
{
"episode": 17664,
"epoch": 3.3334591432345726,
"eps": 0,
"loss/policy_avg": -0.005783136934041977,
"loss/value_avg": 0.003839105134829879,
"lr": 6.397306397306398e-08,
"objective/entropy": -667.8414306640625,
"objective/kl": 6.772984504699707,
"objective/non_score_reward": -0.20318952202796936,
"objective/rlhf_reward": 0.18743547797203064,
"objective/scores": 0.390625,
"policy/approxkl_avg": 0.0003651580773293972,
"policy/clipfrac_avg": 0.007003391161561012,
"policy/entropy_avg": 0.19083023071289062,
"step": 276,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.0000221729278564,
"val/ratio_var": 5.736771981901256e-07
},
{
"episode": 17728,
"epoch": 3.3455368937535384,
"eps": 0,
"loss/policy_avg": -0.006362794898450375,
"loss/value_avg": 0.003617867361754179,
"lr": 6.228956228956229e-08,
"objective/entropy": -669.930419921875,
"objective/kl": 7.383747100830078,
"objective/non_score_reward": -0.22151240706443787,
"objective/rlhf_reward": 0.18522588908672333,
"objective/scores": 0.40625,
"policy/approxkl_avg": 0.00036430690670385957,
"policy/clipfrac_avg": 0.007155537139624357,
"policy/entropy_avg": 0.18641917407512665,
"step": 277,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999706745147705,
"val/ratio_var": 6.842114999017213e-07
},
{
"episode": 17792,
"epoch": 3.357614644272504,
"eps": 0,
"loss/policy_avg": -0.03245866298675537,
"loss/value_avg": 0.003861590987071395,
"lr": 6.060606060606061e-08,
"objective/entropy": -712.3450927734375,
"objective/kl": 6.383450984954834,
"objective/non_score_reward": -0.19150352478027344,
"objective/rlhf_reward": 0.31337928771972656,
"objective/scores": 0.50390625,
"policy/approxkl_avg": 0.00036257284227758646,
"policy/clipfrac_avg": 0.006432620342820883,
"policy/entropy_avg": 0.18203863501548767,
"step": 278,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999188184738159,
"val/ratio_var": 3.687934793106251e-07
},
{
"episode": 17856,
"epoch": 3.36969239479147,
"eps": 0,
"loss/policy_avg": -0.025677090510725975,
"loss/value_avg": 0.003610721556469798,
"lr": 5.892255892255892e-08,
"objective/entropy": -635.7150268554688,
"objective/kl": 6.59326171875,
"objective/non_score_reward": -0.19779784977436066,
"objective/rlhf_reward": 0.20503418147563934,
"objective/scores": 0.40234375,
"policy/approxkl_avg": 0.0003899303264915943,
"policy/clipfrac_avg": 0.006962133105844259,
"policy/entropy_avg": 0.19839096069335938,
"step": 279,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 47,
"val/ratio": 1.0000261068344116,
"val/ratio_var": 5.185344775782141e-07
},
{
"episode": 17920,
"epoch": 3.381770145310436,
"eps": 0,
"loss/policy_avg": -0.028471484780311584,
"loss/value_avg": 0.0032405236270278692,
"lr": 5.723905723905724e-08,
"objective/entropy": -642.4448852539062,
"objective/kl": 6.222779750823975,
"objective/non_score_reward": -0.18668338656425476,
"objective/rlhf_reward": 0.26058220863342285,
"objective/scores": 0.447265625,
"policy/approxkl_avg": 0.0004811930702999234,
"policy/clipfrac_avg": 0.007410434540361166,
"policy/entropy_avg": 0.201324462890625,
"step": 280,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 0.9999183416366577,
"val/ratio_var": 6.145901920717733e-07
},
{
"episode": 17984,
"epoch": 3.393847895829402,
"eps": 0,
"loss/policy_avg": -0.018460756167769432,
"loss/value_avg": 0.0036118782591074705,
"lr": 5.555555555555555e-08,
"objective/entropy": -647.02392578125,
"objective/kl": 7.956416130065918,
"objective/non_score_reward": -0.2386924773454666,
"objective/rlhf_reward": 0.1660926640033722,
"objective/scores": 0.404296875,
"policy/approxkl_avg": 0.00041467935079708695,
"policy/clipfrac_avg": 0.007317427080124617,
"policy/entropy_avg": 0.20193736255168915,
"step": 281,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000567436218262,
"val/ratio_var": 5.499433655131725e-07
},
{
"episode": 18048,
"epoch": 3.4059256463483676,
"eps": 0,
"loss/policy_avg": -0.012457543984055519,
"loss/value_avg": 0.0038994457572698593,
"lr": 5.3872053872053865e-08,
"objective/entropy": -716.0316162109375,
"objective/kl": 5.913008689880371,
"objective/non_score_reward": -0.17739026248455048,
"objective/rlhf_reward": 0.30991441011428833,
"objective/scores": 0.48828125,
"policy/approxkl_avg": 0.0003301530086901039,
"policy/clipfrac_avg": 0.0064779892563819885,
"policy/entropy_avg": 0.18742243945598602,
"step": 282,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000253915786743,
"val/ratio_var": 4.4198901605341234e-07
},
{
"episode": 18112,
"epoch": 3.4180033968673333,
"eps": 0,
"loss/policy_avg": -0.013042353093624115,
"loss/value_avg": 0.0037472937256097794,
"lr": 5.218855218855218e-08,
"objective/entropy": -752.298828125,
"objective/kl": 6.022947311401367,
"objective/non_score_reward": -0.18068841099739075,
"objective/rlhf_reward": 0.35056155920028687,
"objective/scores": 0.53125,
"policy/approxkl_avg": 0.0003260195953771472,
"policy/clipfrac_avg": 0.0073262769728899,
"policy/entropy_avg": 0.193359375,
"step": 283,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 58,
"val/ratio": 1.00008225440979,
"val/ratio_var": 5.816585826323717e-07
},
{
"episode": 18176,
"epoch": 3.430081147386299,
"eps": 0,
"loss/policy_avg": -0.022922195494174957,
"loss/value_avg": 0.003924447111785412,
"lr": 5.05050505050505e-08,
"objective/entropy": -727.8096923828125,
"objective/kl": 6.566383361816406,
"objective/non_score_reward": -0.19699150323867798,
"objective/rlhf_reward": 0.274688184261322,
"objective/scores": 0.47265625,
"policy/approxkl_avg": 0.0003402878064662218,
"policy/clipfrac_avg": 0.006386288907378912,
"policy/entropy_avg": 0.17614874243736267,
"step": 284,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 56,
"val/ratio": 1.0001291036605835,
"val/ratio_var": 7.245762958518753e-07
},
{
"episode": 18240,
"epoch": 3.4421588979052653,
"eps": 0,
"loss/policy_avg": -0.005379597656428814,
"loss/value_avg": 0.003668400924652815,
"lr": 4.8821548821548816e-08,
"objective/entropy": -657.02294921875,
"objective/kl": 7.11133337020874,
"objective/non_score_reward": -0.21333999931812286,
"objective/rlhf_reward": 0.26126939058303833,
"objective/scores": 0.474609375,
"policy/approxkl_avg": 0.0003993379359599203,
"policy/clipfrac_avg": 0.00656835176050663,
"policy/entropy_avg": 0.19129817187786102,
"step": 285,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 39,
"val/ratio": 1.0000909566879272,
"val/ratio_var": 4.280297787317977e-07
},
{
"episode": 18304,
"epoch": 3.454236648424231,
"eps": 0,
"loss/policy_avg": -0.0007903016521595418,
"loss/value_avg": 0.003940465394407511,
"lr": 4.7138047138047134e-08,
"objective/entropy": -678.7791748046875,
"objective/kl": 7.29849100112915,
"objective/non_score_reward": -0.21895474195480347,
"objective/rlhf_reward": 0.15433627367019653,
"objective/scores": 0.373046875,
"policy/approxkl_avg": 0.0003655221953522414,
"policy/clipfrac_avg": 0.006983469240367413,
"policy/entropy_avg": 0.19457626342773438,
"step": 286,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 1.0000255107879639,
"val/ratio_var": 6.261829526010843e-07
},
{
"episode": 18368,
"epoch": 3.466314398943197,
"eps": 0,
"loss/policy_avg": -0.013323694467544556,
"loss/value_avg": 0.003565411549061537,
"lr": 4.545454545454545e-08,
"objective/entropy": -666.1871948242188,
"objective/kl": 6.453955173492432,
"objective/non_score_reward": -0.19361865520477295,
"objective/rlhf_reward": 0.23753370344638824,
"objective/scores": 0.431640625,
"policy/approxkl_avg": 0.0003667096607387066,
"policy/clipfrac_avg": 0.0066523477435112,
"policy/entropy_avg": 0.19250616431236267,
"step": 287,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 41,
"val/ratio": 0.9998884201049805,
"val/ratio_var": 4.2977848124792217e-07
},
{
"episode": 18432,
"epoch": 3.4783921494621626,
"eps": 0,
"loss/policy_avg": -0.01416645385324955,
"loss/value_avg": 0.0035817499738186598,
"lr": 4.377104377104377e-08,
"objective/entropy": -700.9827270507812,
"objective/kl": 6.070189476013184,
"objective/non_score_reward": -0.1821056753396988,
"objective/rlhf_reward": 0.2661365270614624,
"objective/scores": 0.44921875,
"policy/approxkl_avg": 0.0003335383953526616,
"policy/clipfrac_avg": 0.006237865425646305,
"policy/entropy_avg": 0.18711933493614197,
"step": 288,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.000044584274292,
"val/ratio_var": 4.85237990233145e-07
},
{
"episode": 18496,
"epoch": 3.4904698999811283,
"eps": 0,
"loss/policy_avg": -0.022722497582435608,
"loss/value_avg": 0.0033240667544305325,
"lr": 4.208754208754209e-08,
"objective/entropy": -738.82373046875,
"objective/kl": 5.70413064956665,
"objective/non_score_reward": -0.1711239218711853,
"objective/rlhf_reward": 0.3274112343788147,
"objective/scores": 0.498046875,
"policy/approxkl_avg": 0.0003393371298443526,
"policy/clipfrac_avg": 0.007512836717069149,
"policy/entropy_avg": 0.1853078305721283,
"step": 289,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 0.9999982714653015,
"val/ratio_var": 4.772057877744373e-07
},
{
"episode": 18560,
"epoch": 3.5025476505000945,
"eps": 0,
"loss/policy_avg": -0.0007028168765828013,
"loss/value_avg": 0.0038722134195268154,
"lr": 4.040404040404041e-08,
"objective/entropy": -688.8043212890625,
"objective/kl": 7.602431297302246,
"objective/non_score_reward": -0.2280729115009308,
"objective/rlhf_reward": 0.1908724009990692,
"objective/scores": 0.41796875,
"policy/approxkl_avg": 0.00037353829247877,
"policy/clipfrac_avg": 0.007045034319162369,
"policy/entropy_avg": 0.19284312427043915,
"step": 290,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 0.9999067187309265,
"val/ratio_var": 6.341031166812172e-07
},
{
"episode": 18624,
"epoch": 3.5146254010190603,
"eps": 0,
"loss/policy_avg": -0.013677339069545269,
"loss/value_avg": 0.0034771780483424664,
"lr": 3.872053872053872e-08,
"objective/entropy": -623.31884765625,
"objective/kl": 6.28436803817749,
"objective/non_score_reward": -0.1885310411453247,
"objective/rlhf_reward": 0.1708439588546753,
"objective/scores": 0.359375,
"policy/approxkl_avg": 0.00041524547850713134,
"policy/clipfrac_avg": 0.006799499504268169,
"policy/entropy_avg": 0.182159423828125,
"step": 291,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 35,
"val/ratio": 1.0000431537628174,
"val/ratio_var": 6.771080052203615e-07
},
{
"episode": 18688,
"epoch": 3.526703151538026,
"eps": 0,
"loss/policy_avg": -0.02629309892654419,
"loss/value_avg": 0.003640729933977127,
"lr": 3.7037037037037036e-08,
"objective/entropy": -668.0806884765625,
"objective/kl": 7.632940292358398,
"objective/non_score_reward": -0.22898820042610168,
"objective/rlhf_reward": 0.24610945582389832,
"objective/scores": 0.474609375,
"policy/approxkl_avg": 0.00038289197254925966,
"policy/clipfrac_avg": 0.0064846850000321865,
"policy/entropy_avg": 0.19228872656822205,
"step": 292,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.0000791549682617,
"val/ratio_var": 6.841331696705311e-07
},
{
"episode": 18752,
"epoch": 3.538780902056992,
"eps": 0,
"loss/policy_avg": -0.012860393151640892,
"loss/value_avg": 0.004311088938266039,
"lr": 3.5353535353535353e-08,
"objective/entropy": -699.3877563476562,
"objective/kl": 6.807313919067383,
"objective/non_score_reward": -0.20421940088272095,
"objective/rlhf_reward": 0.27234309911727905,
"objective/scores": 0.4765625,
"policy/approxkl_avg": 0.00037193228490650654,
"policy/clipfrac_avg": 0.006834262516349554,
"policy/entropy_avg": 0.17558543384075165,
"step": 293,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 40,
"val/ratio": 0.9999904632568359,
"val/ratio_var": 5.597553922598308e-07
},
{
"episode": 18816,
"epoch": 3.5508586525759576,
"eps": 0,
"loss/policy_avg": -0.03740035742521286,
"loss/value_avg": 0.003467664122581482,
"lr": 3.367003367003367e-08,
"objective/entropy": -682.0465698242188,
"objective/kl": 7.4768147468566895,
"objective/non_score_reward": -0.2243044376373291,
"objective/rlhf_reward": 0.2590939998626709,
"objective/scores": 0.484375,
"policy/approxkl_avg": 0.0004095996846444905,
"policy/clipfrac_avg": 0.006977352779358625,
"policy/entropy_avg": 0.19226329028606415,
"step": 294,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 39,
"val/ratio": 0.9999417066574097,
"val/ratio_var": 6.161255896586226e-07
},
{
"episode": 18880,
"epoch": 3.5629364030949238,
"eps": 0,
"loss/policy_avg": -0.02602003514766693,
"loss/value_avg": 0.003619457595050335,
"lr": 3.198653198653199e-08,
"objective/entropy": -725.4286499023438,
"objective/kl": 6.233606338500977,
"objective/non_score_reward": -0.18700820207595825,
"objective/rlhf_reward": 0.21924179792404175,
"objective/scores": 0.40625,
"policy/approxkl_avg": 0.00038076151395216584,
"policy/clipfrac_avg": 0.006789907813072205,
"policy/entropy_avg": 0.1863047331571579,
"step": 295,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 44,
"val/ratio": 1.0000081062316895,
"val/ratio_var": 6.039576874172781e-07
},
{
"episode": 18944,
"epoch": 3.5750141536138895,
"eps": 0,
"loss/policy_avg": -0.02581647038459778,
"loss/value_avg": 0.003386137541383505,
"lr": 3.0303030303030305e-08,
"objective/entropy": -664.8191528320312,
"objective/kl": 7.328958988189697,
"objective/non_score_reward": -0.21986877918243408,
"objective/rlhf_reward": 0.17368590831756592,
"objective/scores": 0.39453125,
"policy/approxkl_avg": 0.0003837857802864164,
"policy/clipfrac_avg": 0.006847723387181759,
"policy/entropy_avg": 0.2102610319852829,
"step": 296,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 0.9997448921203613,
"val/ratio_var": 6.460118697759754e-07
},
{
"episode": 19008,
"epoch": 3.5870919041328553,
"eps": 0,
"loss/policy_avg": -0.01845662109553814,
"loss/value_avg": 0.0036777183413505554,
"lr": 2.861952861952862e-08,
"objective/entropy": -644.3194580078125,
"objective/kl": 6.178158760070801,
"objective/non_score_reward": -0.18534475564956665,
"objective/rlhf_reward": 0.24483102560043335,
"objective/scores": 0.4296875,
"policy/approxkl_avg": 0.0003848365449812263,
"policy/clipfrac_avg": 0.007219640072435141,
"policy/entropy_avg": 0.18966802954673767,
"step": 297,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 34,
"val/ratio": 1.0000991821289062,
"val/ratio_var": 4.2051803461617965e-07
},
{
"episode": 19072,
"epoch": 3.599169654651821,
"eps": 0,
"loss/policy_avg": -0.03636598959565163,
"loss/value_avg": 0.0033596050925552845,
"lr": 2.6936026936026933e-08,
"objective/entropy": -675.494384765625,
"objective/kl": 6.287810802459717,
"objective/non_score_reward": -0.18863432109355927,
"objective/rlhf_reward": 0.34456878900527954,
"objective/scores": 0.53125,
"policy/approxkl_avg": 0.000387437641620636,
"policy/clipfrac_avg": 0.006644147448241711,
"policy/entropy_avg": 0.2019907683134079,
"step": 298,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 0.9999553561210632,
"val/ratio_var": 5.573217549681431e-07
},
{
"episode": 19136,
"epoch": 3.611247405170787,
"eps": 0,
"loss/policy_avg": -0.00651153177022934,
"loss/value_avg": 0.003799569560214877,
"lr": 2.525252525252525e-08,
"objective/entropy": -662.1578369140625,
"objective/kl": 6.4900946617126465,
"objective/non_score_reward": -0.1947028487920761,
"objective/rlhf_reward": 0.1832268387079239,
"objective/scores": 0.37890625,
"policy/approxkl_avg": 0.0003824663581326604,
"policy/clipfrac_avg": 0.006935178767889738,
"policy/entropy_avg": 0.18524932861328125,
"step": 299,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999604225158691,
"val/ratio_var": 5.727119400944503e-07
},
{
"episode": 19200,
"epoch": 3.623325155689753,
"eps": 0,
"loss/policy_avg": 4.80624566989718e-06,
"loss/value_avg": 0.003957442473620176,
"lr": 2.3569023569023567e-08,
"objective/entropy": -613.0647583007812,
"objective/kl": 7.1408891677856445,
"objective/non_score_reward": -0.21422669291496277,
"objective/rlhf_reward": 0.14368346333503723,
"objective/scores": 0.357421875,
"policy/approxkl_avg": 0.0004018023028038442,
"policy/clipfrac_avg": 0.007539949379861355,
"policy/entropy_avg": 0.18967437744140625,
"step": 300,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 1.0000495910644531,
"val/ratio_var": 7.146343250497011e-07
},
{
"episode": 19264,
"epoch": 3.6354029062087188,
"eps": 0,
"loss/policy_avg": -0.019861344248056412,
"loss/value_avg": 0.0036555491387844086,
"lr": 2.1885521885521884e-08,
"objective/entropy": -695.1436767578125,
"objective/kl": 5.617988586425781,
"objective/non_score_reward": -0.16853967308998108,
"objective/rlhf_reward": 0.2938627004623413,
"objective/scores": 0.462890625,
"policy/approxkl_avg": 0.00036370521411299706,
"policy/clipfrac_avg": 0.00675880815833807,
"policy/entropy_avg": 0.19301223754882812,
"step": 301,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 53,
"val/ratio": 0.9999930262565613,
"val/ratio_var": 4.901943952972942e-07
},
{
"episode": 19328,
"epoch": 3.6474806567276845,
"eps": 0,
"loss/policy_avg": 6.916312031535199e-06,
"loss/value_avg": 0.003553304821252823,
"lr": 2.0202020202020204e-08,
"objective/entropy": -603.779541015625,
"objective/kl": 7.373934745788574,
"objective/non_score_reward": -0.2212180495262146,
"objective/rlhf_reward": 0.1664772629737854,
"objective/scores": 0.38671875,
"policy/approxkl_avg": 0.0004275983665138483,
"policy/clipfrac_avg": 0.006720641162246466,
"policy/entropy_avg": 0.20763906836509705,
"step": 302,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.0000828504562378,
"val/ratio_var": 5.606044624073547e-07
},
{
"episode": 19392,
"epoch": 3.6595584072466503,
"eps": 0,
"loss/policy_avg": -0.016103968024253845,
"loss/value_avg": 0.0041246358305215836,
"lr": 1.8518518518518518e-08,
"objective/entropy": -669.2377319335938,
"objective/kl": 5.863403797149658,
"objective/non_score_reward": -0.1759021282196045,
"objective/rlhf_reward": 0.2698986530303955,
"objective/scores": 0.4453125,
"policy/approxkl_avg": 0.00045897456584498286,
"policy/clipfrac_avg": 0.005576698109507561,
"policy/entropy_avg": 0.17554092407226562,
"step": 303,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.0000157356262207,
"val/ratio_var": 6.06298158345453e-07
},
{
"episode": 19456,
"epoch": 3.671636157765616,
"eps": 0,
"loss/policy_avg": -0.014837692491710186,
"loss/value_avg": 0.0034644536208361387,
"lr": 1.6835016835016835e-08,
"objective/entropy": -638.3107299804688,
"objective/kl": 6.990595817565918,
"objective/non_score_reward": -0.20971786975860596,
"objective/rlhf_reward": 0.19067275524139404,
"objective/scores": 0.400390625,
"policy/approxkl_avg": 0.00039926738827489316,
"policy/clipfrac_avg": 0.0072073861956596375,
"policy/entropy_avg": 0.19615554809570312,
"step": 304,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 55,
"val/ratio": 0.9998650550842285,
"val/ratio_var": 6.70040776640235e-07
},
{
"episode": 19520,
"epoch": 3.683713908284582,
"eps": 0,
"loss/policy_avg": -0.03533481806516647,
"loss/value_avg": 0.003643455682322383,
"lr": 1.5151515151515152e-08,
"objective/entropy": -708.4891357421875,
"objective/kl": 6.058835506439209,
"objective/non_score_reward": -0.18176504969596863,
"objective/rlhf_reward": 0.36803963780403137,
"objective/scores": 0.55078125,
"policy/approxkl_avg": 0.00035262128221802413,
"policy/clipfrac_avg": 0.006895636674016714,
"policy/entropy_avg": 0.18742243945598602,
"step": 305,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 49,
"val/ratio": 1.0000662803649902,
"val/ratio_var": 4.963605420016393e-07
},
{
"episode": 19584,
"epoch": 3.695791658803548,
"eps": 0,
"loss/policy_avg": -0.02189006470143795,
"loss/value_avg": 0.0033562961034476757,
"lr": 1.3468013468013466e-08,
"objective/entropy": -635.7755737304688,
"objective/kl": 7.733546257019043,
"objective/non_score_reward": -0.23200638592243195,
"objective/rlhf_reward": 0.15861861407756805,
"objective/scores": 0.390625,
"policy/approxkl_avg": 0.00040599549538455904,
"policy/clipfrac_avg": 0.006881616078317165,
"policy/entropy_avg": 0.19137954711914062,
"step": 306,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.0000026226043701,
"val/ratio_var": 6.138251933407446e-07
},
{
"episode": 19648,
"epoch": 3.7078694093225137,
"eps": 0,
"loss/policy_avg": -0.03934434801340103,
"loss/value_avg": 0.004052530974149704,
"lr": 1.1784511784511783e-08,
"objective/entropy": -738.7037353515625,
"objective/kl": 6.419657230377197,
"objective/non_score_reward": -0.19258970022201538,
"objective/rlhf_reward": 0.3157110810279846,
"objective/scores": 0.5078125,
"policy/approxkl_avg": 0.0003348543250467628,
"policy/clipfrac_avg": 0.006614835001528263,
"policy/entropy_avg": 0.1747385710477829,
"step": 307,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 50,
"val/ratio": 1.0000545978546143,
"val/ratio_var": 4.5710487484029727e-07
},
{
"episode": 19712,
"epoch": 3.7199471598414795,
"eps": 0,
"loss/policy_avg": -0.01892966404557228,
"loss/value_avg": 0.0036056172102689743,
"lr": 1.0101010101010102e-08,
"objective/entropy": -665.5927124023438,
"objective/kl": 6.821819305419922,
"objective/non_score_reward": -0.20465457439422607,
"objective/rlhf_reward": 0.2562829256057739,
"objective/scores": 0.4609375,
"policy/approxkl_avg": 0.0003650089493021369,
"policy/clipfrac_avg": 0.006426130421459675,
"policy/entropy_avg": 0.19108709692955017,
"step": 308,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 46,
"val/ratio": 0.9999858736991882,
"val/ratio_var": 4.877447850049066e-07
},
{
"episode": 19776,
"epoch": 3.7320249103604453,
"eps": 0,
"loss/policy_avg": -0.012006578966975212,
"loss/value_avg": 0.003470144933089614,
"lr": 8.417508417508418e-09,
"objective/entropy": -619.7354736328125,
"objective/kl": 8.15022087097168,
"objective/non_score_reward": -0.2445066124200821,
"objective/rlhf_reward": 0.11438010632991791,
"objective/scores": 0.359375,
"policy/approxkl_avg": 0.00041244737803936005,
"policy/clipfrac_avg": 0.007321036886423826,
"policy/entropy_avg": 0.19420623779296875,
"step": 309,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 42,
"val/ratio": 1.000077486038208,
"val/ratio_var": 6.490017199212161e-07
},
{
"episode": 19840,
"epoch": 3.744102660879411,
"eps": 0,
"loss/policy_avg": -0.015695635229349136,
"loss/value_avg": 0.00352904899045825,
"lr": 6.734006734006733e-09,
"objective/entropy": -641.0952758789062,
"objective/kl": 7.442835330963135,
"objective/non_score_reward": -0.22328504920005798,
"objective/rlhf_reward": 0.20689073204994202,
"objective/scores": 0.4296875,
"policy/approxkl_avg": 0.00039796438068151474,
"policy/clipfrac_avg": 0.007473438512533903,
"policy/entropy_avg": 0.19350814819335938,
"step": 310,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 43,
"val/ratio": 0.9999562501907349,
"val/ratio_var": 4.872820227319608e-07
},
{
"episode": 19904,
"epoch": 3.756180411398377,
"eps": 0,
"loss/policy_avg": 0.0016891969135031104,
"loss/value_avg": 0.003535608295351267,
"lr": 5.050505050505051e-09,
"objective/entropy": -627.7642822265625,
"objective/kl": 6.384559631347656,
"objective/non_score_reward": -0.1915367841720581,
"objective/rlhf_reward": 0.1114417240023613,
"objective/scores": 0.302734375,
"policy/approxkl_avg": 0.0004028166295029223,
"policy/clipfrac_avg": 0.006718785967677832,
"policy/entropy_avg": 0.2012532651424408,
"step": 311,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 45,
"val/ratio": 1.0000300407409668,
"val/ratio_var": 6.633476914430503e-07
},
{
"episode": 19968,
"epoch": 3.768258161917343,
"eps": 0,
"loss/policy_avg": -0.012200575321912766,
"loss/value_avg": 0.003601629287004471,
"lr": 3.3670033670033666e-09,
"objective/entropy": -709.2785034179688,
"objective/kl": 6.045020580291748,
"objective/non_score_reward": -0.18135061860084534,
"objective/rlhf_reward": 0.14872750639915466,
"objective/scores": 0.330078125,
"policy/approxkl_avg": 0.00035006398684345186,
"policy/clipfrac_avg": 0.006922123488038778,
"policy/entropy_avg": 0.1940714567899704,
"step": 312,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 52,
"val/ratio": 0.9998501539230347,
"val/ratio_var": 5.097400048725831e-07
},
{
"episode": 20032,
"epoch": 3.7803359124363087,
"eps": 0,
"loss/policy_avg": -0.005085974466055632,
"loss/value_avg": 0.003475761041045189,
"lr": 1.6835016835016833e-09,
"objective/entropy": -692.533203125,
"objective/kl": 6.632614612579346,
"objective/non_score_reward": -0.19897842407226562,
"objective/rlhf_reward": 0.22387313842773438,
"objective/scores": 0.421875,
"policy/approxkl_avg": 0.000355798052623868,
"policy/clipfrac_avg": 0.006758556701242924,
"policy/entropy_avg": 0.17910131812095642,
"step": 313,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 48,
"val/ratio": 1.0001018047332764,
"val/ratio_var": 6.0731622397725e-07
}
],
"logging_steps": 500,
"max_steps": 313,
"num_input_tokens_seen": 0,
"num_train_epochs": 3.774297037176826,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0,
"train_batch_size": null,
"trial_name": null,
"trial_params": null
}