super-voice-875 / checkpoint-582 /trainer_state.json
Evan-Lin's picture
Training in progress, step 582, checkpoint
c634f88 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"episode": 9312,
"epoch": 0.2554452186317002,
"eval_steps": 500,
"global_step": 582,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"episode": 16,
"epoch": 0.00043890931036374606,
"loss/policy_avg": -0.012268156744539738,
"lr": 3e-05,
"objective/entropy": 167.71548461914062,
"objective/kl": 0.21667994558811188,
"objective/non_score_reward": -0.021668005734682083,
"objective/rlhf_reward": 4.313327980786562,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7804256677627563,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7981860041618347,
"step": 0,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.002265691757202
},
{
"episode": 32,
"epoch": 0.0008778186207274921,
"loss/policy_avg": 0.28600460290908813,
"lr": 2.9973684210526316e-05,
"objective/entropy": 178.8330535888672,
"objective/kl": 12.701642990112305,
"objective/non_score_reward": -1.2701644897460938,
"objective/rlhf_reward": -0.6806579291820523,
"objective/scores": 1.1,
"policy/approxkl_avg": 300.53009033203125,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7146598100662231,
"step": 1,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.010622978210449
},
{
"episode": 48,
"epoch": 0.0013167279310912384,
"loss/policy_avg": -0.22723183035850525,
"lr": 2.994736842105263e-05,
"objective/entropy": -15.617034912109375,
"objective/kl": 6.93002986907959,
"objective/non_score_reward": -0.6930029392242432,
"objective/rlhf_reward": -2.37201172709465,
"objective/scores": 0.1,
"policy/approxkl_avg": 79.79334259033203,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7464854717254639,
"step": 2,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9983209371566772
},
{
"episode": 64,
"epoch": 0.0017556372414549843,
"loss/policy_avg": 0.8261077404022217,
"lr": 2.992105263157895e-05,
"objective/entropy": 117.23881530761719,
"objective/kl": 12.038005828857422,
"objective/non_score_reward": -1.2038006782531738,
"objective/rlhf_reward": -0.4152026534080502,
"objective/scores": 1.1,
"policy/approxkl_avg": 105.13363647460938,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.49201399087905884,
"step": 3,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9965429306030273
},
{
"episode": 80,
"epoch": 0.0021945465518187304,
"loss/policy_avg": 0.11680221557617188,
"lr": 2.9894736842105264e-05,
"objective/entropy": 126.83998107910156,
"objective/kl": 22.160707473754883,
"objective/non_score_reward": -2.2160706520080566,
"objective/rlhf_reward": -4.46428314447403,
"objective/scores": 1.1,
"policy/approxkl_avg": 121.82243347167969,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.680046796798706,
"step": 4,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9965834617614746
},
{
"episode": 96,
"epoch": 0.0026334558621824767,
"loss/policy_avg": 0.6160796284675598,
"lr": 2.986842105263158e-05,
"objective/entropy": 109.90502166748047,
"objective/kl": 18.486812591552734,
"objective/non_score_reward": -1.8486812114715576,
"objective/rlhf_reward": -2.9947244882583615,
"objective/scores": 1.1,
"policy/approxkl_avg": 164.01425170898438,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6090853810310364,
"step": 5,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9926069974899292
},
{
"episode": 112,
"epoch": 0.0030723651725462226,
"loss/policy_avg": 0.4473002552986145,
"lr": 2.9842105263157894e-05,
"objective/entropy": 171.5350341796875,
"objective/kl": 17.367782592773438,
"objective/non_score_reward": -1.7367782592773438,
"objective/rlhf_reward": -6.547113275527954,
"objective/scores": 0.1,
"policy/approxkl_avg": 34.7076301574707,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.65798020362854,
"step": 6,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9968626499176025
},
{
"episode": 128,
"epoch": 0.0035112744829099685,
"loss/policy_avg": 0.19109274446964264,
"lr": 2.9815789473684212e-05,
"objective/entropy": 134.9733428955078,
"objective/kl": 25.905290603637695,
"objective/non_score_reward": -2.590528964996338,
"objective/rlhf_reward": -8.414705256895957,
"objective/scores": 0.4868528072345416,
"policy/approxkl_avg": 122.29130554199219,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5049231052398682,
"step": 7,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9954898357391357
},
{
"episode": 144,
"epoch": 0.003950183793273714,
"loss/policy_avg": -0.12436092644929886,
"lr": 2.9789473684210527e-05,
"objective/entropy": 199.00608825683594,
"objective/kl": 45.45051574707031,
"objective/non_score_reward": -4.545051574707031,
"objective/rlhf_reward": -13.78020534515381,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.8954527378082275,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7475897073745728,
"step": 8,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.000213623046875
},
{
"episode": 160,
"epoch": 0.004389093103637461,
"loss/policy_avg": 0.7675253748893738,
"lr": 2.9763157894736842e-05,
"objective/entropy": 142.65106201171875,
"objective/kl": 58.533485412597656,
"objective/non_score_reward": -5.853349208831787,
"objective/rlhf_reward": -23.01339683532715,
"objective/scores": 0.1,
"policy/approxkl_avg": 63.6219596862793,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5712227821350098,
"step": 9,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9968998432159424
},
{
"episode": 176,
"epoch": 0.004828002414001207,
"loss/policy_avg": 0.6077067852020264,
"lr": 2.9736842105263157e-05,
"objective/entropy": 187.40093994140625,
"objective/kl": 55.569950103759766,
"objective/non_score_reward": -5.556994915008545,
"objective/rlhf_reward": -17.82797966003418,
"objective/scores": 1.1,
"policy/approxkl_avg": 6.850739002227783,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7330949902534485,
"step": 10,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9972755908966064
},
{
"episode": 192,
"epoch": 0.005266911724364953,
"loss/policy_avg": 0.9427204132080078,
"lr": 2.9710526315789472e-05,
"objective/entropy": 169.49903869628906,
"objective/kl": 43.81541442871094,
"objective/non_score_reward": -4.3815412521362305,
"objective/rlhf_reward": -19.526166915893555,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.3010352849960327,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.637176513671875,
"step": 11,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.003171920776367
},
{
"episode": 208,
"epoch": 0.005705821034728699,
"loss/policy_avg": 0.3998699188232422,
"lr": 2.968421052631579e-05,
"objective/entropy": 136.76266479492188,
"objective/kl": 48.54095458984375,
"objective/non_score_reward": -4.854095935821533,
"objective/rlhf_reward": -15.016384339332582,
"objective/scores": 1.1,
"policy/approxkl_avg": 33.5024528503418,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5481820106506348,
"step": 12,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9996936321258545
},
{
"episode": 224,
"epoch": 0.006144730345092445,
"loss/policy_avg": 0.1956150382757187,
"lr": 2.9657894736842106e-05,
"objective/entropy": 136.42724609375,
"objective/kl": 57.16869354248047,
"objective/non_score_reward": -5.716869354248047,
"objective/rlhf_reward": -18.46747741699219,
"objective/scores": 1.1,
"policy/approxkl_avg": 7.101135730743408,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5565441250801086,
"step": 13,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.000586748123169
},
{
"episode": 240,
"epoch": 0.0065836396554561916,
"loss/policy_avg": 1.4479000568389893,
"lr": 2.963157894736842e-05,
"objective/entropy": 180.2125244140625,
"objective/kl": 33.086238861083984,
"objective/non_score_reward": -3.308624029159546,
"objective/rlhf_reward": -15.234495162963867,
"objective/scores": -0.5,
"policy/approxkl_avg": 3.269646406173706,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.726157546043396,
"step": 14,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.0014841556549072
},
{
"episode": 256,
"epoch": 0.007022548965819937,
"loss/policy_avg": -0.2384345531463623,
"lr": 2.9605263157894735e-05,
"objective/entropy": 148.7078399658203,
"objective/kl": 47.15727996826172,
"objective/non_score_reward": -4.715727806091309,
"objective/rlhf_reward": -16.740206422583135,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 148.09915161132812,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5780054330825806,
"step": 15,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.997875690460205
},
{
"episode": 272,
"epoch": 0.007461458276183683,
"loss/policy_avg": 0.8147653341293335,
"lr": 2.957894736842105e-05,
"objective/entropy": 171.16403198242188,
"objective/kl": 43.11994934082031,
"objective/non_score_reward": -4.311994552612305,
"objective/rlhf_reward": -19.24797821044922,
"objective/scores": -0.5,
"policy/approxkl_avg": 8.860128402709961,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6768981218338013,
"step": 16,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9993420839309692
},
{
"episode": 288,
"epoch": 0.007900367586547429,
"loss/policy_avg": 0.5893005728721619,
"lr": 2.955263157894737e-05,
"objective/entropy": 123.02507781982422,
"objective/kl": 49.891517639160156,
"objective/non_score_reward": -4.9891510009765625,
"objective/rlhf_reward": -17.032885228039,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 6.687857151031494,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.4659116864204407,
"step": 17,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.002612352371216
},
{
"episode": 304,
"epoch": 0.008339276896911175,
"loss/policy_avg": 2.1211345195770264,
"lr": 2.9526315789473684e-05,
"objective/entropy": -174.25625610351562,
"objective/kl": 35.093326568603516,
"objective/non_score_reward": -3.5093321800231934,
"objective/rlhf_reward": -16.037328720092773,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.4923981428146362,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6755027770996094,
"step": 18,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9988504648208618
},
{
"episode": 320,
"epoch": 0.008778186207274922,
"loss/policy_avg": 4.73185396194458,
"lr": 2.95e-05,
"objective/entropy": 155.82408142089844,
"objective/kl": 64.35771179199219,
"objective/non_score_reward": -6.4357709884643555,
"objective/rlhf_reward": -25.343085741996767,
"objective/scores": 0.1,
"policy/approxkl_avg": 19.07648468017578,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6366579532623291,
"step": 19,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.0000834465026855
},
{
"episode": 336,
"epoch": 0.009217095517638668,
"loss/policy_avg": 0.7567238211631775,
"lr": 2.9473684210526314e-05,
"objective/entropy": 134.41641235351562,
"objective/kl": 29.073949813842773,
"objective/non_score_reward": -2.9073948860168457,
"objective/rlhf_reward": -13.629579544067383,
"objective/scores": -0.5,
"policy/approxkl_avg": 550.8251953125,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5258337259292603,
"step": 20,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9921021461486816
},
{
"episode": 352,
"epoch": 0.009656004828002414,
"loss/policy_avg": 0.7278516888618469,
"lr": 2.9447368421052635e-05,
"objective/entropy": 156.93792724609375,
"objective/kl": 41.105464935302734,
"objective/non_score_reward": -4.110546588897705,
"objective/rlhf_reward": -18.44218635559082,
"objective/scores": -0.5,
"policy/approxkl_avg": 242.4786376953125,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6374942064285278,
"step": 21,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9990363121032715
},
{
"episode": 368,
"epoch": 0.01009491413836616,
"loss/policy_avg": -0.12158381938934326,
"lr": 2.942105263157895e-05,
"objective/entropy": 162.37814331054688,
"objective/kl": 47.4278450012207,
"objective/non_score_reward": -4.74278450012207,
"objective/rlhf_reward": -20.97113800048828,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.3796932697296143,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6018426418304443,
"step": 22,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.00181245803833
},
{
"episode": 384,
"epoch": 0.010533823448729907,
"loss/policy_avg": 1.9867658615112305,
"lr": 2.9394736842105265e-05,
"objective/entropy": 164.5052947998047,
"objective/kl": 66.38179779052734,
"objective/non_score_reward": -6.638179779052734,
"objective/rlhf_reward": -22.1527184009552,
"objective/scores": 1.1,
"policy/approxkl_avg": 35.980594635009766,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6241673231124878,
"step": 23,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9957327842712402
},
{
"episode": 400,
"epoch": 0.010972732759093651,
"loss/policy_avg": -0.5419085025787354,
"lr": 2.936842105263158e-05,
"objective/entropy": 145.57208251953125,
"objective/kl": 43.03409957885742,
"objective/non_score_reward": -4.303410053253174,
"objective/rlhf_reward": -15.551780229032623,
"objective/scores": 0.41546487678572874,
"policy/approxkl_avg": 163.27047729492188,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.547484815120697,
"step": 24,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.0120882987976074
},
{
"episode": 416,
"epoch": 0.011411642069457398,
"loss/policy_avg": -0.24992617964744568,
"lr": 2.9342105263157895e-05,
"objective/entropy": 127.15754699707031,
"objective/kl": 41.97152328491211,
"objective/non_score_reward": -4.197152137756348,
"objective/rlhf_reward": -18.78860855102539,
"objective/scores": -0.5,
"policy/approxkl_avg": 3.1116750240325928,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5002591609954834,
"step": 25,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.001451015472412
},
{
"episode": 432,
"epoch": 0.011850551379821144,
"loss/policy_avg": 1.5105552673339844,
"lr": 2.9315789473684214e-05,
"objective/entropy": 197.1464080810547,
"objective/kl": 56.54367446899414,
"objective/non_score_reward": -5.654367923736572,
"objective/rlhf_reward": -24.61747169494629,
"objective/scores": -0.5,
"policy/approxkl_avg": 3.033173084259033,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7858256101608276,
"step": 26,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9994834661483765
},
{
"episode": 448,
"epoch": 0.01228946069018489,
"loss/policy_avg": 1.556326985359192,
"lr": 2.928947368421053e-05,
"objective/entropy": 142.51620483398438,
"objective/kl": 45.598716735839844,
"objective/non_score_reward": -4.559871673583984,
"objective/rlhf_reward": -13.83948621749878,
"objective/scores": 1.1,
"policy/approxkl_avg": 282.253173828125,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6140550374984741,
"step": 27,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.998161792755127
},
{
"episode": 464,
"epoch": 0.012728370000548637,
"loss/policy_avg": 0.05400470644235611,
"lr": 2.9263157894736844e-05,
"objective/entropy": 133.95001220703125,
"objective/kl": 27.514503479003906,
"objective/non_score_reward": -2.751450538635254,
"objective/rlhf_reward": -13.005802154541016,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.0789940357208252,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6318522691726685,
"step": 28,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.0006635189056396
},
{
"episode": 480,
"epoch": 0.013167279310912383,
"loss/policy_avg": -0.2949943244457245,
"lr": 2.923684210526316e-05,
"objective/entropy": 147.1396942138672,
"objective/kl": 54.152565002441406,
"objective/non_score_reward": -5.415256500244141,
"objective/rlhf_reward": -21.261026477813722,
"objective/scores": 0.1,
"policy/approxkl_avg": 14.0246000289917,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5865544676780701,
"step": 29,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.000847339630127
},
{
"episode": 496,
"epoch": 0.01360618862127613,
"loss/policy_avg": 0.7968569397926331,
"lr": 2.9210526315789474e-05,
"objective/entropy": 171.741943359375,
"objective/kl": 41.975685119628906,
"objective/non_score_reward": -4.197568893432617,
"objective/rlhf_reward": -18.79027557373047,
"objective/scores": -0.5,
"policy/approxkl_avg": 186.8196563720703,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6779192090034485,
"step": 30,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.001958131790161
},
{
"episode": 512,
"epoch": 0.014045097931639874,
"loss/policy_avg": 0.456326961517334,
"lr": 2.9184210526315792e-05,
"objective/entropy": 131.41111755371094,
"objective/kl": 44.016998291015625,
"objective/non_score_reward": -4.401699542999268,
"objective/rlhf_reward": -13.206798887252809,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.3007960319519043,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6488356590270996,
"step": 31,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 1.9991300106048584
},
{
"episode": 528,
"epoch": 0.01448400724200362,
"loss/policy_avg": 0.5430006980895996,
"lr": 2.9157894736842107e-05,
"objective/entropy": 190.74668884277344,
"objective/kl": 42.17557907104492,
"objective/non_score_reward": -4.217557907104492,
"objective/rlhf_reward": -18.87023162841797,
"objective/scores": -0.5,
"policy/approxkl_avg": 224.27468872070312,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6937891840934753,
"step": 32,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 1.999834656715393
},
{
"episode": 544,
"epoch": 0.014922916552367367,
"loss/policy_avg": 0.9642138481140137,
"lr": 2.9131578947368422e-05,
"objective/entropy": 192.7371826171875,
"objective/kl": 72.31754302978516,
"objective/non_score_reward": -7.231754302978516,
"objective/rlhf_reward": -24.527017211914064,
"objective/scores": 1.1,
"policy/approxkl_avg": 4.005494117736816,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7295612692832947,
"step": 33,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 1.9996287822723389
},
{
"episode": 560,
"epoch": 0.015361825862731113,
"loss/policy_avg": 0.09143030643463135,
"lr": 2.9105263157894737e-05,
"objective/entropy": 226.00430297851562,
"objective/kl": 47.70861053466797,
"objective/non_score_reward": -4.7708611488342285,
"objective/rlhf_reward": -21.083444595336914,
"objective/scores": -0.5,
"policy/approxkl_avg": 8.21639633178711,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.8707884550094604,
"step": 34,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 2.0034539699554443
},
{
"episode": 576,
"epoch": 0.015800735173094858,
"loss/policy_avg": 0.8646840453147888,
"lr": 2.9078947368421055e-05,
"objective/entropy": 120.56684875488281,
"objective/kl": 43.689491271972656,
"objective/non_score_reward": -4.3689494132995605,
"objective/rlhf_reward": -19.475799560546875,
"objective/scores": -0.5,
"policy/approxkl_avg": 3.5479981899261475,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6465896964073181,
"step": 35,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9970377683639526
},
{
"episode": 592,
"epoch": 0.016239644483458604,
"loss/policy_avg": -0.38367098569869995,
"lr": 2.905263157894737e-05,
"objective/entropy": 154.9678955078125,
"objective/kl": 35.52610397338867,
"objective/non_score_reward": -3.5526108741760254,
"objective/rlhf_reward": -9.810442781448366,
"objective/scores": 1.1,
"policy/approxkl_avg": 4.522809982299805,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6246684789657593,
"step": 36,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.0017809867858887
},
{
"episode": 608,
"epoch": 0.01667855379382235,
"loss/policy_avg": 0.15106165409088135,
"lr": 2.9026315789473685e-05,
"objective/entropy": 146.68177795410156,
"objective/kl": 53.049625396728516,
"objective/non_score_reward": -5.304962635040283,
"objective/rlhf_reward": -16.819850540161134,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.3007020950317383,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7446657419204712,
"step": 37,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.0011558532714844
},
{
"episode": 624,
"epoch": 0.017117463104186097,
"loss/policy_avg": -0.546560525894165,
"lr": 2.9e-05,
"objective/entropy": 173.34613037109375,
"objective/kl": 52.43684005737305,
"objective/non_score_reward": -5.243683815002441,
"objective/rlhf_reward": -22.974735260009766,
"objective/scores": -0.5,
"policy/approxkl_avg": 169.8699188232422,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.6890407800674438,
"step": 38,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.007899761199951
},
{
"episode": 640,
"epoch": 0.017556372414549843,
"loss/policy_avg": 0.015053331851959229,
"lr": 2.8973684210526315e-05,
"objective/entropy": 173.69979858398438,
"objective/kl": 40.57459259033203,
"objective/non_score_reward": -4.057459354400635,
"objective/rlhf_reward": -18.22983741760254,
"objective/scores": -0.5,
"policy/approxkl_avg": 7.52875280380249,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.9174096584320068,
"step": 39,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9973030090332031
},
{
"episode": 656,
"epoch": 0.01799528172491359,
"loss/policy_avg": -0.5114358067512512,
"lr": 2.8947368421052634e-05,
"objective/entropy": -104.62008666992188,
"objective/kl": 22.244895935058594,
"objective/non_score_reward": -2.224490165710449,
"objective/rlhf_reward": -4.49795994758606,
"objective/scores": 1.1,
"policy/approxkl_avg": 138.92034912109375,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7481294274330139,
"step": 40,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.997785210609436
},
{
"episode": 672,
"epoch": 0.018434191035277336,
"loss/policy_avg": 0.7354981899261475,
"lr": 2.892105263157895e-05,
"objective/entropy": 135.70901489257812,
"objective/kl": 39.70330047607422,
"objective/non_score_reward": -3.970329999923706,
"objective/rlhf_reward": -15.481319880485536,
"objective/scores": 0.1,
"policy/approxkl_avg": 192.59478759765625,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7106625437736511,
"step": 41,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9986684322357178
},
{
"episode": 688,
"epoch": 0.018873100345641082,
"loss/policy_avg": -0.014256155118346214,
"lr": 2.8894736842105263e-05,
"objective/entropy": 128.75439453125,
"objective/kl": 49.399986267089844,
"objective/non_score_reward": -4.939998626708984,
"objective/rlhf_reward": -19.359994983673097,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.645370364189148,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7817822098731995,
"step": 42,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.00180983543396
},
{
"episode": 704,
"epoch": 0.01931200965600483,
"loss/policy_avg": 1.0903524160385132,
"lr": 2.886842105263158e-05,
"objective/entropy": 124.43045043945312,
"objective/kl": 33.735557556152344,
"objective/non_score_reward": -3.3735556602478027,
"objective/rlhf_reward": -9.094222164154052,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.955012559890747,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6910259127616882,
"step": 43,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.001138210296631
},
{
"episode": 720,
"epoch": 0.019750918966368575,
"loss/policy_avg": 12.43572998046875,
"lr": 2.8842105263157897e-05,
"objective/entropy": 2.1484909057617188,
"objective/kl": 39.105247497558594,
"objective/non_score_reward": -3.910524845123291,
"objective/rlhf_reward": -15.242099142074586,
"objective/scores": 0.1,
"policy/approxkl_avg": 32.037132263183594,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6875019669532776,
"step": 44,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.000812530517578
},
{
"episode": 736,
"epoch": 0.02018982827673232,
"loss/policy_avg": 2.7653656005859375,
"lr": 2.8815789473684212e-05,
"objective/entropy": 151.14181518554688,
"objective/kl": 39.08449935913086,
"objective/non_score_reward": -3.9084503650665283,
"objective/rlhf_reward": -17.633800506591797,
"objective/scores": -0.5,
"policy/approxkl_avg": 12.080581665039062,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6674988865852356,
"step": 45,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.001593589782715
},
{
"episode": 752,
"epoch": 0.020628737587096067,
"loss/policy_avg": 0.26068630814552307,
"lr": 2.8789473684210527e-05,
"objective/entropy": 131.2846221923828,
"objective/kl": 46.933719635009766,
"objective/non_score_reward": -4.6933722496032715,
"objective/rlhf_reward": -14.37348852157593,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.599510669708252,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6592761874198914,
"step": 46,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.000056028366089
},
{
"episode": 768,
"epoch": 0.021067646897459814,
"loss/policy_avg": 0.36868762969970703,
"lr": 2.876315789473684e-05,
"objective/entropy": 168.22177124023438,
"objective/kl": 59.16243362426758,
"objective/non_score_reward": -5.916243076324463,
"objective/rlhf_reward": -25.66497230529785,
"objective/scores": -0.5,
"policy/approxkl_avg": 22.434894561767578,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7536077499389648,
"step": 47,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 1.9961597919464111
},
{
"episode": 784,
"epoch": 0.02150655620782356,
"loss/policy_avg": -0.34539520740509033,
"lr": 2.8736842105263157e-05,
"objective/entropy": 28.4791259765625,
"objective/kl": 55.672943115234375,
"objective/non_score_reward": -5.567294597625732,
"objective/rlhf_reward": -24.269176483154297,
"objective/scores": -0.5,
"policy/approxkl_avg": 92.67494201660156,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.768832802772522,
"step": 48,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.0010273456573486
},
{
"episode": 800,
"epoch": 0.021945465518187303,
"loss/policy_avg": 2.7739362716674805,
"lr": 2.8710526315789475e-05,
"objective/entropy": 152.84979248046875,
"objective/kl": 74.48204803466797,
"objective/non_score_reward": -7.44820499420166,
"objective/rlhf_reward": -27.67011326767591,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 20.374004364013672,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7515650987625122,
"step": 49,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.0020508766174316
},
{
"episode": 816,
"epoch": 0.02238437482855105,
"loss/policy_avg": 0.8626595735549927,
"lr": 2.868421052631579e-05,
"objective/entropy": 91.1478271484375,
"objective/kl": 45.17556381225586,
"objective/non_score_reward": -4.517556667327881,
"objective/rlhf_reward": -15.146507178188536,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 10.44746208190918,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6176143884658813,
"step": 50,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.000572681427002
},
{
"episode": 832,
"epoch": 0.022823284138914796,
"loss/policy_avg": 3.834225654602051,
"lr": 2.8657894736842105e-05,
"objective/entropy": 79.80493927001953,
"objective/kl": 56.04777526855469,
"objective/non_score_reward": -5.6047773361206055,
"objective/rlhf_reward": -20.68577505747477,
"objective/scores": 0.43333333333333335,
"policy/approxkl_avg": 16.849597930908203,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7000266909599304,
"step": 51,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9981000423431396
},
{
"episode": 848,
"epoch": 0.023262193449278542,
"loss/policy_avg": 1.604856252670288,
"lr": 2.863157894736842e-05,
"objective/entropy": 91.78755187988281,
"objective/kl": 62.33799362182617,
"objective/non_score_reward": -6.233799934387207,
"objective/rlhf_reward": -26.935199737548828,
"objective/scores": -0.5,
"policy/approxkl_avg": 156.7439422607422,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6846381425857544,
"step": 52,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9976816177368164
},
{
"episode": 864,
"epoch": 0.023701102759642288,
"loss/policy_avg": 0.5365209579467773,
"lr": 2.8605263157894735e-05,
"objective/entropy": -6.100943565368652,
"objective/kl": 42.75518798828125,
"objective/non_score_reward": -4.275519371032715,
"objective/rlhf_reward": -16.70207724571228,
"objective/scores": 0.1,
"policy/approxkl_avg": 314.5965270996094,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6669777631759644,
"step": 53,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.99760103225708
},
{
"episode": 880,
"epoch": 0.024140012070006035,
"loss/policy_avg": -0.05100756883621216,
"lr": 2.8578947368421053e-05,
"objective/entropy": 120.99382781982422,
"objective/kl": 45.327186584472656,
"objective/non_score_reward": -4.532718658447266,
"objective/rlhf_reward": -13.730874633789064,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.7438147068023682,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8100802898406982,
"step": 54,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0016448497772217
},
{
"episode": 896,
"epoch": 0.02457892138036978,
"loss/policy_avg": 0.9957195520401001,
"lr": 2.8552631578947368e-05,
"objective/entropy": 171.9380645751953,
"objective/kl": 78.47545623779297,
"objective/non_score_reward": -7.847545623779297,
"objective/rlhf_reward": -30.99018201828003,
"objective/scores": 0.1,
"policy/approxkl_avg": 112.00227355957031,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6961182355880737,
"step": 55,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 1.9984160661697388
},
{
"episode": 912,
"epoch": 0.025017830690733527,
"loss/policy_avg": 0.5854477286338806,
"lr": 2.8526315789473683e-05,
"objective/entropy": 101.01473236083984,
"objective/kl": 57.770263671875,
"objective/non_score_reward": -5.777026176452637,
"objective/rlhf_reward": -25.108104705810547,
"objective/scores": -0.5,
"policy/approxkl_avg": 10.684706687927246,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8344231843948364,
"step": 56,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.999055027961731
},
{
"episode": 928,
"epoch": 0.025456740001097274,
"loss/policy_avg": 1.005487322807312,
"lr": 2.8499999999999998e-05,
"objective/entropy": -35.803977966308594,
"objective/kl": 30.102699279785156,
"objective/non_score_reward": -3.010270118713379,
"objective/rlhf_reward": -11.641080474853517,
"objective/scores": 0.1,
"policy/approxkl_avg": 27.260387420654297,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9162927865982056,
"step": 57,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9970905780792236
},
{
"episode": 944,
"epoch": 0.02589564931146102,
"loss/policy_avg": -0.5104620456695557,
"lr": 2.8473684210526317e-05,
"objective/entropy": 131.767822265625,
"objective/kl": 40.71240234375,
"objective/non_score_reward": -4.071239948272705,
"objective/rlhf_reward": -11.884960508346559,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.580596685409546,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.8555915355682373,
"step": 58,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.0032856464385986
},
{
"episode": 960,
"epoch": 0.026334558621824766,
"loss/policy_avg": -0.0323227159678936,
"lr": 2.844736842105263e-05,
"objective/entropy": 95.74278259277344,
"objective/kl": 50.46125793457031,
"objective/non_score_reward": -5.046125888824463,
"objective/rlhf_reward": -19.784503555297853,
"objective/scores": 0.1,
"policy/approxkl_avg": 346.01953125,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7172888517379761,
"step": 59,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.000260591506958
},
{
"episode": 976,
"epoch": 0.026773467932188513,
"loss/policy_avg": 0.30933094024658203,
"lr": 2.8421052631578946e-05,
"objective/entropy": -6.390421390533447,
"objective/kl": 42.69683074951172,
"objective/non_score_reward": -4.269682884216309,
"objective/rlhf_reward": -12.678731775283813,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.9299302101135254,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7608700394630432,
"step": 60,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9992871284484863
},
{
"episode": 992,
"epoch": 0.02721237724255226,
"loss/policy_avg": 0.9537198543548584,
"lr": 2.839473684210526e-05,
"objective/entropy": 144.63778686523438,
"objective/kl": 42.113922119140625,
"objective/non_score_reward": -4.211391925811768,
"objective/rlhf_reward": -18.84556770324707,
"objective/scores": -0.5,
"policy/approxkl_avg": 14.351249694824219,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6079916954040527,
"step": 61,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 1.9990580081939697
},
{
"episode": 1008,
"epoch": 0.027651286552916005,
"loss/policy_avg": -0.13979224860668182,
"lr": 2.836842105263158e-05,
"objective/entropy": 52.73601531982422,
"objective/kl": 42.785003662109375,
"objective/non_score_reward": -4.278500556945801,
"objective/rlhf_reward": -12.714001274108888,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.0111756324768066,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5955897569656372,
"step": 62,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0026631355285645
},
{
"episode": 1024,
"epoch": 0.028090195863279748,
"loss/policy_avg": 0.7119593620300293,
"lr": 2.8342105263157898e-05,
"objective/entropy": 37.45541763305664,
"objective/kl": 48.38474655151367,
"objective/non_score_reward": -4.838474750518799,
"objective/rlhf_reward": -17.62056566874186,
"objective/scores": 0.43333333333333335,
"policy/approxkl_avg": 1.2988009452819824,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7282307147979736,
"step": 63,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9981870651245117
},
{
"episode": 1040,
"epoch": 0.028529105173643494,
"loss/policy_avg": 0.22343069314956665,
"lr": 2.8315789473684213e-05,
"objective/entropy": -0.5227775573730469,
"objective/kl": 44.62450408935547,
"objective/non_score_reward": -4.4624505043029785,
"objective/rlhf_reward": -17.449802970886232,
"objective/scores": 0.1,
"policy/approxkl_avg": 8.680784225463867,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.47706353664398193,
"step": 64,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9991333484649658
},
{
"episode": 1056,
"epoch": 0.02896801448400724,
"loss/policy_avg": 2.4624288082122803,
"lr": 2.8289473684210528e-05,
"objective/entropy": 44.281429290771484,
"objective/kl": 56.82603454589844,
"objective/non_score_reward": -5.682603359222412,
"objective/rlhf_reward": -20.33041343688965,
"objective/scores": 0.6,
"policy/approxkl_avg": 190.14102172851562,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6275349259376526,
"step": 65,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9954755306243896
},
{
"episode": 1072,
"epoch": 0.029406923794370987,
"loss/policy_avg": 3.7873077392578125,
"lr": 2.8263157894736843e-05,
"objective/entropy": 21.29256820678711,
"objective/kl": 61.06785202026367,
"objective/non_score_reward": -6.106784820556641,
"objective/rlhf_reward": -24.02714011669159,
"objective/scores": 0.1,
"policy/approxkl_avg": 13.861494064331055,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5628820657730103,
"step": 66,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9979462623596191
},
{
"episode": 1088,
"epoch": 0.029845833104734733,
"loss/policy_avg": 1.122948169708252,
"lr": 2.823684210526316e-05,
"objective/entropy": 199.56639099121094,
"objective/kl": 67.14576721191406,
"objective/non_score_reward": -6.714576244354248,
"objective/rlhf_reward": -22.458304977416994,
"objective/scores": 1.1,
"policy/approxkl_avg": 260.06036376953125,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9761279225349426,
"step": 67,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9947412014007568
},
{
"episode": 1104,
"epoch": 0.03028474241509848,
"loss/policy_avg": -0.40396255254745483,
"lr": 2.8210526315789476e-05,
"objective/entropy": 143.4803924560547,
"objective/kl": 73.37672424316406,
"objective/non_score_reward": -7.337671279907227,
"objective/rlhf_reward": -31.350685119628906,
"objective/scores": -0.5,
"policy/approxkl_avg": 65.98458862304688,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8562291860580444,
"step": 68,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.0034923553466797
},
{
"episode": 1120,
"epoch": 0.030723651725462226,
"loss/policy_avg": -0.6217052936553955,
"lr": 2.818421052631579e-05,
"objective/entropy": 61.76605224609375,
"objective/kl": 55.31629180908203,
"objective/non_score_reward": -5.53162956237793,
"objective/rlhf_reward": -19.202798043132994,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 102.99107360839844,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7685708999633789,
"step": 69,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.0019445419311523
},
{
"episode": 1136,
"epoch": 0.031162561035825972,
"loss/policy_avg": 0.9746075868606567,
"lr": 2.8157894736842106e-05,
"objective/entropy": 182.8162078857422,
"objective/kl": 73.46858215332031,
"objective/non_score_reward": -7.346858501434326,
"objective/rlhf_reward": -28.987434005737306,
"objective/scores": 0.1,
"policy/approxkl_avg": 309.7725830078125,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.9557604193687439,
"step": 70,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.999297857284546
},
{
"episode": 1152,
"epoch": 0.031601470346189715,
"loss/policy_avg": 0.011278927326202393,
"lr": 2.813157894736842e-05,
"objective/entropy": 177.80795288085938,
"objective/kl": 79.76069641113281,
"objective/non_score_reward": -7.976069927215576,
"objective/rlhf_reward": -31.504280185699464,
"objective/scores": 0.1,
"policy/approxkl_avg": 8.640438079833984,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7453687787055969,
"step": 71,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 1,
"val/ratio": 2.0047388076782227
},
{
"episode": 1168,
"epoch": 0.032040379656553465,
"loss/policy_avg": 0.2741953730583191,
"lr": 2.810526315789474e-05,
"objective/entropy": 8.855690002441406,
"objective/kl": 39.5793342590332,
"objective/non_score_reward": -3.9579336643218994,
"objective/rlhf_reward": -15.43173418045044,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.3167859315872192,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6513254642486572,
"step": 72,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9987623691558838
},
{
"episode": 1184,
"epoch": 0.03247928896691721,
"loss/policy_avg": 2.404010772705078,
"lr": 2.8078947368421055e-05,
"objective/entropy": 104.23138427734375,
"objective/kl": 71.9098892211914,
"objective/non_score_reward": -7.1909894943237305,
"objective/rlhf_reward": -30.763957977294922,
"objective/scores": -0.5,
"policy/approxkl_avg": 6.468600273132324,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6281869411468506,
"step": 73,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9987789392471313
},
{
"episode": 1200,
"epoch": 0.03291819827728096,
"loss/policy_avg": 1.7633247375488281,
"lr": 2.805263157894737e-05,
"objective/entropy": 90.97634887695312,
"objective/kl": 81.6705322265625,
"objective/non_score_reward": -8.16705322265625,
"objective/rlhf_reward": -29.744492922664854,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 318.6219482421875,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7491735219955444,
"step": 74,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9995994567871094
},
{
"episode": 1216,
"epoch": 0.0333571075876447,
"loss/policy_avg": 0.7622026801109314,
"lr": 2.8026315789473685e-05,
"objective/entropy": 109.73323059082031,
"objective/kl": 59.60847854614258,
"objective/non_score_reward": -5.960847854614258,
"objective/rlhf_reward": -25.84339141845703,
"objective/scores": -0.5,
"policy/approxkl_avg": 54.609107971191406,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6811318397521973,
"step": 75,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9944725036621094
},
{
"episode": 1232,
"epoch": 0.03379601689800845,
"loss/policy_avg": -0.2917378544807434,
"lr": 2.8e-05,
"objective/entropy": 185.174072265625,
"objective/kl": 68.6063232421875,
"objective/non_score_reward": -6.860632419586182,
"objective/rlhf_reward": -29.442529678344727,
"objective/scores": -0.5,
"policy/approxkl_avg": 4.8592119216918945,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.8129308223724365,
"step": 76,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9995498657226562
},
{
"episode": 1248,
"epoch": 0.03423492620837219,
"loss/policy_avg": -0.41292351484298706,
"lr": 2.7973684210526318e-05,
"objective/entropy": 140.15478515625,
"objective/kl": 64.91577911376953,
"objective/non_score_reward": -6.491578102111816,
"objective/rlhf_reward": -25.56631193161011,
"objective/scores": 0.1,
"policy/approxkl_avg": 2.9902713298797607,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5978768467903137,
"step": 77,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.0020546913146973
},
{
"episode": 1264,
"epoch": 0.03467383551873594,
"loss/policy_avg": 0.9187192916870117,
"lr": 2.7947368421052633e-05,
"objective/entropy": 35.76869201660156,
"objective/kl": 52.17973327636719,
"objective/non_score_reward": -5.217973709106445,
"objective/rlhf_reward": -20.471893405914308,
"objective/scores": 0.1,
"policy/approxkl_avg": 117.89864349365234,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6329915523529053,
"step": 78,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9939618110656738
},
{
"episode": 1280,
"epoch": 0.035112744829099686,
"loss/policy_avg": 0.5058541297912598,
"lr": 2.7921052631578948e-05,
"objective/entropy": 125.1489028930664,
"objective/kl": 48.87464904785156,
"objective/non_score_reward": -4.88746452331543,
"objective/rlhf_reward": -21.54985809326172,
"objective/scores": -0.5,
"policy/approxkl_avg": 9.59840202331543,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7014065384864807,
"step": 79,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9995161294937134
},
{
"episode": 1296,
"epoch": 0.035551654139463436,
"loss/policy_avg": 0.38564199209213257,
"lr": 2.7894736842105263e-05,
"objective/entropy": 20.80208969116211,
"objective/kl": 59.019775390625,
"objective/non_score_reward": -5.901978015899658,
"objective/rlhf_reward": -19.207911586761476,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.5623955726623535,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6976651549339294,
"step": 80,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9989688396453857
},
{
"episode": 1312,
"epoch": 0.03599056344982718,
"loss/policy_avg": 0.5554705858230591,
"lr": 2.786842105263158e-05,
"objective/entropy": -90.24222564697266,
"objective/kl": 38.30194854736328,
"objective/non_score_reward": -3.8301947116851807,
"objective/rlhf_reward": -17.320777893066406,
"objective/scores": -0.5,
"policy/approxkl_avg": 201.749267578125,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7411468029022217,
"step": 81,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9972386360168457
},
{
"episode": 1328,
"epoch": 0.03642947276019093,
"loss/policy_avg": 0.0963105857372284,
"lr": 2.7842105263157896e-05,
"objective/entropy": -7.672168731689453,
"objective/kl": 51.06816101074219,
"objective/non_score_reward": -5.106816291809082,
"objective/rlhf_reward": -16.02726492881775,
"objective/scores": 1.1,
"policy/approxkl_avg": 4.051983833312988,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7723426818847656,
"step": 82,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9965218305587769
},
{
"episode": 1344,
"epoch": 0.03686838207055467,
"loss/policy_avg": 0.019892290234565735,
"lr": 2.781578947368421e-05,
"objective/entropy": 64.86529541015625,
"objective/kl": 33.41741943359375,
"objective/non_score_reward": -3.341742515563965,
"objective/rlhf_reward": -8.966969466209413,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.7174110412597656,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6913268566131592,
"step": 83,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9992713928222656
},
{
"episode": 1360,
"epoch": 0.03730729138091842,
"loss/policy_avg": -0.16097909212112427,
"lr": 2.7789473684210526e-05,
"objective/entropy": 100.59518432617188,
"objective/kl": 54.423255920410156,
"objective/non_score_reward": -5.442325592041016,
"objective/rlhf_reward": -17.36930379867554,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.42956307530403137,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7463341951370239,
"step": 84,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0013723373413086
},
{
"episode": 1376,
"epoch": 0.037746200691282164,
"loss/policy_avg": 1.31124746799469,
"lr": 2.776315789473684e-05,
"objective/entropy": 29.62479591369629,
"objective/kl": 52.713645935058594,
"objective/non_score_reward": -5.271364688873291,
"objective/rlhf_reward": -20.685458755493165,
"objective/scores": 0.1,
"policy/approxkl_avg": 17.273426055908203,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9209033250808716,
"step": 85,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000173568725586
},
{
"episode": 1392,
"epoch": 0.03818511000164591,
"loss/policy_avg": 0.1676180362701416,
"lr": 2.773684210526316e-05,
"objective/entropy": 9.287511825561523,
"objective/kl": 51.08624267578125,
"objective/non_score_reward": -5.108624458312988,
"objective/rlhf_reward": -18.91872652748459,
"objective/scores": 0.37894294565112985,
"policy/approxkl_avg": 202.3461456298828,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7541092038154602,
"step": 86,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999753475189209
},
{
"episode": 1408,
"epoch": 0.03862401931200966,
"loss/policy_avg": 0.9255491495132446,
"lr": 2.7710526315789474e-05,
"objective/entropy": 17.094314575195312,
"objective/kl": 48.69655990600586,
"objective/non_score_reward": -4.869655609130859,
"objective/rlhf_reward": -19.078624105453493,
"objective/scores": 0.1,
"policy/approxkl_avg": 88.98983764648438,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8517658114433289,
"step": 87,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9959460496902466
},
{
"episode": 1424,
"epoch": 0.0390629286223734,
"loss/policy_avg": 2.683220624923706,
"lr": 2.768421052631579e-05,
"objective/entropy": 122.18838500976562,
"objective/kl": 57.215179443359375,
"objective/non_score_reward": -5.721518516540527,
"objective/rlhf_reward": -22.486073350906373,
"objective/scores": 0.1,
"policy/approxkl_avg": 8.430204391479492,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6825472116470337,
"step": 88,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9960262775421143
},
{
"episode": 1440,
"epoch": 0.03950183793273715,
"loss/policy_avg": 1.1903173923492432,
"lr": 2.7657894736842104e-05,
"objective/entropy": 56.404361724853516,
"objective/kl": 67.56021118164062,
"objective/non_score_reward": -6.756021499633789,
"objective/rlhf_reward": -26.624085998535158,
"objective/scores": 0.1,
"policy/approxkl_avg": 13.910991668701172,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6265389919281006,
"step": 89,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9977799654006958
},
{
"episode": 1456,
"epoch": 0.03994074724310089,
"loss/policy_avg": 0.586733877658844,
"lr": 2.7631578947368423e-05,
"objective/entropy": -15.051795959472656,
"objective/kl": 43.27064514160156,
"objective/non_score_reward": -4.327064514160156,
"objective/rlhf_reward": -16.908258533477785,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.233497142791748,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7934204936027527,
"step": 90,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0031657218933105
},
{
"episode": 1472,
"epoch": 0.04037965655346464,
"loss/policy_avg": 1.8827378749847412,
"lr": 2.7605263157894738e-05,
"objective/entropy": 49.19090270996094,
"objective/kl": 60.559024810791016,
"objective/non_score_reward": -6.055902481079102,
"objective/rlhf_reward": -19.823610877990724,
"objective/scores": 1.1,
"policy/approxkl_avg": 17.631580352783203,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 1.0130832195281982,
"step": 91,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9948384761810303
},
{
"episode": 1488,
"epoch": 0.040818565863828385,
"loss/policy_avg": -0.12379920482635498,
"lr": 2.7578947368421053e-05,
"objective/entropy": -266.6684265136719,
"objective/kl": 19.84296417236328,
"objective/non_score_reward": -1.9842965602874756,
"objective/rlhf_reward": -7.537186002731324,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.5475006103515625,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7140687704086304,
"step": 92,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9999473094940186
},
{
"episode": 1504,
"epoch": 0.041257475174192135,
"loss/policy_avg": 2.4287641048431396,
"lr": 2.7552631578947368e-05,
"objective/entropy": 61.6163444519043,
"objective/kl": 75.09857940673828,
"objective/non_score_reward": -7.509858131408691,
"objective/rlhf_reward": -29.63943181037903,
"objective/scores": 0.1,
"policy/approxkl_avg": 205.6100616455078,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7793152928352356,
"step": 93,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9955990314483643
},
{
"episode": 1520,
"epoch": 0.04169638448455588,
"loss/policy_avg": 0.1086156815290451,
"lr": 2.7526315789473683e-05,
"objective/entropy": -23.803211212158203,
"objective/kl": 41.80360412597656,
"objective/non_score_reward": -4.180360794067383,
"objective/rlhf_reward": -12.321443176269533,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.97320556640625,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5993040800094604,
"step": 94,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9978277683258057
},
{
"episode": 1536,
"epoch": 0.04213529379491963,
"loss/policy_avg": -0.9118296504020691,
"lr": 2.75e-05,
"objective/entropy": -129.44325256347656,
"objective/kl": 56.03433609008789,
"objective/non_score_reward": -5.603433609008789,
"objective/rlhf_reward": -20.680401102701822,
"objective/scores": 0.43333333333333335,
"policy/approxkl_avg": 161.11831665039062,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8422608375549316,
"step": 95,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0000882148742676
},
{
"episode": 1552,
"epoch": 0.04257420310528337,
"loss/policy_avg": 0.2865448296070099,
"lr": 2.7473684210526316e-05,
"objective/entropy": 9.751440048217773,
"objective/kl": 50.09703063964844,
"objective/non_score_reward": -5.009703159332275,
"objective/rlhf_reward": -17.916106405035528,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 297.1142272949219,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8490245342254639,
"step": 96,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9978770017623901
},
{
"episode": 1568,
"epoch": 0.04301311241564712,
"loss/policy_avg": 1.2864903211593628,
"lr": 2.744736842105263e-05,
"objective/entropy": -15.848587036132812,
"objective/kl": 72.71647644042969,
"objective/non_score_reward": -7.271647930145264,
"objective/rlhf_reward": -26.162872706295225,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 9.334358215332031,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7372955679893494,
"step": 97,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0022706985473633
},
{
"episode": 1584,
"epoch": 0.04345202172601086,
"loss/policy_avg": -0.0980822816491127,
"lr": 2.7421052631578946e-05,
"objective/entropy": -245.53875732421875,
"objective/kl": 20.317668914794922,
"objective/non_score_reward": -2.031766891479492,
"objective/rlhf_reward": -3.7270678043365475,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7220726013183594,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.9007136821746826,
"step": 98,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9996488094329834
},
{
"episode": 1600,
"epoch": 0.043890931036374606,
"loss/policy_avg": -0.025203801691532135,
"lr": 2.739473684210526e-05,
"objective/entropy": -198.08126831054688,
"objective/kl": 38.61351776123047,
"objective/non_score_reward": -3.86135196685791,
"objective/rlhf_reward": -11.04540786743164,
"objective/scores": 1.1,
"policy/approxkl_avg": 74.81521606445312,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9700057506561279,
"step": 99,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9973889589309692
},
{
"episode": 1616,
"epoch": 0.044329840346738356,
"loss/policy_avg": 0.768902063369751,
"lr": 2.736842105263158e-05,
"objective/entropy": 72.47509765625,
"objective/kl": 53.5683479309082,
"objective/non_score_reward": -5.35683536529541,
"objective/rlhf_reward": -17.02733979225159,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.6070337295532227,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8009264469146729,
"step": 100,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0045204162597656
},
{
"episode": 1632,
"epoch": 0.0447687496571021,
"loss/policy_avg": 0.12479447573423386,
"lr": 2.7342105263157894e-05,
"objective/entropy": -71.50438690185547,
"objective/kl": 46.817623138427734,
"objective/non_score_reward": -4.681761741638184,
"objective/rlhf_reward": -18.32704839706421,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.7676063776016235,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6292232275009155,
"step": 101,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.99995756149292
},
{
"episode": 1648,
"epoch": 0.04520765896746585,
"loss/policy_avg": -0.7869347929954529,
"lr": 2.7315789473684213e-05,
"objective/entropy": -67.84761047363281,
"objective/kl": 38.6337890625,
"objective/non_score_reward": -3.8633787631988525,
"objective/rlhf_reward": -11.053514814376832,
"objective/scores": 1.1,
"policy/approxkl_avg": 65.58836364746094,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6918781399726868,
"step": 102,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002877712249756
},
{
"episode": 1664,
"epoch": 0.04564656827782959,
"loss/policy_avg": 1.888732671737671,
"lr": 2.7289473684210528e-05,
"objective/entropy": 45.654727935791016,
"objective/kl": 65.4364013671875,
"objective/non_score_reward": -6.54364013671875,
"objective/rlhf_reward": -25.77456102371216,
"objective/scores": 0.1,
"policy/approxkl_avg": 158.3827667236328,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7598006725311279,
"step": 103,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.995233416557312
},
{
"episode": 1680,
"epoch": 0.04608547758819334,
"loss/policy_avg": 0.5027464032173157,
"lr": 2.7263157894736846e-05,
"objective/entropy": -67.26053619384766,
"objective/kl": 46.42061996459961,
"objective/non_score_reward": -4.642061710357666,
"objective/rlhf_reward": -14.168247556686403,
"objective/scores": 1.1,
"policy/approxkl_avg": 409.32861328125,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7045955657958984,
"step": 104,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9950696229934692
},
{
"episode": 1696,
"epoch": 0.046524386898557084,
"loss/policy_avg": 1.1565163135528564,
"lr": 2.723684210526316e-05,
"objective/entropy": 21.27078628540039,
"objective/kl": 54.91717529296875,
"objective/non_score_reward": -5.491717338562012,
"objective/rlhf_reward": -17.566870307922365,
"objective/scores": 1.1,
"policy/approxkl_avg": 169.4134521484375,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.817981481552124,
"step": 105,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.997206449508667
},
{
"episode": 1712,
"epoch": 0.046963296208920834,
"loss/policy_avg": 1.359951138496399,
"lr": 2.7210526315789476e-05,
"objective/entropy": 73.11656951904297,
"objective/kl": 42.99860382080078,
"objective/non_score_reward": -4.29986047744751,
"objective/rlhf_reward": -16.79944190979004,
"objective/scores": 0.1,
"policy/approxkl_avg": 160.0785369873047,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7364996671676636,
"step": 106,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.999701976776123
},
{
"episode": 1728,
"epoch": 0.047402205519284576,
"loss/policy_avg": 0.6170438528060913,
"lr": 2.718421052631579e-05,
"objective/entropy": 18.827709197998047,
"objective/kl": 59.75545120239258,
"objective/non_score_reward": -5.975545406341553,
"objective/rlhf_reward": -25.90218162536621,
"objective/scores": -0.5,
"policy/approxkl_avg": 12.630553245544434,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8481165170669556,
"step": 107,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0005557537078857
},
{
"episode": 1744,
"epoch": 0.047841114829648326,
"loss/policy_avg": 0.6389247179031372,
"lr": 2.7157894736842106e-05,
"objective/entropy": -15.529239654541016,
"objective/kl": 29.512508392333984,
"objective/non_score_reward": -2.9512510299682617,
"objective/rlhf_reward": -7.405003643035888,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.9877429008483887,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.9178982973098755,
"step": 108,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0009372234344482
},
{
"episode": 1760,
"epoch": 0.04828002414001207,
"loss/policy_avg": 0.5083436965942383,
"lr": 2.7131578947368424e-05,
"objective/entropy": -31.206829071044922,
"objective/kl": 58.42845916748047,
"objective/non_score_reward": -5.842845916748047,
"objective/rlhf_reward": -18.971384143829347,
"objective/scores": 1.1,
"policy/approxkl_avg": 246.38031005859375,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8161805868148804,
"step": 109,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9968533515930176
},
{
"episode": 1776,
"epoch": 0.04871893345037582,
"loss/policy_avg": -0.40812578797340393,
"lr": 2.710526315789474e-05,
"objective/entropy": 31.53291130065918,
"objective/kl": 29.56689453125,
"objective/non_score_reward": -2.9566893577575684,
"objective/rlhf_reward": -13.826757431030273,
"objective/scores": -0.5,
"policy/approxkl_avg": 10.323970794677734,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7529888153076172,
"step": 110,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0055394172668457
},
{
"episode": 1792,
"epoch": 0.04915784276073956,
"loss/policy_avg": 1.3316415548324585,
"lr": 2.7078947368421054e-05,
"objective/entropy": 12.768815994262695,
"objective/kl": 54.86576461791992,
"objective/non_score_reward": -5.486576557159424,
"objective/rlhf_reward": -17.546306228637697,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.3613033294677734,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7278101444244385,
"step": 111,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9984724521636963
},
{
"episode": 1808,
"epoch": 0.04959675207110331,
"loss/policy_avg": -0.11871880292892456,
"lr": 2.705263157894737e-05,
"objective/entropy": -7.827598571777344,
"objective/kl": 39.736026763916016,
"objective/non_score_reward": -3.973602771759033,
"objective/rlhf_reward": -11.494411563873292,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.851396560668945,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7495956420898438,
"step": 112,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9964585304260254
},
{
"episode": 1824,
"epoch": 0.050035661381467054,
"loss/policy_avg": 1.761976957321167,
"lr": 2.7026315789473684e-05,
"objective/entropy": 128.9440460205078,
"objective/kl": 119.60091400146484,
"objective/non_score_reward": -11.960092544555664,
"objective/rlhf_reward": -47.440368270874025,
"objective/scores": 0.1,
"policy/approxkl_avg": 141.6197967529297,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8293796181678772,
"step": 113,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9977341890335083
},
{
"episode": 1840,
"epoch": 0.0504745706918308,
"loss/policy_avg": -0.7025829553604126,
"lr": 2.7000000000000002e-05,
"objective/entropy": 41.96453857421875,
"objective/kl": 52.472015380859375,
"objective/non_score_reward": -5.247201919555664,
"objective/rlhf_reward": -18.06508771026251,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 4.035026550292969,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7776132822036743,
"step": 114,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0039517879486084
},
{
"episode": 1856,
"epoch": 0.05091348000219455,
"loss/policy_avg": 0.7849704623222351,
"lr": 2.6973684210526317e-05,
"objective/entropy": -171.7941131591797,
"objective/kl": 46.76850128173828,
"objective/non_score_reward": -4.676850318908691,
"objective/rlhf_reward": -16.30740032196045,
"objective/scores": 0.6,
"policy/approxkl_avg": 233.18243408203125,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7888088822364807,
"step": 115,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9987388849258423
},
{
"episode": 1872,
"epoch": 0.05135238931255829,
"loss/policy_avg": -0.09497790038585663,
"lr": 2.6947368421052632e-05,
"objective/entropy": -106.61396789550781,
"objective/kl": 29.52130699157715,
"objective/non_score_reward": -2.9521307945251465,
"objective/rlhf_reward": -7.4085229396820065,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.43795135617256165,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.8204243183135986,
"step": 116,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000943183898926
},
{
"episode": 1888,
"epoch": 0.05179129862292204,
"loss/policy_avg": 0.32255253195762634,
"lr": 2.6921052631578947e-05,
"objective/entropy": -91.3974838256836,
"objective/kl": 43.5883903503418,
"objective/non_score_reward": -4.35883903503418,
"objective/rlhf_reward": -13.035355901718141,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.264688968658447,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7290156483650208,
"step": 117,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0012741088867188
},
{
"episode": 1904,
"epoch": 0.05223020793328578,
"loss/policy_avg": -0.016802702099084854,
"lr": 2.6894736842105266e-05,
"objective/entropy": -53.469398498535156,
"objective/kl": 59.04955291748047,
"objective/non_score_reward": -5.9049553871154785,
"objective/rlhf_reward": -19.219821548461915,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.983826756477356,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.742132306098938,
"step": 118,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000786542892456
},
{
"episode": 1920,
"epoch": 0.05266911724364953,
"loss/policy_avg": 1.3971296548843384,
"lr": 2.686842105263158e-05,
"objective/entropy": -29.92015266418457,
"objective/kl": 50.994483947753906,
"objective/non_score_reward": -5.099448204040527,
"objective/rlhf_reward": -15.99779305458069,
"objective/scores": 1.1,
"policy/approxkl_avg": 112.21920776367188,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.884390652179718,
"step": 119,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9959502220153809
},
{
"episode": 1936,
"epoch": 0.053108026554013275,
"loss/policy_avg": 0.22061039507389069,
"lr": 2.6842105263157896e-05,
"objective/entropy": -107.27847290039062,
"objective/kl": 49.88347244262695,
"objective/non_score_reward": -4.988347053527832,
"objective/rlhf_reward": -15.553388690948488,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.4191887378692627,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7550037503242493,
"step": 120,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9977598190307617
},
{
"episode": 1952,
"epoch": 0.053546935864377025,
"loss/policy_avg": 0.3377835750579834,
"lr": 2.681578947368421e-05,
"objective/entropy": -58.49903869628906,
"objective/kl": 37.71872329711914,
"objective/non_score_reward": -3.7718722820281982,
"objective/rlhf_reward": -10.687489128112793,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.646287441253662,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6332191228866577,
"step": 121,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9996297359466553
},
{
"episode": 1968,
"epoch": 0.05398584517474077,
"loss/policy_avg": -0.24410778284072876,
"lr": 2.6789473684210526e-05,
"objective/entropy": -20.791318893432617,
"objective/kl": 42.270294189453125,
"objective/non_score_reward": -4.227029800415039,
"objective/rlhf_reward": -15.08328973797233,
"objective/scores": 0.4562071871080222,
"policy/approxkl_avg": 2.9298739433288574,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.8968937397003174,
"step": 122,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0011134147644043
},
{
"episode": 1984,
"epoch": 0.05442475448510452,
"loss/policy_avg": -0.020455077290534973,
"lr": 2.6763157894736844e-05,
"objective/entropy": 53.92317199707031,
"objective/kl": 44.21133804321289,
"objective/non_score_reward": -4.421133518218994,
"objective/rlhf_reward": -14.760815297008726,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 146.8616943359375,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8489948511123657,
"step": 123,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9991151094436646
},
{
"episode": 2000,
"epoch": 0.05486366379546826,
"loss/policy_avg": -0.6532711982727051,
"lr": 2.673684210526316e-05,
"objective/entropy": 95.89879608154297,
"objective/kl": 70.10568237304688,
"objective/non_score_reward": -7.0105671882629395,
"objective/rlhf_reward": -25.118550692440245,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 1.0512518882751465,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7362265586853027,
"step": 124,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0022377967834473
},
{
"episode": 2016,
"epoch": 0.05530257310583201,
"loss/policy_avg": -0.532859206199646,
"lr": 2.6710526315789474e-05,
"objective/entropy": -134.69949340820312,
"objective/kl": 26.79535675048828,
"objective/non_score_reward": -2.6795358657836914,
"objective/rlhf_reward": -6.318143224716187,
"objective/scores": 1.1,
"policy/approxkl_avg": 131.03668212890625,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7895087599754333,
"step": 125,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000466823577881
},
{
"episode": 2032,
"epoch": 0.05574148241619575,
"loss/policy_avg": -0.6944832801818848,
"lr": 2.668421052631579e-05,
"objective/entropy": -70.2201919555664,
"objective/kl": 39.88362121582031,
"objective/non_score_reward": -3.9883623123168945,
"objective/rlhf_reward": -17.953449249267578,
"objective/scores": -0.5,
"policy/approxkl_avg": 142.29962158203125,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7750124335289001,
"step": 126,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9993183612823486
},
{
"episode": 2048,
"epoch": 0.056180391726559496,
"loss/policy_avg": 0.4301793575286865,
"lr": 2.6657894736842107e-05,
"objective/entropy": 41.7333984375,
"objective/kl": 50.46965789794922,
"objective/non_score_reward": -5.046966075897217,
"objective/rlhf_reward": -19.787864542007448,
"objective/scores": 0.1,
"policy/approxkl_avg": 8.3101167678833,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7716444730758667,
"step": 127,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000278949737549
},
{
"episode": 2064,
"epoch": 0.056619301036923246,
"loss/policy_avg": 0.20978103578090668,
"lr": 2.6631578947368422e-05,
"objective/entropy": -39.1937370300293,
"objective/kl": 60.34259033203125,
"objective/non_score_reward": -6.034258842468262,
"objective/rlhf_reward": -23.737036323547365,
"objective/scores": 0.1,
"policy/approxkl_avg": 6.2546772956848145,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6956591010093689,
"step": 128,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0086145401000977
},
{
"episode": 2080,
"epoch": 0.05705821034728699,
"loss/policy_avg": 1.5271130800247192,
"lr": 2.6605263157894737e-05,
"objective/entropy": -15.993308067321777,
"objective/kl": 42.31775665283203,
"objective/non_score_reward": -4.231775760650635,
"objective/rlhf_reward": -12.527102565765382,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.8814833164215088,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.839857280254364,
"step": 129,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0023293495178223
},
{
"episode": 2096,
"epoch": 0.05749711965765074,
"loss/policy_avg": 1.5832982063293457,
"lr": 2.6578947368421052e-05,
"objective/entropy": 23.444358825683594,
"objective/kl": 67.05261993408203,
"objective/non_score_reward": -6.705262184143066,
"objective/rlhf_reward": -26.421048736572267,
"objective/scores": 0.1,
"policy/approxkl_avg": 343.2151184082031,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6558287143707275,
"step": 130,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.003596067428589
},
{
"episode": 2112,
"epoch": 0.05793602896801448,
"loss/policy_avg": 4.708208084106445,
"lr": 2.6552631578947367e-05,
"objective/entropy": -47.06990432739258,
"objective/kl": 54.011878967285156,
"objective/non_score_reward": -5.401188373565674,
"objective/rlhf_reward": -21.204753494262697,
"objective/scores": 0.1,
"policy/approxkl_avg": 10.141319274902344,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7859508991241455,
"step": 131,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.996903896331787
},
{
"episode": 2128,
"epoch": 0.05837493827837823,
"loss/policy_avg": 0.5339508056640625,
"lr": 2.6526315789473685e-05,
"objective/entropy": 1.606778621673584,
"objective/kl": 49.70311737060547,
"objective/non_score_reward": -4.970311641693115,
"objective/rlhf_reward": -15.481246566772462,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.387594223022461,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8247156143188477,
"step": 132,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9987622499465942
},
{
"episode": 2144,
"epoch": 0.058813847588741974,
"loss/policy_avg": 0.5222880840301514,
"lr": 2.65e-05,
"objective/entropy": -71.43392944335938,
"objective/kl": 67.5316162109375,
"objective/non_score_reward": -6.753161907196045,
"objective/rlhf_reward": -26.612646675109865,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.4301528930664062,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6953130960464478,
"step": 133,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9979286193847656
},
{
"episode": 2160,
"epoch": 0.059252756899105724,
"loss/policy_avg": 0.3055562674999237,
"lr": 2.6473684210526315e-05,
"objective/entropy": -49.49654769897461,
"objective/kl": 45.72821044921875,
"objective/non_score_reward": -4.572821140289307,
"objective/rlhf_reward": -17.891284561157228,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.059330463409424,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5824675559997559,
"step": 134,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0010061264038086
},
{
"episode": 2176,
"epoch": 0.05969166620946947,
"loss/policy_avg": 0.23712539672851562,
"lr": 2.644736842105263e-05,
"objective/entropy": -80.8898696899414,
"objective/kl": 64.44762420654297,
"objective/non_score_reward": -6.44476318359375,
"objective/rlhf_reward": -25.379051303863527,
"objective/scores": 0.1,
"policy/approxkl_avg": 200.35958862304688,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8632474541664124,
"step": 135,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9975115060806274
},
{
"episode": 2192,
"epoch": 0.06013057551983322,
"loss/policy_avg": -0.052902236580848694,
"lr": 2.6421052631578945e-05,
"objective/entropy": -6.771919250488281,
"objective/kl": 57.73787307739258,
"objective/non_score_reward": -5.773787498474121,
"objective/rlhf_reward": -18.695149517059328,
"objective/scores": 1.1,
"policy/approxkl_avg": 10.29161262512207,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7512908577919006,
"step": 136,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9991791248321533
},
{
"episode": 2208,
"epoch": 0.06056948483019696,
"loss/policy_avg": -0.9740344285964966,
"lr": 2.6394736842105264e-05,
"objective/entropy": -20.03300666809082,
"objective/kl": 39.889991760253906,
"objective/non_score_reward": -3.988999366760254,
"objective/rlhf_reward": -14.008585999684271,
"objective/scores": 0.4868528072345416,
"policy/approxkl_avg": 1.1283724308013916,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7417764663696289,
"step": 137,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0027053356170654
},
{
"episode": 2224,
"epoch": 0.06100839414056071,
"loss/policy_avg": 0.14382201433181763,
"lr": 2.636842105263158e-05,
"objective/entropy": -43.97955322265625,
"objective/kl": 37.197532653808594,
"objective/non_score_reward": -3.7197535037994385,
"objective/rlhf_reward": -10.479013776779176,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.3562726974487305,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7394824028015137,
"step": 138,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999990463256836
},
{
"episode": 2240,
"epoch": 0.06144730345092445,
"loss/policy_avg": 1.9168037176132202,
"lr": 2.6342105263157894e-05,
"objective/entropy": 7.118698596954346,
"objective/kl": 42.77621078491211,
"objective/non_score_reward": -4.277621269226074,
"objective/rlhf_reward": -12.71048483848572,
"objective/scores": 1.1,
"policy/approxkl_avg": 7.454709053039551,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7237066030502319,
"step": 139,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9988505840301514
},
{
"episode": 2256,
"epoch": 0.0618862127612882,
"loss/policy_avg": 1.2066419124603271,
"lr": 2.631578947368421e-05,
"objective/entropy": 4.71856689453125,
"objective/kl": 43.93148422241211,
"objective/non_score_reward": -4.393148422241211,
"objective/rlhf_reward": -13.172594642639162,
"objective/scores": 1.1,
"policy/approxkl_avg": 6.769146919250488,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7349320650100708,
"step": 140,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9952998161315918
},
{
"episode": 2272,
"epoch": 0.062325122071651945,
"loss/policy_avg": -0.01632899045944214,
"lr": 2.6289473684210527e-05,
"objective/entropy": 23.46413230895996,
"objective/kl": 56.494224548339844,
"objective/non_score_reward": -5.649422645568848,
"objective/rlhf_reward": -18.19769105911255,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.6812927722930908,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6015833616256714,
"step": 141,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0010690689086914
},
{
"episode": 2288,
"epoch": 0.0627640313820157,
"loss/policy_avg": -0.19681307673454285,
"lr": 2.6263157894736842e-05,
"objective/entropy": -24.22689437866211,
"objective/kl": 40.63903045654297,
"objective/non_score_reward": -4.063903331756592,
"objective/rlhf_reward": -11.85561261177063,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.7771735191345215,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.675006628036499,
"step": 142,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0051536560058594
},
{
"episode": 2304,
"epoch": 0.06320294069237943,
"loss/policy_avg": -0.12234362959861755,
"lr": 2.623684210526316e-05,
"objective/entropy": -47.442970275878906,
"objective/kl": 35.8116455078125,
"objective/non_score_reward": -3.581164836883545,
"objective/rlhf_reward": -9.9246591091156,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.1870553493499756,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8034932017326355,
"step": 143,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990580081939697
},
{
"episode": 2320,
"epoch": 0.06364185000274318,
"loss/policy_avg": 2.628115177154541,
"lr": 2.6210526315789475e-05,
"objective/entropy": -15.868587493896484,
"objective/kl": 60.868995666503906,
"objective/non_score_reward": -6.086899280548096,
"objective/rlhf_reward": -19.947597599029542,
"objective/scores": 1.1,
"policy/approxkl_avg": 13.940417289733887,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5825701951980591,
"step": 144,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9940749406814575
},
{
"episode": 2336,
"epoch": 0.06408075931310693,
"loss/policy_avg": 1.0517263412475586,
"lr": 2.618421052631579e-05,
"objective/entropy": -58.0733528137207,
"objective/kl": 43.175994873046875,
"objective/non_score_reward": -4.317599296569824,
"objective/rlhf_reward": -12.870397424697877,
"objective/scores": 1.1,
"policy/approxkl_avg": 285.7511901855469,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6195497512817383,
"step": 145,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9964275360107422
},
{
"episode": 2352,
"epoch": 0.06451966862347068,
"loss/policy_avg": 0.2179277539253235,
"lr": 2.615789473684211e-05,
"objective/entropy": 30.72907829284668,
"objective/kl": 58.64939880371094,
"objective/non_score_reward": -5.864940643310547,
"objective/rlhf_reward": -19.059761619567873,
"objective/scores": 1.1,
"policy/approxkl_avg": 10.522979736328125,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.739693820476532,
"step": 146,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9976598024368286
},
{
"episode": 2368,
"epoch": 0.06495857793383442,
"loss/policy_avg": 0.25930777192115784,
"lr": 2.6131578947368424e-05,
"objective/entropy": -27.965259552001953,
"objective/kl": 49.60761642456055,
"objective/non_score_reward": -4.960761070251465,
"objective/rlhf_reward": -16.919326697231504,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 1.7648733854293823,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5860817432403564,
"step": 147,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9973572492599487
},
{
"episode": 2384,
"epoch": 0.06539748724419817,
"loss/policy_avg": 1.8243403434753418,
"lr": 2.610526315789474e-05,
"objective/entropy": -18.670982360839844,
"objective/kl": 60.018455505371094,
"objective/non_score_reward": -6.001845836639404,
"objective/rlhf_reward": -23.60738286972046,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.463629126548767,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6600210070610046,
"step": 148,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0028176307678223
},
{
"episode": 2400,
"epoch": 0.06583639655456192,
"loss/policy_avg": 0.2627159059047699,
"lr": 2.6078947368421053e-05,
"objective/entropy": -24.919069290161133,
"objective/kl": 38.713436126708984,
"objective/non_score_reward": -3.8713436126708984,
"objective/rlhf_reward": -11.085374450683593,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.5450901985168457,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7100886702537537,
"step": 149,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.99765944480896
},
{
"episode": 2416,
"epoch": 0.06627530586492567,
"loss/policy_avg": -0.16113287210464478,
"lr": 2.605263157894737e-05,
"objective/entropy": -233.61676025390625,
"objective/kl": 25.49152183532715,
"objective/non_score_reward": -2.549152374267578,
"objective/rlhf_reward": -12.196609497070312,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.3775527477264404,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.572956919670105,
"step": 150,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0008134841918945
},
{
"episode": 2432,
"epoch": 0.0667142151752894,
"loss/policy_avg": 1.506213665008545,
"lr": 2.6026315789473687e-05,
"objective/entropy": 17.05105972290039,
"objective/kl": 88.19961547851562,
"objective/non_score_reward": -8.819961547851562,
"objective/rlhf_reward": -34.87984714508057,
"objective/scores": 0.1,
"policy/approxkl_avg": 127.97711181640625,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8008217811584473,
"step": 151,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9965453147888184
},
{
"episode": 2448,
"epoch": 0.06715312448565315,
"loss/policy_avg": 0.3404674828052521,
"lr": 2.6000000000000002e-05,
"objective/entropy": -31.00487518310547,
"objective/kl": 70.51848602294922,
"objective/non_score_reward": -7.051849365234375,
"objective/rlhf_reward": -23.807396507263185,
"objective/scores": 1.1,
"policy/approxkl_avg": 9.886850357055664,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7234030961990356,
"step": 152,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9982991218566895
},
{
"episode": 2464,
"epoch": 0.0675920337960169,
"loss/policy_avg": 1.5734370946884155,
"lr": 2.5973684210526317e-05,
"objective/entropy": -27.52907943725586,
"objective/kl": 56.56928634643555,
"objective/non_score_reward": -5.656929016113281,
"objective/rlhf_reward": -18.227714633941652,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.53957462310791,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5384600162506104,
"step": 153,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9993629455566406
},
{
"episode": 2480,
"epoch": 0.06803094310638065,
"loss/policy_avg": 0.3518854081630707,
"lr": 2.5947368421052632e-05,
"objective/entropy": -261.23638916015625,
"objective/kl": 18.554153442382812,
"objective/non_score_reward": -1.8554154634475708,
"objective/rlhf_reward": -3.021661853790283,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.608569860458374,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6520816087722778,
"step": 154,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.999368667602539
},
{
"episode": 2496,
"epoch": 0.06846985241674439,
"loss/policy_avg": -0.8293436169624329,
"lr": 2.592105263157895e-05,
"objective/entropy": -169.88430786132812,
"objective/kl": 61.74763488769531,
"objective/non_score_reward": -6.1747636795043945,
"objective/rlhf_reward": -24.29905471801758,
"objective/scores": 0.1,
"policy/approxkl_avg": 108.35233306884766,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6774731278419495,
"step": 155,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9995229244232178
},
{
"episode": 2512,
"epoch": 0.06890876172710814,
"loss/policy_avg": 0.14700853824615479,
"lr": 2.5894736842105265e-05,
"objective/entropy": -85.36224365234375,
"objective/kl": 55.885929107666016,
"objective/non_score_reward": -5.588593482971191,
"objective/rlhf_reward": -17.95437297821045,
"objective/scores": 1.1,
"policy/approxkl_avg": 4.13096284866333,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6663363575935364,
"step": 156,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0007362365722656
},
{
"episode": 2528,
"epoch": 0.06934767103747189,
"loss/policy_avg": -0.4546607434749603,
"lr": 2.586842105263158e-05,
"objective/entropy": -116.53895568847656,
"objective/kl": 44.87690734863281,
"objective/non_score_reward": -4.4876909255981445,
"objective/rlhf_reward": -13.550763225555421,
"objective/scores": 1.1,
"policy/approxkl_avg": 135.5211944580078,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7589835524559021,
"step": 157,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.99733304977417
},
{
"episode": 2544,
"epoch": 0.06978658034783562,
"loss/policy_avg": 6.644524574279785,
"lr": 2.5842105263157895e-05,
"objective/entropy": 224.932373046875,
"objective/kl": 77.01321411132812,
"objective/non_score_reward": -7.701322078704834,
"objective/rlhf_reward": -30.405288791656496,
"objective/scores": 0.1,
"policy/approxkl_avg": 16.4545955657959,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.9313648343086243,
"step": 158,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9967166185379028
},
{
"episode": 2560,
"epoch": 0.07022548965819937,
"loss/policy_avg": -0.7700405120849609,
"lr": 2.581578947368421e-05,
"objective/entropy": -299.10150146484375,
"objective/kl": 37.77906036376953,
"objective/non_score_reward": -3.7779064178466797,
"objective/rlhf_reward": -14.711626148223878,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.6640864610671997,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7569674849510193,
"step": 159,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 8,
"val/ratio": 1.9998410940170288
},
{
"episode": 2576,
"epoch": 0.07066439896856312,
"loss/policy_avg": -1.5657492876052856,
"lr": 2.578947368421053e-05,
"objective/entropy": -37.76366424560547,
"objective/kl": 66.01406860351562,
"objective/non_score_reward": -6.601407051086426,
"objective/rlhf_reward": -22.005628204345705,
"objective/scores": 1.1,
"policy/approxkl_avg": 97.5102310180664,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8600292205810547,
"step": 160,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.004817485809326
},
{
"episode": 2592,
"epoch": 0.07110330827892687,
"loss/policy_avg": 0.8615214228630066,
"lr": 2.5763157894736843e-05,
"objective/entropy": -48.53110885620117,
"objective/kl": 50.90060806274414,
"objective/non_score_reward": -5.090060234069824,
"objective/rlhf_reward": -19.960242366790773,
"objective/scores": 0.1,
"policy/approxkl_avg": 271.98052978515625,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7262279987335205,
"step": 161,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9973949193954468
},
{
"episode": 2608,
"epoch": 0.07154221758929061,
"loss/policy_avg": 0.14006884396076202,
"lr": 2.5736842105263158e-05,
"objective/entropy": -87.22216796875,
"objective/kl": 48.421382904052734,
"objective/non_score_reward": -4.842138290405273,
"objective/rlhf_reward": -14.968552684783937,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.7606146335601807,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6789655685424805,
"step": 162,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9983569383621216
},
{
"episode": 2624,
"epoch": 0.07198112689965436,
"loss/policy_avg": 0.11756910383701324,
"lr": 2.5710526315789473e-05,
"objective/entropy": -90.09169006347656,
"objective/kl": 48.95092010498047,
"objective/non_score_reward": -4.895092010498047,
"objective/rlhf_reward": -15.180367088317873,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.360994815826416,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.673121988773346,
"step": 163,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9987850189208984
},
{
"episode": 2640,
"epoch": 0.07242003621001811,
"loss/policy_avg": 0.6505356431007385,
"lr": 2.568421052631579e-05,
"objective/entropy": -105.59286499023438,
"objective/kl": 57.51460647583008,
"objective/non_score_reward": -5.751461029052734,
"objective/rlhf_reward": -18.605842685699464,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9482333064079285,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6159911751747131,
"step": 164,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.00016188621521
},
{
"episode": 2656,
"epoch": 0.07285894552038186,
"loss/policy_avg": 0.4072788655757904,
"lr": 2.5657894736842107e-05,
"objective/entropy": 30.31393814086914,
"objective/kl": 62.92792510986328,
"objective/non_score_reward": -6.292792320251465,
"objective/rlhf_reward": -20.77116928100586,
"objective/scores": 1.1,
"policy/approxkl_avg": 41.36127471923828,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7414021492004395,
"step": 165,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.994492769241333
},
{
"episode": 2672,
"epoch": 0.07329785483074559,
"loss/policy_avg": 0.6451830863952637,
"lr": 2.563157894736842e-05,
"objective/entropy": -93.72520446777344,
"objective/kl": 68.08780670166016,
"objective/non_score_reward": -6.808781147003174,
"objective/rlhf_reward": -22.83512411117554,
"objective/scores": 1.1,
"policy/approxkl_avg": 380.79705810546875,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.778990626335144,
"step": 166,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0007405281066895
},
{
"episode": 2688,
"epoch": 0.07373676414110934,
"loss/policy_avg": 0.30001240968704224,
"lr": 2.5605263157894737e-05,
"objective/entropy": -105.16179656982422,
"objective/kl": 41.9052619934082,
"objective/non_score_reward": -4.190526008605957,
"objective/rlhf_reward": -16.362104749679567,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.618799924850464,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7631393074989319,
"step": 167,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0000078678131104
},
{
"episode": 2704,
"epoch": 0.07417567345147309,
"loss/policy_avg": 0.49671614170074463,
"lr": 2.557894736842105e-05,
"objective/entropy": -53.44061279296875,
"objective/kl": 57.806434631347656,
"objective/non_score_reward": -5.780643463134766,
"objective/rlhf_reward": -18.722572898864748,
"objective/scores": 1.1,
"policy/approxkl_avg": 6.265882968902588,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7590473294258118,
"step": 168,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990757703781128
},
{
"episode": 2720,
"epoch": 0.07461458276183684,
"loss/policy_avg": 1.5006479024887085,
"lr": 2.555263157894737e-05,
"objective/entropy": -62.24983596801758,
"objective/kl": 45.70757293701172,
"objective/non_score_reward": -4.570757865905762,
"objective/rlhf_reward": -13.883030509948732,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.5961132049560547,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7392877340316772,
"step": 169,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9993603229522705
},
{
"episode": 2736,
"epoch": 0.07505349207220058,
"loss/policy_avg": -0.012972153723239899,
"lr": 2.5526315789473685e-05,
"objective/entropy": -65.46833801269531,
"objective/kl": 66.47633361816406,
"objective/non_score_reward": -6.647633075714111,
"objective/rlhf_reward": -22.190532779693605,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.807072162628174,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.64030921459198,
"step": 170,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000433921813965
},
{
"episode": 2752,
"epoch": 0.07549240138256433,
"loss/policy_avg": 0.2781289219856262,
"lr": 2.55e-05,
"objective/entropy": 16.252883911132812,
"objective/kl": 39.9376106262207,
"objective/non_score_reward": -3.9937610626220703,
"objective/rlhf_reward": -15.575044488906862,
"objective/scores": 0.1,
"policy/approxkl_avg": 123.21941375732422,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7487983107566833,
"step": 171,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.999727487564087
},
{
"episode": 2768,
"epoch": 0.07593131069292808,
"loss/policy_avg": 0.4936387538909912,
"lr": 2.5473684210526315e-05,
"objective/entropy": -25.21156883239746,
"objective/kl": 54.01347351074219,
"objective/non_score_reward": -5.4013471603393555,
"objective/rlhf_reward": -17.20538911819458,
"objective/scores": 1.1,
"policy/approxkl_avg": 8.337639808654785,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.828056812286377,
"step": 172,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9998791217803955
},
{
"episode": 2784,
"epoch": 0.07637022000329181,
"loss/policy_avg": 0.9954125881195068,
"lr": 2.544736842105263e-05,
"objective/entropy": -17.563995361328125,
"objective/kl": 53.30859375,
"objective/non_score_reward": -5.330860137939453,
"objective/rlhf_reward": -20.923440074920656,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.786358118057251,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7179369926452637,
"step": 173,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9982266426086426
},
{
"episode": 2800,
"epoch": 0.07680912931365556,
"loss/policy_avg": 0.03012019395828247,
"lr": 2.5421052631578948e-05,
"objective/entropy": 5.954471588134766,
"objective/kl": 54.192142486572266,
"objective/non_score_reward": -5.419214248657227,
"objective/rlhf_reward": -21.27685651779175,
"objective/scores": 0.1,
"policy/approxkl_avg": 13.544754028320312,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8271291255950928,
"step": 174,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.997292399406433
},
{
"episode": 2816,
"epoch": 0.07724803862401931,
"loss/policy_avg": -0.3252931833267212,
"lr": 2.5394736842105263e-05,
"objective/entropy": -62.24835205078125,
"objective/kl": 52.21865463256836,
"objective/non_score_reward": -5.221865653991699,
"objective/rlhf_reward": -16.48746213912964,
"objective/scores": 1.1,
"policy/approxkl_avg": 83.18733215332031,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8194577693939209,
"step": 175,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9963982105255127
},
{
"episode": 2832,
"epoch": 0.07768694793438306,
"loss/policy_avg": -0.061586543917655945,
"lr": 2.5368421052631578e-05,
"objective/entropy": -232.89175415039062,
"objective/kl": 31.382434844970703,
"objective/non_score_reward": -3.1382434368133545,
"objective/rlhf_reward": -14.552973747253418,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.4224392175674438,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.638608455657959,
"step": 176,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998449683189392
},
{
"episode": 2848,
"epoch": 0.0781258572447468,
"loss/policy_avg": 2.2170538902282715,
"lr": 2.5342105263157893e-05,
"objective/entropy": 50.82858657836914,
"objective/kl": 76.44434356689453,
"objective/non_score_reward": -7.644434928894043,
"objective/rlhf_reward": -26.177738761901857,
"objective/scores": 1.1,
"policy/approxkl_avg": 268.76092529296875,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7145341634750366,
"step": 177,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000556707382202
},
{
"episode": 2864,
"epoch": 0.07856476655511055,
"loss/policy_avg": 0.03309518098831177,
"lr": 2.531578947368421e-05,
"objective/entropy": -38.59127426147461,
"objective/kl": 41.92137908935547,
"objective/non_score_reward": -4.192137718200684,
"objective/rlhf_reward": -14.645845117346319,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 2.4393868446350098,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7346012592315674,
"step": 178,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000610828399658
},
{
"episode": 2880,
"epoch": 0.0790036758654743,
"loss/policy_avg": -0.08147536963224411,
"lr": 2.5289473684210526e-05,
"objective/entropy": 22.475852966308594,
"objective/kl": 39.314239501953125,
"objective/non_score_reward": -3.931424379348755,
"objective/rlhf_reward": -17.725696563720703,
"objective/scores": -0.5,
"policy/approxkl_avg": 3.216597080230713,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6433309316635132,
"step": 179,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9997375011444092
},
{
"episode": 2896,
"epoch": 0.07944258517583805,
"loss/policy_avg": 1.5342857837677002,
"lr": 2.526315789473684e-05,
"objective/entropy": 12.662029266357422,
"objective/kl": 44.605552673339844,
"objective/non_score_reward": -4.460555553436279,
"objective/rlhf_reward": -17.44222221374512,
"objective/scores": 0.1,
"policy/approxkl_avg": 5.570178031921387,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9360150098800659,
"step": 180,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990198612213135
},
{
"episode": 2912,
"epoch": 0.07988149448620178,
"loss/policy_avg": 1.0636322498321533,
"lr": 2.5236842105263156e-05,
"objective/entropy": -20.420055389404297,
"objective/kl": 43.089805603027344,
"objective/non_score_reward": -4.308980941772461,
"objective/rlhf_reward": -12.835923767089845,
"objective/scores": 1.1,
"policy/approxkl_avg": 8.459303855895996,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6522048711776733,
"step": 181,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998940348625183
},
{
"episode": 2928,
"epoch": 0.08032040379656553,
"loss/policy_avg": 1.1111366748809814,
"lr": 2.521052631578947e-05,
"objective/entropy": -286.44207763671875,
"objective/kl": 25.992015838623047,
"objective/non_score_reward": -2.5992014408111572,
"objective/rlhf_reward": -5.9968057632446286,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.6512064933776855,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7038178443908691,
"step": 182,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 1.9989608526229858
},
{
"episode": 2944,
"epoch": 0.08075931310692928,
"loss/policy_avg": 0.3593832850456238,
"lr": 2.518421052631579e-05,
"objective/entropy": -23.417577743530273,
"objective/kl": 55.88574981689453,
"objective/non_score_reward": -5.58857536315918,
"objective/rlhf_reward": -17.954300975799562,
"objective/scores": 1.1,
"policy/approxkl_avg": 7.021892070770264,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6839828491210938,
"step": 183,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998976230621338
},
{
"episode": 2960,
"epoch": 0.08119822241729302,
"loss/policy_avg": 1.0649125576019287,
"lr": 2.5157894736842108e-05,
"objective/entropy": -28.054828643798828,
"objective/kl": 52.482826232910156,
"objective/non_score_reward": -5.248283386230469,
"objective/rlhf_reward": -16.593132114410402,
"objective/scores": 1.1,
"policy/approxkl_avg": 6.408839225769043,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6410462856292725,
"step": 184,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9986329078674316
},
{
"episode": 2976,
"epoch": 0.08163713172765677,
"loss/policy_avg": 0.09679454565048218,
"lr": 2.5131578947368423e-05,
"objective/entropy": 16.912199020385742,
"objective/kl": 57.1039924621582,
"objective/non_score_reward": -5.710399627685547,
"objective/rlhf_reward": -18.441597557067873,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.548750638961792,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7216329574584961,
"step": 185,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000427722930908
},
{
"episode": 2992,
"epoch": 0.08207604103802052,
"loss/policy_avg": 2.137105941772461,
"lr": 2.5105263157894738e-05,
"objective/entropy": 4.954294204711914,
"objective/kl": 45.99482727050781,
"objective/non_score_reward": -4.599482536315918,
"objective/rlhf_reward": -13.997929906845094,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.8815433979034424,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7191178798675537,
"step": 186,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9966540336608887
},
{
"episode": 3008,
"epoch": 0.08251495034838427,
"loss/policy_avg": 1.5447564125061035,
"lr": 2.5078947368421056e-05,
"objective/entropy": 72.04828643798828,
"objective/kl": 52.15823745727539,
"objective/non_score_reward": -5.215824127197266,
"objective/rlhf_reward": -16.463295555114748,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.3372530937194824,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7812240123748779,
"step": 187,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9983248710632324
},
{
"episode": 3024,
"epoch": 0.082953859658748,
"loss/policy_avg": -0.10642924904823303,
"lr": 2.505263157894737e-05,
"objective/entropy": -7.903879165649414,
"objective/kl": 40.129364013671875,
"objective/non_score_reward": -4.012936115264893,
"objective/rlhf_reward": -11.651745176315309,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.9247307777404785,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6996808052062988,
"step": 188,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999596118927002
},
{
"episode": 3040,
"epoch": 0.08339276896911176,
"loss/policy_avg": -0.6931325793266296,
"lr": 2.5026315789473686e-05,
"objective/entropy": -112.98082733154297,
"objective/kl": 45.408485412597656,
"objective/non_score_reward": -4.540848255157471,
"objective/rlhf_reward": -13.7633939743042,
"objective/scores": 1.1,
"policy/approxkl_avg": 85.60306549072266,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9481509923934937,
"step": 189,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.99983811378479
},
{
"episode": 3056,
"epoch": 0.0838316782794755,
"loss/policy_avg": 0.543519139289856,
"lr": 2.5e-05,
"objective/entropy": -18.81544303894043,
"objective/kl": 71.94432830810547,
"objective/non_score_reward": -7.194432735443115,
"objective/rlhf_reward": -24.37773141860962,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.9382588863372803,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8083455562591553,
"step": 190,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9974308013916016
},
{
"episode": 3072,
"epoch": 0.08427058758983925,
"loss/policy_avg": 0.05518569424748421,
"lr": 2.4973684210526316e-05,
"objective/entropy": 60.97806167602539,
"objective/kl": 76.83258819580078,
"objective/non_score_reward": -7.6832594871521,
"objective/rlhf_reward": -26.3330379486084,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5206363797187805,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.67827308177948,
"step": 191,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.001342535018921
},
{
"episode": 3088,
"epoch": 0.08470949690020299,
"loss/policy_avg": 0.1450192928314209,
"lr": 2.4947368421052635e-05,
"objective/entropy": 81.1869125366211,
"objective/kl": 51.66636276245117,
"objective/non_score_reward": -5.1666364669799805,
"objective/rlhf_reward": -22.666545867919922,
"objective/scores": -0.5,
"policy/approxkl_avg": 125.81236267089844,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8107389807701111,
"step": 192,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 2.0004963874816895
},
{
"episode": 3104,
"epoch": 0.08514840621056674,
"loss/policy_avg": -0.575951099395752,
"lr": 2.492105263157895e-05,
"objective/entropy": -129.48512268066406,
"objective/kl": 38.8513298034668,
"objective/non_score_reward": -3.8851335048675537,
"objective/rlhf_reward": -11.140533542633058,
"objective/scores": 1.1,
"policy/approxkl_avg": 232.32797241210938,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6589675545692444,
"step": 193,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9972124099731445
},
{
"episode": 3120,
"epoch": 0.08558731552093049,
"loss/policy_avg": -0.8726249933242798,
"lr": 2.4894736842105264e-05,
"objective/entropy": -98.82183837890625,
"objective/kl": 38.976009368896484,
"objective/non_score_reward": -3.8976006507873535,
"objective/rlhf_reward": -11.190402841567995,
"objective/scores": 1.1,
"policy/approxkl_avg": 231.56631469726562,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7292333245277405,
"step": 194,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000427722930908
},
{
"episode": 3136,
"epoch": 0.08602622483129424,
"loss/policy_avg": -0.5889065861701965,
"lr": 2.486842105263158e-05,
"objective/entropy": -3.191420078277588,
"objective/kl": 56.826507568359375,
"objective/non_score_reward": -5.682650566101074,
"objective/rlhf_reward": -19.80688277328131,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 8.074029922485352,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7311458587646484,
"step": 195,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999056100845337
},
{
"episode": 3152,
"epoch": 0.08646513414165798,
"loss/policy_avg": 0.0854165256023407,
"lr": 2.4842105263157894e-05,
"objective/entropy": 110.20230102539062,
"objective/kl": 75.08110046386719,
"objective/non_score_reward": -7.508110523223877,
"objective/rlhf_reward": -25.632442569732667,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.1298476457595825,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7107164263725281,
"step": 196,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9999678134918213
},
{
"episode": 3168,
"epoch": 0.08690404345202173,
"loss/policy_avg": 0.2902490496635437,
"lr": 2.4815789473684213e-05,
"objective/entropy": 72.5558853149414,
"objective/kl": 70.04791259765625,
"objective/non_score_reward": -7.004791736602783,
"objective/rlhf_reward": -23.619165992736818,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.1979445219039917,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7555328607559204,
"step": 197,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000892400741577
},
{
"episode": 3184,
"epoch": 0.08734295276238548,
"loss/policy_avg": -1.6277761459350586,
"lr": 2.4789473684210528e-05,
"objective/entropy": 7.621055603027344,
"objective/kl": 61.288002014160156,
"objective/non_score_reward": -6.128800868988037,
"objective/rlhf_reward": -26.51520347595215,
"objective/scores": -0.5,
"policy/approxkl_avg": 187.9342041015625,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.8816299438476562,
"step": 198,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9992430210113525
},
{
"episode": 3200,
"epoch": 0.08778186207274921,
"loss/policy_avg": 0.22336675226688385,
"lr": 2.4763157894736843e-05,
"objective/entropy": 140.80392456054688,
"objective/kl": 71.45215606689453,
"objective/non_score_reward": -7.145215034484863,
"objective/rlhf_reward": -24.180860137939455,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.8151704668998718,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.844842791557312,
"step": 199,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999359846115112
},
{
"episode": 3216,
"epoch": 0.08822077138311296,
"loss/policy_avg": -0.7593020796775818,
"lr": 2.4736842105263158e-05,
"objective/entropy": -4.5475921630859375,
"objective/kl": 52.44508361816406,
"objective/non_score_reward": -5.244508743286133,
"objective/rlhf_reward": -16.57803592681885,
"objective/scores": 1.1,
"policy/approxkl_avg": 52.09981155395508,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8364603519439697,
"step": 200,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000638484954834
},
{
"episode": 3232,
"epoch": 0.08865968069347671,
"loss/policy_avg": 0.44816046953201294,
"lr": 2.4710526315789476e-05,
"objective/entropy": 92.43020629882812,
"objective/kl": 97.43000793457031,
"objective/non_score_reward": -9.743000984191895,
"objective/rlhf_reward": -34.572002506256105,
"objective/scores": 1.1,
"policy/approxkl_avg": 7.917519569396973,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6928500533103943,
"step": 201,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.000586986541748
},
{
"episode": 3248,
"epoch": 0.08909859000384046,
"loss/policy_avg": -0.15180611610412598,
"lr": 2.468421052631579e-05,
"objective/entropy": 115.11419677734375,
"objective/kl": 68.31903076171875,
"objective/non_score_reward": -6.831902980804443,
"objective/rlhf_reward": -29.327611923217773,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.6768676042556763,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7814561128616333,
"step": 202,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0007996559143066
},
{
"episode": 3264,
"epoch": 0.0895374993142042,
"loss/policy_avg": 2.1507365703582764,
"lr": 2.4657894736842106e-05,
"objective/entropy": 131.67218017578125,
"objective/kl": 87.50892639160156,
"objective/non_score_reward": -8.750892639160156,
"objective/rlhf_reward": -34.60357151031494,
"objective/scores": 0.1,
"policy/approxkl_avg": 9.72176742553711,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8055911064147949,
"step": 203,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9976446628570557
},
{
"episode": 3280,
"epoch": 0.08997640862456795,
"loss/policy_avg": 0.9095442891120911,
"lr": 2.463157894736842e-05,
"objective/entropy": -201.7689208984375,
"objective/kl": 39.41615295410156,
"objective/non_score_reward": -3.941615581512451,
"objective/rlhf_reward": -17.766462326049805,
"objective/scores": -0.5,
"policy/approxkl_avg": 2.0174264907836914,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.743715763092041,
"step": 204,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9988057613372803
},
{
"episode": 3296,
"epoch": 0.0904153179349317,
"loss/policy_avg": 2.7806615829467773,
"lr": 2.4605263157894736e-05,
"objective/entropy": 70.67823028564453,
"objective/kl": 75.30030822753906,
"objective/non_score_reward": -7.530030250549316,
"objective/rlhf_reward": -25.720120763778688,
"objective/scores": 1.1,
"policy/approxkl_avg": 23.9725399017334,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8070772290229797,
"step": 205,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9965420961380005
},
{
"episode": 3312,
"epoch": 0.09085422724529545,
"loss/policy_avg": -0.4679667055606842,
"lr": 2.4578947368421054e-05,
"objective/entropy": -221.43614196777344,
"objective/kl": 38.17326736450195,
"objective/non_score_reward": -3.8173270225524902,
"objective/rlhf_reward": -10.86930856704712,
"objective/scores": 1.1,
"policy/approxkl_avg": 7.382453918457031,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7196519374847412,
"step": 206,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0000643730163574
},
{
"episode": 3328,
"epoch": 0.09129313655565918,
"loss/policy_avg": 0.7069023847579956,
"lr": 2.455263157894737e-05,
"objective/entropy": 46.485408782958984,
"objective/kl": 52.93321228027344,
"objective/non_score_reward": -5.29332160949707,
"objective/rlhf_reward": -20.773285484313966,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.7949851155281067,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8590609431266785,
"step": 207,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.002488374710083
},
{
"episode": 3344,
"epoch": 0.09173204586602293,
"loss/policy_avg": -0.6623063683509827,
"lr": 2.4526315789473684e-05,
"objective/entropy": 45.29491424560547,
"objective/kl": 68.79707336425781,
"objective/non_score_reward": -6.879707336425781,
"objective/rlhf_reward": -27.118828868865968,
"objective/scores": 0.1,
"policy/approxkl_avg": 9.599810600280762,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7060703039169312,
"step": 208,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0040082931518555
},
{
"episode": 3360,
"epoch": 0.09217095517638668,
"loss/policy_avg": 0.8098376989364624,
"lr": 2.45e-05,
"objective/entropy": 71.23324584960938,
"objective/kl": 51.93505096435547,
"objective/non_score_reward": -5.193504810333252,
"objective/rlhf_reward": -16.37401876449585,
"objective/scores": 1.1,
"policy/approxkl_avg": 4.758597373962402,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7107915878295898,
"step": 209,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.996649980545044
},
{
"episode": 3376,
"epoch": 0.09260986448675043,
"loss/policy_avg": -0.045729756355285645,
"lr": 2.4473684210526318e-05,
"objective/entropy": 68.11517333984375,
"objective/kl": 71.54877471923828,
"objective/non_score_reward": -7.154877662658691,
"objective/rlhf_reward": -24.219510650634767,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.4835081100463867,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.938834547996521,
"step": 210,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.003654956817627
},
{
"episode": 3392,
"epoch": 0.09304877379711417,
"loss/policy_avg": -0.398199200630188,
"lr": 2.4447368421052633e-05,
"objective/entropy": 22.62654685974121,
"objective/kl": 52.52642059326172,
"objective/non_score_reward": -5.252641677856445,
"objective/rlhf_reward": -16.6105676651001,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.71232008934021,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7540408372879028,
"step": 211,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0004401206970215
},
{
"episode": 3408,
"epoch": 0.09348768310747792,
"loss/policy_avg": 0.09533184766769409,
"lr": 2.4421052631578948e-05,
"objective/entropy": 80.2608642578125,
"objective/kl": 69.30525207519531,
"objective/non_score_reward": -6.930525779724121,
"objective/rlhf_reward": -23.32210216522217,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.8454056978225708,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.9530247449874878,
"step": 212,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998732566833496
},
{
"episode": 3424,
"epoch": 0.09392659241784167,
"loss/policy_avg": 0.5622943639755249,
"lr": 2.4394736842105262e-05,
"objective/entropy": 5.986083984375,
"objective/kl": 52.243106842041016,
"objective/non_score_reward": -5.224310398101807,
"objective/rlhf_reward": -18.77453631378797,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 185.1870880126953,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9663141965866089,
"step": 213,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9961705207824707
},
{
"episode": 3440,
"epoch": 0.0943655017282054,
"loss/policy_avg": 0.1358652114868164,
"lr": 2.4368421052631577e-05,
"objective/entropy": 42.141456604003906,
"objective/kl": 73.0504150390625,
"objective/non_score_reward": -7.305041313171387,
"objective/rlhf_reward": -28.82016525268555,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.950047254562378,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6339312791824341,
"step": 214,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002665519714355
},
{
"episode": 3456,
"epoch": 0.09480441103856915,
"loss/policy_avg": 0.0715370774269104,
"lr": 2.4342105263157896e-05,
"objective/entropy": 42.893985748291016,
"objective/kl": 65.23382568359375,
"objective/non_score_reward": -6.523383140563965,
"objective/rlhf_reward": -25.69353303909302,
"objective/scores": 0.1,
"policy/approxkl_avg": 87.95655822753906,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7504739165306091,
"step": 215,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9980251789093018
},
{
"episode": 3472,
"epoch": 0.0952433203489329,
"loss/policy_avg": 8.101757049560547,
"lr": 2.431578947368421e-05,
"objective/entropy": 85.4518051147461,
"objective/kl": 72.85520935058594,
"objective/non_score_reward": -7.285521030426025,
"objective/rlhf_reward": -31.1420841217041,
"objective/scores": -0.5,
"policy/approxkl_avg": 16.570175170898438,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7926974296569824,
"step": 216,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0041887760162354
},
{
"episode": 3488,
"epoch": 0.09568222965929665,
"loss/policy_avg": 0.8208998441696167,
"lr": 2.4289473684210526e-05,
"objective/entropy": 54.86122131347656,
"objective/kl": 60.157432556152344,
"objective/non_score_reward": -6.015743732452393,
"objective/rlhf_reward": -19.662973976135255,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.3636417388916016,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9655085206031799,
"step": 217,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0007495880126953
},
{
"episode": 3504,
"epoch": 0.09612113896966039,
"loss/policy_avg": -0.5492388010025024,
"lr": 2.426315789473684e-05,
"objective/entropy": -81.31658935546875,
"objective/kl": 44.720726013183594,
"objective/non_score_reward": -4.472072601318359,
"objective/rlhf_reward": -17.48829040527344,
"objective/scores": 0.1,
"policy/approxkl_avg": 60.92509078979492,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.843732476234436,
"step": 218,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9994643926620483
},
{
"episode": 3520,
"epoch": 0.09656004828002414,
"loss/policy_avg": 1.4506752490997314,
"lr": 2.4236842105263156e-05,
"objective/entropy": 45.367652893066406,
"objective/kl": 65.80734252929688,
"objective/non_score_reward": -6.580735206604004,
"objective/rlhf_reward": -21.922941303253175,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.5029066801071167,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7406171560287476,
"step": 219,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9987130165100098
},
{
"episode": 3536,
"epoch": 0.09699895759038789,
"loss/policy_avg": 0.04402471333742142,
"lr": 2.4210526315789474e-05,
"objective/entropy": -187.66488647460938,
"objective/kl": 39.494014739990234,
"objective/non_score_reward": -3.949401378631592,
"objective/rlhf_reward": -15.397605514526369,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.20192278921604156,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8446219563484192,
"step": 220,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0014162063598633
},
{
"episode": 3552,
"epoch": 0.09743786690075164,
"loss/policy_avg": -0.6722557544708252,
"lr": 2.418421052631579e-05,
"objective/entropy": 132.76797485351562,
"objective/kl": 84.23394775390625,
"objective/non_score_reward": -8.423394203186035,
"objective/rlhf_reward": -33.293578958511354,
"objective/scores": 0.1,
"policy/approxkl_avg": 120.05321502685547,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.9575395584106445,
"step": 221,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.997431755065918
},
{
"episode": 3568,
"epoch": 0.09787677621111537,
"loss/policy_avg": -0.47086307406425476,
"lr": 2.4157894736842104e-05,
"objective/entropy": -79.6368408203125,
"objective/kl": 54.54154968261719,
"objective/non_score_reward": -5.454154968261719,
"objective/rlhf_reward": -17.416619873046876,
"objective/scores": 1.1,
"policy/approxkl_avg": 54.655574798583984,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8305655717849731,
"step": 222,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.997011661529541
},
{
"episode": 3584,
"epoch": 0.09831568552147912,
"loss/policy_avg": -0.0004381835460662842,
"lr": 2.413157894736842e-05,
"objective/entropy": 61.53660583496094,
"objective/kl": 74.82394409179688,
"objective/non_score_reward": -7.482394218444824,
"objective/rlhf_reward": -25.529577827453615,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.7358994483947754,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8543496131896973,
"step": 223,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9997599124908447
},
{
"episode": 3600,
"epoch": 0.09875459483184287,
"loss/policy_avg": 0.07636167109012604,
"lr": 2.410526315789474e-05,
"objective/entropy": 142.87892150878906,
"objective/kl": 79.087890625,
"objective/non_score_reward": -7.90878963470459,
"objective/rlhf_reward": -27.235158061981203,
"objective/scores": 1.1,
"policy/approxkl_avg": 7.760828971862793,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.9195695519447327,
"step": 224,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002522468566895
},
{
"episode": 3616,
"epoch": 0.09919350414220662,
"loss/policy_avg": 0.6181021928787231,
"lr": 2.4078947368421056e-05,
"objective/entropy": 55.48746871948242,
"objective/kl": 55.08421325683594,
"objective/non_score_reward": -5.508421421051025,
"objective/rlhf_reward": -21.633685207366945,
"objective/scores": 0.1,
"policy/approxkl_avg": 2.1698455810546875,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8132386207580566,
"step": 225,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9993584156036377
},
{
"episode": 3632,
"epoch": 0.09963241345257036,
"loss/policy_avg": 0.6648176908493042,
"lr": 2.405263157894737e-05,
"objective/entropy": -195.64773559570312,
"objective/kl": 44.247493743896484,
"objective/non_score_reward": -4.424749374389648,
"objective/rlhf_reward": -13.298997497558595,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9296014904975891,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.8615956902503967,
"step": 226,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9994629621505737
},
{
"episode": 3648,
"epoch": 0.10007132276293411,
"loss/policy_avg": 1.097282886505127,
"lr": 2.4026315789473686e-05,
"objective/entropy": 28.437664031982422,
"objective/kl": 63.49753952026367,
"objective/non_score_reward": -6.3497538566589355,
"objective/rlhf_reward": -20.999015426635744,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.303382635116577,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 1.031259298324585,
"step": 227,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9980473518371582
},
{
"episode": 3664,
"epoch": 0.10051023207329786,
"loss/policy_avg": 0.3590712547302246,
"lr": 2.4e-05,
"objective/entropy": -5.089465141296387,
"objective/kl": 73.07501983642578,
"objective/non_score_reward": -7.307501792907715,
"objective/rlhf_reward": -24.830005264282228,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.681659698486328,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8207137584686279,
"step": 228,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0043158531188965
},
{
"episode": 3680,
"epoch": 0.1009491413836616,
"loss/policy_avg": -0.26667794585227966,
"lr": 2.397368421052632e-05,
"objective/entropy": 4.93023681640625,
"objective/kl": 48.020545959472656,
"objective/non_score_reward": -4.802054405212402,
"objective/rlhf_reward": -14.808218097686769,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.0555940866470337,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 1.0309653282165527,
"step": 229,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000304698944092
},
{
"episode": 3696,
"epoch": 0.10138805069402534,
"loss/policy_avg": -1.0239133834838867,
"lr": 2.3947368421052634e-05,
"objective/entropy": -117.7593994140625,
"objective/kl": 66.29994201660156,
"objective/non_score_reward": -6.6299943923950195,
"objective/rlhf_reward": -22.11997756958008,
"objective/scores": 1.1,
"policy/approxkl_avg": 126.52104187011719,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9115900993347168,
"step": 230,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9966094493865967
},
{
"episode": 3712,
"epoch": 0.1018269600043891,
"loss/policy_avg": -0.3394533395767212,
"lr": 2.392105263157895e-05,
"objective/entropy": -26.78044891357422,
"objective/kl": 74.928466796875,
"objective/non_score_reward": -7.492847442626953,
"objective/rlhf_reward": -25.571388816833498,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7065678834915161,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8705970048904419,
"step": 231,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999688863754272
},
{
"episode": 3728,
"epoch": 0.10226586931475284,
"loss/policy_avg": 0.10001493245363235,
"lr": 2.3894736842105264e-05,
"objective/entropy": -13.011394500732422,
"objective/kl": 45.67135238647461,
"objective/non_score_reward": -4.567135334014893,
"objective/rlhf_reward": -13.86854157447815,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.24752560257911682,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.810563325881958,
"step": 232,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000312566757202
},
{
"episode": 3744,
"epoch": 0.10270477862511658,
"loss/policy_avg": 0.6616812944412231,
"lr": 2.386842105263158e-05,
"objective/entropy": 7.605260848999023,
"objective/kl": 51.76225280761719,
"objective/non_score_reward": -5.176225662231445,
"objective/rlhf_reward": -16.304901218414308,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.828341007232666,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 1.038901448249817,
"step": 233,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0011584758758545
},
{
"episode": 3760,
"epoch": 0.10314368793548033,
"loss/policy_avg": 0.9933425784111023,
"lr": 2.3842105263157897e-05,
"objective/entropy": -255.31588745117188,
"objective/kl": 24.543170928955078,
"objective/non_score_reward": -2.454317092895508,
"objective/rlhf_reward": -8.336315753872753,
"objective/scores": 0.3702381544273198,
"policy/approxkl_avg": 3.459949493408203,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.9003795385360718,
"step": 234,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9975780248641968
},
{
"episode": 3776,
"epoch": 0.10358259724584408,
"loss/policy_avg": 2.148421049118042,
"lr": 2.3815789473684212e-05,
"objective/entropy": 90.86304473876953,
"objective/kl": 81.66354370117188,
"objective/non_score_reward": -8.166354179382324,
"objective/rlhf_reward": -32.265417671203615,
"objective/scores": 0.1,
"policy/approxkl_avg": 23.5389347076416,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9693481922149658,
"step": 235,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.998524785041809
},
{
"episode": 3792,
"epoch": 0.10402150655620783,
"loss/policy_avg": 1.1006348133087158,
"lr": 2.3789473684210527e-05,
"objective/entropy": 13.458210945129395,
"objective/kl": 63.5147590637207,
"objective/non_score_reward": -6.351475715637207,
"objective/rlhf_reward": -25.005903816223146,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.5866320729255676,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.84203040599823,
"step": 236,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002055168151855
},
{
"episode": 3808,
"epoch": 0.10446041586657157,
"loss/policy_avg": 0.1537262499332428,
"lr": 2.3763157894736842e-05,
"objective/entropy": -280.88623046875,
"objective/kl": 25.253185272216797,
"objective/non_score_reward": -2.5253186225891113,
"objective/rlhf_reward": -5.701274490356445,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.43143346905708313,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7017552256584167,
"step": 237,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0003671646118164
},
{
"episode": 3824,
"epoch": 0.10489932517693532,
"loss/policy_avg": -1.110314130783081,
"lr": 2.373684210526316e-05,
"objective/entropy": -110.31336975097656,
"objective/kl": 62.17368698120117,
"objective/non_score_reward": -6.217369079589844,
"objective/rlhf_reward": -20.469476318359376,
"objective/scores": 1.1,
"policy/approxkl_avg": 41.373291015625,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7687387466430664,
"step": 238,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9982497692108154
},
{
"episode": 3840,
"epoch": 0.10533823448729907,
"loss/policy_avg": 0.35918915271759033,
"lr": 2.3710526315789475e-05,
"objective/entropy": 8.541339874267578,
"objective/kl": 67.1448974609375,
"objective/non_score_reward": -6.714488983154297,
"objective/rlhf_reward": -26.457956886291505,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.951134204864502,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8946194052696228,
"step": 239,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990932941436768
},
{
"episode": 3856,
"epoch": 0.10577714379766281,
"loss/policy_avg": -0.18191561102867126,
"lr": 2.368421052631579e-05,
"objective/entropy": 26.545223236083984,
"objective/kl": 42.12986755371094,
"objective/non_score_reward": -4.212986946105957,
"objective/rlhf_reward": -12.451948022842407,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.1634092330932617,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.8075993657112122,
"step": 240,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0009357929229736
},
{
"episode": 3872,
"epoch": 0.10621605310802655,
"loss/policy_avg": -0.21272988617420197,
"lr": 2.3657894736842105e-05,
"objective/entropy": -43.799278259277344,
"objective/kl": 57.01869583129883,
"objective/non_score_reward": -5.701869487762451,
"objective/rlhf_reward": -18.407477951049806,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.31846147775650024,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.9049383401870728,
"step": 241,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.00126051902771
},
{
"episode": 3888,
"epoch": 0.1066549624183903,
"loss/policy_avg": 0.45139366388320923,
"lr": 2.363157894736842e-05,
"objective/entropy": -8.316286087036133,
"objective/kl": 66.7007064819336,
"objective/non_score_reward": -6.670070648193359,
"objective/rlhf_reward": -22.280281162261964,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.046037197113037,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8059285879135132,
"step": 242,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9966351985931396
},
{
"episode": 3904,
"epoch": 0.10709387172875405,
"loss/policy_avg": 0.02522527426481247,
"lr": 2.360526315789474e-05,
"objective/entropy": -59.79475402832031,
"objective/kl": 43.16062927246094,
"objective/non_score_reward": -4.316062927246094,
"objective/rlhf_reward": -16.864251947402956,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.9877862930297852,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7408619523048401,
"step": 243,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999268054962158
},
{
"episode": 3920,
"epoch": 0.10753278103911779,
"loss/policy_avg": -0.4486180543899536,
"lr": 2.3578947368421054e-05,
"objective/entropy": -58.7701530456543,
"objective/kl": 48.37897491455078,
"objective/non_score_reward": -4.837897300720215,
"objective/rlhf_reward": -14.95158920288086,
"objective/scores": 1.1,
"policy/approxkl_avg": 108.83811950683594,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.847912073135376,
"step": 244,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9981459379196167
},
{
"episode": 3936,
"epoch": 0.10797169034948154,
"loss/policy_avg": 1.2412970066070557,
"lr": 2.355263157894737e-05,
"objective/entropy": -6.253645896911621,
"objective/kl": 60.986305236816406,
"objective/non_score_reward": -6.098630905151367,
"objective/rlhf_reward": -23.994523143768312,
"objective/scores": 0.1,
"policy/approxkl_avg": 5.727560997009277,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8397929668426514,
"step": 245,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.997018814086914
},
{
"episode": 3952,
"epoch": 0.10841059965984529,
"loss/policy_avg": 0.993728518486023,
"lr": 2.3526315789473684e-05,
"objective/entropy": -7.986781120300293,
"objective/kl": 75.30459594726562,
"objective/non_score_reward": -7.530459880828857,
"objective/rlhf_reward": -25.72183952331543,
"objective/scores": 1.1,
"policy/approxkl_avg": 207.10345458984375,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6667543649673462,
"step": 246,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9957817792892456
},
{
"episode": 3968,
"epoch": 0.10884950897020904,
"loss/policy_avg": -0.0747595876455307,
"lr": 2.3500000000000002e-05,
"objective/entropy": -87.14791870117188,
"objective/kl": 53.44728469848633,
"objective/non_score_reward": -5.344728469848633,
"objective/rlhf_reward": -20.978913164138795,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.7891916036605835,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.9153791069984436,
"step": 247,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9984283447265625
},
{
"episode": 3984,
"epoch": 0.10928841828057277,
"loss/policy_avg": 1.0588536262512207,
"lr": 2.3473684210526317e-05,
"objective/entropy": -253.19564819335938,
"objective/kl": 25.629247665405273,
"objective/non_score_reward": -2.562924861907959,
"objective/rlhf_reward": -9.851699447631837,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.8195428848266602,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7758696675300598,
"step": 248,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0022389888763428
},
{
"episode": 4000,
"epoch": 0.10972732759093652,
"loss/policy_avg": 0.5859658718109131,
"lr": 2.3447368421052632e-05,
"objective/entropy": -90.75294494628906,
"objective/kl": 59.36740493774414,
"objective/non_score_reward": -5.936740875244141,
"objective/rlhf_reward": -23.346962547302248,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.5736149549484253,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7139400243759155,
"step": 249,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.002814292907715
},
{
"episode": 4016,
"epoch": 0.11016623690130027,
"loss/policy_avg": 1.2698756456375122,
"lr": 2.3421052631578947e-05,
"objective/entropy": 8.61981201171875,
"objective/kl": 45.71005630493164,
"objective/non_score_reward": -4.571005821228027,
"objective/rlhf_reward": -13.884023761749269,
"objective/scores": 1.1,
"policy/approxkl_avg": 4.179641246795654,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.9620411396026611,
"step": 250,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9981392621994019
},
{
"episode": 4032,
"epoch": 0.11060514621166402,
"loss/policy_avg": -0.1636083722114563,
"lr": 2.3394736842105262e-05,
"objective/entropy": -46.70410919189453,
"objective/kl": 52.554317474365234,
"objective/non_score_reward": -5.25543212890625,
"objective/rlhf_reward": -16.621727204322816,
"objective/scores": 1.1,
"policy/approxkl_avg": 185.56044006347656,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7868727445602417,
"step": 251,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0018248558044434
},
{
"episode": 4048,
"epoch": 0.11104405552202776,
"loss/policy_avg": 0.20726332068443298,
"lr": 2.336842105263158e-05,
"objective/entropy": 42.283023834228516,
"objective/kl": 60.69983673095703,
"objective/non_score_reward": -6.06998348236084,
"objective/rlhf_reward": -23.879934883117677,
"objective/scores": 0.1,
"policy/approxkl_avg": 299.7823486328125,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8600380420684814,
"step": 252,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001103401184082
},
{
"episode": 4064,
"epoch": 0.1114829648323915,
"loss/policy_avg": -0.7184450030326843,
"lr": 2.3342105263157895e-05,
"objective/entropy": -167.64480590820312,
"objective/kl": 39.79289245605469,
"objective/non_score_reward": -3.9792890548706055,
"objective/rlhf_reward": -13.517155742645265,
"objective/scores": 0.6,
"policy/approxkl_avg": 35.35560607910156,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7276349663734436,
"step": 253,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0005733966827393
},
{
"episode": 4080,
"epoch": 0.11192187414275526,
"loss/policy_avg": 0.17252863943576813,
"lr": 2.331578947368421e-05,
"objective/entropy": -28.098251342773438,
"objective/kl": 77.68197631835938,
"objective/non_score_reward": -7.768197536468506,
"objective/rlhf_reward": -30.67278919219971,
"objective/scores": 0.1,
"policy/approxkl_avg": 190.54446411132812,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7322374582290649,
"step": 254,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.996679663658142
},
{
"episode": 4096,
"epoch": 0.11236078345311899,
"loss/policy_avg": 0.4479329288005829,
"lr": 2.3289473684210525e-05,
"objective/entropy": -36.64995193481445,
"objective/kl": 59.990604400634766,
"objective/non_score_reward": -5.999059677124023,
"objective/rlhf_reward": -21.07252112472174,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 2.55191969871521,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8982864618301392,
"step": 255,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9995880126953125
},
{
"episode": 4112,
"epoch": 0.11279969276348274,
"loss/policy_avg": 0.10965825617313385,
"lr": 2.326315789473684e-05,
"objective/entropy": -107.83656311035156,
"objective/kl": 57.40251922607422,
"objective/non_score_reward": -5.740252494812012,
"objective/rlhf_reward": -18.56100950241089,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5369773507118225,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9820707440376282,
"step": 256,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9997625350952148
},
{
"episode": 4128,
"epoch": 0.11323860207384649,
"loss/policy_avg": 1.0212502479553223,
"lr": 2.323684210526316e-05,
"objective/entropy": -31.868837356567383,
"objective/kl": 72.33842468261719,
"objective/non_score_reward": -7.233841896057129,
"objective/rlhf_reward": -24.535368537902833,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.164497375488281,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8438643217086792,
"step": 257,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9965769052505493
},
{
"episode": 4144,
"epoch": 0.11367751138421024,
"loss/policy_avg": 0.17471346259117126,
"lr": 2.3210526315789473e-05,
"objective/entropy": -30.013629913330078,
"objective/kl": 78.91902160644531,
"objective/non_score_reward": -7.891902923583984,
"objective/rlhf_reward": -31.16761121749878,
"objective/scores": 0.1,
"policy/approxkl_avg": 4.068915367126465,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7715609073638916,
"step": 258,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999107003211975
},
{
"episode": 4160,
"epoch": 0.11411642069457398,
"loss/policy_avg": 0.03435403108596802,
"lr": 2.318421052631579e-05,
"objective/entropy": -285.7454833984375,
"objective/kl": 18.901094436645508,
"objective/non_score_reward": -1.8901095390319824,
"objective/rlhf_reward": -3.1604381561279293,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.1570599377155304,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.9358274936676025,
"step": 259,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.000042676925659
},
{
"episode": 4176,
"epoch": 0.11455533000493773,
"loss/policy_avg": 0.253903329372406,
"lr": 2.3157894736842103e-05,
"objective/entropy": -43.559417724609375,
"objective/kl": 62.325313568115234,
"objective/non_score_reward": -6.2325310707092285,
"objective/rlhf_reward": -24.530125236511232,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.8882020711898804,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 1.0222952365875244,
"step": 260,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.002045154571533
},
{
"episode": 4192,
"epoch": 0.11499423931530148,
"loss/policy_avg": -0.259467214345932,
"lr": 2.3131578947368422e-05,
"objective/entropy": -50.13906478881836,
"objective/kl": 56.37471008300781,
"objective/non_score_reward": -5.6374711990356445,
"objective/rlhf_reward": -22.149885749816896,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.73235023021698,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.9029992818832397,
"step": 261,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000865936279297
},
{
"episode": 4208,
"epoch": 0.11543314862566523,
"loss/policy_avg": -0.007709167897701263,
"lr": 2.3105263157894737e-05,
"objective/entropy": -27.86676788330078,
"objective/kl": 55.32978439331055,
"objective/non_score_reward": -5.532978534698486,
"objective/rlhf_reward": -21.731915092468263,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.949405312538147,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7955001592636108,
"step": 262,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0025434494018555
},
{
"episode": 4224,
"epoch": 0.11587205793602896,
"loss/policy_avg": 0.07850819826126099,
"lr": 2.3078947368421052e-05,
"objective/entropy": -71.25007629394531,
"objective/kl": 45.41839599609375,
"objective/non_score_reward": -4.541839599609375,
"objective/rlhf_reward": -17.767357921600343,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.0701725482940674,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.690671980381012,
"step": 263,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001460313796997
},
{
"episode": 4240,
"epoch": 0.11631096724639271,
"loss/policy_avg": -0.4324986934661865,
"lr": 2.3052631578947367e-05,
"objective/entropy": -28.658288955688477,
"objective/kl": 58.944786071777344,
"objective/non_score_reward": -5.894477844238281,
"objective/rlhf_reward": -23.177911853790285,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.1384832859039307,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7700298428535461,
"step": 264,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0013980865478516
},
{
"episode": 4256,
"epoch": 0.11674987655675646,
"loss/policy_avg": 0.03035491704940796,
"lr": 2.3026315789473685e-05,
"objective/entropy": -80.87973022460938,
"objective/kl": 47.47106170654297,
"objective/non_score_reward": -4.747106075286865,
"objective/rlhf_reward": -18.588424301147462,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.1451961100101471,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7782449722290039,
"step": 265,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0004281997680664
},
{
"episode": 4272,
"epoch": 0.11718878586712021,
"loss/policy_avg": 0.15458990633487701,
"lr": 2.3000000000000003e-05,
"objective/entropy": -125.90150451660156,
"objective/kl": 68.26533508300781,
"objective/non_score_reward": -6.826533317565918,
"objective/rlhf_reward": -22.90613374710083,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.2503981590270996,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6700058579444885,
"step": 266,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9996408224105835
},
{
"episode": 4288,
"epoch": 0.11762769517748395,
"loss/policy_avg": -0.07002025842666626,
"lr": 2.297368421052632e-05,
"objective/entropy": -39.36412811279297,
"objective/kl": 64.72374725341797,
"objective/non_score_reward": -6.47237491607666,
"objective/rlhf_reward": -21.489498233795167,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.8438606262207031,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7219110727310181,
"step": 267,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.002565860748291
},
{
"episode": 4304,
"epoch": 0.1180666044878477,
"loss/policy_avg": 2.583950996398926,
"lr": 2.2947368421052633e-05,
"objective/entropy": -39.74678421020508,
"objective/kl": 60.672142028808594,
"objective/non_score_reward": -6.067214012145996,
"objective/rlhf_reward": -19.868857002258302,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.409726142883301,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7915390729904175,
"step": 268,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0011162757873535
},
{
"episode": 4320,
"epoch": 0.11850551379821145,
"loss/policy_avg": -0.9337702989578247,
"lr": 2.292105263157895e-05,
"objective/entropy": -6.267377853393555,
"objective/kl": 55.53129959106445,
"objective/non_score_reward": -5.553130149841309,
"objective/rlhf_reward": -21.812520599365236,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.0464932918548584,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.8756027221679688,
"step": 269,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002498626708984
},
{
"episode": 4336,
"epoch": 0.11894442310857518,
"loss/policy_avg": 0.41067397594451904,
"lr": 2.2894736842105263e-05,
"objective/entropy": -10.552501678466797,
"objective/kl": 59.17110824584961,
"objective/non_score_reward": -5.917110443115234,
"objective/rlhf_reward": -19.268443202972414,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.47551509737968445,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6988734006881714,
"step": 270,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0003862380981445
},
{
"episode": 4352,
"epoch": 0.11938333241893893,
"loss/policy_avg": 0.19564735889434814,
"lr": 2.286842105263158e-05,
"objective/entropy": -27.893587112426758,
"objective/kl": 63.20972442626953,
"objective/non_score_reward": -6.320972442626953,
"objective/rlhf_reward": -20.883888816833498,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.4529670476913452,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6984528303146362,
"step": 271,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0030579566955566
},
{
"episode": 4368,
"epoch": 0.11982224172930268,
"loss/policy_avg": -0.5308524370193481,
"lr": 2.2842105263157897e-05,
"objective/entropy": -49.50511169433594,
"objective/kl": 54.02558135986328,
"objective/non_score_reward": -5.402558326721191,
"objective/rlhf_reward": -17.21023235321045,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.7996609210968018,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7930437326431274,
"step": 272,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001534938812256
},
{
"episode": 4384,
"epoch": 0.12026115103966643,
"loss/policy_avg": 0.7527059316635132,
"lr": 2.281578947368421e-05,
"objective/entropy": -54.087158203125,
"objective/kl": 74.53093719482422,
"objective/non_score_reward": -7.453094482421875,
"objective/rlhf_reward": -25.412376976013185,
"objective/scores": 1.1,
"policy/approxkl_avg": 4.522714138031006,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.84335857629776,
"step": 273,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9970676898956299
},
{
"episode": 4400,
"epoch": 0.12070006035003017,
"loss/policy_avg": 2.354644536972046,
"lr": 2.2789473684210527e-05,
"objective/entropy": -57.67284393310547,
"objective/kl": 61.635345458984375,
"objective/non_score_reward": -6.163534164428711,
"objective/rlhf_reward": -20.25413808822632,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.8387179374694824,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7114452123641968,
"step": 274,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9973610639572144
},
{
"episode": 4416,
"epoch": 0.12113896966039392,
"loss/policy_avg": 0.33613622188568115,
"lr": 2.2763157894736845e-05,
"objective/entropy": -16.483137130737305,
"objective/kl": 80.16790771484375,
"objective/non_score_reward": -8.016791343688965,
"objective/rlhf_reward": -30.119753668980536,
"objective/scores": 0.4868528072345416,
"policy/approxkl_avg": 384.392578125,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7436220645904541,
"step": 275,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999522089958191
},
{
"episode": 4432,
"epoch": 0.12157787897075767,
"loss/policy_avg": 1.4383784532546997,
"lr": 2.273684210526316e-05,
"objective/entropy": -57.518455505371094,
"objective/kl": 60.987579345703125,
"objective/non_score_reward": -6.098758220672607,
"objective/rlhf_reward": -22.27232569672254,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 4.061164855957031,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6441831588745117,
"step": 276,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9998172521591187
},
{
"episode": 4448,
"epoch": 0.12201678828112142,
"loss/policy_avg": 1.3634085655212402,
"lr": 2.2710526315789475e-05,
"objective/entropy": -76.76998901367188,
"objective/kl": 48.89331817626953,
"objective/non_score_reward": -4.889332294464111,
"objective/rlhf_reward": -15.157328701019289,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.423219919204712,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7695807814598083,
"step": 277,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9966275691986084
},
{
"episode": 4464,
"epoch": 0.12245569759148515,
"loss/policy_avg": 0.3571290373802185,
"lr": 2.268421052631579e-05,
"objective/entropy": -40.89424514770508,
"objective/kl": 55.5726432800293,
"objective/non_score_reward": -5.55726432800293,
"objective/rlhf_reward": -17.829056835174562,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.3838818073272705,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7596747875213623,
"step": 278,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000396728515625
},
{
"episode": 4480,
"epoch": 0.1228946069018489,
"loss/policy_avg": 0.5862584710121155,
"lr": 2.2657894736842105e-05,
"objective/entropy": -71.57767486572266,
"objective/kl": 64.86282348632812,
"objective/non_score_reward": -6.4862823486328125,
"objective/rlhf_reward": -21.545130825042726,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7939838171005249,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6686462163925171,
"step": 279,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001072883605957
},
{
"episode": 4496,
"epoch": 0.12333351621221265,
"loss/policy_avg": -0.3135361969470978,
"lr": 2.2631578947368423e-05,
"objective/entropy": -197.05056762695312,
"objective/kl": 54.46437072753906,
"objective/non_score_reward": -5.446436882019043,
"objective/rlhf_reward": -17.385747528076173,
"objective/scores": 1.1,
"policy/approxkl_avg": 128.93673706054688,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8125574588775635,
"step": 280,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.997973918914795
},
{
"episode": 4512,
"epoch": 0.1237724255225764,
"loss/policy_avg": 0.19447273015975952,
"lr": 2.2605263157894738e-05,
"objective/entropy": -70.06492614746094,
"objective/kl": 65.292724609375,
"objective/non_score_reward": -6.52927303314209,
"objective/rlhf_reward": -23.71709213256836,
"objective/scores": 0.6,
"policy/approxkl_avg": 4.314207553863525,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8678346872329712,
"step": 281,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.997523546218872
},
{
"episode": 4528,
"epoch": 0.12421133483294014,
"loss/policy_avg": 1.5640308856964111,
"lr": 2.2578947368421053e-05,
"objective/entropy": 5.301499366760254,
"objective/kl": 56.73728942871094,
"objective/non_score_reward": -5.673728942871094,
"objective/rlhf_reward": -24.694915771484375,
"objective/scores": -0.5,
"policy/approxkl_avg": 221.279052734375,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.539535403251648,
"step": 282,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 2,
"val/ratio": 1.9981220960617065
},
{
"episode": 4544,
"epoch": 0.12465024414330389,
"loss/policy_avg": -0.15352113544940948,
"lr": 2.2552631578947368e-05,
"objective/entropy": -116.27902221679688,
"objective/kl": 70.43946838378906,
"objective/non_score_reward": -7.043946743011475,
"objective/rlhf_reward": -27.77578649520874,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.8236973881721497,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5488464832305908,
"step": 283,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0008316040039062
},
{
"episode": 4560,
"epoch": 0.12508915345366764,
"loss/policy_avg": -0.3702731728553772,
"lr": 2.2526315789473686e-05,
"objective/entropy": -167.31336975097656,
"objective/kl": 39.96023178100586,
"objective/non_score_reward": -3.996023178100586,
"objective/rlhf_reward": -15.584092235565187,
"objective/scores": 0.1,
"policy/approxkl_avg": 56.35317611694336,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9425455331802368,
"step": 284,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9981685876846313
},
{
"episode": 4576,
"epoch": 0.1255280627640314,
"loss/policy_avg": 0.20314961671829224,
"lr": 2.25e-05,
"objective/entropy": -204.00357055664062,
"objective/kl": 47.920372009277344,
"objective/non_score_reward": -4.792037010192871,
"objective/rlhf_reward": -14.768148040771486,
"objective/scores": 1.1,
"policy/approxkl_avg": 79.69387817382812,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8037855625152588,
"step": 285,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9953657388687134
},
{
"episode": 4592,
"epoch": 0.12596697207439514,
"loss/policy_avg": 1.1937806606292725,
"lr": 2.2473684210526316e-05,
"objective/entropy": -74.75482177734375,
"objective/kl": 61.91727828979492,
"objective/non_score_reward": -6.191727638244629,
"objective/rlhf_reward": -22.644205751196417,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 0.780440628528595,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9212397933006287,
"step": 286,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000117540359497
},
{
"episode": 4608,
"epoch": 0.12640588138475886,
"loss/policy_avg": -0.05165944993495941,
"lr": 2.244736842105263e-05,
"objective/entropy": -65.29678344726562,
"objective/kl": 58.81901168823242,
"objective/non_score_reward": -5.881901264190674,
"objective/rlhf_reward": -19.127605056762697,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.41273033618927,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5402939915657043,
"step": 287,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998548984527588
},
{
"episode": 4624,
"epoch": 0.1268447906951226,
"loss/policy_avg": 0.4261413514614105,
"lr": 2.2421052631578946e-05,
"objective/entropy": -260.0628662109375,
"objective/kl": 54.19302749633789,
"objective/non_score_reward": -5.419302940368652,
"objective/rlhf_reward": -21.27721176147461,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.7060222625732422,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6860594749450684,
"step": 288,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0012943744659424
},
{
"episode": 4640,
"epoch": 0.12728370000548636,
"loss/policy_avg": 0.21566912531852722,
"lr": 2.2394736842105265e-05,
"objective/entropy": -55.46482849121094,
"objective/kl": 62.80320739746094,
"objective/non_score_reward": -6.28032112121582,
"objective/rlhf_reward": -24.721283531188966,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.185427188873291,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9172104597091675,
"step": 289,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9994546175003052
},
{
"episode": 4656,
"epoch": 0.1277226093158501,
"loss/policy_avg": 0.4541619122028351,
"lr": 2.236842105263158e-05,
"objective/entropy": -61.278629302978516,
"objective/kl": 46.581844329833984,
"objective/non_score_reward": -4.658184051513672,
"objective/rlhf_reward": -14.232737159729005,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.593280076980591,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6413055658340454,
"step": 290,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9979616403579712
},
{
"episode": 4672,
"epoch": 0.12816151862621386,
"loss/policy_avg": 2.0460634231567383,
"lr": 2.2342105263157895e-05,
"objective/entropy": -106.77190399169922,
"objective/kl": 54.43701934814453,
"objective/non_score_reward": -5.443702220916748,
"objective/rlhf_reward": -17.374808883666994,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.7687790393829346,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7337887287139893,
"step": 291,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9968886375427246
},
{
"episode": 4688,
"epoch": 0.1286004279365776,
"loss/policy_avg": 0.13285712897777557,
"lr": 2.231578947368421e-05,
"objective/entropy": -54.86618423461914,
"objective/kl": 64.16263580322266,
"objective/non_score_reward": -6.416263580322266,
"objective/rlhf_reward": -21.265054798126222,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.0963797569274902,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7351999282836914,
"step": 292,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9985344409942627
},
{
"episode": 4704,
"epoch": 0.12903933724694136,
"loss/policy_avg": 0.5066732168197632,
"lr": 2.2289473684210525e-05,
"objective/entropy": -5.0934295654296875,
"objective/kl": 55.561546325683594,
"objective/non_score_reward": -5.556154727935791,
"objective/rlhf_reward": -17.824618911743165,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9651141166687012,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.736973762512207,
"step": 293,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9991990327835083
},
{
"episode": 4720,
"epoch": 0.1294782465573051,
"loss/policy_avg": -0.49058061838150024,
"lr": 2.2263157894736843e-05,
"objective/entropy": -31.643150329589844,
"objective/kl": 91.68218994140625,
"objective/non_score_reward": -9.168219566345215,
"objective/rlhf_reward": -32.27287635803223,
"objective/scores": 1.1,
"policy/approxkl_avg": 50.992034912109375,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8162568807601929,
"step": 294,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000028133392334
},
{
"episode": 4736,
"epoch": 0.12991715586766883,
"loss/policy_avg": 0.280429482460022,
"lr": 2.2236842105263158e-05,
"objective/entropy": -37.46349334716797,
"objective/kl": 56.00715637207031,
"objective/non_score_reward": -5.600715637207031,
"objective/rlhf_reward": -18.002863502502443,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.3156410455703735,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8110268115997314,
"step": 295,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9986287355422974
},
{
"episode": 4752,
"epoch": 0.13035606517803258,
"loss/policy_avg": 0.46459612250328064,
"lr": 2.2210526315789473e-05,
"objective/entropy": -301.3935241699219,
"objective/kl": 36.17216110229492,
"objective/non_score_reward": -3.617216110229492,
"objective/rlhf_reward": -14.06886444091797,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.247771978378296,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8073223233222961,
"step": 296,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9980659484863281
},
{
"episode": 4768,
"epoch": 0.13079497448839633,
"loss/policy_avg": -0.6589095592498779,
"lr": 2.2184210526315788e-05,
"objective/entropy": -87.08273315429688,
"objective/kl": 53.673248291015625,
"objective/non_score_reward": -5.3673248291015625,
"objective/rlhf_reward": -17.06929979324341,
"objective/scores": 1.1,
"policy/approxkl_avg": 41.84059143066406,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8697096705436707,
"step": 297,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.999763011932373
},
{
"episode": 4784,
"epoch": 0.13123388379876008,
"loss/policy_avg": 0.5723211765289307,
"lr": 2.2157894736842106e-05,
"objective/entropy": -102.64724731445312,
"objective/kl": 60.75276184082031,
"objective/non_score_reward": -6.075276851654053,
"objective/rlhf_reward": -19.901106929779054,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.05747127532959,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7324357032775879,
"step": 298,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990253448486328
},
{
"episode": 4800,
"epoch": 0.13167279310912383,
"loss/policy_avg": 0.4579891562461853,
"lr": 2.213157894736842e-05,
"objective/entropy": -297.0541687011719,
"objective/kl": 35.25475311279297,
"objective/non_score_reward": -3.525475263595581,
"objective/rlhf_reward": -12.4977810717264,
"objective/scores": 0.40102999566398123,
"policy/approxkl_avg": 0.13317279517650604,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7126107811927795,
"step": 299,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 1.9996960163116455
},
{
"episode": 4816,
"epoch": 0.13211170241948758,
"loss/policy_avg": 5.555450439453125,
"lr": 2.2105263157894736e-05,
"objective/entropy": -8.623800277709961,
"objective/kl": 106.37024688720703,
"objective/non_score_reward": -10.637025833129883,
"objective/rlhf_reward": -42.148100471496576,
"objective/scores": 0.1,
"policy/approxkl_avg": 105.52049255371094,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7556277513504028,
"step": 300,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9987742900848389
},
{
"episode": 4832,
"epoch": 0.13255061172985133,
"loss/policy_avg": 1.14644193649292,
"lr": 2.207894736842105e-05,
"objective/entropy": -50.783111572265625,
"objective/kl": 64.76817321777344,
"objective/non_score_reward": -6.4768171310424805,
"objective/rlhf_reward": -21.507267570495607,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.36064863204956055,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7552889585494995,
"step": 301,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0014567375183105
},
{
"episode": 4848,
"epoch": 0.13298952104021505,
"loss/policy_avg": -1.4459398984909058,
"lr": 2.2052631578947366e-05,
"objective/entropy": -131.6910858154297,
"objective/kl": 60.966819763183594,
"objective/non_score_reward": -6.096682548522949,
"objective/rlhf_reward": -19.986729240417482,
"objective/scores": 1.1,
"policy/approxkl_avg": 84.77787780761719,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6685715913772583,
"step": 302,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9999291896820068
},
{
"episode": 4864,
"epoch": 0.1334284303505788,
"loss/policy_avg": 2.2287280559539795,
"lr": 2.2026315789473684e-05,
"objective/entropy": -86.05450439453125,
"objective/kl": 63.66607666015625,
"objective/non_score_reward": -6.366608619689941,
"objective/rlhf_reward": -25.06643376350403,
"objective/scores": 0.1,
"policy/approxkl_avg": 118.51611328125,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8448888063430786,
"step": 303,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9962316751480103
},
{
"episode": 4880,
"epoch": 0.13386733966094255,
"loss/policy_avg": -0.04208461940288544,
"lr": 2.2e-05,
"objective/entropy": -136.94601440429688,
"objective/kl": 46.04341506958008,
"objective/non_score_reward": -4.604341506958008,
"objective/rlhf_reward": -14.01736650466919,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.618884325027466,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.6870962381362915,
"step": 304,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000941038131714
},
{
"episode": 4896,
"epoch": 0.1343062489713063,
"loss/policy_avg": 0.08847332000732422,
"lr": 2.1973684210526314e-05,
"objective/entropy": -113.82402801513672,
"objective/kl": 73.18362426757812,
"objective/non_score_reward": -7.318362236022949,
"objective/rlhf_reward": -24.873450374603273,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.0231006145477295,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8040682077407837,
"step": 305,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9995307922363281
},
{
"episode": 4912,
"epoch": 0.13474515828167005,
"loss/policy_avg": -0.6677903532981873,
"lr": 2.1947368421052633e-05,
"objective/entropy": -184.19952392578125,
"objective/kl": 45.321136474609375,
"objective/non_score_reward": -4.532113075256348,
"objective/rlhf_reward": -15.728453254699708,
"objective/scores": 0.6,
"policy/approxkl_avg": 19.28260040283203,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8651016354560852,
"step": 306,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9984790086746216
},
{
"episode": 4928,
"epoch": 0.1351840675920338,
"loss/policy_avg": -0.6809933185577393,
"lr": 2.192105263157895e-05,
"objective/entropy": -110.36442565917969,
"objective/kl": 64.39363098144531,
"objective/non_score_reward": -6.4393630027771,
"objective/rlhf_reward": -21.3574520111084,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.8575007915496826,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7239686250686646,
"step": 307,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9992657899856567
},
{
"episode": 4944,
"epoch": 0.13562297690239755,
"loss/policy_avg": 0.12154096364974976,
"lr": 2.1894736842105266e-05,
"objective/entropy": -269.3300476074219,
"objective/kl": 36.127044677734375,
"objective/non_score_reward": -3.6127047538757324,
"objective/rlhf_reward": -11.527099524379942,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 0.3908675014972687,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8144656419754028,
"step": 308,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9992928504943848
},
{
"episode": 4960,
"epoch": 0.1360618862127613,
"loss/policy_avg": -0.45784494280815125,
"lr": 2.186842105263158e-05,
"objective/entropy": -53.075233459472656,
"objective/kl": 55.653831481933594,
"objective/non_score_reward": -5.565382957458496,
"objective/rlhf_reward": -17.861531829833986,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.0932250022888184,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6408039331436157,
"step": 309,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9994187355041504
},
{
"episode": 4976,
"epoch": 0.13650079552312502,
"loss/policy_avg": 1.0032461881637573,
"lr": 2.1842105263157896e-05,
"objective/entropy": -32.92076873779297,
"objective/kl": 68.55870056152344,
"objective/non_score_reward": -6.855870246887207,
"objective/rlhf_reward": -27.02348051071167,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.289876699447632,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8963852524757385,
"step": 310,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9983676671981812
},
{
"episode": 4992,
"epoch": 0.13693970483348877,
"loss/policy_avg": -0.9319555759429932,
"lr": 2.181578947368421e-05,
"objective/entropy": -155.17787170410156,
"objective/kl": 57.721900939941406,
"objective/non_score_reward": -5.772191047668457,
"objective/rlhf_reward": -18.68876419067383,
"objective/scores": 1.1,
"policy/approxkl_avg": 104.70545959472656,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.9497686624526978,
"step": 311,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9971158504486084
},
{
"episode": 5008,
"epoch": 0.13737861414385252,
"loss/policy_avg": 4.808812141418457,
"lr": 2.178947368421053e-05,
"objective/entropy": 16.19580841064453,
"objective/kl": 73.25709533691406,
"objective/non_score_reward": -7.325709342956543,
"objective/rlhf_reward": -28.902838027477266,
"objective/scores": 0.1,
"policy/approxkl_avg": 66.1017837524414,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7183545827865601,
"step": 312,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9997210502624512
},
{
"episode": 5024,
"epoch": 0.13781752345421627,
"loss/policy_avg": -0.00417676568031311,
"lr": 2.1763157894736844e-05,
"objective/entropy": -36.66968536376953,
"objective/kl": 69.7296142578125,
"objective/non_score_reward": -6.972960948944092,
"objective/rlhf_reward": -24.968125258327696,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 0.3599999248981476,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8020345568656921,
"step": 313,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0006017684936523
},
{
"episode": 5040,
"epoch": 0.13825643276458002,
"loss/policy_avg": 1.2215163707733154,
"lr": 2.173684210526316e-05,
"objective/entropy": 31.91278648376465,
"objective/kl": 60.51493453979492,
"objective/non_score_reward": -6.0514936447143555,
"objective/rlhf_reward": -22.690202319415743,
"objective/scores": 0.37894294565112985,
"policy/approxkl_avg": 3.8936374187469482,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9649239778518677,
"step": 314,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9966028928756714
},
{
"episode": 5056,
"epoch": 0.13869534207494377,
"loss/policy_avg": 0.34088611602783203,
"lr": 2.1710526315789474e-05,
"objective/entropy": -262.01800537109375,
"objective/kl": 25.127941131591797,
"objective/non_score_reward": -2.512794256210327,
"objective/rlhf_reward": -9.65117702484131,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.9674937725067139,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 1.1907947063446045,
"step": 315,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 1.9991812705993652
},
{
"episode": 5072,
"epoch": 0.13913425138530752,
"loss/policy_avg": 0.8795095086097717,
"lr": 2.168421052631579e-05,
"objective/entropy": -25.965755462646484,
"objective/kl": 67.08163452148438,
"objective/non_score_reward": -6.708163738250732,
"objective/rlhf_reward": -26.43265542984009,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.6068958044052124,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9191763401031494,
"step": 316,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9982707500457764
},
{
"episode": 5088,
"epoch": 0.13957316069567124,
"loss/policy_avg": 4.1575822830200195,
"lr": 2.1657894736842108e-05,
"objective/entropy": 75.68894958496094,
"objective/kl": 100.57203674316406,
"objective/non_score_reward": -10.057204246520996,
"objective/rlhf_reward": -42.228816986083984,
"objective/scores": -0.5,
"policy/approxkl_avg": 114.35267639160156,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9920985698699951,
"step": 317,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9952189922332764
},
{
"episode": 5104,
"epoch": 0.140012070006035,
"loss/policy_avg": 5.1927056312561035,
"lr": 2.1631578947368423e-05,
"objective/entropy": -85.5977783203125,
"objective/kl": 47.872772216796875,
"objective/non_score_reward": -4.787276744842529,
"objective/rlhf_reward": -21.149106979370117,
"objective/scores": -0.5,
"policy/approxkl_avg": 68.31144714355469,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7419743537902832,
"step": 318,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9981746673583984
},
{
"episode": 5120,
"epoch": 0.14045097931639874,
"loss/policy_avg": 0.17260417342185974,
"lr": 2.1605263157894738e-05,
"objective/entropy": 5.903343200683594,
"objective/kl": 58.757911682128906,
"objective/non_score_reward": -5.875791072845459,
"objective/rlhf_reward": -23.103164291381837,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.40057724714279175,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9249540567398071,
"step": 319,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001283645629883
},
{
"episode": 5136,
"epoch": 0.1408898886267625,
"loss/policy_avg": 0.7069985866546631,
"lr": 2.1578947368421053e-05,
"objective/entropy": -40.48529052734375,
"objective/kl": 66.60336303710938,
"objective/non_score_reward": -6.660336494445801,
"objective/rlhf_reward": -26.24134693145752,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.6525074243545532,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6802330613136292,
"step": 320,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002126693725586
},
{
"episode": 5152,
"epoch": 0.14132879793712624,
"loss/policy_avg": 1.3065457344055176,
"lr": 2.155263157894737e-05,
"objective/entropy": -42.642120361328125,
"objective/kl": 65.40618896484375,
"objective/non_score_reward": -6.540618419647217,
"objective/rlhf_reward": -21.762474632263185,
"objective/scores": 1.1,
"policy/approxkl_avg": 87.9366455078125,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7430741190910339,
"step": 321,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9962996244430542
},
{
"episode": 5168,
"epoch": 0.14176770724749,
"loss/policy_avg": 1.3831791877746582,
"lr": 2.1526315789473686e-05,
"objective/entropy": -79.70892333984375,
"objective/kl": 57.3241081237793,
"objective/non_score_reward": -5.732410907745361,
"objective/rlhf_reward": -18.529643630981447,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.5671920776367188,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7802742719650269,
"step": 322,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.002181053161621
},
{
"episode": 5184,
"epoch": 0.14220661655785374,
"loss/policy_avg": 0.1442442387342453,
"lr": 2.15e-05,
"objective/entropy": -68.2364273071289,
"objective/kl": 58.37102127075195,
"objective/non_score_reward": -5.837101936340332,
"objective/rlhf_reward": -18.948408222198488,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.7003833055496216,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.8646501302719116,
"step": 323,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002686977386475
},
{
"episode": 5200,
"epoch": 0.1426455258682175,
"loss/policy_avg": 0.2340829223394394,
"lr": 2.1473684210526316e-05,
"objective/entropy": 40.878807067871094,
"objective/kl": 64.59869384765625,
"objective/non_score_reward": -6.459869384765625,
"objective/rlhf_reward": -21.439478492736818,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.2884216904640198,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8738260269165039,
"step": 324,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000551223754883
},
{
"episode": 5216,
"epoch": 0.14308443517858122,
"loss/policy_avg": 1.2863520383834839,
"lr": 2.144736842105263e-05,
"objective/entropy": -44.597232818603516,
"objective/kl": 65.76396179199219,
"objective/non_score_reward": -6.5763959884643555,
"objective/rlhf_reward": -25.90558443069458,
"objective/scores": 0.1,
"policy/approxkl_avg": 75.11996459960938,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9290659427642822,
"step": 325,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9984740018844604
},
{
"episode": 5232,
"epoch": 0.14352334448894496,
"loss/policy_avg": -0.5511414408683777,
"lr": 2.142105263157895e-05,
"objective/entropy": -234.2386474609375,
"objective/kl": 39.21967315673828,
"objective/non_score_reward": -3.921967029571533,
"objective/rlhf_reward": -17.687868118286133,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.12211395800113678,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.897687554359436,
"step": 326,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0014986991882324
},
{
"episode": 5248,
"epoch": 0.14396225379930871,
"loss/policy_avg": 0.07166372239589691,
"lr": 2.1394736842105264e-05,
"objective/entropy": -281.99237060546875,
"objective/kl": 26.73297691345215,
"objective/non_score_reward": -2.673297882080078,
"objective/rlhf_reward": -6.293191528320313,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.3802681267261505,
"policy/clipfrac_avg": 0.0,
"policy/entropy_avg": 0.7150620818138123,
"step": 327,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.000077247619629
},
{
"episode": 5264,
"epoch": 0.14440116310967246,
"loss/policy_avg": 0.7263858318328857,
"lr": 2.136842105263158e-05,
"objective/entropy": -50.91541290283203,
"objective/kl": 56.45762634277344,
"objective/non_score_reward": -5.6457624435424805,
"objective/rlhf_reward": -18.18305025100708,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.6975631713867188,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.981597900390625,
"step": 328,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9971437454223633
},
{
"episode": 5280,
"epoch": 0.14484007242003621,
"loss/policy_avg": -0.8948829174041748,
"lr": 2.1342105263157894e-05,
"objective/entropy": -104.03028869628906,
"objective/kl": 64.45817565917969,
"objective/non_score_reward": -6.445817947387695,
"objective/rlhf_reward": -23.660566510931524,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 73.4510498046875,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8826221227645874,
"step": 329,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9989889860153198
},
{
"episode": 5296,
"epoch": 0.14527898173039996,
"loss/policy_avg": -1.3438373804092407,
"lr": 2.1315789473684212e-05,
"objective/entropy": -59.31224060058594,
"objective/kl": 93.35623168945312,
"objective/non_score_reward": -9.335622787475586,
"objective/rlhf_reward": -36.94249258041381,
"objective/scores": 0.1,
"policy/approxkl_avg": 14.088260650634766,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9897010326385498,
"step": 330,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0010061264038086
},
{
"episode": 5312,
"epoch": 0.14571789104076371,
"loss/policy_avg": 0.26144880056381226,
"lr": 2.1289473684210527e-05,
"objective/entropy": -51.645599365234375,
"objective/kl": 65.5647201538086,
"objective/non_score_reward": -6.556471824645996,
"objective/rlhf_reward": -21.825886821746828,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.4604711532592773,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8496096134185791,
"step": 331,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0012714862823486
},
{
"episode": 5328,
"epoch": 0.14615680035112744,
"loss/policy_avg": 1.6116310358047485,
"lr": 2.1263157894736842e-05,
"objective/entropy": -2.5801925659179688,
"objective/kl": 90.74581909179688,
"objective/non_score_reward": -9.074581146240234,
"objective/rlhf_reward": -31.898327445983888,
"objective/scores": 1.1,
"policy/approxkl_avg": 161.30035400390625,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8422311544418335,
"step": 332,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9967368841171265
},
{
"episode": 5344,
"epoch": 0.14659570966149119,
"loss/policy_avg": 0.12108969688415527,
"lr": 2.1236842105263157e-05,
"objective/entropy": -26.530567169189453,
"objective/kl": 51.3416748046875,
"objective/non_score_reward": -5.134167194366455,
"objective/rlhf_reward": -16.13666877746582,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.6582493782043457,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8019247651100159,
"step": 333,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9982410669326782
},
{
"episode": 5360,
"epoch": 0.14703461897185494,
"loss/policy_avg": 0.024289570748806,
"lr": 2.1210526315789472e-05,
"objective/entropy": -281.92706298828125,
"objective/kl": 22.00769805908203,
"objective/non_score_reward": -2.2007696628570557,
"objective/rlhf_reward": -6.680372419134651,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 0.024459868669509888,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.8491325974464417,
"step": 334,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.000946521759033
},
{
"episode": 5376,
"epoch": 0.14747352828221869,
"loss/policy_avg": 0.25369805097579956,
"lr": 2.118421052631579e-05,
"objective/entropy": -22.282852172851562,
"objective/kl": 61.02508544921875,
"objective/non_score_reward": -6.102509021759033,
"objective/rlhf_reward": -20.010036087036134,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.20567560195922852,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.861437201499939,
"step": 335,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000655174255371
},
{
"episode": 5392,
"epoch": 0.14791243759258244,
"loss/policy_avg": -1.4644429683685303,
"lr": 2.1157894736842106e-05,
"objective/entropy": -44.415863037109375,
"objective/kl": 66.73587036132812,
"objective/non_score_reward": -6.673586368560791,
"objective/rlhf_reward": -22.294345951080324,
"objective/scores": 1.1,
"policy/approxkl_avg": 34.78900146484375,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 1.028685212135315,
"step": 336,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9993712902069092
},
{
"episode": 5408,
"epoch": 0.14835134690294619,
"loss/policy_avg": 0.9412789344787598,
"lr": 2.113157894736842e-05,
"objective/entropy": -100.75498962402344,
"objective/kl": 66.7685775756836,
"objective/non_score_reward": -6.676857948303223,
"objective/rlhf_reward": -22.307430839538576,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.902991533279419,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8672486543655396,
"step": 337,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9970085620880127
},
{
"episode": 5424,
"epoch": 0.14879025621330993,
"loss/policy_avg": -0.2066299468278885,
"lr": 2.1105263157894736e-05,
"objective/entropy": -302.25445556640625,
"objective/kl": 27.72308921813965,
"objective/non_score_reward": -2.7723090648651123,
"objective/rlhf_reward": -10.689236021041872,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.4395582675933838,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6688382029533386,
"step": 338,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 2.000229835510254
},
{
"episode": 5440,
"epoch": 0.14922916552367368,
"loss/policy_avg": 0.2728902995586395,
"lr": 2.107894736842105e-05,
"objective/entropy": -61.32304382324219,
"objective/kl": 47.950096130371094,
"objective/non_score_reward": -4.795009613037109,
"objective/rlhf_reward": -14.780039167404176,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.8936206102371216,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9472643733024597,
"step": 339,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0016930103302
},
{
"episode": 5456,
"epoch": 0.1496680748340374,
"loss/policy_avg": -0.455877423286438,
"lr": 2.105263157894737e-05,
"objective/entropy": -281.2245788574219,
"objective/kl": 46.91923522949219,
"objective/non_score_reward": -4.69192361831665,
"objective/rlhf_reward": -20.7676944732666,
"objective/scores": -0.5,
"policy/approxkl_avg": 2.1040773391723633,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.751963198184967,
"step": 340,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.999725103378296
},
{
"episode": 5472,
"epoch": 0.15010698414440116,
"loss/policy_avg": 0.18102796375751495,
"lr": 2.1026315789473684e-05,
"objective/entropy": -39.01849365234375,
"objective/kl": 57.12322998046875,
"objective/non_score_reward": -5.712323188781738,
"objective/rlhf_reward": -18.449292278289796,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9643039703369141,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9084256887435913,
"step": 341,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998090147972107
},
{
"episode": 5488,
"epoch": 0.1505458934547649,
"loss/policy_avg": -0.9690545201301575,
"lr": 2.1e-05,
"objective/entropy": 34.1061897277832,
"objective/kl": 62.09593200683594,
"objective/non_score_reward": -6.209592819213867,
"objective/rlhf_reward": -20.438370800018312,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.2767698764801025,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.8822450637817383,
"step": 342,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0016417503356934
},
{
"episode": 5504,
"epoch": 0.15098480276512866,
"loss/policy_avg": -1.0985370874404907,
"lr": 2.0973684210526314e-05,
"objective/entropy": -98.5644302368164,
"objective/kl": 60.704673767089844,
"objective/non_score_reward": -6.070467948913574,
"objective/rlhf_reward": -23.881870365142824,
"objective/scores": 0.1,
"policy/approxkl_avg": 60.376220703125,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9388371706008911,
"step": 343,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.000264883041382
},
{
"episode": 5520,
"epoch": 0.1514237120754924,
"loss/policy_avg": 0.04451874643564224,
"lr": 2.0947368421052632e-05,
"objective/entropy": -291.84930419921875,
"objective/kl": 31.81546401977539,
"objective/non_score_reward": -3.181546688079834,
"objective/rlhf_reward": -12.32618627548218,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.256332665681839,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.9044315814971924,
"step": 344,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.999337911605835
},
{
"episode": 5536,
"epoch": 0.15186262138585616,
"loss/policy_avg": 0.33280524611473083,
"lr": 2.0921052631578947e-05,
"objective/entropy": -297.44451904296875,
"objective/kl": 35.726036071777344,
"objective/non_score_reward": -3.572603702545166,
"objective/rlhf_reward": -16.290414810180664,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.19790010154247284,
"policy/clipfrac_avg": 0.0,
"policy/entropy_avg": 0.7775914072990417,
"step": 345,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0004537105560303
},
{
"episode": 5552,
"epoch": 0.1523015306962199,
"loss/policy_avg": 0.2544030249118805,
"lr": 2.0894736842105262e-05,
"objective/entropy": 17.410404205322266,
"objective/kl": 64.44038391113281,
"objective/non_score_reward": -6.444038391113281,
"objective/rlhf_reward": -21.37615261077881,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9380027651786804,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8260934352874756,
"step": 346,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999771118164062
},
{
"episode": 5568,
"epoch": 0.15274044000658363,
"loss/policy_avg": 0.10114617645740509,
"lr": 2.086842105263158e-05,
"objective/entropy": -246.6290283203125,
"objective/kl": 27.598003387451172,
"objective/non_score_reward": -2.759800434112549,
"objective/rlhf_reward": -8.639202213287355,
"objective/scores": 0.6,
"policy/approxkl_avg": 0.4102931618690491,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7962213754653931,
"step": 347,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.999171257019043
},
{
"episode": 5584,
"epoch": 0.15317934931694738,
"loss/policy_avg": -0.27266669273376465,
"lr": 2.0842105263157895e-05,
"objective/entropy": -11.595855712890625,
"objective/kl": 87.985595703125,
"objective/non_score_reward": -8.798559188842773,
"objective/rlhf_reward": -30.794237232208253,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.9808310270309448,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7044839262962341,
"step": 348,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0007858276367188
},
{
"episode": 5600,
"epoch": 0.15361825862731113,
"loss/policy_avg": -0.88209068775177,
"lr": 2.0815789473684214e-05,
"objective/entropy": -190.73699951171875,
"objective/kl": 50.43879318237305,
"objective/non_score_reward": -5.04387903213501,
"objective/rlhf_reward": -15.77551612854004,
"objective/scores": 1.1,
"policy/approxkl_avg": 156.59344482421875,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.677651047706604,
"step": 349,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9986798763275146
},
{
"episode": 5616,
"epoch": 0.15405716793767488,
"loss/policy_avg": 0.14822402596473694,
"lr": 2.078947368421053e-05,
"objective/entropy": -283.6138916015625,
"objective/kl": 20.200992584228516,
"objective/non_score_reward": -2.02009916305542,
"objective/rlhf_reward": -3.6803968906402584,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.2338528633117676,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8573083877563477,
"step": 350,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0005390644073486
},
{
"episode": 5632,
"epoch": 0.15449607724803863,
"loss/policy_avg": 0.0547795295715332,
"lr": 2.0763157894736844e-05,
"objective/entropy": -284.02154541015625,
"objective/kl": 32.1505241394043,
"objective/non_score_reward": -3.215052604675293,
"objective/rlhf_reward": -8.460210895538331,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7220900654792786,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7125214338302612,
"step": 351,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.999941110610962
},
{
"episode": 5648,
"epoch": 0.15493498655840238,
"loss/policy_avg": 0.5199474692344666,
"lr": 2.073684210526316e-05,
"objective/entropy": 1.3252010345458984,
"objective/kl": 80.30017852783203,
"objective/non_score_reward": -8.030017852783203,
"objective/rlhf_reward": -27.720070457458498,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.922848701477051,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8046101331710815,
"step": 352,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 2.0003271102905273
},
{
"episode": 5664,
"epoch": 0.15537389586876613,
"loss/policy_avg": 0.06510049104690552,
"lr": 2.0710526315789474e-05,
"objective/entropy": -4.9978132247924805,
"objective/kl": 74.41607666015625,
"objective/non_score_reward": -7.441607475280762,
"objective/rlhf_reward": -29.366430377960206,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.37788572907447815,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9078077077865601,
"step": 353,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0011749267578125
},
{
"episode": 5680,
"epoch": 0.15581280517912985,
"loss/policy_avg": -0.5775541663169861,
"lr": 2.0684210526315792e-05,
"objective/entropy": -262.08123779296875,
"objective/kl": 39.923824310302734,
"objective/non_score_reward": -3.992382526397705,
"objective/rlhf_reward": -15.569530105590822,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.2174054384231567,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7082418203353882,
"step": 354,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0004000663757324
},
{
"episode": 5696,
"epoch": 0.1562517144894936,
"loss/policy_avg": -0.7071057558059692,
"lr": 2.0657894736842107e-05,
"objective/entropy": -120.38491821289062,
"objective/kl": 57.850921630859375,
"objective/non_score_reward": -5.785092353820801,
"objective/rlhf_reward": -21.53624895579012,
"objective/scores": 0.40102999566398123,
"policy/approxkl_avg": 30.95258331298828,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7739803194999695,
"step": 355,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9983584880828857
},
{
"episode": 5712,
"epoch": 0.15669062379985735,
"loss/policy_avg": -0.019969038665294647,
"lr": 2.0631578947368422e-05,
"objective/entropy": -7.5037031173706055,
"objective/kl": 49.77817153930664,
"objective/non_score_reward": -4.977817058563232,
"objective/rlhf_reward": -15.51126847267151,
"objective/scores": 1.1,
"policy/approxkl_avg": 4.679505825042725,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7911888360977173,
"step": 356,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9971606731414795
},
{
"episode": 5728,
"epoch": 0.1571295331102211,
"loss/policy_avg": -0.15620523691177368,
"lr": 2.0605263157894737e-05,
"objective/entropy": -52.470314025878906,
"objective/kl": 50.46641540527344,
"objective/non_score_reward": -5.0466413497924805,
"objective/rlhf_reward": -15.786566829681398,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.52745521068573,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7481960654258728,
"step": 357,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9998908042907715
},
{
"episode": 5744,
"epoch": 0.15756844242058485,
"loss/policy_avg": 0.10630068182945251,
"lr": 2.0578947368421055e-05,
"objective/entropy": -234.93450927734375,
"objective/kl": 25.975557327270508,
"objective/non_score_reward": -2.5975558757781982,
"objective/rlhf_reward": -5.990223503112793,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9941245317459106,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7744235396385193,
"step": 358,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9997868537902832
},
{
"episode": 5760,
"epoch": 0.1580073517309486,
"loss/policy_avg": 1.8786392211914062,
"lr": 2.055263157894737e-05,
"objective/entropy": -9.861099243164062,
"objective/kl": 97.31732940673828,
"objective/non_score_reward": -9.731733322143555,
"objective/rlhf_reward": -38.52693424224854,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.8872553110122681,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6962687969207764,
"step": 359,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990880489349365
},
{
"episode": 5776,
"epoch": 0.15844626104131235,
"loss/policy_avg": 0.09301671385765076,
"lr": 2.0526315789473685e-05,
"objective/entropy": 40.30408477783203,
"objective/kl": 54.91043472290039,
"objective/non_score_reward": -5.4910430908203125,
"objective/rlhf_reward": -17.564173793792726,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.559497356414795,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8001800179481506,
"step": 360,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998398780822754
},
{
"episode": 5792,
"epoch": 0.1588851703516761,
"loss/policy_avg": -0.1022261455655098,
"lr": 2.05e-05,
"objective/entropy": -116.18830108642578,
"objective/kl": 57.52555465698242,
"objective/non_score_reward": -5.752555847167969,
"objective/rlhf_reward": -18.61022243499756,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.282173216342926,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.720761239528656,
"step": 361,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.001110076904297
},
{
"episode": 5808,
"epoch": 0.15932407966203982,
"loss/policy_avg": -1.0800527334213257,
"lr": 2.0473684210526315e-05,
"objective/entropy": -125.45774841308594,
"objective/kl": 58.299625396728516,
"objective/non_score_reward": -5.829962253570557,
"objective/rlhf_reward": -22.919849491119386,
"objective/scores": 0.1,
"policy/approxkl_avg": 22.697772979736328,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6915639638900757,
"step": 362,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0005617141723633
},
{
"episode": 5824,
"epoch": 0.15976298897240357,
"loss/policy_avg": 0.93153977394104,
"lr": 2.0447368421052634e-05,
"objective/entropy": -43.68049621582031,
"objective/kl": 73.64720153808594,
"objective/non_score_reward": -7.364720821380615,
"objective/rlhf_reward": -25.058883285522462,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5285642147064209,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8717088103294373,
"step": 363,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001168727874756
},
{
"episode": 5840,
"epoch": 0.16020189828276732,
"loss/policy_avg": -0.5737454295158386,
"lr": 2.042105263157895e-05,
"objective/entropy": -144.2489013671875,
"objective/kl": 56.47989273071289,
"objective/non_score_reward": -5.647989273071289,
"objective/rlhf_reward": -18.191957569122316,
"objective/scores": 1.1,
"policy/approxkl_avg": 48.41154479980469,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6994253993034363,
"step": 364,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0013179779052734
},
{
"episode": 5856,
"epoch": 0.16064080759313107,
"loss/policy_avg": -0.5543741583824158,
"lr": 2.0394736842105264e-05,
"objective/entropy": -153.12057495117188,
"objective/kl": 63.56580352783203,
"objective/non_score_reward": -6.356581211090088,
"objective/rlhf_reward": -25.026324844360353,
"objective/scores": 0.1,
"policy/approxkl_avg": 32.669097900390625,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7884219288825989,
"step": 365,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9991981983184814
},
{
"episode": 5872,
"epoch": 0.16107971690349482,
"loss/policy_avg": 0.06399692595005035,
"lr": 2.036842105263158e-05,
"objective/entropy": -10.777188301086426,
"objective/kl": 75.06257629394531,
"objective/non_score_reward": -7.506258487701416,
"objective/rlhf_reward": -29.625033473968507,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.7900074124336243,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8327258229255676,
"step": 366,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9993977546691895
},
{
"episode": 5888,
"epoch": 0.16151862621385857,
"loss/policy_avg": 0.1322728395462036,
"lr": 2.0342105263157897e-05,
"objective/entropy": -25.25229835510254,
"objective/kl": 55.4914665222168,
"objective/non_score_reward": -5.54914665222168,
"objective/rlhf_reward": -17.79658708572388,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7297406196594238,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7416528463363647,
"step": 367,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9996728897094727
},
{
"episode": 5904,
"epoch": 0.16195753552422232,
"loss/policy_avg": 0.13267464935779572,
"lr": 2.0315789473684212e-05,
"objective/entropy": -74.82369995117188,
"objective/kl": 50.54753494262695,
"objective/non_score_reward": -5.054753303527832,
"objective/rlhf_reward": -19.819014644622804,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.003377914428711,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7704557776451111,
"step": 368,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.002319574356079
},
{
"episode": 5920,
"epoch": 0.16239644483458604,
"loss/policy_avg": 0.1616273820400238,
"lr": 2.0289473684210527e-05,
"objective/entropy": -311.86871337890625,
"objective/kl": 34.5989990234375,
"objective/non_score_reward": -3.459900140762329,
"objective/rlhf_reward": -9.439600563049318,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.149490624666214,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.7689305543899536,
"step": 369,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 8,
"val/ratio": 2.000070810317993
},
{
"episode": 5936,
"epoch": 0.1628353541449498,
"loss/policy_avg": -0.322374165058136,
"lr": 2.0263157894736842e-05,
"objective/entropy": -186.23793029785156,
"objective/kl": 45.43787384033203,
"objective/non_score_reward": -4.543787002563477,
"objective/rlhf_reward": -13.775148963928224,
"objective/scores": 1.1,
"policy/approxkl_avg": 36.16376495361328,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8869510889053345,
"step": 370,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9983677864074707
},
{
"episode": 5952,
"epoch": 0.16327426345531354,
"loss/policy_avg": 0.10794153809547424,
"lr": 2.0236842105263157e-05,
"objective/entropy": -311.2743225097656,
"objective/kl": 34.24767303466797,
"objective/non_score_reward": -3.4247677326202393,
"objective/rlhf_reward": -15.699070930480957,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.39599934220314026,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.7125707268714905,
"step": 371,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0000691413879395
},
{
"episode": 5968,
"epoch": 0.1637131727656773,
"loss/policy_avg": -1.7316912412643433,
"lr": 2.0210526315789475e-05,
"objective/entropy": -150.6534423828125,
"objective/kl": 53.55550003051758,
"objective/non_score_reward": -5.3555498123168945,
"objective/rlhf_reward": -17.022200679779054,
"objective/scores": 1.1,
"policy/approxkl_avg": 51.17546081542969,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7921953797340393,
"step": 372,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9999009370803833
},
{
"episode": 5984,
"epoch": 0.16415208207604104,
"loss/policy_avg": -0.33673471212387085,
"lr": 2.018421052631579e-05,
"objective/entropy": -80.92591094970703,
"objective/kl": 67.56401062011719,
"objective/non_score_reward": -6.756401062011719,
"objective/rlhf_reward": -22.625604724884035,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.118476152420044,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6927839517593384,
"step": 373,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9994900226593018
},
{
"episode": 6000,
"epoch": 0.1645909913864048,
"loss/policy_avg": -0.3108822703361511,
"lr": 2.0157894736842105e-05,
"objective/entropy": -145.98187255859375,
"objective/kl": 50.68169021606445,
"objective/non_score_reward": -5.068169116973877,
"objective/rlhf_reward": -15.872675991058351,
"objective/scores": 1.1,
"policy/approxkl_avg": 41.044952392578125,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9443225860595703,
"step": 374,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.997762680053711
},
{
"episode": 6016,
"epoch": 0.16502990069676854,
"loss/policy_avg": 0.3729490339756012,
"lr": 2.013157894736842e-05,
"objective/entropy": -270.9046630859375,
"objective/kl": 54.419471740722656,
"objective/non_score_reward": -5.441946983337402,
"objective/rlhf_reward": -17.367788887023927,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.4963139295578003,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8751029968261719,
"step": 375,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9989949464797974
},
{
"episode": 6032,
"epoch": 0.1654688100071323,
"loss/policy_avg": 0.10866791009902954,
"lr": 2.0105263157894735e-05,
"objective/entropy": -292.86962890625,
"objective/kl": 46.78014373779297,
"objective/non_score_reward": -4.678014278411865,
"objective/rlhf_reward": -20.71205711364746,
"objective/scores": -0.5,
"policy/approxkl_avg": 2.655266046524048,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8966959714889526,
"step": 376,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 8,
"val/ratio": 2.0000298023223877
},
{
"episode": 6048,
"epoch": 0.165907719317496,
"loss/policy_avg": -0.5902384519577026,
"lr": 2.0078947368421053e-05,
"objective/entropy": -193.88943481445312,
"objective/kl": 53.728702545166016,
"objective/non_score_reward": -5.372870445251465,
"objective/rlhf_reward": -17.09148178100586,
"objective/scores": 1.1,
"policy/approxkl_avg": 75.42146301269531,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.986204981803894,
"step": 377,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0006065368652344
},
{
"episode": 6064,
"epoch": 0.16634662862785976,
"loss/policy_avg": -0.6812242865562439,
"lr": 2.0052631578947368e-05,
"objective/entropy": -107.44984436035156,
"objective/kl": 87.9330062866211,
"objective/non_score_reward": -8.79330062866211,
"objective/rlhf_reward": -34.773203468322755,
"objective/scores": 0.1,
"policy/approxkl_avg": 2.6081008911132812,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7108145952224731,
"step": 378,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0003364086151123
},
{
"episode": 6080,
"epoch": 0.1667855379382235,
"loss/policy_avg": 0.2358405739068985,
"lr": 2.0026315789473683e-05,
"objective/entropy": -33.92523956298828,
"objective/kl": 57.368141174316406,
"objective/non_score_reward": -5.736814022064209,
"objective/rlhf_reward": -18.547257041931154,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.742258608341217,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7838228940963745,
"step": 379,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0000722408294678
},
{
"episode": 6096,
"epoch": 0.16722444724858726,
"loss/policy_avg": 0.5731798410415649,
"lr": 1.9999999999999998e-05,
"objective/entropy": -114.15296936035156,
"objective/kl": 51.251705169677734,
"objective/non_score_reward": -5.125170707702637,
"objective/rlhf_reward": -16.10068235397339,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.509912371635437,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.7274678945541382,
"step": 380,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0001626014709473
},
{
"episode": 6112,
"epoch": 0.167663356558951,
"loss/policy_avg": 0.10960416495800018,
"lr": 1.9973684210526317e-05,
"objective/entropy": -75.13426208496094,
"objective/kl": 71.34457397460938,
"objective/non_score_reward": -7.134457588195801,
"objective/rlhf_reward": -28.137829875946046,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.02106511592865,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8250123262405396,
"step": 381,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9984166622161865
},
{
"episode": 6128,
"epoch": 0.16810226586931476,
"loss/policy_avg": 1.2399119138717651,
"lr": 1.994736842105263e-05,
"objective/entropy": -34.498966217041016,
"objective/kl": 60.02134704589844,
"objective/non_score_reward": -6.002135276794434,
"objective/rlhf_reward": -21.08482113921759,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 1.7387114763259888,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8664690256118774,
"step": 382,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001873731613159
},
{
"episode": 6144,
"epoch": 0.1685411751796785,
"loss/policy_avg": 0.9964497089385986,
"lr": 1.9921052631578947e-05,
"objective/entropy": -114.55979919433594,
"objective/kl": 48.91990661621094,
"objective/non_score_reward": -4.891991138458252,
"objective/rlhf_reward": -19.16796455383301,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.6877788305282593,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7417900562286377,
"step": 383,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9994585514068604
},
{
"episode": 6160,
"epoch": 0.16898008449004223,
"loss/policy_avg": -0.14047479629516602,
"lr": 1.989473684210526e-05,
"objective/entropy": -73.96463012695312,
"objective/kl": 70.61947631835938,
"objective/non_score_reward": -7.061947345733643,
"objective/rlhf_reward": -25.32407036864874,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 0.5930333733558655,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8347901105880737,
"step": 384,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0012753009796143
},
{
"episode": 6176,
"epoch": 0.16941899380040598,
"loss/policy_avg": -0.8877277374267578,
"lr": 1.9868421052631576e-05,
"objective/entropy": -26.141101837158203,
"objective/kl": 70.60355377197266,
"objective/non_score_reward": -7.0603556632995605,
"objective/rlhf_reward": -27.841422653198244,
"objective/scores": 0.1,
"policy/approxkl_avg": 2.1280431747436523,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8992083668708801,
"step": 385,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0029306411743164
},
{
"episode": 6192,
"epoch": 0.16985790311076973,
"loss/policy_avg": -0.07291531562805176,
"lr": 1.9842105263157895e-05,
"objective/entropy": -157.21595764160156,
"objective/kl": 46.20996856689453,
"objective/non_score_reward": -4.620996475219727,
"objective/rlhf_reward": -15.560267840267393,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 82.55020141601562,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8402323722839355,
"step": 386,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0009355545043945
},
{
"episode": 6208,
"epoch": 0.17029681242113348,
"loss/policy_avg": -0.2972855269908905,
"lr": 1.9815789473684213e-05,
"objective/entropy": -103.44124603271484,
"objective/kl": 63.92302703857422,
"objective/non_score_reward": -6.392302989959717,
"objective/rlhf_reward": -21.16921195983887,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.8578600883483887,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6705944538116455,
"step": 387,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0004239082336426
},
{
"episode": 6224,
"epoch": 0.17073572173149723,
"loss/policy_avg": 0.29273730516433716,
"lr": 1.9789473684210528e-05,
"objective/entropy": -67.3672866821289,
"objective/kl": 58.002750396728516,
"objective/non_score_reward": -5.800274848937988,
"objective/rlhf_reward": -21.644841520991875,
"objective/scores": 0.38906482631788786,
"policy/approxkl_avg": 1.0682644844055176,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7647286057472229,
"step": 388,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9988304376602173
},
{
"episode": 6240,
"epoch": 0.17117463104186098,
"loss/policy_avg": -0.1294967234134674,
"lr": 1.9763157894736843e-05,
"objective/entropy": -200.57763671875,
"objective/kl": 68.5810317993164,
"objective/non_score_reward": -6.858103275299072,
"objective/rlhf_reward": -25.91664179542893,
"objective/scores": 0.37894294565112985,
"policy/approxkl_avg": 55.38726806640625,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7542761564254761,
"step": 389,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999221563339233
},
{
"episode": 6256,
"epoch": 0.17161354035222473,
"loss/policy_avg": 0.58003830909729,
"lr": 1.9736842105263158e-05,
"objective/entropy": -17.135601043701172,
"objective/kl": 76.6408920288086,
"objective/non_score_reward": -7.664089679718018,
"objective/rlhf_reward": -30.256357288360597,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.9818647503852844,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7245423793792725,
"step": 390,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.002509117126465
},
{
"episode": 6272,
"epoch": 0.17205244966258848,
"loss/policy_avg": -0.8786243796348572,
"lr": 1.9710526315789476e-05,
"objective/entropy": -170.98220825195312,
"objective/kl": 59.838829040527344,
"objective/non_score_reward": -5.983882904052734,
"objective/rlhf_reward": -25.935531616210938,
"objective/scores": -0.5,
"policy/approxkl_avg": 76.37075805664062,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8124063014984131,
"step": 391,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9987341165542603
},
{
"episode": 6288,
"epoch": 0.1724913589729522,
"loss/policy_avg": 0.4738132357597351,
"lr": 1.968421052631579e-05,
"objective/entropy": -26.218795776367188,
"objective/kl": 57.189857482910156,
"objective/non_score_reward": -5.7189860343933105,
"objective/rlhf_reward": -18.475943660736085,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.8383350372314453,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8988721966743469,
"step": 392,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999671459197998
},
{
"episode": 6304,
"epoch": 0.17293026828331595,
"loss/policy_avg": 0.6148244738578796,
"lr": 1.9657894736842106e-05,
"objective/entropy": -279.04248046875,
"objective/kl": 27.996761322021484,
"objective/non_score_reward": -2.7996764183044434,
"objective/rlhf_reward": -13.198705673217773,
"objective/scores": -0.5,
"policy/approxkl_avg": 4.403238773345947,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.955359935760498,
"step": 393,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.998351812362671
},
{
"episode": 6320,
"epoch": 0.1733691775936797,
"loss/policy_avg": 0.5641376972198486,
"lr": 1.963157894736842e-05,
"objective/entropy": -278.6837463378906,
"objective/kl": 31.391254425048828,
"objective/non_score_reward": -3.139125347137451,
"objective/rlhf_reward": -8.156501865386964,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7701074481010437,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.9388406276702881,
"step": 394,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 1.9990991353988647
},
{
"episode": 6336,
"epoch": 0.17380808690404345,
"loss/policy_avg": -0.5742035508155823,
"lr": 1.960526315789474e-05,
"objective/entropy": -31.725059509277344,
"objective/kl": 70.22592163085938,
"objective/non_score_reward": -7.022592544555664,
"objective/rlhf_reward": -25.166651640773985,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 1.2569208145141602,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7669047117233276,
"step": 395,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998677134513855
},
{
"episode": 6352,
"epoch": 0.1742469962144072,
"loss/policy_avg": -0.42700430750846863,
"lr": 1.9578947368421055e-05,
"objective/entropy": -337.1484680175781,
"objective/kl": 34.36861801147461,
"objective/non_score_reward": -3.436861991882324,
"objective/rlhf_reward": -9.34744749069214,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.8713659644126892,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6901314854621887,
"step": 396,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 2.0013864040374756
},
{
"episode": 6368,
"epoch": 0.17468590552477095,
"loss/policy_avg": 0.5263411402702332,
"lr": 1.955263157894737e-05,
"objective/entropy": -352.5277099609375,
"objective/kl": 33.26110076904297,
"objective/non_score_reward": -3.326110363006592,
"objective/rlhf_reward": -8.90444097518921,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5582944750785828,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.7719149589538574,
"step": 397,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 1.9988962411880493
},
{
"episode": 6384,
"epoch": 0.1751248148351347,
"loss/policy_avg": 0.5565706491470337,
"lr": 1.9526315789473685e-05,
"objective/entropy": -280.6494445800781,
"objective/kl": 28.447463989257812,
"objective/non_score_reward": -2.8447465896606445,
"objective/rlhf_reward": -6.9789863586425795,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.1234630346298218,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 1.0970081090927124,
"step": 398,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 1.9993960857391357
},
{
"episode": 6400,
"epoch": 0.17556372414549842,
"loss/policy_avg": 0.9028674364089966,
"lr": 1.95e-05,
"objective/entropy": -307.14703369140625,
"objective/kl": 34.96384048461914,
"objective/non_score_reward": -3.4963841438293457,
"objective/rlhf_reward": -9.585536575317384,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.13779303431510925,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.849014163017273,
"step": 399,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.000701427459717
},
{
"episode": 6416,
"epoch": 0.17600263345586217,
"loss/policy_avg": 0.09404882788658142,
"lr": 1.9473684210526318e-05,
"objective/entropy": -319.993896484375,
"objective/kl": 26.75142478942871,
"objective/non_score_reward": -2.675142765045166,
"objective/rlhf_reward": -6.300570583343506,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.08198696374893188,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8347125053405762,
"step": 400,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 8,
"val/ratio": 1.999713659286499
},
{
"episode": 6432,
"epoch": 0.17644154276622592,
"loss/policy_avg": -0.5681114196777344,
"lr": 1.9447368421052633e-05,
"objective/entropy": 5.593948841094971,
"objective/kl": 81.45667266845703,
"objective/non_score_reward": -8.145668029785156,
"objective/rlhf_reward": -28.182670211791994,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.9207358360290527,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.9207296371459961,
"step": 401,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9994817972183228
},
{
"episode": 6448,
"epoch": 0.17688045207658967,
"loss/policy_avg": -0.19854994118213654,
"lr": 1.9421052631578948e-05,
"objective/entropy": -295.1614074707031,
"objective/kl": 35.05091857910156,
"objective/non_score_reward": -3.505092144012451,
"objective/rlhf_reward": -13.620368099212648,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.7516396045684814,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9505492448806763,
"step": 402,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.001508951187134
},
{
"episode": 6464,
"epoch": 0.17731936138695342,
"loss/policy_avg": -0.1034904420375824,
"lr": 1.9394736842105263e-05,
"objective/entropy": -260.1594543457031,
"objective/kl": 30.833967208862305,
"objective/non_score_reward": -3.0833964347839355,
"objective/rlhf_reward": -14.333585739135742,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.2162019908428192,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8382043838500977,
"step": 403,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 7,
"val/ratio": 2.000133752822876
},
{
"episode": 6480,
"epoch": 0.17775827069731717,
"loss/policy_avg": -0.9927380681037903,
"lr": 1.936842105263158e-05,
"objective/entropy": -140.57418823242188,
"objective/kl": 60.026126861572266,
"objective/non_score_reward": -6.002613067626953,
"objective/rlhf_reward": -23.610451316833498,
"objective/scores": 0.1,
"policy/approxkl_avg": 16.303983688354492,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 1.1447197198867798,
"step": 404,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9993970394134521
},
{
"episode": 6496,
"epoch": 0.17819718000768092,
"loss/policy_avg": 3.3291196823120117,
"lr": 1.9342105263157896e-05,
"objective/entropy": -25.995943069458008,
"objective/kl": 84.37167358398438,
"objective/non_score_reward": -8.43716812133789,
"objective/rlhf_reward": -33.348669624328615,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.6764692664146423,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.9281838536262512,
"step": 405,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0005123615264893
},
{
"episode": 6512,
"epoch": 0.17863608931804467,
"loss/policy_avg": -0.6397604942321777,
"lr": 1.931578947368421e-05,
"objective/entropy": -198.416748046875,
"objective/kl": 60.98023223876953,
"objective/non_score_reward": -6.098023414611816,
"objective/rlhf_reward": -19.992093658447267,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.191061019897461,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8035732507705688,
"step": 406,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0002763271331787
},
{
"episode": 6528,
"epoch": 0.1790749986284084,
"loss/policy_avg": -0.08131340146064758,
"lr": 1.9289473684210526e-05,
"objective/entropy": -295.3315124511719,
"objective/kl": 36.48168182373047,
"objective/non_score_reward": -3.6481685638427734,
"objective/rlhf_reward": -16.592674255371094,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.0901284217834473,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.833161473274231,
"step": 407,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.001061201095581
},
{
"episode": 6544,
"epoch": 0.17951390793877214,
"loss/policy_avg": 0.4617496728897095,
"lr": 1.926315789473684e-05,
"objective/entropy": -66.05003356933594,
"objective/kl": 48.92518997192383,
"objective/non_score_reward": -4.892518997192383,
"objective/rlhf_reward": -15.170076227188112,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.639676809310913,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7556887865066528,
"step": 408,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0015039443969727
},
{
"episode": 6560,
"epoch": 0.1799528172491359,
"loss/policy_avg": 0.08634293079376221,
"lr": 1.923684210526316e-05,
"objective/entropy": -70.57524108886719,
"objective/kl": 73.93417358398438,
"objective/non_score_reward": -7.393417835235596,
"objective/rlhf_reward": -25.173671340942384,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5882269144058228,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8048997521400452,
"step": 409,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9990966320037842
},
{
"episode": 6576,
"epoch": 0.18039172655949964,
"loss/policy_avg": 0.033458832651376724,
"lr": 1.9210526315789474e-05,
"objective/entropy": -297.69122314453125,
"objective/kl": 40.81647491455078,
"objective/non_score_reward": -4.081647872924805,
"objective/rlhf_reward": -11.926590538024904,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.15943774580955505,
"policy/clipfrac_avg": 0.0,
"policy/entropy_avg": 0.9514541029930115,
"step": 410,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9999594688415527
},
{
"episode": 6592,
"epoch": 0.1808306358698634,
"loss/policy_avg": 0.6446475982666016,
"lr": 1.918421052631579e-05,
"objective/entropy": -141.07623291015625,
"objective/kl": 61.068145751953125,
"objective/non_score_reward": -6.106815338134766,
"objective/rlhf_reward": -20.027261352539064,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7178910970687866,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8605374693870544,
"step": 411,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0003395080566406
},
{
"episode": 6608,
"epoch": 0.18126954518022714,
"loss/policy_avg": 0.2619036138057709,
"lr": 1.9157894736842104e-05,
"objective/entropy": -278.59210205078125,
"objective/kl": 27.3831787109375,
"objective/non_score_reward": -2.7383179664611816,
"objective/rlhf_reward": -6.553271865844727,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.32318031787872314,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8319592475891113,
"step": 412,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9992783069610596
},
{
"episode": 6624,
"epoch": 0.1817084544905909,
"loss/policy_avg": 0.2453235387802124,
"lr": 1.913157894736842e-05,
"objective/entropy": -309.0776672363281,
"objective/kl": 25.942588806152344,
"objective/non_score_reward": -2.5942587852478027,
"objective/rlhf_reward": -5.9770351409912115,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.0987516641616821,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 1.0278055667877197,
"step": 413,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 8,
"val/ratio": 1.9998819828033447
},
{
"episode": 6640,
"epoch": 0.18214736380095461,
"loss/policy_avg": -0.6481926441192627,
"lr": 1.9105263157894738e-05,
"objective/entropy": -180.14288330078125,
"objective/kl": 47.951805114746094,
"objective/non_score_reward": -4.795180797576904,
"objective/rlhf_reward": -14.780723190307619,
"objective/scores": 1.1,
"policy/approxkl_avg": 54.561119079589844,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 1.0198578834533691,
"step": 414,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9996154308319092
},
{
"episode": 6656,
"epoch": 0.18258627311131836,
"loss/policy_avg": 0.03629662096500397,
"lr": 1.9078947368421053e-05,
"objective/entropy": -9.615211486816406,
"objective/kl": 65.70553588867188,
"objective/non_score_reward": -6.570553779602051,
"objective/rlhf_reward": -25.882215595245363,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.8190488815307617,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9529430866241455,
"step": 415,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9984461069107056
},
{
"episode": 6672,
"epoch": 0.18302518242168211,
"loss/policy_avg": -0.17124596238136292,
"lr": 1.9052631578947368e-05,
"objective/entropy": -296.9742736816406,
"objective/kl": 31.674793243408203,
"objective/non_score_reward": -3.1674790382385254,
"objective/rlhf_reward": -8.269916391372682,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.6360380053520203,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9433181881904602,
"step": 416,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000105142593384
},
{
"episode": 6688,
"epoch": 0.18346409173204586,
"loss/policy_avg": -0.10602147877216339,
"lr": 1.9026315789473683e-05,
"objective/entropy": 26.670429229736328,
"objective/kl": 58.556495666503906,
"objective/non_score_reward": -5.855649948120117,
"objective/rlhf_reward": -23.02259979248047,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.7600481510162354,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 1.0238035917282104,
"step": 417,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000761032104492
},
{
"episode": 6704,
"epoch": 0.18390300104240961,
"loss/policy_avg": 0.16803205013275146,
"lr": 1.9e-05,
"objective/entropy": -16.96014976501465,
"objective/kl": 62.71726989746094,
"objective/non_score_reward": -6.271727085113525,
"objective/rlhf_reward": -20.686907863616945,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.8721787929534912,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9119787216186523,
"step": 418,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002262592315674
},
{
"episode": 6720,
"epoch": 0.18434191035277336,
"loss/policy_avg": 0.8746110200881958,
"lr": 1.8973684210526316e-05,
"objective/entropy": 6.184518814086914,
"objective/kl": 61.64329528808594,
"objective/non_score_reward": -6.164329528808594,
"objective/rlhf_reward": -24.25731716156006,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.2244923114776611,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.9992606043815613,
"step": 419,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9991247653961182
},
{
"episode": 6736,
"epoch": 0.1847808196631371,
"loss/policy_avg": 0.014954586513340473,
"lr": 1.894736842105263e-05,
"objective/entropy": -267.61376953125,
"objective/kl": 27.085674285888672,
"objective/non_score_reward": -2.7085673809051514,
"objective/rlhf_reward": -6.434269523620606,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.14659488201141357,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.9907639026641846,
"step": 420,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0013108253479004
},
{
"episode": 6752,
"epoch": 0.18521972897350086,
"loss/policy_avg": 0.8699166178703308,
"lr": 1.8921052631578946e-05,
"objective/entropy": -77.01543426513672,
"objective/kl": 48.32474899291992,
"objective/non_score_reward": -4.832475185394287,
"objective/rlhf_reward": -14.929901218414308,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.799189805984497,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7855837345123291,
"step": 421,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999213457107544
},
{
"episode": 6768,
"epoch": 0.18565863828386459,
"loss/policy_avg": -0.27239370346069336,
"lr": 1.889473684210526e-05,
"objective/entropy": -10.540843963623047,
"objective/kl": 58.51908874511719,
"objective/non_score_reward": -5.851909637451172,
"objective/rlhf_reward": -19.007637119293214,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.3941487073898315,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8914047479629517,
"step": 422,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0000998973846436
},
{
"episode": 6784,
"epoch": 0.18609754759422834,
"loss/policy_avg": 0.5779774785041809,
"lr": 1.886842105263158e-05,
"objective/entropy": -37.48634338378906,
"objective/kl": 60.117984771728516,
"objective/non_score_reward": -6.011798858642578,
"objective/rlhf_reward": -19.64719400405884,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.9815831184387207,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 1.0184441804885864,
"step": 423,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998115062713623
},
{
"episode": 6800,
"epoch": 0.18653645690459209,
"loss/policy_avg": 0.49414119124412537,
"lr": 1.8842105263157894e-05,
"objective/entropy": 50.2642707824707,
"objective/kl": 78.58097076416016,
"objective/non_score_reward": -7.858097076416016,
"objective/rlhf_reward": -31.032388305664064,
"objective/scores": 0.1,
"policy/approxkl_avg": 141.35354614257812,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8828184604644775,
"step": 424,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9957654476165771
},
{
"episode": 6816,
"epoch": 0.18697536621495583,
"loss/policy_avg": 0.014625292271375656,
"lr": 1.881578947368421e-05,
"objective/entropy": -72.2008285522461,
"objective/kl": 67.50498962402344,
"objective/non_score_reward": -6.7504987716674805,
"objective/rlhf_reward": -22.601995086669923,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5733897686004639,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8627150654792786,
"step": 425,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0004444122314453
},
{
"episode": 6832,
"epoch": 0.18741427552531958,
"loss/policy_avg": 0.3940482437610626,
"lr": 1.8789473684210524e-05,
"objective/entropy": -109.4649429321289,
"objective/kl": 57.591156005859375,
"objective/non_score_reward": -5.759115219116211,
"objective/rlhf_reward": -18.63646183013916,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.5976098775863647,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6642797589302063,
"step": 426,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9996377229690552
},
{
"episode": 6848,
"epoch": 0.18785318483568333,
"loss/policy_avg": 0.6501445174217224,
"lr": 1.8763157894736843e-05,
"objective/entropy": -93.68934631347656,
"objective/kl": 52.76403045654297,
"objective/non_score_reward": -5.276403427124023,
"objective/rlhf_reward": -16.70561275482178,
"objective/scores": 1.1,
"policy/approxkl_avg": 93.70457458496094,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8544291853904724,
"step": 427,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000861406326294
},
{
"episode": 6864,
"epoch": 0.18829209414604708,
"loss/policy_avg": -0.7759616374969482,
"lr": 1.873684210526316e-05,
"objective/entropy": -127.28284454345703,
"objective/kl": 52.26542282104492,
"objective/non_score_reward": -5.2265424728393555,
"objective/rlhf_reward": -16.50617036819458,
"objective/scores": 1.1,
"policy/approxkl_avg": 44.17460250854492,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 1.0000516176223755,
"step": 428,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9975546598434448
},
{
"episode": 6880,
"epoch": 0.1887310034564108,
"loss/policy_avg": -0.11000016331672668,
"lr": 1.8710526315789476e-05,
"objective/entropy": -19.487102508544922,
"objective/kl": 55.983280181884766,
"objective/non_score_reward": -5.598328113555908,
"objective/rlhf_reward": -17.993312454223634,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.473930597305298,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.9288405179977417,
"step": 429,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000844717025757
},
{
"episode": 6896,
"epoch": 0.18916991276677456,
"loss/policy_avg": -0.06470927596092224,
"lr": 1.868421052631579e-05,
"objective/entropy": -254.17868041992188,
"objective/kl": 34.887691497802734,
"objective/non_score_reward": -3.4887688159942627,
"objective/rlhf_reward": -13.555075263977052,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.26364630460739136,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 1.0304815769195557,
"step": 430,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0000686645507812
},
{
"episode": 6912,
"epoch": 0.1896088220771383,
"loss/policy_avg": 0.21356704831123352,
"lr": 1.8657894736842106e-05,
"objective/entropy": -261.9208679199219,
"objective/kl": 31.915531158447266,
"objective/non_score_reward": -3.1915531158447266,
"objective/rlhf_reward": -8.36621198654175,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.789092540740967,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8163321018218994,
"step": 431,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9990161657333374
},
{
"episode": 6928,
"epoch": 0.19004773138750206,
"loss/policy_avg": -1.0231928825378418,
"lr": 1.8631578947368424e-05,
"objective/entropy": -168.65093994140625,
"objective/kl": 51.64683532714844,
"objective/non_score_reward": -5.1646833419799805,
"objective/rlhf_reward": -22.658733367919922,
"objective/scores": -0.5,
"policy/approxkl_avg": 32.607357025146484,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8486956357955933,
"step": 432,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9985682964324951
},
{
"episode": 6944,
"epoch": 0.1904866406978658,
"loss/policy_avg": -0.6227490901947021,
"lr": 1.860526315789474e-05,
"objective/entropy": -65.97626495361328,
"objective/kl": 63.84180450439453,
"objective/non_score_reward": -6.384180545806885,
"objective/rlhf_reward": -25.1367226600647,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.5096147656440735,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6590231657028198,
"step": 433,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0007688999176025
},
{
"episode": 6960,
"epoch": 0.19092555000822956,
"loss/policy_avg": 0.4701375365257263,
"lr": 1.8578947368421054e-05,
"objective/entropy": -10.970794677734375,
"objective/kl": 62.22464370727539,
"objective/non_score_reward": -6.222464561462402,
"objective/rlhf_reward": -20.489859199523927,
"objective/scores": 1.1,
"policy/approxkl_avg": 173.43112182617188,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.9498796463012695,
"step": 434,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9947084188461304
},
{
"episode": 6976,
"epoch": 0.1913644593185933,
"loss/policy_avg": 0.099261075258255,
"lr": 1.855263157894737e-05,
"objective/entropy": -267.3575134277344,
"objective/kl": 27.131763458251953,
"objective/non_score_reward": -2.7131762504577637,
"objective/rlhf_reward": -12.852705001831055,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.024509262293577194,
"policy/clipfrac_avg": 0.0,
"policy/entropy_avg": 0.6670312881469727,
"step": 435,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0010085105895996
},
{
"episode": 6992,
"epoch": 0.19180336862895705,
"loss/policy_avg": -0.4680057466030121,
"lr": 1.8526315789473684e-05,
"objective/entropy": -42.91291427612305,
"objective/kl": 66.02111053466797,
"objective/non_score_reward": -6.602110862731934,
"objective/rlhf_reward": -22.008442974090578,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.3675117492675781,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6913959980010986,
"step": 436,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9998745918273926
},
{
"episode": 7008,
"epoch": 0.19224227793932078,
"loss/policy_avg": 2.381070137023926,
"lr": 1.8500000000000002e-05,
"objective/entropy": -19.4685115814209,
"objective/kl": 47.43255615234375,
"objective/non_score_reward": -4.743255615234375,
"objective/rlhf_reward": -14.573023653030397,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7034322619438171,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.837364673614502,
"step": 437,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.003263473510742
},
{
"episode": 7024,
"epoch": 0.19268118724968453,
"loss/policy_avg": -0.018530648201704025,
"lr": 1.8473684210526317e-05,
"objective/entropy": -32.53235626220703,
"objective/kl": 59.6943244934082,
"objective/non_score_reward": -5.969432830810547,
"objective/rlhf_reward": -19.477729892730714,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5509717464447021,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8037828207015991,
"step": 438,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0012922286987305
},
{
"episode": 7040,
"epoch": 0.19312009656004828,
"loss/policy_avg": -0.009570784866809845,
"lr": 1.8447368421052632e-05,
"objective/entropy": -50.37032699584961,
"objective/kl": 56.81880187988281,
"objective/non_score_reward": -5.681879997253418,
"objective/rlhf_reward": -18.327521419525148,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.6729666590690613,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8526865839958191,
"step": 439,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9991214275360107
},
{
"episode": 7056,
"epoch": 0.19355900587041203,
"loss/policy_avg": -0.930114209651947,
"lr": 1.8421052631578947e-05,
"objective/entropy": -178.06288146972656,
"objective/kl": 45.6494026184082,
"objective/non_score_reward": -4.564940452575684,
"objective/rlhf_reward": -13.859762287139894,
"objective/scores": 1.1,
"policy/approxkl_avg": 123.52664184570312,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8203810453414917,
"step": 440,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9983216524124146
},
{
"episode": 7072,
"epoch": 0.19399791518077578,
"loss/policy_avg": 1.49681556224823,
"lr": 1.8394736842105266e-05,
"objective/entropy": -21.60431671142578,
"objective/kl": 63.327247619628906,
"objective/non_score_reward": -6.332725524902344,
"objective/rlhf_reward": -20.930900669097902,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.75593900680542,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7101706266403198,
"step": 441,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001399040222168
},
{
"episode": 7088,
"epoch": 0.19443682449113953,
"loss/policy_avg": 0.45096832513809204,
"lr": 1.836842105263158e-05,
"objective/entropy": -5.946907043457031,
"objective/kl": 75.59844970703125,
"objective/non_score_reward": -7.559844970703125,
"objective/rlhf_reward": -29.839378929138185,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.7098190784454346,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.766690731048584,
"step": 442,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9985554218292236
},
{
"episode": 7104,
"epoch": 0.19487573380150328,
"loss/policy_avg": 0.263136088848114,
"lr": 1.8342105263157896e-05,
"objective/entropy": -17.822843551635742,
"objective/kl": 51.40583038330078,
"objective/non_score_reward": -5.140583515167236,
"objective/rlhf_reward": -16.16233334541321,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.2211589813232422,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5617699027061462,
"step": 443,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9987659454345703
},
{
"episode": 7120,
"epoch": 0.195314643111867,
"loss/policy_avg": -0.007531791925430298,
"lr": 1.831578947368421e-05,
"objective/entropy": -16.001230239868164,
"objective/kl": 80.43534851074219,
"objective/non_score_reward": -8.043535232543945,
"objective/rlhf_reward": -27.77414140701294,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.3839130401611328,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.8183437585830688,
"step": 444,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0007972717285156
},
{
"episode": 7136,
"epoch": 0.19575355242223075,
"loss/policy_avg": 1.0919990539550781,
"lr": 1.8289473684210526e-05,
"objective/entropy": -254.75714111328125,
"objective/kl": 31.01543426513672,
"objective/non_score_reward": -3.101543426513672,
"objective/rlhf_reward": -8.006174182891847,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.4650940895080566,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6774981021881104,
"step": 445,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9985923767089844
},
{
"episode": 7152,
"epoch": 0.1961924617325945,
"loss/policy_avg": -0.019719071686267853,
"lr": 1.8263157894736844e-05,
"objective/entropy": -237.97215270996094,
"objective/kl": 38.12819290161133,
"objective/non_score_reward": -3.812819480895996,
"objective/rlhf_reward": -10.851277446746828,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.24818766117095947,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8377676606178284,
"step": 446,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.999452829360962
},
{
"episode": 7168,
"epoch": 0.19663137104295825,
"loss/policy_avg": 0.27718091011047363,
"lr": 1.823684210526316e-05,
"objective/entropy": -8.873601913452148,
"objective/kl": 49.01349639892578,
"objective/non_score_reward": -4.9013495445251465,
"objective/rlhf_reward": -15.205398416519166,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9532676935195923,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6659794449806213,
"step": 447,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9996001720428467
},
{
"episode": 7184,
"epoch": 0.197070280353322,
"loss/policy_avg": -0.06615559011697769,
"lr": 1.8210526315789474e-05,
"objective/entropy": -44.73011016845703,
"objective/kl": 54.297813415527344,
"objective/non_score_reward": -5.429781913757324,
"objective/rlhf_reward": -21.31912717819214,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.26587027311325073,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.712435245513916,
"step": 448,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0008835792541504
},
{
"episode": 7200,
"epoch": 0.19750918966368575,
"loss/policy_avg": 0.9200335741043091,
"lr": 1.818421052631579e-05,
"objective/entropy": -270.44720458984375,
"objective/kl": 34.97774124145508,
"objective/non_score_reward": -3.497774362564087,
"objective/rlhf_reward": -13.591097450256349,
"objective/scores": 0.1,
"policy/approxkl_avg": 2.7731072902679443,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7718012928962708,
"step": 449,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.001753807067871
},
{
"episode": 7216,
"epoch": 0.1979480989740495,
"loss/policy_avg": -0.2271648347377777,
"lr": 1.8157894736842107e-05,
"objective/entropy": 17.676624298095703,
"objective/kl": 70.4654769897461,
"objective/non_score_reward": -7.046547889709473,
"objective/rlhf_reward": -25.786191082000734,
"objective/scores": 0.6,
"policy/approxkl_avg": 2.0250697135925293,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7303136587142944,
"step": 450,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9997398853302002
},
{
"episode": 7232,
"epoch": 0.19838700828441325,
"loss/policy_avg": 1.256649136543274,
"lr": 1.8131578947368422e-05,
"objective/entropy": -0.7466411590576172,
"objective/kl": 61.498382568359375,
"objective/non_score_reward": -6.149837970733643,
"objective/rlhf_reward": -20.19935188293457,
"objective/scores": 1.1,
"policy/approxkl_avg": 70.4373550415039,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.712023138999939,
"step": 451,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.994720697402954
},
{
"episode": 7248,
"epoch": 0.19882591759477697,
"loss/policy_avg": -0.515707790851593,
"lr": 1.8105263157894737e-05,
"objective/entropy": -30.733827590942383,
"objective/kl": 75.01179504394531,
"objective/non_score_reward": -7.5011796951293945,
"objective/rlhf_reward": -25.60471830368042,
"objective/scores": 1.1,
"policy/approxkl_avg": 8.555524826049805,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6430870294570923,
"step": 452,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000035047531128
},
{
"episode": 7264,
"epoch": 0.19926482690514072,
"loss/policy_avg": 0.050750162452459335,
"lr": 1.8078947368421052e-05,
"objective/entropy": -46.318687438964844,
"objective/kl": 74.29559326171875,
"objective/non_score_reward": -7.429559230804443,
"objective/rlhf_reward": -25.318237400054933,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9510049819946289,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6830945014953613,
"step": 453,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9981689453125
},
{
"episode": 7280,
"epoch": 0.19970373621550447,
"loss/policy_avg": 0.30346107482910156,
"lr": 1.8052631578947367e-05,
"objective/entropy": -264.3983459472656,
"objective/kl": 43.06207275390625,
"objective/non_score_reward": -4.306207180023193,
"objective/rlhf_reward": -16.824828243255617,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.0833698511123657,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7494672536849976,
"step": 454,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9986538887023926
},
{
"episode": 7296,
"epoch": 0.20014264552586822,
"loss/policy_avg": 1.5475409030914307,
"lr": 1.8026315789473685e-05,
"objective/entropy": 29.105066299438477,
"objective/kl": 46.912784576416016,
"objective/non_score_reward": -4.691278457641602,
"objective/rlhf_reward": -20.765113830566406,
"objective/scores": -0.5,
"policy/approxkl_avg": 94.84734344482422,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5901237726211548,
"step": 455,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9971956014633179
},
{
"episode": 7312,
"epoch": 0.20058155483623197,
"loss/policy_avg": 0.40649881958961487,
"lr": 1.8e-05,
"objective/entropy": -20.784526824951172,
"objective/kl": 64.50088500976562,
"objective/non_score_reward": -6.4500885009765625,
"objective/rlhf_reward": -21.40035400390625,
"objective/scores": 1.1,
"policy/approxkl_avg": 5.087962627410889,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.680627703666687,
"step": 456,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9978127479553223
},
{
"episode": 7328,
"epoch": 0.20102046414659572,
"loss/policy_avg": 0.028734341263771057,
"lr": 1.7973684210526315e-05,
"objective/entropy": -282.66912841796875,
"objective/kl": 29.973798751831055,
"objective/non_score_reward": -2.997379779815674,
"objective/rlhf_reward": -9.065800104976866,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 0.2122015655040741,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6922552585601807,
"step": 457,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0007247924804688
},
{
"episode": 7344,
"epoch": 0.20145937345695947,
"loss/policy_avg": -0.22284090518951416,
"lr": 1.794736842105263e-05,
"objective/entropy": -73.5594711303711,
"objective/kl": 46.40571975708008,
"objective/non_score_reward": -4.6405720710754395,
"objective/rlhf_reward": -14.162287569046022,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.1293284893035889,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8148968815803528,
"step": 458,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002856254577637
},
{
"episode": 7360,
"epoch": 0.2018982827673232,
"loss/policy_avg": -0.16610336303710938,
"lr": 1.7921052631578945e-05,
"objective/entropy": -63.282432556152344,
"objective/kl": 54.72997283935547,
"objective/non_score_reward": -5.472997665405273,
"objective/rlhf_reward": -17.491990184783937,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.9718881845474243,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6093668341636658,
"step": 459,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9995301961898804
},
{
"episode": 7376,
"epoch": 0.20233719207768694,
"loss/policy_avg": -0.5106132626533508,
"lr": 1.7894736842105264e-05,
"objective/entropy": -32.18394088745117,
"objective/kl": 60.436309814453125,
"objective/non_score_reward": -6.043630599975586,
"objective/rlhf_reward": -19.774522638320924,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.607424736022949,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.6205296516418457,
"step": 460,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9996771812438965
},
{
"episode": 7392,
"epoch": 0.2027761013880507,
"loss/policy_avg": -0.11848235130310059,
"lr": 1.786842105263158e-05,
"objective/entropy": -250.755859375,
"objective/kl": 24.229875564575195,
"objective/non_score_reward": -2.422987461090088,
"objective/rlhf_reward": -11.691949844360352,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.8377648591995239,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8346691131591797,
"step": 461,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.99947988986969
},
{
"episode": 7408,
"epoch": 0.20321501069841444,
"loss/policy_avg": 0.08530762791633606,
"lr": 1.7842105263157894e-05,
"objective/entropy": 6.682670593261719,
"objective/kl": 51.59654998779297,
"objective/non_score_reward": -5.159655570983887,
"objective/rlhf_reward": -16.238621330261232,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.4281816482543945,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.535011351108551,
"step": 462,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998945951461792
},
{
"episode": 7424,
"epoch": 0.2036539200087782,
"loss/policy_avg": 1.1254315376281738,
"lr": 1.781578947368421e-05,
"objective/entropy": -52.357872009277344,
"objective/kl": 50.13798522949219,
"objective/non_score_reward": -5.013798713684082,
"objective/rlhf_reward": -15.655193901062013,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.8155397772789001,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6176036596298218,
"step": 463,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0029804706573486
},
{
"episode": 7440,
"epoch": 0.20409282931914194,
"loss/policy_avg": 1.055740237236023,
"lr": 1.7789473684210527e-05,
"objective/entropy": -6.067316055297852,
"objective/kl": 59.782745361328125,
"objective/non_score_reward": -5.978274345397949,
"objective/rlhf_reward": -23.513098812103273,
"objective/scores": 0.1,
"policy/approxkl_avg": 5.5082688331604,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6876584887504578,
"step": 464,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000401258468628
},
{
"episode": 7456,
"epoch": 0.2045317386295057,
"loss/policy_avg": 0.6007944345474243,
"lr": 1.7763157894736842e-05,
"objective/entropy": -19.558338165283203,
"objective/kl": 60.92905044555664,
"objective/non_score_reward": -6.092905044555664,
"objective/rlhf_reward": -22.248913469091924,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 1.0873606204986572,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5196924209594727,
"step": 465,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9983136653900146
},
{
"episode": 7472,
"epoch": 0.20497064793986944,
"loss/policy_avg": -0.22906312346458435,
"lr": 1.7736842105263157e-05,
"objective/entropy": -40.17732238769531,
"objective/kl": 65.0643310546875,
"objective/non_score_reward": -6.50643253326416,
"objective/rlhf_reward": -21.62573108673096,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7022728323936462,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.6174463033676147,
"step": 466,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0009613037109375
},
{
"episode": 7488,
"epoch": 0.20540955725023316,
"loss/policy_avg": -0.7405965328216553,
"lr": 1.7710526315789472e-05,
"objective/entropy": -157.75514221191406,
"objective/kl": 49.42311096191406,
"objective/non_score_reward": -4.9423112869262695,
"objective/rlhf_reward": -19.369244194030763,
"objective/scores": 0.1,
"policy/approxkl_avg": 65.77047729492188,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5402215719223022,
"step": 467,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999876022338867
},
{
"episode": 7504,
"epoch": 0.2058484665605969,
"loss/policy_avg": 0.04996461793780327,
"lr": 1.7684210526315787e-05,
"objective/entropy": -275.37030029296875,
"objective/kl": 33.15436553955078,
"objective/non_score_reward": -3.315436840057373,
"objective/rlhf_reward": -11.139041127935918,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 0.5296927094459534,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6478952169418335,
"step": 468,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.999742865562439
},
{
"episode": 7520,
"epoch": 0.20628737587096066,
"loss/policy_avg": 0.7616822123527527,
"lr": 1.765789473684211e-05,
"objective/entropy": -32.41627502441406,
"objective/kl": 87.37818145751953,
"objective/non_score_reward": -8.737818717956543,
"objective/rlhf_reward": -34.55127391815186,
"objective/scores": 0.1,
"policy/approxkl_avg": 106.83147430419922,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6894011497497559,
"step": 469,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9993350505828857
},
{
"episode": 7536,
"epoch": 0.2067262851813244,
"loss/policy_avg": -0.7901378273963928,
"lr": 1.7631578947368424e-05,
"objective/entropy": -130.1688232421875,
"objective/kl": 46.754058837890625,
"objective/non_score_reward": -4.675406455993652,
"objective/rlhf_reward": -15.777905856014463,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 8.254480361938477,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5577253103256226,
"step": 470,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9982564449310303
},
{
"episode": 7552,
"epoch": 0.20716519449168816,
"loss/policy_avg": 0.019479047507047653,
"lr": 1.760526315789474e-05,
"objective/entropy": -301.51361083984375,
"objective/kl": 30.93421173095703,
"objective/non_score_reward": -3.093421220779419,
"objective/rlhf_reward": -14.373684883117676,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.615177869796753,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.5213165283203125,
"step": 471,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9974719285964966
},
{
"episode": 7568,
"epoch": 0.2076041038020519,
"loss/policy_avg": 0.01017309445887804,
"lr": 1.7578947368421054e-05,
"objective/entropy": -243.85569763183594,
"objective/kl": 34.48377227783203,
"objective/non_score_reward": -3.4483771324157715,
"objective/rlhf_reward": -9.393509006500246,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7559428215026855,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.8406962156295776,
"step": 472,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999021291732788
},
{
"episode": 7584,
"epoch": 0.20804301311241566,
"loss/policy_avg": 1.2839194536209106,
"lr": 1.755263157894737e-05,
"objective/entropy": -78.8662109375,
"objective/kl": 60.683692932128906,
"objective/non_score_reward": -6.0683698654174805,
"objective/rlhf_reward": -23.873478507995607,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.5784714221954346,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.543493926525116,
"step": 473,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998354434967041
},
{
"episode": 7600,
"epoch": 0.20848192242277938,
"loss/policy_avg": 0.1725112497806549,
"lr": 1.7526315789473687e-05,
"objective/entropy": 12.784589767456055,
"objective/kl": 58.08510971069336,
"objective/non_score_reward": -5.808510780334473,
"objective/rlhf_reward": -18.83404407501221,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7866100072860718,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5890865921974182,
"step": 474,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9998912811279297
},
{
"episode": 7616,
"epoch": 0.20892083173314313,
"loss/policy_avg": -1.373124361038208,
"lr": 1.7500000000000002e-05,
"objective/entropy": -33.494956970214844,
"objective/kl": 69.98075103759766,
"objective/non_score_reward": -6.998075485229492,
"objective/rlhf_reward": -29.99230194091797,
"objective/scores": -0.5,
"policy/approxkl_avg": 54.548370361328125,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.64041668176651,
"step": 475,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9978668689727783
},
{
"episode": 7632,
"epoch": 0.20935974104350688,
"loss/policy_avg": 0.07895299792289734,
"lr": 1.7473684210526317e-05,
"objective/entropy": -11.28443431854248,
"objective/kl": 53.7340087890625,
"objective/non_score_reward": -5.373400688171387,
"objective/rlhf_reward": -17.09360227584839,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.7228981256484985,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6447443962097168,
"step": 476,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.998573660850525
},
{
"episode": 7648,
"epoch": 0.20979865035387063,
"loss/policy_avg": -1.7310700416564941,
"lr": 1.7447368421052632e-05,
"objective/entropy": -142.7119903564453,
"objective/kl": 53.508262634277344,
"objective/non_score_reward": -5.350826740264893,
"objective/rlhf_reward": -23.40330696105957,
"objective/scores": -0.5,
"policy/approxkl_avg": 84.33183288574219,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5232048034667969,
"step": 477,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999740719795227
},
{
"episode": 7664,
"epoch": 0.21023755966423438,
"loss/policy_avg": 0.08662360906600952,
"lr": 1.742105263157895e-05,
"objective/entropy": -298.08477783203125,
"objective/kl": 29.51595687866211,
"objective/non_score_reward": -2.9515957832336426,
"objective/rlhf_reward": -11.406383132934572,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.8490124940872192,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.620236873626709,
"step": 478,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0023481845855713
},
{
"episode": 7680,
"epoch": 0.21067646897459813,
"loss/policy_avg": 4.753633975982666,
"lr": 1.7394736842105265e-05,
"objective/entropy": 4.008540153503418,
"objective/kl": 50.286949157714844,
"objective/non_score_reward": -5.028695106506348,
"objective/rlhf_reward": -17.714780187606813,
"objective/scores": 0.6,
"policy/approxkl_avg": 2.038093090057373,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6810034513473511,
"step": 479,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9995425939559937
},
{
"episode": 7696,
"epoch": 0.21111537828496188,
"loss/policy_avg": -0.013529837131500244,
"lr": 1.736842105263158e-05,
"objective/entropy": -62.08034133911133,
"objective/kl": 98.79060363769531,
"objective/non_score_reward": -9.879060745239258,
"objective/rlhf_reward": -35.116244888305665,
"objective/scores": 1.1,
"policy/approxkl_avg": 8.784490585327148,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.6048997640609741,
"step": 480,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0008130073547363
},
{
"episode": 7712,
"epoch": 0.21155428759532563,
"loss/policy_avg": -0.44553932547569275,
"lr": 1.7342105263157895e-05,
"objective/entropy": -287.85150146484375,
"objective/kl": 18.73577117919922,
"objective/non_score_reward": -1.8735771179199219,
"objective/rlhf_reward": -3.094308471679687,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.242295041680336,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5571756362915039,
"step": 481,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.0010554790496826
},
{
"episode": 7728,
"epoch": 0.21199319690568935,
"loss/policy_avg": -0.17571601271629333,
"lr": 1.731578947368421e-05,
"objective/entropy": -26.461956024169922,
"objective/kl": 72.46046447753906,
"objective/non_score_reward": -7.24604606628418,
"objective/rlhf_reward": -24.584185218811037,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.8817163705825806,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6041799187660217,
"step": 482,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0004825592041016
},
{
"episode": 7744,
"epoch": 0.2124321062160531,
"loss/policy_avg": 0.004539132118225098,
"lr": 1.728947368421053e-05,
"objective/entropy": -290.8310546875,
"objective/kl": 24.83023452758789,
"objective/non_score_reward": -2.4830234050750732,
"objective/rlhf_reward": -9.532093620300294,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.04867856949567795,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.5443418622016907,
"step": 483,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0004634857177734
},
{
"episode": 7760,
"epoch": 0.21287101552641685,
"loss/policy_avg": 0.8424244523048401,
"lr": 1.7263157894736843e-05,
"objective/entropy": -43.61247253417969,
"objective/kl": 67.6912841796875,
"objective/non_score_reward": -6.769128799438477,
"objective/rlhf_reward": -26.67651472091675,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.6011160016059875,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5028409957885742,
"step": 484,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0024876594543457
},
{
"episode": 7776,
"epoch": 0.2133099248367806,
"loss/policy_avg": 0.05304345488548279,
"lr": 1.723684210526316e-05,
"objective/entropy": -58.04655456542969,
"objective/kl": 60.33159637451172,
"objective/non_score_reward": -6.033159255981445,
"objective/rlhf_reward": -19.7326379776001,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.44679179787635803,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5934137105941772,
"step": 485,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990103244781494
},
{
"episode": 7792,
"epoch": 0.21374883414714435,
"loss/policy_avg": -0.1699434518814087,
"lr": 1.7210526315789473e-05,
"objective/entropy": -118.589599609375,
"objective/kl": 57.44417953491211,
"objective/non_score_reward": -5.744418144226074,
"objective/rlhf_reward": -18.577671623229982,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.6742945313453674,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5328694581985474,
"step": 486,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.00083065032959
},
{
"episode": 7808,
"epoch": 0.2141877434575081,
"loss/policy_avg": 0.04833655059337616,
"lr": 1.718421052631579e-05,
"objective/entropy": -38.469444274902344,
"objective/kl": 45.26972198486328,
"objective/non_score_reward": -4.52697229385376,
"objective/rlhf_reward": -17.707888698577882,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.6370303630828857,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6407510638237,
"step": 487,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9995996952056885
},
{
"episode": 7824,
"epoch": 0.21462665276787185,
"loss/policy_avg": 0.1121654063463211,
"lr": 1.7157894736842107e-05,
"objective/entropy": -55.31531524658203,
"objective/kl": 48.76139831542969,
"objective/non_score_reward": -4.876140117645264,
"objective/rlhf_reward": -15.104560470581056,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7145019769668579,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5792022943496704,
"step": 488,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9987767934799194
},
{
"episode": 7840,
"epoch": 0.21506556207823557,
"loss/policy_avg": 0.07800394296646118,
"lr": 1.713157894736842e-05,
"objective/entropy": -19.052385330200195,
"objective/kl": 85.97078704833984,
"objective/non_score_reward": -8.597079277038574,
"objective/rlhf_reward": -29.988315677642824,
"objective/scores": 1.1,
"policy/approxkl_avg": 3.3856687545776367,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7141093015670776,
"step": 489,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.00225567817688
},
{
"episode": 7856,
"epoch": 0.21550447138859932,
"loss/policy_avg": 2.34645414352417,
"lr": 1.7105263157894737e-05,
"objective/entropy": 11.276354789733887,
"objective/kl": 74.60820007324219,
"objective/non_score_reward": -7.460819721221924,
"objective/rlhf_reward": -27.720573606268438,
"objective/scores": 0.5306765580733931,
"policy/approxkl_avg": 177.46253967285156,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6245430707931519,
"step": 490,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9974071979522705
},
{
"episode": 7872,
"epoch": 0.21594338069896307,
"loss/policy_avg": 1.0574601888656616,
"lr": 1.707894736842105e-05,
"objective/entropy": -76.87249755859375,
"objective/kl": 66.18588256835938,
"objective/non_score_reward": -6.618587970733643,
"objective/rlhf_reward": -22.07435188293457,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.6070568561553955,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5290701389312744,
"step": 491,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9992444515228271
},
{
"episode": 7888,
"epoch": 0.21638229000932682,
"loss/policy_avg": -0.13727782666683197,
"lr": 1.705263157894737e-05,
"objective/entropy": -46.14100646972656,
"objective/kl": 64.03702545166016,
"objective/non_score_reward": -6.403702735900879,
"objective/rlhf_reward": -25.214810943603517,
"objective/scores": 0.1,
"policy/approxkl_avg": 160.15899658203125,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5764521360397339,
"step": 492,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.996391773223877
},
{
"episode": 7904,
"epoch": 0.21682119931969057,
"loss/policy_avg": -1.2387105226516724,
"lr": 1.7026315789473685e-05,
"objective/entropy": -138.5601043701172,
"objective/kl": 58.91579818725586,
"objective/non_score_reward": -5.891580104827881,
"objective/rlhf_reward": -25.566320419311523,
"objective/scores": -0.5,
"policy/approxkl_avg": 82.98390197753906,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7035410404205322,
"step": 493,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9981887340545654
},
{
"episode": 7920,
"epoch": 0.21726010863005432,
"loss/policy_avg": -0.45713070034980774,
"lr": 1.7e-05,
"objective/entropy": -44.855960845947266,
"objective/kl": 51.5128173828125,
"objective/non_score_reward": -5.151281833648682,
"objective/rlhf_reward": -20.205127334594728,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.6437796354293823,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6304531097412109,
"step": 494,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.00028657913208
},
{
"episode": 7936,
"epoch": 0.21769901794041807,
"loss/policy_avg": 0.4890076220035553,
"lr": 1.6973684210526315e-05,
"objective/entropy": -85.00923156738281,
"objective/kl": 56.18220520019531,
"objective/non_score_reward": -5.618220329284668,
"objective/rlhf_reward": -22.07288179397583,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.5769182443618774,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6442561149597168,
"step": 495,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0013489723205566
},
{
"episode": 7952,
"epoch": 0.2181379272507818,
"loss/policy_avg": -0.9529117345809937,
"lr": 1.694736842105263e-05,
"objective/entropy": -68.37138366699219,
"objective/kl": 56.5475959777832,
"objective/non_score_reward": -5.654759407043457,
"objective/rlhf_reward": -18.219038820266725,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.1973304748535156,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5495026707649231,
"step": 496,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000663995742798
},
{
"episode": 7968,
"epoch": 0.21857683656114554,
"loss/policy_avg": 0.12810270488262177,
"lr": 1.6921052631578948e-05,
"objective/entropy": -281.0614929199219,
"objective/kl": 24.029754638671875,
"objective/non_score_reward": -2.402975559234619,
"objective/rlhf_reward": -5.211901998519897,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.1585916429758072,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.5986806154251099,
"step": 497,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.99949049949646
},
{
"episode": 7984,
"epoch": 0.2190157458715093,
"loss/policy_avg": -1.2483512163162231,
"lr": 1.6894736842105263e-05,
"objective/entropy": -125.2818374633789,
"objective/kl": 48.96977996826172,
"objective/non_score_reward": -4.89697790145874,
"objective/rlhf_reward": -15.1879123210907,
"objective/scores": 1.1,
"policy/approxkl_avg": 38.351585388183594,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5701537132263184,
"step": 498,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9982032775878906
},
{
"episode": 8000,
"epoch": 0.21945465518187304,
"loss/policy_avg": 0.2582131624221802,
"lr": 1.6868421052631578e-05,
"objective/entropy": -43.63482666015625,
"objective/kl": 83.76309967041016,
"objective/non_score_reward": -8.376310348510742,
"objective/rlhf_reward": -33.10524425506592,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.616728663444519,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6804277896881104,
"step": 499,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0020856857299805
},
{
"episode": 8016,
"epoch": 0.2198935644922368,
"loss/policy_avg": -0.2930386960506439,
"lr": 1.6842105263157893e-05,
"objective/entropy": -47.91590118408203,
"objective/kl": 48.19269561767578,
"objective/non_score_reward": -4.819270133972168,
"objective/rlhf_reward": -14.877080774307252,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.4650521278381348,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7425109148025513,
"step": 500,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0012712478637695
},
{
"episode": 8032,
"epoch": 0.22033247380260054,
"loss/policy_avg": -1.611896276473999,
"lr": 1.681578947368421e-05,
"objective/entropy": -103.2376480102539,
"objective/kl": 59.12322998046875,
"objective/non_score_reward": -5.912322998046875,
"objective/rlhf_reward": -19.24929246902466,
"objective/scores": 1.1,
"policy/approxkl_avg": 150.0416259765625,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6628369092941284,
"step": 501,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990419149398804
},
{
"episode": 8048,
"epoch": 0.2207713831129643,
"loss/policy_avg": 0.293116956949234,
"lr": 1.6789473684210526e-05,
"objective/entropy": -84.19717407226562,
"objective/kl": 43.60019302368164,
"objective/non_score_reward": -4.360019207000732,
"objective/rlhf_reward": -13.040077304840088,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.25065314769744873,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.5742840766906738,
"step": 502,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0006816387176514
},
{
"episode": 8064,
"epoch": 0.22121029242332804,
"loss/policy_avg": -0.2314561903476715,
"lr": 1.676315789473684e-05,
"objective/entropy": -20.76449966430664,
"objective/kl": 50.151161193847656,
"objective/non_score_reward": -5.015115737915039,
"objective/rlhf_reward": -15.660464620590211,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.5777597427368164,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6621568202972412,
"step": 503,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001661777496338
},
{
"episode": 8080,
"epoch": 0.22164920173369176,
"loss/policy_avg": 0.26323428750038147,
"lr": 1.6736842105263156e-05,
"objective/entropy": -250.92178344726562,
"objective/kl": 26.257488250732422,
"objective/non_score_reward": -2.625748872756958,
"objective/rlhf_reward": -6.102995491027832,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.509668231010437,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6861035823822021,
"step": 504,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9993526935577393
},
{
"episode": 8096,
"epoch": 0.22208811104405551,
"loss/policy_avg": 0.7368256449699402,
"lr": 1.671052631578947e-05,
"objective/entropy": -49.587276458740234,
"objective/kl": 54.560203552246094,
"objective/non_score_reward": -5.456020355224609,
"objective/rlhf_reward": -21.424079990386964,
"objective/scores": 0.1,
"policy/approxkl_avg": 40.43186950683594,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5872527956962585,
"step": 505,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.002885103225708
},
{
"episode": 8112,
"epoch": 0.22252702035441926,
"loss/policy_avg": -0.13276471197605133,
"lr": 1.668421052631579e-05,
"objective/entropy": -29.59518814086914,
"objective/kl": 59.4058837890625,
"objective/non_score_reward": -5.940588474273682,
"objective/rlhf_reward": -19.362353897094728,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.369570016860962,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6469683647155762,
"step": 506,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0005695819854736
},
{
"episode": 8128,
"epoch": 0.222965929664783,
"loss/policy_avg": 0.15404559671878815,
"lr": 1.6657894736842105e-05,
"objective/entropy": -288.75341796875,
"objective/kl": 16.826412200927734,
"objective/non_score_reward": -1.6826412677764893,
"objective/rlhf_reward": -2.3305650711059567,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.13429397344589233,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6506304740905762,
"step": 507,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.001171827316284
},
{
"episode": 8144,
"epoch": 0.22340483897514676,
"loss/policy_avg": 0.06800419837236404,
"lr": 1.663157894736842e-05,
"objective/entropy": -30.991519927978516,
"objective/kl": 53.74003601074219,
"objective/non_score_reward": -5.3740034103393555,
"objective/rlhf_reward": -17.09601459503174,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.6434616446495056,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6338897347450256,
"step": 508,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999275207519531
},
{
"episode": 8160,
"epoch": 0.2238437482855105,
"loss/policy_avg": 0.010863058269023895,
"lr": 1.6605263157894738e-05,
"objective/entropy": -300.00469970703125,
"objective/kl": 22.95432472229004,
"objective/non_score_reward": -2.2954325675964355,
"objective/rlhf_reward": -8.781729793548585,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.30962803959846497,
"policy/clipfrac_avg": 0.0,
"policy/entropy_avg": 0.7447618246078491,
"step": 509,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.999950647354126
},
{
"episode": 8176,
"epoch": 0.22428265759587426,
"loss/policy_avg": -0.005725979804992676,
"lr": 1.6578947368421053e-05,
"objective/entropy": -72.19743347167969,
"objective/kl": 67.5067138671875,
"objective/non_score_reward": -6.75067138671875,
"objective/rlhf_reward": -22.602685070037843,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.3745484352111816,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7517286539077759,
"step": 510,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990617036819458
},
{
"episode": 8192,
"epoch": 0.22472156690623799,
"loss/policy_avg": -0.49819329380989075,
"lr": 1.655263157894737e-05,
"objective/entropy": -100.77090454101562,
"objective/kl": 52.09931945800781,
"objective/non_score_reward": -5.20993185043335,
"objective/rlhf_reward": -16.439727878570558,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.40050771832466125,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6259418725967407,
"step": 511,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0012173652648926
},
{
"episode": 8208,
"epoch": 0.22516047621660173,
"loss/policy_avg": -0.2459656298160553,
"lr": 1.6526315789473686e-05,
"objective/entropy": -64.88304138183594,
"objective/kl": 54.07443618774414,
"objective/non_score_reward": -5.407444000244141,
"objective/rlhf_reward": -21.22977457046509,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.1960783004760742,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.71941077709198,
"step": 512,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999362230300903
},
{
"episode": 8224,
"epoch": 0.22559938552696548,
"loss/policy_avg": 0.015581846237182617,
"lr": 1.65e-05,
"objective/entropy": -86.62329864501953,
"objective/kl": 63.60594177246094,
"objective/non_score_reward": -6.360594749450684,
"objective/rlhf_reward": -21.042378520965578,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.669529914855957,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5121915936470032,
"step": 513,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9988899230957031
},
{
"episode": 8240,
"epoch": 0.22603829483732923,
"loss/policy_avg": -1.0670742988586426,
"lr": 1.6473684210526316e-05,
"objective/entropy": -126.68997955322266,
"objective/kl": 48.829551696777344,
"objective/non_score_reward": -4.882955074310303,
"objective/rlhf_reward": -15.131820535659791,
"objective/scores": 1.1,
"policy/approxkl_avg": 43.48120880126953,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5280458927154541,
"step": 514,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9984208345413208
},
{
"episode": 8256,
"epoch": 0.22647720414769298,
"loss/policy_avg": -1.7157282829284668,
"lr": 1.6447368421052635e-05,
"objective/entropy": -78.12884521484375,
"objective/kl": 58.25613784790039,
"objective/non_score_reward": -5.8256144523620605,
"objective/rlhf_reward": -18.90245876312256,
"objective/scores": 1.1,
"policy/approxkl_avg": 66.62440490722656,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6942804455757141,
"step": 515,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0003674030303955
},
{
"episode": 8272,
"epoch": 0.22691611345805673,
"loss/policy_avg": 1.5511988401412964,
"lr": 1.642105263157895e-05,
"objective/entropy": -47.62477111816406,
"objective/kl": 81.21244812011719,
"objective/non_score_reward": -8.121244430541992,
"objective/rlhf_reward": -28.08497772216797,
"objective/scores": 1.1,
"policy/approxkl_avg": 58.765281677246094,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.632847785949707,
"step": 516,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9979219436645508
},
{
"episode": 8288,
"epoch": 0.22735502276842048,
"loss/policy_avg": -0.03130514919757843,
"lr": 1.6394736842105265e-05,
"objective/entropy": -84.12074279785156,
"objective/kl": 70.67733764648438,
"objective/non_score_reward": -7.067734241485596,
"objective/rlhf_reward": -27.870936012268068,
"objective/scores": 0.1,
"policy/approxkl_avg": 2.3221986293792725,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7155718803405762,
"step": 517,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999363899230957
},
{
"episode": 8304,
"epoch": 0.22779393207878423,
"loss/policy_avg": 0.8689873218536377,
"lr": 1.636842105263158e-05,
"objective/entropy": -128.02854919433594,
"objective/kl": 57.08744812011719,
"objective/non_score_reward": -5.708744525909424,
"objective/rlhf_reward": -19.911259089351866,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 1.4211647510528564,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6031744480133057,
"step": 518,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9977145195007324
},
{
"episode": 8320,
"epoch": 0.22823284138914796,
"loss/policy_avg": -0.5332244634628296,
"lr": 1.6342105263157894e-05,
"objective/entropy": -185.8155059814453,
"objective/kl": 34.196075439453125,
"objective/non_score_reward": -3.419607639312744,
"objective/rlhf_reward": -15.678430557250977,
"objective/scores": -0.5,
"policy/approxkl_avg": 91.76868438720703,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.666742205619812,
"step": 519,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9985761642456055
},
{
"episode": 8336,
"epoch": 0.2286717506995117,
"loss/policy_avg": -0.2599008083343506,
"lr": 1.6315789473684213e-05,
"objective/entropy": -39.81787109375,
"objective/kl": 55.39878845214844,
"objective/non_score_reward": -5.539878845214844,
"objective/rlhf_reward": -21.759516334533693,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.2191312313079834,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7273707389831543,
"step": 520,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000648021697998
},
{
"episode": 8352,
"epoch": 0.22911066000987546,
"loss/policy_avg": 1.1705384254455566,
"lr": 1.6289473684210528e-05,
"objective/entropy": 14.593412399291992,
"objective/kl": 67.65731811523438,
"objective/non_score_reward": -6.765731334686279,
"objective/rlhf_reward": -29.062925338745117,
"objective/scores": -0.5,
"policy/approxkl_avg": 108.43498229980469,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.689389705657959,
"step": 521,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9988114833831787
},
{
"episode": 8368,
"epoch": 0.2295495693202392,
"loss/policy_avg": -0.0511559396982193,
"lr": 1.6263157894736843e-05,
"objective/entropy": -110.55036926269531,
"objective/kl": 51.608421325683594,
"objective/non_score_reward": -5.160842418670654,
"objective/rlhf_reward": -16.243369913101198,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.193710446357727,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6105633974075317,
"step": 522,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000009536743164
},
{
"episode": 8384,
"epoch": 0.22998847863060295,
"loss/policy_avg": -1.15142822265625,
"lr": 1.6236842105263158e-05,
"objective/entropy": -194.12693786621094,
"objective/kl": 60.674110412597656,
"objective/non_score_reward": -6.067410469055176,
"objective/rlhf_reward": -23.86964330673218,
"objective/scores": 0.1,
"policy/approxkl_avg": 37.457462310791016,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7633137106895447,
"step": 523,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9969725608825684
},
{
"episode": 8400,
"epoch": 0.2304273879409667,
"loss/policy_avg": -0.3951420783996582,
"lr": 1.6210526315789476e-05,
"objective/entropy": -70.58984375,
"objective/kl": 50.954105377197266,
"objective/non_score_reward": -5.0954108238220215,
"objective/rlhf_reward": -15.98164281845093,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5035173892974854,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6034559011459351,
"step": 524,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0016956329345703
},
{
"episode": 8416,
"epoch": 0.23086629725133045,
"loss/policy_avg": -0.012867942452430725,
"lr": 1.618421052631579e-05,
"objective/entropy": -262.4700012207031,
"objective/kl": 26.607215881347656,
"objective/non_score_reward": -2.660721778869629,
"objective/rlhf_reward": -6.242887115478516,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.36631155014038086,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6922171115875244,
"step": 525,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0013232231140137
},
{
"episode": 8432,
"epoch": 0.23130520656169418,
"loss/policy_avg": 0.1977730691432953,
"lr": 1.6157894736842106e-05,
"objective/entropy": -9.29283332824707,
"objective/kl": 75.08651733398438,
"objective/non_score_reward": -7.5086517333984375,
"objective/rlhf_reward": -29.634605979919435,
"objective/scores": 0.1,
"policy/approxkl_avg": 2.5576517581939697,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.8048279285430908,
"step": 526,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9986631870269775
},
{
"episode": 8448,
"epoch": 0.23174411587205793,
"loss/policy_avg": 0.09318825602531433,
"lr": 1.613157894736842e-05,
"objective/entropy": -24.686296463012695,
"objective/kl": 62.26502990722656,
"objective/non_score_reward": -6.226503372192383,
"objective/rlhf_reward": -20.506013011932374,
"objective/scores": 1.1,
"policy/approxkl_avg": 107.91770935058594,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.58833909034729,
"step": 527,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0000178813934326
},
{
"episode": 8464,
"epoch": 0.23218302518242168,
"loss/policy_avg": 0.030599527060985565,
"lr": 1.6105263157894736e-05,
"objective/entropy": -306.718017578125,
"objective/kl": 33.813175201416016,
"objective/non_score_reward": -3.381317615509033,
"objective/rlhf_reward": -15.525270462036133,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.0647120475769043,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.7297744154930115,
"step": 528,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.998337745666504
},
{
"episode": 8480,
"epoch": 0.23262193449278543,
"loss/policy_avg": 0.49856454133987427,
"lr": 1.6078947368421054e-05,
"objective/entropy": -59.88057327270508,
"objective/kl": 62.05354309082031,
"objective/non_score_reward": -6.205354690551758,
"objective/rlhf_reward": -20.421417808532716,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.5137290358543396,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7658047676086426,
"step": 529,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9996016025543213
},
{
"episode": 8496,
"epoch": 0.23306084380314918,
"loss/policy_avg": 1.4718656539916992,
"lr": 1.605263157894737e-05,
"objective/entropy": -23.841388702392578,
"objective/kl": 50.9389762878418,
"objective/non_score_reward": -5.093897819519043,
"objective/rlhf_reward": -15.975590801239015,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.899733304977417,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6414656639099121,
"step": 530,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9971559047698975
},
{
"episode": 8512,
"epoch": 0.23349975311351293,
"loss/policy_avg": -0.14778290688991547,
"lr": 1.6026315789473684e-05,
"objective/entropy": -280.56488037109375,
"objective/kl": 39.72999572753906,
"objective/non_score_reward": -3.9729995727539062,
"objective/rlhf_reward": -11.491998291015626,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9470717310905457,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6742441654205322,
"step": 531,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9999909400939941
},
{
"episode": 8528,
"epoch": 0.23393866242387668,
"loss/policy_avg": -0.8402799367904663,
"lr": 1.6e-05,
"objective/entropy": -146.46067810058594,
"objective/kl": 55.393707275390625,
"objective/non_score_reward": -5.539371013641357,
"objective/rlhf_reward": -17.757483100891115,
"objective/scores": 1.1,
"policy/approxkl_avg": 65.72979736328125,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6336046457290649,
"step": 532,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0008506774902344
},
{
"episode": 8544,
"epoch": 0.23437757173424043,
"loss/policy_avg": 0.10066896677017212,
"lr": 1.5973684210526314e-05,
"objective/entropy": -54.2482795715332,
"objective/kl": 59.967933654785156,
"objective/non_score_reward": -5.996793270111084,
"objective/rlhf_reward": -19.587173080444337,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9665546417236328,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6454309225082397,
"step": 533,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.002523899078369
},
{
"episode": 8560,
"epoch": 0.23481648104460415,
"loss/policy_avg": 0.96723473072052,
"lr": 1.5947368421052633e-05,
"objective/entropy": -75.4139633178711,
"objective/kl": 53.41941452026367,
"objective/non_score_reward": -5.3419413566589355,
"objective/rlhf_reward": -16.967765903472902,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.5414352416992188,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6739579439163208,
"step": 534,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9997044801712036
},
{
"episode": 8576,
"epoch": 0.2352553903549679,
"loss/policy_avg": 0.21909461915493011,
"lr": 1.5921052631578948e-05,
"objective/entropy": -55.3682861328125,
"objective/kl": 55.079891204833984,
"objective/non_score_reward": -5.507988929748535,
"objective/rlhf_reward": -17.631956911087038,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.1223952770233154,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6993370056152344,
"step": 535,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9986274242401123
},
{
"episode": 8592,
"epoch": 0.23569429966533165,
"loss/policy_avg": 0.16766348481178284,
"lr": 1.5894736842105263e-05,
"objective/entropy": -38.26525115966797,
"objective/kl": 53.806514739990234,
"objective/non_score_reward": -5.380651473999023,
"objective/rlhf_reward": -17.122606372833253,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.6757107973098755,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6718076467514038,
"step": 536,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.001012086868286
},
{
"episode": 8608,
"epoch": 0.2361332089756954,
"loss/policy_avg": -0.13367152214050293,
"lr": 1.5868421052631578e-05,
"objective/entropy": -64.89817810058594,
"objective/kl": 70.96744537353516,
"objective/non_score_reward": -7.096744537353516,
"objective/rlhf_reward": -23.98697957992554,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.0274198055267334,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6000114679336548,
"step": 537,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9985002279281616
},
{
"episode": 8624,
"epoch": 0.23657211828605915,
"loss/policy_avg": 0.6538009643554688,
"lr": 1.5842105263157896e-05,
"objective/entropy": -54.44984436035156,
"objective/kl": 61.42521667480469,
"objective/non_score_reward": -6.142521858215332,
"objective/rlhf_reward": -20.170088386535646,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.1359775066375732,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6809964179992676,
"step": 538,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.000568389892578
},
{
"episode": 8640,
"epoch": 0.2370110275964229,
"loss/policy_avg": -0.14812606573104858,
"lr": 1.581578947368421e-05,
"objective/entropy": -267.172607421875,
"objective/kl": 30.55646324157715,
"objective/non_score_reward": -3.0556464195251465,
"objective/rlhf_reward": -14.222585678100586,
"objective/scores": -0.5,
"policy/approxkl_avg": 1.3976173400878906,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7651552557945251,
"step": 539,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000613212585449
},
{
"episode": 8656,
"epoch": 0.23744993690678665,
"loss/policy_avg": -0.5527013540267944,
"lr": 1.5789473684210526e-05,
"objective/entropy": -38.92097091674805,
"objective/kl": 90.69183349609375,
"objective/non_score_reward": -9.069183349609375,
"objective/rlhf_reward": -35.87673292160034,
"objective/scores": 0.1,
"policy/approxkl_avg": 3.2288694381713867,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.7611846923828125,
"step": 540,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0006866455078125
},
{
"episode": 8672,
"epoch": 0.23788884621715037,
"loss/policy_avg": 0.3605126142501831,
"lr": 1.576315789473684e-05,
"objective/entropy": 8.301453590393066,
"objective/kl": 55.14949417114258,
"objective/non_score_reward": -5.514949798583984,
"objective/rlhf_reward": -21.65979919433594,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.4897167682647705,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6536012887954712,
"step": 541,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9994428157806396
},
{
"episode": 8688,
"epoch": 0.23832775552751412,
"loss/policy_avg": -0.628774106502533,
"lr": 1.5736842105263156e-05,
"objective/entropy": -87.08699035644531,
"objective/kl": 42.880611419677734,
"objective/non_score_reward": -4.288061141967773,
"objective/rlhf_reward": -15.54812458521517,
"objective/scores": 0.40102999566398123,
"policy/approxkl_avg": 122.45773315429688,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.741459846496582,
"step": 542,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9991590976715088
},
{
"episode": 8704,
"epoch": 0.23876666483787787,
"loss/policy_avg": 0.4420792758464813,
"lr": 1.5710526315789474e-05,
"objective/entropy": -31.224285125732422,
"objective/kl": 41.453887939453125,
"objective/non_score_reward": -4.145388603210449,
"objective/rlhf_reward": -12.181554651260377,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.1506004333496094,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7803224325180054,
"step": 543,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9997355937957764
},
{
"episode": 8720,
"epoch": 0.23920557414824162,
"loss/policy_avg": 0.09505406767129898,
"lr": 1.568421052631579e-05,
"objective/entropy": -271.14013671875,
"objective/kl": 28.504985809326172,
"objective/non_score_reward": -2.850498676300049,
"objective/rlhf_reward": -7.001994705200196,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.29007285833358765,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.856664776802063,
"step": 544,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9991320371627808
},
{
"episode": 8736,
"epoch": 0.23964448345860537,
"loss/policy_avg": 0.429214209318161,
"lr": 1.5657894736842104e-05,
"objective/entropy": -288.30877685546875,
"objective/kl": 22.891695022583008,
"objective/non_score_reward": -2.2891695499420166,
"objective/rlhf_reward": -7.331849451335977,
"objective/scores": 0.4562071871080222,
"policy/approxkl_avg": 0.47718560695648193,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6876716613769531,
"step": 545,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9994748830795288
},
{
"episode": 8752,
"epoch": 0.24008339276896912,
"loss/policy_avg": -0.6105255484580994,
"lr": 1.563157894736842e-05,
"objective/entropy": -192.4208984375,
"objective/kl": 39.915897369384766,
"objective/non_score_reward": -3.9915900230407715,
"objective/rlhf_reward": -11.566360092163087,
"objective/scores": 1.1,
"policy/approxkl_avg": 62.829139709472656,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5915755033493042,
"step": 546,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9990711212158203
},
{
"episode": 8768,
"epoch": 0.24052230207933287,
"loss/policy_avg": -0.9540656208992004,
"lr": 1.5605263157894737e-05,
"objective/entropy": -123.78657531738281,
"objective/kl": 41.60905075073242,
"objective/non_score_reward": -4.160904884338379,
"objective/rlhf_reward": -16.243620014190675,
"objective/scores": 0.1,
"policy/approxkl_avg": 84.0649185180664,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.5818493366241455,
"step": 547,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0009255409240723
},
{
"episode": 8784,
"epoch": 0.24096121138969662,
"loss/policy_avg": -0.5903968811035156,
"lr": 1.5578947368421052e-05,
"objective/entropy": -53.922210693359375,
"objective/kl": 70.95716857910156,
"objective/non_score_reward": -7.095716953277588,
"objective/rlhf_reward": -27.982867813110353,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.5617876052856445,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6649601459503174,
"step": 548,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999706745147705
},
{
"episode": 8800,
"epoch": 0.24140012070006034,
"loss/policy_avg": 0.12424759566783905,
"lr": 1.5552631578947367e-05,
"objective/entropy": -291.2635498046875,
"objective/kl": 25.438182830810547,
"objective/non_score_reward": -2.543818235397339,
"objective/rlhf_reward": -5.775272941589355,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.44543614983558655,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.8033074736595154,
"step": 549,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.9994637966156006
},
{
"episode": 8816,
"epoch": 0.2418390300104241,
"loss/policy_avg": -1.257480263710022,
"lr": 1.5526315789473686e-05,
"objective/entropy": -137.553466796875,
"objective/kl": 48.276798248291016,
"objective/non_score_reward": -4.827679634094238,
"objective/rlhf_reward": -16.910718774795534,
"objective/scores": 0.6,
"policy/approxkl_avg": 121.47811126708984,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5976001024246216,
"step": 550,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9993959665298462
},
{
"episode": 8832,
"epoch": 0.24227793932078784,
"loss/policy_avg": -0.5651368498802185,
"lr": 1.55e-05,
"objective/entropy": -161.20584106445312,
"objective/kl": 46.88820266723633,
"objective/non_score_reward": -4.688819885253906,
"objective/rlhf_reward": -16.930451269420693,
"objective/scores": 0.4562071871080222,
"policy/approxkl_avg": 53.305084228515625,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7755599021911621,
"step": 551,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.000209331512451
},
{
"episode": 8848,
"epoch": 0.2427168486311516,
"loss/policy_avg": -0.052873387932777405,
"lr": 1.547368421052632e-05,
"objective/entropy": -93.70918273925781,
"objective/kl": 56.23017883300781,
"objective/non_score_reward": -5.62301778793335,
"objective/rlhf_reward": -18.092072105407716,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.4745566844940186,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.7499211430549622,
"step": 552,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999940037727356
},
{
"episode": 8864,
"epoch": 0.24315575794151534,
"loss/policy_avg": -0.07624203711748123,
"lr": 1.5447368421052634e-05,
"objective/entropy": -70.65289306640625,
"objective/kl": 51.36430358886719,
"objective/non_score_reward": -5.136430740356445,
"objective/rlhf_reward": -16.145722961425783,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9020864963531494,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5754582285881042,
"step": 553,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999817132949829
},
{
"episode": 8880,
"epoch": 0.2435946672518791,
"loss/policy_avg": -0.33316770195961,
"lr": 1.542105263157895e-05,
"objective/entropy": -100.49559020996094,
"objective/kl": 47.4459228515625,
"objective/non_score_reward": -4.744592666625977,
"objective/rlhf_reward": -14.578370666503908,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.17755243182182312,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.5925395488739014,
"step": 554,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002267360687256
},
{
"episode": 8896,
"epoch": 0.24403357656224284,
"loss/policy_avg": -0.24473661184310913,
"lr": 1.5394736842105264e-05,
"objective/entropy": -85.58175659179688,
"objective/kl": 60.863525390625,
"objective/non_score_reward": -6.086352348327637,
"objective/rlhf_reward": -23.94540939331055,
"objective/scores": 0.1,
"policy/approxkl_avg": 1.0000783205032349,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.6594964861869812,
"step": 555,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.999789834022522
},
{
"episode": 8912,
"epoch": 0.24447248587260656,
"loss/policy_avg": 0.9693965911865234,
"lr": 1.536842105263158e-05,
"objective/entropy": -82.48194885253906,
"objective/kl": 73.32645416259766,
"objective/non_score_reward": -7.332645416259766,
"objective/rlhf_reward": -24.930581188201906,
"objective/scores": 1.1,
"policy/approxkl_avg": 134.82119750976562,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6412348747253418,
"step": 556,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9974418878555298
},
{
"episode": 8928,
"epoch": 0.2449113951829703,
"loss/policy_avg": -0.02250886708498001,
"lr": 1.5342105263157897e-05,
"objective/entropy": -291.52117919921875,
"objective/kl": 33.25413513183594,
"objective/non_score_reward": -3.325413703918457,
"objective/rlhf_reward": -15.301654815673828,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.8158763647079468,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5648782253265381,
"step": 557,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 1.999756932258606
},
{
"episode": 8944,
"epoch": 0.24535030449333406,
"loss/policy_avg": -1.834641695022583,
"lr": 1.5315789473684212e-05,
"objective/entropy": -98.22699737548828,
"objective/kl": 65.66693115234375,
"objective/non_score_reward": -6.566692352294922,
"objective/rlhf_reward": -21.866769886016847,
"objective/scores": 1.1,
"policy/approxkl_avg": 60.248626708984375,
"policy/clipfrac_avg": 1.25,
"policy/entropy_avg": 0.6188458204269409,
"step": 558,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9987174272537231
},
{
"episode": 8960,
"epoch": 0.2457892138036978,
"loss/policy_avg": 0.5259836316108704,
"lr": 1.5289473684210527e-05,
"objective/entropy": -80.89381408691406,
"objective/kl": 49.18425750732422,
"objective/non_score_reward": -4.918426513671875,
"objective/rlhf_reward": -15.273705101013185,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.9190274477005005,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6806643009185791,
"step": 559,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9992485046386719
},
{
"episode": 8976,
"epoch": 0.24622812311406156,
"loss/policy_avg": 0.2278147041797638,
"lr": 1.5263157894736842e-05,
"objective/entropy": -60.12748718261719,
"objective/kl": 55.052490234375,
"objective/non_score_reward": -5.5052490234375,
"objective/rlhf_reward": -21.62099657058716,
"objective/scores": 0.1,
"policy/approxkl_avg": 0.8754554986953735,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5408790707588196,
"step": 560,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9984455108642578
},
{
"episode": 8992,
"epoch": 0.2466670324244253,
"loss/policy_avg": 0.31462305784225464,
"lr": 1.5236842105263159e-05,
"objective/entropy": -285.16705322265625,
"objective/kl": 36.19426727294922,
"objective/non_score_reward": -3.6194264888763428,
"objective/rlhf_reward": -12.652877207073281,
"objective/scores": 0.4562071871080222,
"policy/approxkl_avg": 1.1670053005218506,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6850937008857727,
"step": 561,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999622106552124
},
{
"episode": 9008,
"epoch": 0.24710594173478906,
"loss/policy_avg": 0.40112099051475525,
"lr": 1.5210526315789476e-05,
"objective/entropy": -69.30557250976562,
"objective/kl": 55.801090240478516,
"objective/non_score_reward": -5.580108642578125,
"objective/rlhf_reward": -17.920436000823976,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.1320338249206543,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.8245863318443298,
"step": 562,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0004453659057617
},
{
"episode": 9024,
"epoch": 0.2475448510451528,
"loss/policy_avg": 1.3591008186340332,
"lr": 1.518421052631579e-05,
"objective/entropy": -142.46173095703125,
"objective/kl": 52.2000617980957,
"objective/non_score_reward": -5.220005989074707,
"objective/rlhf_reward": -16.480024433135988,
"objective/scores": 1.1,
"policy/approxkl_avg": 61.359336853027344,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.7486914396286011,
"step": 563,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9949495792388916
},
{
"episode": 9040,
"epoch": 0.24798376035551653,
"loss/policy_avg": -0.2928787171840668,
"lr": 1.5157894736842105e-05,
"objective/entropy": -53.85539245605469,
"objective/kl": 69.98612213134766,
"objective/non_score_reward": -6.998612403869629,
"objective/rlhf_reward": -23.5944486618042,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.6392881870269775,
"policy/clipfrac_avg": 1.5,
"policy/entropy_avg": 0.6351510286331177,
"step": 564,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990606307983398
},
{
"episode": 9056,
"epoch": 0.24842266966588028,
"loss/policy_avg": 0.3248935341835022,
"lr": 1.5131578947368422e-05,
"objective/entropy": -45.36878204345703,
"objective/kl": 59.32015609741211,
"objective/non_score_reward": -5.932015419006348,
"objective/rlhf_reward": -23.32806262969971,
"objective/scores": 0.1,
"policy/approxkl_avg": 181.79592895507812,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.675167441368103,
"step": 565,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9971544742584229
},
{
"episode": 9072,
"epoch": 0.24886157897624403,
"loss/policy_avg": 0.10387852787971497,
"lr": 1.5105263157894737e-05,
"objective/entropy": -40.72087860107422,
"objective/kl": 57.553802490234375,
"objective/non_score_reward": -5.755380630493164,
"objective/rlhf_reward": -18.62152156829834,
"objective/scores": 1.1,
"policy/approxkl_avg": 2.47540283203125,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.588182806968689,
"step": 566,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9985510110855103
},
{
"episode": 9088,
"epoch": 0.24930048828660778,
"loss/policy_avg": 0.3820185959339142,
"lr": 1.5078947368421054e-05,
"objective/entropy": -82.16580200195312,
"objective/kl": 60.49824523925781,
"objective/non_score_reward": -6.049825191497803,
"objective/rlhf_reward": -19.799300289154054,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.949872612953186,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6572426557540894,
"step": 567,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9995747804641724
},
{
"episode": 9104,
"epoch": 0.24973939759697153,
"loss/policy_avg": 1.8019391298294067,
"lr": 1.5052631578947369e-05,
"objective/entropy": -60.920955657958984,
"objective/kl": 69.83343505859375,
"objective/non_score_reward": -6.983344078063965,
"objective/rlhf_reward": -23.533374881744386,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.9492524862289429,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6470024585723877,
"step": 568,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9977750778198242
},
{
"episode": 9120,
"epoch": 0.2501783069073353,
"loss/policy_avg": 0.1746143400669098,
"lr": 1.5026315789473685e-05,
"objective/entropy": -273.22198486328125,
"objective/kl": 20.09288215637207,
"objective/non_score_reward": -2.0092883110046387,
"objective/rlhf_reward": -10.037153244018555,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.23779484629631042,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.6256122589111328,
"step": 569,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 1.9998234510421753
},
{
"episode": 9136,
"epoch": 0.250617216217699,
"loss/policy_avg": -1.2724919319152832,
"lr": 1.5e-05,
"objective/entropy": -167.3118896484375,
"objective/kl": 67.96356201171875,
"objective/non_score_reward": -6.796355724334717,
"objective/rlhf_reward": -26.78542289733887,
"objective/scores": 0.1,
"policy/approxkl_avg": 129.8609619140625,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.5831791162490845,
"step": 570,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0005290508270264
},
{
"episode": 9152,
"epoch": 0.2510561255280628,
"loss/policy_avg": -0.5978893041610718,
"lr": 1.4973684210526315e-05,
"objective/entropy": -162.8898468017578,
"objective/kl": 44.88909912109375,
"objective/non_score_reward": -4.48891019821167,
"objective/rlhf_reward": -13.555640316009523,
"objective/scores": 1.1,
"policy/approxkl_avg": 58.46027374267578,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6906276941299438,
"step": 571,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 5,
"val/ratio": 2.0012764930725098
},
{
"episode": 9168,
"epoch": 0.2514950348384265,
"loss/policy_avg": 0.5245893597602844,
"lr": 1.4947368421052632e-05,
"objective/entropy": -91.91587829589844,
"objective/kl": 55.85387420654297,
"objective/non_score_reward": -5.585387229919434,
"objective/rlhf_reward": -17.941549396514894,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.2534908056259155,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.5883944630622864,
"step": 572,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999537467956543
},
{
"episode": 9184,
"epoch": 0.2519339441487903,
"loss/policy_avg": -0.6780993342399597,
"lr": 1.4921052631578947e-05,
"objective/entropy": -72.03091430664062,
"objective/kl": 63.12275695800781,
"objective/non_score_reward": -6.3122758865356445,
"objective/rlhf_reward": -22.84910306930542,
"objective/scores": 0.6,
"policy/approxkl_avg": 0.9321353435516357,
"policy/clipfrac_avg": 1.75,
"policy/entropy_avg": 0.7479265332221985,
"step": 573,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0003271102905273
},
{
"episode": 9200,
"epoch": 0.252372853459154,
"loss/policy_avg": 0.025867890566587448,
"lr": 1.4894736842105264e-05,
"objective/entropy": -305.865966796875,
"objective/kl": 30.30899429321289,
"objective/non_score_reward": -3.0308995246887207,
"objective/rlhf_reward": -14.123598098754883,
"objective/scores": -0.5,
"policy/approxkl_avg": 0.22707101702690125,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.6340222358703613,
"step": 574,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9998149871826172
},
{
"episode": 9216,
"epoch": 0.2528117627695177,
"loss/policy_avg": 0.2892727851867676,
"lr": 1.4868421052631579e-05,
"objective/entropy": -4.849845886230469,
"objective/kl": 57.78361511230469,
"objective/non_score_reward": -5.7783613204956055,
"objective/rlhf_reward": -18.713445281982423,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7950266003608704,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.7515757083892822,
"step": 575,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0013017654418945
},
{
"episode": 9232,
"epoch": 0.2532506720798815,
"loss/policy_avg": 3.0340328216552734,
"lr": 1.4842105263157895e-05,
"objective/entropy": -41.5596923828125,
"objective/kl": 67.7565689086914,
"objective/non_score_reward": -6.775656700134277,
"objective/rlhf_reward": -22.70262727737427,
"objective/scores": 1.1,
"policy/approxkl_avg": 1.2664908170700073,
"policy/clipfrac_avg": 0.5,
"policy/entropy_avg": 0.609063982963562,
"step": 576,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9999446868896484
},
{
"episode": 9248,
"epoch": 0.2536895813902452,
"loss/policy_avg": 0.24365383386611938,
"lr": 1.481578947368421e-05,
"objective/entropy": -62.568878173828125,
"objective/kl": 73.23529052734375,
"objective/non_score_reward": -7.3235297203063965,
"objective/rlhf_reward": -24.894118881225587,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7512961626052856,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6471009850502014,
"step": 577,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 2.0002264976501465
},
{
"episode": 9264,
"epoch": 0.254128490700609,
"loss/policy_avg": 1.6541283130645752,
"lr": 1.4789473684210525e-05,
"objective/entropy": -75.43449401855469,
"objective/kl": 60.49390411376953,
"objective/non_score_reward": -6.0493903160095215,
"objective/rlhf_reward": -21.273842249752256,
"objective/scores": 0.7309297535714575,
"policy/approxkl_avg": 0.5560513734817505,
"policy/clipfrac_avg": 0.25,
"policy/entropy_avg": 0.5372088551521301,
"step": 578,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9995195865631104
},
{
"episode": 9280,
"epoch": 0.2545674000109727,
"loss/policy_avg": -0.0851164162158966,
"lr": 1.4763157894736842e-05,
"objective/entropy": -289.71875,
"objective/kl": 32.05172348022461,
"objective/non_score_reward": -3.205172538757324,
"objective/rlhf_reward": -8.420690155029298,
"objective/scores": 1.1,
"policy/approxkl_avg": 0.7391265034675598,
"policy/clipfrac_avg": 0.75,
"policy/entropy_avg": 0.6136614084243774,
"step": 579,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 6,
"val/ratio": 2.000051975250244
},
{
"episode": 9296,
"epoch": 0.2550063093213365,
"loss/policy_avg": 4.458627223968506,
"lr": 1.4736842105263157e-05,
"objective/entropy": 24.38788414001465,
"objective/kl": 78.81938171386719,
"objective/non_score_reward": -7.881937503814697,
"objective/rlhf_reward": -31.12775049209595,
"objective/scores": 0.1,
"policy/approxkl_avg": 237.8883514404297,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6564638018608093,
"step": 580,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 3,
"val/ratio": 1.9961477518081665
},
{
"episode": 9312,
"epoch": 0.2554452186317002,
"loss/policy_avg": 0.8149951100349426,
"lr": 1.4710526315789475e-05,
"objective/entropy": -61.854007720947266,
"objective/kl": 52.87870407104492,
"objective/non_score_reward": -5.287870407104492,
"objective/rlhf_reward": -20.75148162841797,
"objective/scores": 0.1,
"policy/approxkl_avg": 2.4551239013671875,
"policy/clipfrac_avg": 1.0,
"policy/entropy_avg": 0.6414899230003357,
"step": 581,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 4,
"val/ratio": 1.9990167617797852
}
],
"logging_steps": 500,
"max_steps": 570,
"num_input_tokens_seen": 0,
"num_train_epochs": 1.0,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0,
"train_batch_size": null,
"trial_name": null,
"trial_params": null
}