ppo_model_final / checkpoint-107 /trainer_state.json
MMattaparthy's picture
Upload sfo model
f80f695 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"episode": 1712,
"epoch": 3.0035087719298246,
"eval_steps": 500,
"global_step": 107,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"episode": 16,
"epoch": 0.028070175438596492,
"eps": 2,
"loss/policy_avg": 0.040254347026348114,
"loss/value_avg": 4.365694046020508,
"lr": 1.41e-05,
"objective/entropy": 20.665752410888672,
"objective/kl": 38.31879425048828,
"objective/non_score_reward": -1.9159398078918457,
"objective/rlhf_reward": -0.5565648078918457,
"objective/scores": 1.359375,
"policy/approxkl_avg": 5.953035354614258,
"policy/clipfrac_avg": 0.2399764209985733,
"policy/entropy_avg": 0.6761884689331055,
"step": 1,
"val/clipfrac_avg": 0.26650944352149963,
"val/num_eos_tokens": 0,
"val/ratio": 0.8525064587593079,
"val/ratio_var": 0.0006113838753663003
},
{
"episode": 32,
"epoch": 0.056140350877192984,
"eps": 2,
"loss/policy_avg": 0.0446447990834713,
"loss/value_avg": 3.0273280143737793,
"lr": 1.3968224299065421e-05,
"objective/entropy": 21.37685203552246,
"objective/kl": 73.40159606933594,
"objective/non_score_reward": -3.6700797080993652,
"objective/rlhf_reward": -2.5294547080993652,
"objective/scores": 1.140625,
"policy/approxkl_avg": 6.8038129806518555,
"policy/clipfrac_avg": 0.22936320304870605,
"policy/entropy_avg": 0.6141112446784973,
"step": 2,
"val/clipfrac_avg": 0.2458726465702057,
"val/num_eos_tokens": 0,
"val/ratio": 0.8650269508361816,
"val/ratio_var": 0.0003330775070935488
},
{
"episode": 48,
"epoch": 0.08421052631578947,
"eps": 3,
"loss/policy_avg": 0.07702315598726273,
"loss/value_avg": 2.1064209938049316,
"lr": 1.3836448598130842e-05,
"objective/entropy": 26.10182762145996,
"objective/kl": 74.03025817871094,
"objective/non_score_reward": -3.7015130519866943,
"objective/rlhf_reward": -2.6546380519866943,
"objective/scores": 1.046875,
"policy/approxkl_avg": 5.6432695388793945,
"policy/clipfrac_avg": 0.2087264060974121,
"policy/entropy_avg": 0.7391780614852905,
"step": 3,
"val/clipfrac_avg": 0.21639150381088257,
"val/num_eos_tokens": 0,
"val/ratio": 0.8773487210273743,
"val/ratio_var": 0.00039911610656417906
},
{
"episode": 64,
"epoch": 0.11228070175438597,
"eps": 3,
"loss/policy_avg": 0.03978118300437927,
"loss/value_avg": 1.5315862894058228,
"lr": 1.3704672897196262e-05,
"objective/entropy": 20.781639099121094,
"objective/kl": 68.94087219238281,
"objective/non_score_reward": -3.4470434188842773,
"objective/rlhf_reward": -2.4470434188842773,
"objective/scores": 1.0,
"policy/approxkl_avg": 4.726678848266602,
"policy/clipfrac_avg": 0.20341980457305908,
"policy/entropy_avg": 0.7119085788726807,
"step": 4,
"val/clipfrac_avg": 0.16096699237823486,
"val/num_eos_tokens": 0,
"val/ratio": 0.8485063314437866,
"val/ratio_var": 0.0003752955235540867
},
{
"episode": 80,
"epoch": 0.14035087719298245,
"eps": 3,
"loss/policy_avg": 0.06184825301170349,
"loss/value_avg": 0.9904976487159729,
"lr": 1.3572897196261683e-05,
"objective/entropy": 22.48508071899414,
"objective/kl": 75.60542297363281,
"objective/non_score_reward": -3.780271291732788,
"objective/rlhf_reward": -3.272458791732788,
"objective/scores": 0.5078125,
"policy/approxkl_avg": 4.352260589599609,
"policy/clipfrac_avg": 0.2146226465702057,
"policy/entropy_avg": 0.7200251817703247,
"step": 5,
"val/clipfrac_avg": 0.09669811278581619,
"val/num_eos_tokens": 0,
"val/ratio": 0.8546276092529297,
"val/ratio_var": 0.00010198648669756949
},
{
"episode": 96,
"epoch": 0.16842105263157894,
"eps": 3,
"loss/policy_avg": 0.046236515045166016,
"loss/value_avg": 1.2219572067260742,
"lr": 1.3441121495327103e-05,
"objective/entropy": 22.197113037109375,
"objective/kl": 82.56051635742188,
"objective/non_score_reward": -4.128026008605957,
"objective/rlhf_reward": -3.253026008605957,
"objective/scores": 0.875,
"policy/approxkl_avg": 3.8577029705047607,
"policy/clipfrac_avg": 0.22641509771347046,
"policy/entropy_avg": 0.7238099575042725,
"step": 6,
"val/clipfrac_avg": 0.0383254736661911,
"val/num_eos_tokens": 0,
"val/ratio": 0.8292837738990784,
"val/ratio_var": 6.185401434777305e-05
},
{
"episode": 112,
"epoch": 0.19649122807017544,
"eps": 3,
"loss/policy_avg": 0.06606701016426086,
"loss/value_avg": 1.325202226638794,
"lr": 1.3309345794392524e-05,
"objective/entropy": 27.022249221801758,
"objective/kl": 101.50897216796875,
"objective/non_score_reward": -5.075448513031006,
"objective/rlhf_reward": -3.653573513031006,
"objective/scores": 1.421875,
"policy/approxkl_avg": 5.585095405578613,
"policy/clipfrac_avg": 0.24174529314041138,
"policy/entropy_avg": 0.8859995603561401,
"step": 7,
"val/clipfrac_avg": 0.028301887214183807,
"val/num_eos_tokens": 0,
"val/ratio": 0.8486474752426147,
"val/ratio_var": 0.0005388148711062968
},
{
"episode": 128,
"epoch": 0.22456140350877193,
"eps": 3,
"loss/policy_avg": 0.06467999517917633,
"loss/value_avg": 0.9481044411659241,
"lr": 1.3177570093457945e-05,
"objective/entropy": 26.585155487060547,
"objective/kl": 116.76457214355469,
"objective/non_score_reward": -5.838229179382324,
"objective/rlhf_reward": -4.111666679382324,
"objective/scores": 1.7265625,
"policy/approxkl_avg": 5.351860046386719,
"policy/clipfrac_avg": 0.24469339847564697,
"policy/entropy_avg": 0.8991943597793579,
"step": 8,
"val/clipfrac_avg": 0.08785377442836761,
"val/num_eos_tokens": 0,
"val/ratio": 0.8524093627929688,
"val/ratio_var": 3.493872281978838e-05
},
{
"episode": 144,
"epoch": 0.25263157894736843,
"eps": 3,
"loss/policy_avg": 0.03949737548828125,
"loss/value_avg": 0.9386715888977051,
"lr": 1.3045794392523365e-05,
"objective/entropy": 29.664459228515625,
"objective/kl": 132.19940185546875,
"objective/non_score_reward": -6.6099700927734375,
"objective/rlhf_reward": -5.6763763427734375,
"objective/scores": 0.93359375,
"policy/approxkl_avg": 5.967764377593994,
"policy/clipfrac_avg": 0.2057783007621765,
"policy/entropy_avg": 1.0122931003570557,
"step": 9,
"val/clipfrac_avg": 0.017099056392908096,
"val/num_eos_tokens": 0,
"val/ratio": 0.8205329179763794,
"val/ratio_var": 7.763963367324322e-05
},
{
"episode": 160,
"epoch": 0.2807017543859649,
"eps": 3,
"loss/policy_avg": 0.0211679395288229,
"loss/value_avg": 1.213599681854248,
"lr": 1.2914018691588786e-05,
"objective/entropy": 31.076860427856445,
"objective/kl": 135.477294921875,
"objective/non_score_reward": -6.77386474609375,
"objective/rlhf_reward": -5.00042724609375,
"objective/scores": 1.7734375,
"policy/approxkl_avg": 3.5569186210632324,
"policy/clipfrac_avg": 0.21304652094841003,
"policy/entropy_avg": 1.1141610145568848,
"step": 10,
"val/clipfrac_avg": 0.06719152629375458,
"val/num_eos_tokens": 0,
"val/ratio": 0.8466310501098633,
"val/ratio_var": 0.0001414915022905916
},
{
"episode": 176,
"epoch": 0.3087719298245614,
"eps": 3,
"loss/policy_avg": 0.0240048635751009,
"loss/value_avg": 1.3511584997177124,
"lr": 1.2782242990654206e-05,
"objective/entropy": 32.90422058105469,
"objective/kl": 146.03787231445312,
"objective/non_score_reward": -7.301893711090088,
"objective/rlhf_reward": -5.630018711090088,
"objective/scores": 1.671875,
"policy/approxkl_avg": 5.625584125518799,
"policy/clipfrac_avg": 0.20400942862033844,
"policy/entropy_avg": 1.1266499757766724,
"step": 11,
"val/clipfrac_avg": 0.05483490601181984,
"val/num_eos_tokens": 0,
"val/ratio": 0.8710312843322754,
"val/ratio_var": 0.00014447471767198294
},
{
"episode": 192,
"epoch": 0.3368421052631579,
"eps": 3,
"loss/policy_avg": 0.05039631202816963,
"loss/value_avg": 1.4507163763046265,
"lr": 1.2650467289719627e-05,
"objective/entropy": 34.778846740722656,
"objective/kl": 152.17945861816406,
"objective/non_score_reward": -7.608973503112793,
"objective/rlhf_reward": -5.726161003112793,
"objective/scores": 1.8828125,
"policy/approxkl_avg": 4.985030174255371,
"policy/clipfrac_avg": 0.1987028270959854,
"policy/entropy_avg": 1.1480720043182373,
"step": 12,
"val/clipfrac_avg": 0.05188679322600365,
"val/num_eos_tokens": 0,
"val/ratio": 0.8588758111000061,
"val/ratio_var": 3.4296579542569816e-05
},
{
"episode": 208,
"epoch": 0.3649122807017544,
"eps": 3,
"loss/policy_avg": 0.037938639521598816,
"loss/value_avg": 1.5314528942108154,
"lr": 1.2518691588785048e-05,
"objective/entropy": 45.65214538574219,
"objective/kl": 132.1256866455078,
"objective/non_score_reward": -6.606284141540527,
"objective/rlhf_reward": -6.803549766540527,
"objective/scores": -0.197265625,
"policy/approxkl_avg": 5.0863566398620605,
"policy/clipfrac_avg": 0.18867924809455872,
"policy/entropy_avg": 1.3655226230621338,
"step": 13,
"val/clipfrac_avg": 0.06367924809455872,
"val/num_eos_tokens": 0,
"val/ratio": 0.8282334804534912,
"val/ratio_var": 0.0004519254434853792
},
{
"episode": 224,
"epoch": 0.3929824561403509,
"eps": 3,
"loss/policy_avg": 0.036732763051986694,
"loss/value_avg": 2.189356803894043,
"lr": 1.2386915887850468e-05,
"objective/entropy": 38.35211944580078,
"objective/kl": 113.13496398925781,
"objective/non_score_reward": -5.656748294830322,
"objective/rlhf_reward": -4.539560794830322,
"objective/scores": 1.1171875,
"policy/approxkl_avg": 4.408792972564697,
"policy/clipfrac_avg": 0.21201542019844055,
"policy/entropy_avg": 1.2410473823547363,
"step": 14,
"val/clipfrac_avg": 0.10222071409225464,
"val/num_eos_tokens": 0,
"val/ratio": 0.8754662275314331,
"val/ratio_var": 0.0007480247295461595
},
{
"episode": 240,
"epoch": 0.42105263157894735,
"eps": 3,
"loss/policy_avg": 0.03690744563937187,
"loss/value_avg": 1.4401739835739136,
"lr": 1.2255140186915889e-05,
"objective/entropy": 38.69694519042969,
"objective/kl": 115.77085876464844,
"objective/non_score_reward": -5.788543224334717,
"objective/rlhf_reward": -5.627410411834717,
"objective/scores": 0.1611328125,
"policy/approxkl_avg": 4.673148155212402,
"policy/clipfrac_avg": 0.1833726465702057,
"policy/entropy_avg": 1.2108347415924072,
"step": 15,
"val/clipfrac_avg": 0.01179245300590992,
"val/num_eos_tokens": 0,
"val/ratio": 0.8487275838851929,
"val/ratio_var": 0.0008522844291292131
},
{
"episode": 256,
"epoch": 0.44912280701754387,
"eps": 3,
"loss/policy_avg": 0.019956424832344055,
"loss/value_avg": 1.5383408069610596,
"lr": 1.212336448598131e-05,
"objective/entropy": 32.02195739746094,
"objective/kl": 122.87109375,
"objective/non_score_reward": -6.1435546875,
"objective/rlhf_reward": -5.7470703125,
"objective/scores": 0.396484375,
"policy/approxkl_avg": 5.454257965087891,
"policy/clipfrac_avg": 0.22314535081386566,
"policy/entropy_avg": 1.1098759174346924,
"step": 16,
"val/clipfrac_avg": 0.03384597226977348,
"val/num_eos_tokens": 0,
"val/ratio": 0.8839770555496216,
"val/ratio_var": 0.0004538022622000426
},
{
"episode": 272,
"epoch": 0.47719298245614034,
"eps": 3,
"loss/policy_avg": 0.016002152115106583,
"loss/value_avg": 1.5554126501083374,
"lr": 1.199158878504673e-05,
"objective/entropy": 28.698740005493164,
"objective/kl": 129.80386352539062,
"objective/non_score_reward": -6.4901933670043945,
"objective/rlhf_reward": -6.1796464920043945,
"objective/scores": 0.310546875,
"policy/approxkl_avg": 3.0268969535827637,
"policy/clipfrac_avg": 0.19969519972801208,
"policy/entropy_avg": 0.9890843629837036,
"step": 17,
"val/clipfrac_avg": 0.07565668225288391,
"val/num_eos_tokens": 0,
"val/ratio": 0.8847370743751526,
"val/ratio_var": 0.000195541579159908
},
{
"episode": 288,
"epoch": 0.5052631578947369,
"eps": 3,
"loss/policy_avg": 0.013964798301458359,
"loss/value_avg": 1.590545892715454,
"lr": 1.185981308411215e-05,
"objective/entropy": 12.885665893554688,
"objective/kl": 107.77202606201172,
"objective/non_score_reward": -5.388601303100586,
"objective/rlhf_reward": -4.169851303100586,
"objective/scores": 1.21875,
"policy/approxkl_avg": 5.9386420249938965,
"policy/clipfrac_avg": 0.18009786307811737,
"policy/entropy_avg": 0.6803157925605774,
"step": 18,
"val/clipfrac_avg": 0.12902730703353882,
"val/num_eos_tokens": 0,
"val/ratio": 0.9129149913787842,
"val/ratio_var": 0.0004151359898969531
},
{
"episode": 304,
"epoch": 0.5333333333333333,
"eps": 3,
"loss/policy_avg": 0.02675745077431202,
"loss/value_avg": 1.745023488998413,
"lr": 1.1728037383177571e-05,
"objective/entropy": 22.75409698486328,
"objective/kl": 119.34423828125,
"objective/non_score_reward": -5.967212200164795,
"objective/rlhf_reward": -5.197680950164795,
"objective/scores": 0.76953125,
"policy/approxkl_avg": 6.532215118408203,
"policy/clipfrac_avg": 0.1759602427482605,
"policy/entropy_avg": 0.8050931692123413,
"step": 19,
"val/clipfrac_avg": 0.0052770450711250305,
"val/num_eos_tokens": 0,
"val/ratio": 0.9205665588378906,
"val/ratio_var": 0.0005024418351240456
},
{
"episode": 320,
"epoch": 0.5614035087719298,
"eps": 3,
"loss/policy_avg": 0.014609305188059807,
"loss/value_avg": 1.7434797286987305,
"lr": 1.159626168224299e-05,
"objective/entropy": 29.933460235595703,
"objective/kl": 144.45672607421875,
"objective/non_score_reward": -7.222836494445801,
"objective/rlhf_reward": -6.675961494445801,
"objective/scores": 0.546875,
"policy/approxkl_avg": 7.733211517333984,
"policy/clipfrac_avg": 0.1762971729040146,
"policy/entropy_avg": 0.8804416656494141,
"step": 20,
"val/clipfrac_avg": 0.026533018797636032,
"val/num_eos_tokens": 0,
"val/ratio": 0.8693416714668274,
"val/ratio_var": 0.000557584862690419
},
{
"episode": 336,
"epoch": 0.5894736842105263,
"eps": 3,
"loss/policy_avg": 0.019340746104717255,
"loss/value_avg": 1.471176266670227,
"lr": 1.146448598130841e-05,
"objective/entropy": 23.77098846435547,
"objective/kl": 149.46974182128906,
"objective/non_score_reward": -7.473487377166748,
"objective/rlhf_reward": -5.668799877166748,
"objective/scores": 1.8046875,
"policy/approxkl_avg": 5.010880470275879,
"policy/clipfrac_avg": 0.18341976404190063,
"policy/entropy_avg": 0.973499596118927,
"step": 21,
"val/clipfrac_avg": 0.016483133658766747,
"val/num_eos_tokens": 0,
"val/ratio": 0.8893705606460571,
"val/ratio_var": 0.0012118957238271832
},
{
"episode": 352,
"epoch": 0.6175438596491228,
"eps": 3,
"loss/policy_avg": 0.009950436651706696,
"loss/value_avg": 1.1944406032562256,
"lr": 1.1332710280373831e-05,
"objective/entropy": 23.91471290588379,
"objective/kl": 155.48843383789062,
"objective/non_score_reward": -7.774421691894531,
"objective/rlhf_reward": -7.125984191894531,
"objective/scores": 0.6484375,
"policy/approxkl_avg": 6.8445940017700195,
"policy/clipfrac_avg": 0.16214622557163239,
"policy/entropy_avg": 0.837052047252655,
"step": 22,
"val/clipfrac_avg": 0.09257075190544128,
"val/num_eos_tokens": 0,
"val/ratio": 0.9027042984962463,
"val/ratio_var": 0.00019580482330638915
},
{
"episode": 368,
"epoch": 0.6456140350877193,
"eps": 3,
"loss/policy_avg": 0.021877210587263107,
"loss/value_avg": 1.0937385559082031,
"lr": 1.1200934579439252e-05,
"objective/entropy": 23.552492141723633,
"objective/kl": 172.6247100830078,
"objective/non_score_reward": -8.631235122680664,
"objective/rlhf_reward": -6.787485122680664,
"objective/scores": 1.84375,
"policy/approxkl_avg": 6.227967262268066,
"policy/clipfrac_avg": 0.16203010082244873,
"policy/entropy_avg": 0.8671172857284546,
"step": 23,
"val/clipfrac_avg": 0.018410665914416313,
"val/num_eos_tokens": 0,
"val/ratio": 0.8919187784194946,
"val/ratio_var": 0.001309347921051085
},
{
"episode": 384,
"epoch": 0.6736842105263158,
"eps": 3,
"loss/policy_avg": 0.036975178867578506,
"loss/value_avg": 1.1198090314865112,
"lr": 1.1069158878504672e-05,
"objective/entropy": 20.281631469726562,
"objective/kl": 170.099365234375,
"objective/non_score_reward": -8.504968643188477,
"objective/rlhf_reward": -7.262781143188477,
"objective/scores": 1.2421875,
"policy/approxkl_avg": 6.825319766998291,
"policy/clipfrac_avg": 0.14049983024597168,
"policy/entropy_avg": 0.802283525466919,
"step": 24,
"val/clipfrac_avg": 0.0785929411649704,
"val/num_eos_tokens": 0,
"val/ratio": 0.8865500092506409,
"val/ratio_var": 0.0006475243135355413
},
{
"episode": 400,
"epoch": 0.7017543859649122,
"eps": 3,
"loss/policy_avg": 0.0237587820738554,
"loss/value_avg": 1.3415908813476562,
"lr": 1.0937383177570093e-05,
"objective/entropy": 21.36954689025879,
"objective/kl": 179.03819274902344,
"objective/non_score_reward": -8.951910018920898,
"objective/rlhf_reward": -7.334722518920898,
"objective/scores": 1.6171875,
"policy/approxkl_avg": 7.667660713195801,
"policy/clipfrac_avg": 0.11468379199504852,
"policy/entropy_avg": 0.7623839974403381,
"step": 25,
"val/clipfrac_avg": 0.27072370052337646,
"val/num_eos_tokens": 0,
"val/ratio": 0.8912988901138306,
"val/ratio_var": 2.9881122827646323e-05
},
{
"episode": 416,
"epoch": 0.7298245614035088,
"eps": 3,
"loss/policy_avg": 0.018890127539634705,
"loss/value_avg": 1.2341463565826416,
"lr": 1.0805607476635514e-05,
"objective/entropy": 16.852466583251953,
"objective/kl": 179.8382568359375,
"objective/non_score_reward": -8.991912841796875,
"objective/rlhf_reward": -8.312225341796875,
"objective/scores": 0.6796875,
"policy/approxkl_avg": 7.357123374938965,
"policy/clipfrac_avg": 0.11944779008626938,
"policy/entropy_avg": 0.7555092573165894,
"step": 26,
"val/clipfrac_avg": 0.2508935034275055,
"val/num_eos_tokens": 0,
"val/ratio": 0.8802620768547058,
"val/ratio_var": 0.00018670025747269392
},
{
"episode": 432,
"epoch": 0.7578947368421053,
"eps": 3,
"loss/policy_avg": 0.021100062876939774,
"loss/value_avg": 0.9362931847572327,
"lr": 1.0673831775700934e-05,
"objective/entropy": 21.68787384033203,
"objective/kl": 183.68655395507812,
"objective/non_score_reward": -9.184328079223633,
"objective/rlhf_reward": -7.199953079223633,
"objective/scores": 1.984375,
"policy/approxkl_avg": 5.103993892669678,
"policy/clipfrac_avg": 0.12146226316690445,
"policy/entropy_avg": 0.7987968921661377,
"step": 27,
"val/clipfrac_avg": 0.05424528568983078,
"val/num_eos_tokens": 0,
"val/ratio": 0.8928566575050354,
"val/ratio_var": 0.00013602118997368962
},
{
"episode": 448,
"epoch": 0.7859649122807018,
"eps": 3,
"loss/policy_avg": 0.011967534199357033,
"loss/value_avg": 1.0544021129608154,
"lr": 1.0542056074766355e-05,
"objective/entropy": 23.05614471435547,
"objective/kl": 182.8275146484375,
"objective/non_score_reward": -9.141375541687012,
"objective/rlhf_reward": -7.789813041687012,
"objective/scores": 1.3515625,
"policy/approxkl_avg": 4.963105201721191,
"policy/clipfrac_avg": 0.14268869161605835,
"policy/entropy_avg": 0.8151739835739136,
"step": 28,
"val/clipfrac_avg": 0.2146226465702057,
"val/num_eos_tokens": 0,
"val/ratio": 0.8715541362762451,
"val/ratio_var": 7.266430475283414e-05
},
{
"episode": 464,
"epoch": 0.8140350877192982,
"eps": 3,
"loss/policy_avg": 0.011134720407426357,
"loss/value_avg": 0.775411069393158,
"lr": 1.0410280373831775e-05,
"objective/entropy": 25.063724517822266,
"objective/kl": 188.51422119140625,
"objective/non_score_reward": -9.425710678100586,
"objective/rlhf_reward": -8.394460678100586,
"objective/scores": 1.03125,
"policy/approxkl_avg": 7.372692108154297,
"policy/clipfrac_avg": 0.1320754736661911,
"policy/entropy_avg": 0.8475193977355957,
"step": 29,
"val/clipfrac_avg": 0.04245283082127571,
"val/num_eos_tokens": 0,
"val/ratio": 0.891329824924469,
"val/ratio_var": 3.0236831207730575e-06
},
{
"episode": 480,
"epoch": 0.8421052631578947,
"eps": 3,
"loss/policy_avg": 0.017270730808377266,
"loss/value_avg": 0.7942019701004028,
"lr": 1.0278504672897196e-05,
"objective/entropy": 22.402624130249023,
"objective/kl": 185.72421264648438,
"objective/non_score_reward": -9.286211013793945,
"objective/rlhf_reward": -7.606523513793945,
"objective/scores": 1.6796875,
"policy/approxkl_avg": 8.755260467529297,
"policy/clipfrac_avg": 0.11261792480945587,
"policy/entropy_avg": 0.799101710319519,
"step": 30,
"val/clipfrac_avg": 0.014740565791726112,
"val/num_eos_tokens": 0,
"val/ratio": 0.8892796635627747,
"val/ratio_var": 6.262218084884807e-05
},
{
"episode": 496,
"epoch": 0.8701754385964913,
"eps": 3,
"loss/policy_avg": 0.019226763397455215,
"loss/value_avg": 0.6826229095458984,
"lr": 1.0146728971962616e-05,
"objective/entropy": 23.43070411682129,
"objective/kl": 201.67799377441406,
"objective/non_score_reward": -10.083900451660156,
"objective/rlhf_reward": -9.154212951660156,
"objective/scores": 0.9296875,
"policy/approxkl_avg": 8.144369125366211,
"policy/clipfrac_avg": 0.12264151126146317,
"policy/entropy_avg": 0.8400179147720337,
"step": 31,
"val/clipfrac_avg": 0.2228773534297943,
"val/num_eos_tokens": 0,
"val/ratio": 0.8812745213508606,
"val/ratio_var": 0.00010276544344378635
},
{
"episode": 512,
"epoch": 0.8982456140350877,
"eps": 3,
"loss/policy_avg": 0.005793810822069645,
"loss/value_avg": 0.8381754159927368,
"lr": 1.0014953271028037e-05,
"objective/entropy": 21.22824478149414,
"objective/kl": 185.61614990234375,
"objective/non_score_reward": -9.280807495117188,
"objective/rlhf_reward": -7.3511199951171875,
"objective/scores": 1.9296875,
"policy/approxkl_avg": 7.533528804779053,
"policy/clipfrac_avg": 0.12205187976360321,
"policy/entropy_avg": 0.7801576256752014,
"step": 32,
"val/clipfrac_avg": 0.09787735342979431,
"val/num_eos_tokens": 0,
"val/ratio": 0.8987904787063599,
"val/ratio_var": 0.00018136559810955077
},
{
"episode": 528,
"epoch": 0.9263157894736842,
"eps": 3,
"loss/policy_avg": 0.02254486456513405,
"loss/value_avg": 0.8877236843109131,
"lr": 9.883177570093458e-06,
"objective/entropy": 23.442203521728516,
"objective/kl": 188.65457153320312,
"objective/non_score_reward": -9.432729721069336,
"objective/rlhf_reward": -7.534292221069336,
"objective/scores": 1.8984375,
"policy/approxkl_avg": 5.329720497131348,
"policy/clipfrac_avg": 0.12323113530874252,
"policy/entropy_avg": 0.8007351160049438,
"step": 33,
"val/clipfrac_avg": 0.036556605249643326,
"val/num_eos_tokens": 0,
"val/ratio": 0.8945379257202148,
"val/ratio_var": 0.0001225806336151436
},
{
"episode": 544,
"epoch": 0.9543859649122807,
"eps": 3,
"loss/policy_avg": 0.01662730611860752,
"loss/value_avg": 0.637324869632721,
"lr": 9.751401869158878e-06,
"objective/entropy": 20.620216369628906,
"objective/kl": 186.25180053710938,
"objective/non_score_reward": -9.312589645385742,
"objective/rlhf_reward": -7.851652145385742,
"objective/scores": 1.4609375,
"policy/approxkl_avg": 8.322406768798828,
"policy/clipfrac_avg": 0.125,
"policy/entropy_avg": 0.7402328848838806,
"step": 34,
"val/clipfrac_avg": 0.002358490601181984,
"val/num_eos_tokens": 0,
"val/ratio": 0.8899158239364624,
"val/ratio_var": 0.0002540757122915238
},
{
"episode": 560,
"epoch": 0.9824561403508771,
"eps": 3,
"loss/policy_avg": 0.0003616265021264553,
"loss/value_avg": 0.6015822291374207,
"lr": 9.619626168224299e-06,
"objective/entropy": 21.12371063232422,
"objective/kl": 186.5103302001953,
"objective/non_score_reward": -9.325516700744629,
"objective/rlhf_reward": -7.817704200744629,
"objective/scores": 1.5078125,
"policy/approxkl_avg": 6.682015419006348,
"policy/clipfrac_avg": 0.13443395495414734,
"policy/entropy_avg": 0.7727504372596741,
"step": 35,
"val/clipfrac_avg": 0.037146229296922684,
"val/num_eos_tokens": 0,
"val/ratio": 0.8780511021614075,
"val/ratio_var": 6.755981303285807e-05
},
{
"episode": 576,
"epoch": 1.0105263157894737,
"eps": 3,
"loss/policy_avg": 0.009338408708572388,
"loss/value_avg": 0.6681860685348511,
"lr": 9.48785046728972e-06,
"objective/entropy": 21.184894561767578,
"objective/kl": 192.3019256591797,
"objective/non_score_reward": -9.615096092224121,
"objective/rlhf_reward": -7.443221092224121,
"objective/scores": 2.171875,
"policy/approxkl_avg": 6.053627967834473,
"policy/clipfrac_avg": 0.1179245263338089,
"policy/entropy_avg": 0.7489176392555237,
"step": 36,
"val/clipfrac_avg": 0.15683962404727936,
"val/num_eos_tokens": 0,
"val/ratio": 0.8922820091247559,
"val/ratio_var": 0.00026253468240611255
},
{
"episode": 592,
"epoch": 1.03859649122807,
"eps": 3,
"loss/policy_avg": 0.006294197402894497,
"loss/value_avg": 0.8030184507369995,
"lr": 9.35607476635514e-06,
"objective/entropy": 20.368942260742188,
"objective/kl": 190.313232421875,
"objective/non_score_reward": -9.51566219329834,
"objective/rlhf_reward": -8.02347469329834,
"objective/scores": 1.4921875,
"policy/approxkl_avg": 6.838142395019531,
"policy/clipfrac_avg": 0.11615566164255142,
"policy/entropy_avg": 0.7397478222846985,
"step": 37,
"val/clipfrac_avg": 0.003537735901772976,
"val/num_eos_tokens": 0,
"val/ratio": 0.8849948644638062,
"val/ratio_var": 8.990589412860572e-05
},
{
"episode": 608,
"epoch": 1.0666666666666667,
"eps": 3,
"loss/policy_avg": 0.016940509900450706,
"loss/value_avg": 0.5882998704910278,
"lr": 9.22429906542056e-06,
"objective/entropy": 23.183269500732422,
"objective/kl": 186.7628631591797,
"objective/non_score_reward": -9.338143348693848,
"objective/rlhf_reward": -7.564705848693848,
"objective/scores": 1.7734375,
"policy/approxkl_avg": 3.9737157821655273,
"policy/clipfrac_avg": 0.13089622557163239,
"policy/entropy_avg": 0.7805944681167603,
"step": 38,
"val/clipfrac_avg": 0.004127358552068472,
"val/num_eos_tokens": 0,
"val/ratio": 0.8967232704162598,
"val/ratio_var": 0.00010195688810199499
},
{
"episode": 624,
"epoch": 1.0947368421052632,
"eps": 3,
"loss/policy_avg": 0.011485239490866661,
"loss/value_avg": 0.5787074565887451,
"lr": 9.092523364485981e-06,
"objective/entropy": 17.85469627380371,
"objective/kl": 187.50631713867188,
"objective/non_score_reward": -9.375316619873047,
"objective/rlhf_reward": -7.203441619873047,
"objective/scores": 2.171875,
"policy/approxkl_avg": 5.274375915527344,
"policy/clipfrac_avg": 0.11556603759527206,
"policy/entropy_avg": 0.7150323390960693,
"step": 39,
"val/clipfrac_avg": 0.16509434580802917,
"val/num_eos_tokens": 0,
"val/ratio": 0.8858749866485596,
"val/ratio_var": 0.0001809587120078504
},
{
"episode": 640,
"epoch": 1.1228070175438596,
"eps": 3,
"loss/policy_avg": 0.017689252272248268,
"loss/value_avg": 0.7021454572677612,
"lr": 8.960747663551402e-06,
"objective/entropy": 21.994365692138672,
"objective/kl": 182.05810546875,
"objective/non_score_reward": -9.1029052734375,
"objective/rlhf_reward": -6.8841552734375,
"objective/scores": 2.21875,
"policy/approxkl_avg": 4.578630447387695,
"policy/clipfrac_avg": 0.12558962404727936,
"policy/entropy_avg": 0.7208189964294434,
"step": 40,
"val/clipfrac_avg": 0.06426886469125748,
"val/num_eos_tokens": 0,
"val/ratio": 0.8957177400588989,
"val/ratio_var": 7.126481068553403e-05
},
{
"episode": 656,
"epoch": 1.1508771929824562,
"eps": 3,
"loss/policy_avg": 0.005923585034906864,
"loss/value_avg": 0.5532969236373901,
"lr": 8.828971962616822e-06,
"objective/entropy": 18.056703567504883,
"objective/kl": 170.11863708496094,
"objective/non_score_reward": -8.505931854248047,
"objective/rlhf_reward": -6.380931854248047,
"objective/scores": 2.125,
"policy/approxkl_avg": 3.3685710430145264,
"policy/clipfrac_avg": 0.12028301507234573,
"policy/entropy_avg": 0.6859033107757568,
"step": 41,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.8874868154525757,
"val/ratio_var": 4.332972093834542e-05
},
{
"episode": 672,
"epoch": 1.1789473684210525,
"eps": 3,
"loss/policy_avg": 0.012987145222723484,
"loss/value_avg": 0.632028341293335,
"lr": 8.697196261682243e-06,
"objective/entropy": 19.865787506103516,
"objective/kl": 177.42971801757812,
"objective/non_score_reward": -8.871485710144043,
"objective/rlhf_reward": -6.527735710144043,
"objective/scores": 2.34375,
"policy/approxkl_avg": 6.186136245727539,
"policy/clipfrac_avg": 0.13325470685958862,
"policy/entropy_avg": 0.6923173069953918,
"step": 42,
"val/clipfrac_avg": 0.125,
"val/num_eos_tokens": 0,
"val/ratio": 0.8991193771362305,
"val/ratio_var": 0.0009607934043742716
},
{
"episode": 688,
"epoch": 1.207017543859649,
"eps": 3,
"loss/policy_avg": 0.011711956933140755,
"loss/value_avg": 0.4474850296974182,
"lr": 8.565420560747664e-06,
"objective/entropy": 19.79934310913086,
"objective/kl": 176.83395385742188,
"objective/non_score_reward": -8.84169864654541,
"objective/rlhf_reward": -7.02138614654541,
"objective/scores": 1.8203125,
"policy/approxkl_avg": 5.904331207275391,
"policy/clipfrac_avg": 0.13089622557163239,
"policy/entropy_avg": 0.6940422654151917,
"step": 43,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.8869062066078186,
"val/ratio_var": 0.0008940807892940938
},
{
"episode": 704,
"epoch": 1.2350877192982457,
"eps": 3,
"loss/policy_avg": 0.014619714580476284,
"loss/value_avg": 0.625725507736206,
"lr": 8.433644859813084e-06,
"objective/entropy": 22.385128021240234,
"objective/kl": 181.2716064453125,
"objective/non_score_reward": -9.063579559326172,
"objective/rlhf_reward": -7.493267059326172,
"objective/scores": 1.5703125,
"policy/approxkl_avg": 4.966976165771484,
"policy/clipfrac_avg": 0.14799527823925018,
"policy/entropy_avg": 0.7825338840484619,
"step": 44,
"val/clipfrac_avg": 0.14681604504585266,
"val/num_eos_tokens": 0,
"val/ratio": 0.883050799369812,
"val/ratio_var": 1.7675925846560858e-05
},
{
"episode": 720,
"epoch": 1.263157894736842,
"eps": 3,
"loss/policy_avg": 0.024550937116146088,
"loss/value_avg": 0.7726404070854187,
"lr": 8.301869158878505e-06,
"objective/entropy": 19.87116050720215,
"objective/kl": 172.38674926757812,
"objective/non_score_reward": -8.619338035583496,
"objective/rlhf_reward": -6.572463035583496,
"objective/scores": 2.046875,
"policy/approxkl_avg": 4.95554256439209,
"policy/clipfrac_avg": 0.13089622557163239,
"policy/entropy_avg": 0.6934086680412292,
"step": 45,
"val/clipfrac_avg": 0.009433962404727936,
"val/num_eos_tokens": 0,
"val/ratio": 0.8879209756851196,
"val/ratio_var": 0.0003393842780496925
},
{
"episode": 736,
"epoch": 1.2912280701754386,
"eps": 3,
"loss/policy_avg": 0.019363895058631897,
"loss/value_avg": 0.5529497861862183,
"lr": 8.170093457943925e-06,
"objective/entropy": 17.47795295715332,
"objective/kl": 174.22576904296875,
"objective/non_score_reward": -8.711288452148438,
"objective/rlhf_reward": -6.3206634521484375,
"objective/scores": 2.390625,
"policy/approxkl_avg": 3.8177051544189453,
"policy/clipfrac_avg": 0.11851415038108826,
"policy/entropy_avg": 0.6579099297523499,
"step": 46,
"val/clipfrac_avg": 0.001179245300590992,
"val/num_eos_tokens": 0,
"val/ratio": 0.9020024538040161,
"val/ratio_var": 0.00011536524834809825
},
{
"episode": 752,
"epoch": 1.3192982456140352,
"eps": 3,
"loss/policy_avg": 0.012542858719825745,
"loss/value_avg": 0.6548900604248047,
"lr": 8.038317757009346e-06,
"objective/entropy": 17.823394775390625,
"objective/kl": 171.56602478027344,
"objective/non_score_reward": -8.578301429748535,
"objective/rlhf_reward": -6.828301429748535,
"objective/scores": 1.75,
"policy/approxkl_avg": 5.147519111633301,
"policy/clipfrac_avg": 0.14563679695129395,
"policy/entropy_avg": 0.6417368650436401,
"step": 47,
"val/clipfrac_avg": 0.004127358552068472,
"val/num_eos_tokens": 0,
"val/ratio": 0.8848338723182678,
"val/ratio_var": 0.000510354817379266
},
{
"episode": 768,
"epoch": 1.3473684210526315,
"eps": 3,
"loss/policy_avg": 0.013358336873352528,
"loss/value_avg": 0.3859623968601227,
"lr": 7.906542056074766e-06,
"objective/entropy": 16.128374099731445,
"objective/kl": 168.9840087890625,
"objective/non_score_reward": -8.449200630187988,
"objective/rlhf_reward": -6.847638130187988,
"objective/scores": 1.6015625,
"policy/approxkl_avg": 4.781813144683838,
"policy/clipfrac_avg": 0.12028302252292633,
"policy/entropy_avg": 0.5887731313705444,
"step": 48,
"val/clipfrac_avg": 0.01179245300590992,
"val/num_eos_tokens": 0,
"val/ratio": 0.9012677669525146,
"val/ratio_var": 0.00041765146306715906
},
{
"episode": 784,
"epoch": 1.3754385964912281,
"eps": 3,
"loss/policy_avg": 0.017281489446759224,
"loss/value_avg": 0.5614770650863647,
"lr": 7.774766355140187e-06,
"objective/entropy": 20.177621841430664,
"objective/kl": 166.16497802734375,
"objective/non_score_reward": -8.308249473571777,
"objective/rlhf_reward": -7.081686973571777,
"objective/scores": 1.2265625,
"policy/approxkl_avg": 3.1813056468963623,
"policy/clipfrac_avg": 0.1432783007621765,
"policy/entropy_avg": 0.7520714998245239,
"step": 49,
"val/clipfrac_avg": 0.1845518946647644,
"val/num_eos_tokens": 0,
"val/ratio": 0.8848077654838562,
"val/ratio_var": 0.0005292592104524374
},
{
"episode": 800,
"epoch": 1.4035087719298245,
"eps": 3,
"loss/policy_avg": 0.015007663518190384,
"loss/value_avg": 0.918763279914856,
"lr": 7.642990654205608e-06,
"objective/entropy": 17.82724380493164,
"objective/kl": 179.90628051757812,
"objective/non_score_reward": -8.99531364440918,
"objective/rlhf_reward": -7.37812614440918,
"objective/scores": 1.6171875,
"policy/approxkl_avg": 5.0764312744140625,
"policy/clipfrac_avg": 0.1149764209985733,
"policy/entropy_avg": 0.6353041529655457,
"step": 50,
"val/clipfrac_avg": 0.014740565791726112,
"val/num_eos_tokens": 0,
"val/ratio": 0.9066751599311829,
"val/ratio_var": 0.00023197307018563151
},
{
"episode": 816,
"epoch": 1.431578947368421,
"eps": 3,
"loss/policy_avg": 0.013827439397573471,
"loss/value_avg": 0.6559504270553589,
"lr": 7.511214953271027e-06,
"objective/entropy": 17.98678207397461,
"objective/kl": 173.85345458984375,
"objective/non_score_reward": -8.692672729492188,
"objective/rlhf_reward": -7.0911102294921875,
"objective/scores": 1.6015625,
"policy/approxkl_avg": 4.863851547241211,
"policy/clipfrac_avg": 0.12382075190544128,
"policy/entropy_avg": 0.6412214040756226,
"step": 51,
"val/clipfrac_avg": 0.03419811278581619,
"val/num_eos_tokens": 0,
"val/ratio": 0.8822081089019775,
"val/ratio_var": 5.379146841733018e-06
},
{
"episode": 832,
"epoch": 1.4596491228070176,
"eps": 3,
"loss/policy_avg": 0.009551126509904861,
"loss/value_avg": 0.46615320444107056,
"lr": 7.379439252336448e-06,
"objective/entropy": 14.611004829406738,
"objective/kl": 169.49464416503906,
"objective/non_score_reward": -8.4747314453125,
"objective/rlhf_reward": -7.0137939453125,
"objective/scores": 1.4609375,
"policy/approxkl_avg": 4.765644073486328,
"policy/clipfrac_avg": 0.09964622557163239,
"policy/entropy_avg": 0.5645979642868042,
"step": 52,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9012112021446228,
"val/ratio_var": 1.3516472790797707e-05
},
{
"episode": 848,
"epoch": 1.487719298245614,
"eps": 3,
"loss/policy_avg": 0.009903261438012123,
"loss/value_avg": 1.024169683456421,
"lr": 7.2476635514018685e-06,
"objective/entropy": 16.012420654296875,
"objective/kl": 173.99917602539062,
"objective/non_score_reward": -8.699958801269531,
"objective/rlhf_reward": -6.371833801269531,
"objective/scores": 2.328125,
"policy/approxkl_avg": 4.889924049377441,
"policy/clipfrac_avg": 0.11261792480945587,
"policy/entropy_avg": 0.5950597524642944,
"step": 53,
"val/clipfrac_avg": 0.20518869161605835,
"val/num_eos_tokens": 0,
"val/ratio": 0.8948594331741333,
"val/ratio_var": 6.705896521452814e-05
},
{
"episode": 864,
"epoch": 1.5157894736842106,
"eps": 3,
"loss/policy_avg": -0.007397271227091551,
"loss/value_avg": 0.4296436011791229,
"lr": 7.115887850467289e-06,
"objective/entropy": 12.315929412841797,
"objective/kl": 175.46511840820312,
"objective/non_score_reward": -8.773256301879883,
"objective/rlhf_reward": -6.570131301879883,
"objective/scores": 2.203125,
"policy/approxkl_avg": 5.532078742980957,
"policy/clipfrac_avg": 0.09375,
"policy/entropy_avg": 0.5138251781463623,
"step": 54,
"val/clipfrac_avg": 0.003537735901772976,
"val/num_eos_tokens": 0,
"val/ratio": 0.9130541086196899,
"val/ratio_var": 8.332962897839025e-05
},
{
"episode": 880,
"epoch": 1.543859649122807,
"eps": 3,
"loss/policy_avg": 0.0055680545046925545,
"loss/value_avg": 0.44329357147216797,
"lr": 6.9841121495327106e-06,
"objective/entropy": 13.753085136413574,
"objective/kl": 162.1046905517578,
"objective/non_score_reward": -8.10523509979248,
"objective/rlhf_reward": -6.1052350997924805,
"objective/scores": 2.0,
"policy/approxkl_avg": 4.2778730392456055,
"policy/clipfrac_avg": 0.10200471431016922,
"policy/entropy_avg": 0.538252592086792,
"step": 55,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.89923495054245,
"val/ratio_var": 0.00013667276652995497
},
{
"episode": 896,
"epoch": 1.5719298245614035,
"eps": 3,
"loss/policy_avg": 0.0029763877391815186,
"loss/value_avg": 0.5969923734664917,
"lr": 6.852336448598131e-06,
"objective/entropy": 10.386423110961914,
"objective/kl": 170.64817810058594,
"objective/non_score_reward": -8.53240966796875,
"objective/rlhf_reward": -5.84490966796875,
"objective/scores": 2.6875,
"policy/approxkl_avg": 5.515145301818848,
"policy/clipfrac_avg": 0.0695754736661911,
"policy/entropy_avg": 0.4759911596775055,
"step": 56,
"val/clipfrac_avg": 0.22051887214183807,
"val/num_eos_tokens": 0,
"val/ratio": 0.9136029481887817,
"val/ratio_var": 0.00015939133299980313
},
{
"episode": 912,
"epoch": 1.6,
"eps": 3,
"loss/policy_avg": -0.0002519981935620308,
"loss/value_avg": 0.6188120245933533,
"lr": 6.720560747663552e-06,
"objective/entropy": 9.047847747802734,
"objective/kl": 162.95162963867188,
"objective/non_score_reward": -8.147581100463867,
"objective/rlhf_reward": -5.835081100463867,
"objective/scores": 2.3125,
"policy/approxkl_avg": 5.942928314208984,
"policy/clipfrac_avg": 0.06721697747707367,
"policy/entropy_avg": 0.43892478942871094,
"step": 57,
"val/clipfrac_avg": 0.03242924436926842,
"val/num_eos_tokens": 0,
"val/ratio": 0.9175019264221191,
"val/ratio_var": 2.5528926926199347e-05
},
{
"episode": 928,
"epoch": 1.6280701754385964,
"eps": 3,
"loss/policy_avg": -0.004241641610860825,
"loss/value_avg": 0.6380342245101929,
"lr": 6.588785046728972e-06,
"objective/entropy": 10.172576904296875,
"objective/kl": 172.64210510253906,
"objective/non_score_reward": -8.632105827331543,
"objective/rlhf_reward": -6.085230827331543,
"objective/scores": 2.546875,
"policy/approxkl_avg": 5.1512861251831055,
"policy/clipfrac_avg": 0.09669811278581619,
"policy/entropy_avg": 0.44444799423217773,
"step": 58,
"val/clipfrac_avg": 0.00294811325147748,
"val/num_eos_tokens": 0,
"val/ratio": 0.9051207304000854,
"val/ratio_var": 9.685073746368289e-05
},
{
"episode": 944,
"epoch": 1.656140350877193,
"eps": 3,
"loss/policy_avg": 0.005844447761774063,
"loss/value_avg": 0.46530038118362427,
"lr": 6.457009345794393e-06,
"objective/entropy": 11.34018611907959,
"objective/kl": 167.05087280273438,
"objective/non_score_reward": -8.352543830871582,
"objective/rlhf_reward": -5.368168830871582,
"objective/scores": 2.984375,
"policy/approxkl_avg": 4.73173713684082,
"policy/clipfrac_avg": 0.06898584961891174,
"policy/entropy_avg": 0.4987587630748749,
"step": 59,
"val/clipfrac_avg": 0.003537735901772976,
"val/num_eos_tokens": 0,
"val/ratio": 0.9096221327781677,
"val/ratio_var": 0.0002903940330725163
},
{
"episode": 960,
"epoch": 1.6842105263157894,
"eps": 3,
"loss/policy_avg": 0.0015796682564541698,
"loss/value_avg": 0.5465973615646362,
"lr": 6.3252336448598135e-06,
"objective/entropy": 10.832345962524414,
"objective/kl": 166.35125732421875,
"objective/non_score_reward": -8.317562103271484,
"objective/rlhf_reward": -5.114437103271484,
"objective/scores": 3.203125,
"policy/approxkl_avg": 4.080867767333984,
"policy/clipfrac_avg": 0.08726415038108826,
"policy/entropy_avg": 0.46615108847618103,
"step": 60,
"val/clipfrac_avg": 0.018278302624821663,
"val/num_eos_tokens": 0,
"val/ratio": 0.9084208011627197,
"val/ratio_var": 1.8292890672455542e-05
},
{
"episode": 976,
"epoch": 1.712280701754386,
"eps": 3,
"loss/policy_avg": -0.0016184533014893532,
"loss/value_avg": 0.6316072344779968,
"lr": 6.193457943925234e-06,
"objective/entropy": 9.0885648727417,
"objective/kl": 172.646240234375,
"objective/non_score_reward": -8.632311820983887,
"objective/rlhf_reward": -5.194811820983887,
"objective/scores": 3.4375,
"policy/approxkl_avg": 4.502593994140625,
"policy/clipfrac_avg": 0.06603773683309555,
"policy/entropy_avg": 0.41100969910621643,
"step": 61,
"val/clipfrac_avg": 0.044811319559812546,
"val/num_eos_tokens": 0,
"val/ratio": 0.9256702661514282,
"val/ratio_var": 7.894221198512241e-05
},
{
"episode": 992,
"epoch": 1.7403508771929825,
"eps": 3,
"loss/policy_avg": -0.0019415542483329773,
"loss/value_avg": 0.6046911478042603,
"lr": 6.061682242990655e-06,
"objective/entropy": 9.12926197052002,
"objective/kl": 169.4315185546875,
"objective/non_score_reward": -8.471575736999512,
"objective/rlhf_reward": -5.424700736999512,
"objective/scores": 3.046875,
"policy/approxkl_avg": 5.609973907470703,
"policy/clipfrac_avg": 0.09198112785816193,
"policy/entropy_avg": 0.4236205816268921,
"step": 62,
"val/clipfrac_avg": 0.001768867950886488,
"val/num_eos_tokens": 0,
"val/ratio": 0.9198966026306152,
"val/ratio_var": 6.228529673535377e-05
},
{
"episode": 1008,
"epoch": 1.768421052631579,
"eps": 3,
"loss/policy_avg": -0.007835395634174347,
"loss/value_avg": 0.6853305697441101,
"lr": 5.929906542056075e-06,
"objective/entropy": 8.566083908081055,
"objective/kl": 163.68191528320312,
"objective/non_score_reward": -8.18409538269043,
"objective/rlhf_reward": -4.09034538269043,
"objective/scores": 4.09375,
"policy/approxkl_avg": 3.7664973735809326,
"policy/clipfrac_avg": 0.07429245114326477,
"policy/entropy_avg": 0.41426771879196167,
"step": 63,
"val/clipfrac_avg": 0.007665094453841448,
"val/num_eos_tokens": 0,
"val/ratio": 0.9395467042922974,
"val/ratio_var": 0.00018259203352499753
},
{
"episode": 1024,
"epoch": 1.7964912280701755,
"eps": 3,
"loss/policy_avg": 0.0056846365332603455,
"loss/value_avg": 0.8050791621208191,
"lr": 5.798130841121495e-06,
"objective/entropy": 7.867904186248779,
"objective/kl": 176.44961547851562,
"objective/non_score_reward": -8.822481155395508,
"objective/rlhf_reward": -4.931856155395508,
"objective/scores": 3.890625,
"policy/approxkl_avg": 4.615470886230469,
"policy/clipfrac_avg": 0.07016509771347046,
"policy/entropy_avg": 0.40076911449432373,
"step": 64,
"val/clipfrac_avg": 0.1179245263338089,
"val/num_eos_tokens": 0,
"val/ratio": 0.9168256521224976,
"val/ratio_var": 1.1071170774812344e-05
},
{
"episode": 1040,
"epoch": 1.8245614035087718,
"eps": 3,
"loss/policy_avg": -0.004829235374927521,
"loss/value_avg": 0.7683409452438354,
"lr": 5.666355140186916e-06,
"objective/entropy": 8.73065185546875,
"objective/kl": 165.93441772460938,
"objective/non_score_reward": -8.296720504760742,
"objective/rlhf_reward": -4.531095504760742,
"objective/scores": 3.765625,
"policy/approxkl_avg": 4.037623882293701,
"policy/clipfrac_avg": 0.0625,
"policy/entropy_avg": 0.38483142852783203,
"step": 65,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9364030361175537,
"val/ratio_var": 0.0001283105229958892
},
{
"episode": 1056,
"epoch": 1.8526315789473684,
"eps": 3,
"loss/policy_avg": -0.002082128543406725,
"loss/value_avg": 0.8781827688217163,
"lr": 5.534579439252336e-06,
"objective/entropy": 6.81689977645874,
"objective/kl": 173.76760864257812,
"objective/non_score_reward": -8.688380241394043,
"objective/rlhf_reward": -5.454005241394043,
"objective/scores": 3.234375,
"policy/approxkl_avg": 5.1825032234191895,
"policy/clipfrac_avg": 0.07488207519054413,
"policy/entropy_avg": 0.3771995007991791,
"step": 66,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9298049211502075,
"val/ratio_var": 0.00015954735863488168
},
{
"episode": 1072,
"epoch": 1.880701754385965,
"eps": 3,
"loss/policy_avg": 0.005034355446696281,
"loss/value_avg": 1.0226874351501465,
"lr": 5.402803738317757e-06,
"objective/entropy": 5.308557510375977,
"objective/kl": 171.6015167236328,
"objective/non_score_reward": -8.580076217651367,
"objective/rlhf_reward": -4.236326217651367,
"objective/scores": 4.34375,
"policy/approxkl_avg": 5.336367607116699,
"policy/clipfrac_avg": 0.04599056765437126,
"policy/entropy_avg": 0.34400177001953125,
"step": 67,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9246993064880371,
"val/ratio_var": 9.672338592281449e-07
},
{
"episode": 1088,
"epoch": 1.9087719298245613,
"eps": 3,
"loss/policy_avg": 0.023275576531887054,
"loss/value_avg": 0.6750494241714478,
"lr": 5.271028037383177e-06,
"objective/entropy": 7.23941707611084,
"objective/kl": 166.45547485351562,
"objective/non_score_reward": -8.322773933410645,
"objective/rlhf_reward": -4.6352739334106445,
"objective/scores": 3.6875,
"policy/approxkl_avg": 3.1369752883911133,
"policy/clipfrac_avg": 0.05837263911962509,
"policy/entropy_avg": 0.3996211886405945,
"step": 68,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9343756437301636,
"val/ratio_var": 0.00011849942529806867
},
{
"episode": 1104,
"epoch": 1.936842105263158,
"eps": 3,
"loss/policy_avg": 0.001583978533744812,
"loss/value_avg": 0.7364473342895508,
"lr": 5.139252336448598e-06,
"objective/entropy": 8.292254447937012,
"objective/kl": 174.10446166992188,
"objective/non_score_reward": -8.705223083496094,
"objective/rlhf_reward": -4.517723083496094,
"objective/scores": 4.1875,
"policy/approxkl_avg": 5.407079696655273,
"policy/clipfrac_avg": 0.06780660152435303,
"policy/entropy_avg": 0.3910168409347534,
"step": 69,
"val/clipfrac_avg": 0.001179245300590992,
"val/num_eos_tokens": 0,
"val/ratio": 0.9262620210647583,
"val/ratio_var": 8.509035978931934e-06
},
{
"episode": 1120,
"epoch": 1.9649122807017543,
"eps": 3,
"loss/policy_avg": 0.014011572115123272,
"loss/value_avg": 0.49188750982284546,
"lr": 5.0074766355140185e-06,
"objective/entropy": 4.73923397064209,
"objective/kl": 170.3909912109375,
"objective/non_score_reward": -8.519549369812012,
"objective/rlhf_reward": -4.535174369812012,
"objective/scores": 3.984375,
"policy/approxkl_avg": 4.553505897521973,
"policy/clipfrac_avg": 0.04658018797636032,
"policy/entropy_avg": 0.314146488904953,
"step": 70,
"val/clipfrac_avg": 0.001768867950886488,
"val/num_eos_tokens": 0,
"val/ratio": 0.9345220327377319,
"val/ratio_var": 0.00011590120993787423
},
{
"episode": 1136,
"epoch": 1.9929824561403509,
"eps": 3,
"loss/policy_avg": 0.014443885535001755,
"loss/value_avg": 0.8583539724349976,
"lr": 4.875700934579439e-06,
"objective/entropy": 6.110556602478027,
"objective/kl": 168.1246337890625,
"objective/non_score_reward": -8.406231880187988,
"objective/rlhf_reward": -4.781231880187988,
"objective/scores": 3.625,
"policy/approxkl_avg": 3.3112387657165527,
"policy/clipfrac_avg": 0.04716981202363968,
"policy/entropy_avg": 0.3741912841796875,
"step": 71,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9311293363571167,
"val/ratio_var": 1.7129657862824388e-05
},
{
"episode": 1152,
"epoch": 2.0210526315789474,
"eps": 3,
"loss/policy_avg": 0.0069357771426439285,
"loss/value_avg": 0.6024092435836792,
"lr": 4.74392523364486e-06,
"objective/entropy": 1.9080017805099487,
"objective/kl": 175.54367065429688,
"objective/non_score_reward": -8.777183532714844,
"objective/rlhf_reward": -3.7771835327148438,
"objective/scores": 5.0,
"policy/approxkl_avg": 6.433887004852295,
"policy/clipfrac_avg": 0.03655660152435303,
"policy/entropy_avg": 0.2685927748680115,
"step": 72,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9363144040107727,
"val/ratio_var": 2.7767631763708778e-05
},
{
"episode": 1168,
"epoch": 2.049122807017544,
"eps": 3,
"loss/policy_avg": 0.004744451027363539,
"loss/value_avg": 0.6521505117416382,
"lr": 4.61214953271028e-06,
"objective/entropy": 2.584568500518799,
"objective/kl": 171.61709594726562,
"objective/non_score_reward": -8.580854415893555,
"objective/rlhf_reward": -3.1121044158935547,
"objective/scores": 5.46875,
"policy/approxkl_avg": 4.509120941162109,
"policy/clipfrac_avg": 0.03478773683309555,
"policy/entropy_avg": 0.2757822573184967,
"step": 73,
"val/clipfrac_avg": 0.2900943160057068,
"val/num_eos_tokens": 0,
"val/ratio": 0.9448769092559814,
"val/ratio_var": 1.143478584708646e-05
},
{
"episode": 1184,
"epoch": 2.07719298245614,
"eps": 3,
"loss/policy_avg": 0.004101068712770939,
"loss/value_avg": 0.44417738914489746,
"lr": 4.480373831775701e-06,
"objective/entropy": 3.265643835067749,
"objective/kl": 179.89352416992188,
"objective/non_score_reward": -8.99467658996582,
"objective/rlhf_reward": -4.11967658996582,
"objective/scores": 4.875,
"policy/approxkl_avg": 5.798920154571533,
"policy/clipfrac_avg": 0.028891509398818016,
"policy/entropy_avg": 0.29206639528274536,
"step": 74,
"val/clipfrac_avg": 0.05188679322600365,
"val/num_eos_tokens": 0,
"val/ratio": 0.9267533421516418,
"val/ratio_var": 0.0001095838742912747
},
{
"episode": 1200,
"epoch": 2.1052631578947367,
"eps": 3,
"loss/policy_avg": 0.004760343115776777,
"loss/value_avg": 0.3549901843070984,
"lr": 4.3485981308411215e-06,
"objective/entropy": 2.9447989463806152,
"objective/kl": 175.41961669921875,
"objective/non_score_reward": -8.770980834960938,
"objective/rlhf_reward": -3.2084808349609375,
"objective/scores": 5.5625,
"policy/approxkl_avg": 4.80606746673584,
"policy/clipfrac_avg": 0.03419811278581619,
"policy/entropy_avg": 0.30916815996170044,
"step": 75,
"val/clipfrac_avg": 0.004127358552068472,
"val/num_eos_tokens": 0,
"val/ratio": 0.9180092215538025,
"val/ratio_var": 2.3069829694577493e-05
},
{
"episode": 1216,
"epoch": 2.1333333333333333,
"eps": 3,
"loss/policy_avg": 0.010298425331711769,
"loss/value_avg": 0.15927723050117493,
"lr": 4.216822429906542e-06,
"objective/entropy": 1.4227180480957031,
"objective/kl": 176.0067138671875,
"objective/non_score_reward": -8.800336837768555,
"objective/rlhf_reward": -2.6753368377685547,
"objective/scores": 6.125,
"policy/approxkl_avg": 4.99057149887085,
"policy/clipfrac_avg": 0.028301887214183807,
"policy/entropy_avg": 0.2751670479774475,
"step": 76,
"val/clipfrac_avg": 0.10495282709598541,
"val/num_eos_tokens": 0,
"val/ratio": 0.9201045036315918,
"val/ratio_var": 5.366753157431958e-06
},
{
"episode": 1232,
"epoch": 2.16140350877193,
"eps": 3,
"loss/policy_avg": -0.005462624132633209,
"loss/value_avg": 0.28704196214675903,
"lr": 4.085046728971963e-06,
"objective/entropy": 1.6171071529388428,
"objective/kl": 176.83685302734375,
"objective/non_score_reward": -8.841842651367188,
"objective/rlhf_reward": -3.1543426513671875,
"objective/scores": 5.6875,
"policy/approxkl_avg": 5.847208023071289,
"policy/clipfrac_avg": 0.028891509398818016,
"policy/entropy_avg": 0.286138117313385,
"step": 77,
"val/clipfrac_avg": 0.07075471431016922,
"val/num_eos_tokens": 0,
"val/ratio": 0.9195102453231812,
"val/ratio_var": 9.577343917044345e-06
},
{
"episode": 1248,
"epoch": 2.1894736842105265,
"eps": 3,
"loss/policy_avg": 0.0010141655802726746,
"loss/value_avg": 0.8408201932907104,
"lr": 3.953271028037383e-06,
"objective/entropy": 7.40260124206543,
"objective/kl": 177.4427490234375,
"objective/non_score_reward": -8.872137069702148,
"objective/rlhf_reward": -4.903387069702148,
"objective/scores": 3.96875,
"policy/approxkl_avg": 5.285105228424072,
"policy/clipfrac_avg": 0.04658018797636032,
"policy/entropy_avg": 0.4253733158111572,
"step": 78,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9203585982322693,
"val/ratio_var": 1.65566543728346e-05
},
{
"episode": 1264,
"epoch": 2.2175438596491226,
"eps": 3,
"loss/policy_avg": -0.004624534398317337,
"loss/value_avg": 0.7719740271568298,
"lr": 3.821495327102804e-06,
"objective/entropy": 4.648886203765869,
"objective/kl": 181.51986694335938,
"objective/non_score_reward": -9.075994491577148,
"objective/rlhf_reward": -4.700994491577148,
"objective/scores": 4.375,
"policy/approxkl_avg": 4.547338485717773,
"policy/clipfrac_avg": 0.0383254736661911,
"policy/entropy_avg": 0.3513880968093872,
"step": 79,
"val/clipfrac_avg": 0.1291273534297943,
"val/num_eos_tokens": 0,
"val/ratio": 0.9286638498306274,
"val/ratio_var": 6.422147998819128e-05
},
{
"episode": 1280,
"epoch": 2.245614035087719,
"eps": 3,
"loss/policy_avg": 0.012380128726363182,
"loss/value_avg": 0.40563684701919556,
"lr": 3.689719626168224e-06,
"objective/entropy": 7.685408115386963,
"objective/kl": 168.90484619140625,
"objective/non_score_reward": -8.445242881774902,
"objective/rlhf_reward": -3.4452428817749023,
"objective/scores": 5.0,
"policy/approxkl_avg": 3.4143970012664795,
"policy/clipfrac_avg": 0.041273586452007294,
"policy/entropy_avg": 0.4100227355957031,
"step": 80,
"val/clipfrac_avg": 0.015330187976360321,
"val/num_eos_tokens": 0,
"val/ratio": 0.9217012524604797,
"val/ratio_var": 7.061174983391538e-05
},
{
"episode": 1296,
"epoch": 2.2736842105263158,
"eps": 3,
"loss/policy_avg": 0.011339722201228142,
"loss/value_avg": 0.3490160405635834,
"lr": 3.5579439252336446e-06,
"objective/entropy": 4.046834945678711,
"objective/kl": 177.92718505859375,
"objective/non_score_reward": -8.89635944366455,
"objective/rlhf_reward": -3.958859443664551,
"objective/scores": 4.9375,
"policy/approxkl_avg": 5.583766460418701,
"policy/clipfrac_avg": 0.03478773683309555,
"policy/entropy_avg": 0.3193933963775635,
"step": 81,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9363265037536621,
"val/ratio_var": 5.8069101214641705e-05
},
{
"episode": 1312,
"epoch": 2.3017543859649123,
"eps": 3,
"loss/policy_avg": 0.007465606089681387,
"loss/value_avg": 0.3137081563472748,
"lr": 3.4261682242990656e-06,
"objective/entropy": 3.293423652648926,
"objective/kl": 173.18377685546875,
"objective/non_score_reward": -8.659189224243164,
"objective/rlhf_reward": -3.065439224243164,
"objective/scores": 5.59375,
"policy/approxkl_avg": 4.794089317321777,
"policy/clipfrac_avg": 0.0383254736661911,
"policy/entropy_avg": 0.29685914516448975,
"step": 82,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9458816647529602,
"val/ratio_var": 0.00023432180751115084
},
{
"episode": 1328,
"epoch": 2.329824561403509,
"eps": 3,
"loss/policy_avg": -0.00093865767121315,
"loss/value_avg": 0.9402576684951782,
"lr": 3.294392523364486e-06,
"objective/entropy": 5.09280252456665,
"objective/kl": 173.88351440429688,
"objective/non_score_reward": -8.694175720214844,
"objective/rlhf_reward": -4.866050720214844,
"objective/scores": 3.828125,
"policy/approxkl_avg": 3.8168904781341553,
"policy/clipfrac_avg": 0.03891509398818016,
"policy/entropy_avg": 0.35750845074653625,
"step": 83,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9232673645019531,
"val/ratio_var": 4.1539384255884215e-05
},
{
"episode": 1344,
"epoch": 2.357894736842105,
"eps": 3,
"loss/policy_avg": -0.007357731461524963,
"loss/value_avg": 0.36178284883499146,
"lr": 3.1626168224299067e-06,
"objective/entropy": 5.281716346740723,
"objective/kl": 179.2125701904297,
"objective/non_score_reward": -8.960628509521484,
"objective/rlhf_reward": -3.9918785095214844,
"objective/scores": 4.96875,
"policy/approxkl_avg": 4.461269378662109,
"policy/clipfrac_avg": 0.05365566164255142,
"policy/entropy_avg": 0.35694169998168945,
"step": 84,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9178085923194885,
"val/ratio_var": 4.949720823788084e-05
},
{
"episode": 1360,
"epoch": 2.3859649122807016,
"eps": 3,
"loss/policy_avg": 0.0004696398973464966,
"loss/value_avg": 0.30143094062805176,
"lr": 3.0308411214953273e-06,
"objective/entropy": 2.755769729614258,
"objective/kl": 173.08140563964844,
"objective/non_score_reward": -8.654069900512695,
"objective/rlhf_reward": -2.9665699005126953,
"objective/scores": 5.6875,
"policy/approxkl_avg": 5.356992721557617,
"policy/clipfrac_avg": 0.03242924436926842,
"policy/entropy_avg": 0.282896488904953,
"step": 85,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9419326782226562,
"val/ratio_var": 3.9359238144243136e-05
},
{
"episode": 1376,
"epoch": 2.414035087719298,
"eps": 3,
"loss/policy_avg": 0.0008706599473953247,
"loss/value_avg": 0.5158276557922363,
"lr": 2.8990654205607475e-06,
"objective/entropy": 4.149250507354736,
"objective/kl": 173.71505737304688,
"objective/non_score_reward": -8.685752868652344,
"objective/rlhf_reward": -3.6857528686523438,
"objective/scores": 5.0,
"policy/approxkl_avg": 5.095344066619873,
"policy/clipfrac_avg": 0.030070755630731583,
"policy/entropy_avg": 0.3076534867286682,
"step": 86,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.93389892578125,
"val/ratio_var": 9.108168342208955e-06
},
{
"episode": 1392,
"epoch": 2.442105263157895,
"eps": 3,
"loss/policy_avg": -0.0013641200494021177,
"loss/value_avg": 0.46665364503860474,
"lr": 2.767289719626168e-06,
"objective/entropy": 3.9847404956817627,
"objective/kl": 171.97903442382812,
"objective/non_score_reward": -8.59895133972168,
"objective/rlhf_reward": -3.4114513397216797,
"objective/scores": 5.1875,
"policy/approxkl_avg": 4.758839130401611,
"policy/clipfrac_avg": 0.02771226316690445,
"policy/entropy_avg": 0.3068329691886902,
"step": 87,
"val/clipfrac_avg": 0.002358490601181984,
"val/num_eos_tokens": 0,
"val/ratio": 0.9303795099258423,
"val/ratio_var": 2.7705931643140502e-05
},
{
"episode": 1408,
"epoch": 2.4701754385964914,
"eps": 3,
"loss/policy_avg": -0.009293105453252792,
"loss/value_avg": 0.1374308168888092,
"lr": 2.6355140186915887e-06,
"objective/entropy": 2.8504319190979004,
"objective/kl": 178.8887176513672,
"objective/non_score_reward": -8.944437026977539,
"objective/rlhf_reward": -2.975687026977539,
"objective/scores": 5.96875,
"policy/approxkl_avg": 5.254701614379883,
"policy/clipfrac_avg": 0.026533018797636032,
"policy/entropy_avg": 0.2935040593147278,
"step": 88,
"val/clipfrac_avg": 0.002358490601181984,
"val/num_eos_tokens": 0,
"val/ratio": 0.9373711943626404,
"val/ratio_var": 1.766591776686255e-05
},
{
"episode": 1424,
"epoch": 2.498245614035088,
"eps": 3,
"loss/policy_avg": 0.00495288148522377,
"loss/value_avg": 0.20061969757080078,
"lr": 2.5037383177570093e-06,
"objective/entropy": 4.51104211807251,
"objective/kl": 169.7410125732422,
"objective/non_score_reward": -8.487051010131836,
"objective/rlhf_reward": -2.768301010131836,
"objective/scores": 5.71875,
"policy/approxkl_avg": 4.481791019439697,
"policy/clipfrac_avg": 0.03478773683309555,
"policy/entropy_avg": 0.3265501856803894,
"step": 89,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9414163827896118,
"val/ratio_var": 0.00011632378300419077
},
{
"episode": 1440,
"epoch": 2.526315789473684,
"eps": 3,
"loss/policy_avg": 0.00646105594933033,
"loss/value_avg": 0.42740941047668457,
"lr": 2.37196261682243e-06,
"objective/entropy": 3.2403650283813477,
"objective/kl": 176.238037109375,
"objective/non_score_reward": -8.811902046203613,
"objective/rlhf_reward": -3.9994020462036133,
"objective/scores": 4.8125,
"policy/approxkl_avg": 5.44842529296875,
"policy/clipfrac_avg": 0.01591981202363968,
"policy/entropy_avg": 0.2933845520019531,
"step": 90,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9381667971611023,
"val/ratio_var": 9.369335020892322e-05
},
{
"episode": 1456,
"epoch": 2.5543859649122806,
"eps": 3,
"loss/policy_avg": -0.005389541387557983,
"loss/value_avg": 0.4948211908340454,
"lr": 2.2401869158878504e-06,
"objective/entropy": 2.898387908935547,
"objective/kl": 173.48486328125,
"objective/non_score_reward": -8.674242973327637,
"objective/rlhf_reward": -3.5179929733276367,
"objective/scores": 5.15625,
"policy/approxkl_avg": 4.66801643371582,
"policy/clipfrac_avg": 0.020636793226003647,
"policy/entropy_avg": 0.2913670837879181,
"step": 91,
"val/clipfrac_avg": 0.001179245300590992,
"val/num_eos_tokens": 0,
"val/ratio": 0.9378049373626709,
"val/ratio_var": 1.2327662261668593e-05
},
{
"episode": 1472,
"epoch": 2.5824561403508772,
"eps": 3,
"loss/policy_avg": -0.010267895646393299,
"loss/value_avg": 0.26834648847579956,
"lr": 2.108411214953271e-06,
"objective/entropy": 4.616816997528076,
"objective/kl": 171.12762451171875,
"objective/non_score_reward": -8.556382179260254,
"objective/rlhf_reward": -3.587632179260254,
"objective/scores": 4.96875,
"policy/approxkl_avg": 4.146580219268799,
"policy/clipfrac_avg": 0.041273586452007294,
"policy/entropy_avg": 0.34417253732681274,
"step": 92,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9241670370101929,
"val/ratio_var": 4.9057460273616016e-05
},
{
"episode": 1488,
"epoch": 2.610526315789474,
"eps": 3,
"loss/policy_avg": 0.0006395354866981506,
"loss/value_avg": 0.7872554063796997,
"lr": 1.9766355140186916e-06,
"objective/entropy": 5.483046531677246,
"objective/kl": 169.1695098876953,
"objective/non_score_reward": -8.458476066589355,
"objective/rlhf_reward": -4.4741010665893555,
"objective/scores": 3.984375,
"policy/approxkl_avg": 2.8852078914642334,
"policy/clipfrac_avg": 0.032429248094558716,
"policy/entropy_avg": 0.35312554240226746,
"step": 93,
"val/clipfrac_avg": 0.001768867950886488,
"val/num_eos_tokens": 0,
"val/ratio": 0.9286659955978394,
"val/ratio_var": 2.8370095606078394e-05
},
{
"episode": 1504,
"epoch": 2.6385964912280704,
"eps": 3,
"loss/policy_avg": -0.00652042031288147,
"loss/value_avg": 0.17014235258102417,
"lr": 1.844859813084112e-06,
"objective/entropy": 2.737617015838623,
"objective/kl": 178.22747802734375,
"objective/non_score_reward": -8.911375045776367,
"objective/rlhf_reward": -3.286375045776367,
"objective/scores": 5.625,
"policy/approxkl_avg": 5.498225688934326,
"policy/clipfrac_avg": 0.028891511261463165,
"policy/entropy_avg": 0.28995558619499207,
"step": 94,
"val/clipfrac_avg": 0.001768867950886488,
"val/num_eos_tokens": 0,
"val/ratio": 0.9345540404319763,
"val/ratio_var": 1.2709216434814152e-06
},
{
"episode": 1520,
"epoch": 2.6666666666666665,
"eps": 3,
"loss/policy_avg": 0.007195580750703812,
"loss/value_avg": 0.40153437852859497,
"lr": 1.7130841121495328e-06,
"objective/entropy": 4.30942440032959,
"objective/kl": 176.95938110351562,
"objective/non_score_reward": -8.847970008850098,
"objective/rlhf_reward": -3.8479700088500977,
"objective/scores": 5.0,
"policy/approxkl_avg": 5.267421245574951,
"policy/clipfrac_avg": 0.03125,
"policy/entropy_avg": 0.31008654832839966,
"step": 95,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9336869716644287,
"val/ratio_var": 4.966981941834092e-05
},
{
"episode": 1536,
"epoch": 2.694736842105263,
"eps": 3,
"loss/policy_avg": 0.012591801583766937,
"loss/value_avg": 0.3597390055656433,
"lr": 1.5813084112149534e-06,
"objective/entropy": 5.459916591644287,
"objective/kl": 172.44110107421875,
"objective/non_score_reward": -8.622055053710938,
"objective/rlhf_reward": -3.6220550537109375,
"objective/scores": 5.0,
"policy/approxkl_avg": 4.5339765548706055,
"policy/clipfrac_avg": 0.03537736088037491,
"policy/entropy_avg": 0.3411254286766052,
"step": 96,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9462149739265442,
"val/ratio_var": 3.8459929783130065e-05
},
{
"episode": 1552,
"epoch": 2.7228070175438597,
"eps": 3,
"loss/policy_avg": -0.003356472123414278,
"loss/value_avg": 0.6434417963027954,
"lr": 1.4495327102803737e-06,
"objective/entropy": 5.633913516998291,
"objective/kl": 172.26502990722656,
"objective/non_score_reward": -8.613250732421875,
"objective/rlhf_reward": -4.363250732421875,
"objective/scores": 4.25,
"policy/approxkl_avg": 3.585165500640869,
"policy/clipfrac_avg": 0.03537735715508461,
"policy/entropy_avg": 0.34199586510658264,
"step": 97,
"val/clipfrac_avg": 0.001179245300590992,
"val/num_eos_tokens": 0,
"val/ratio": 0.9311625957489014,
"val/ratio_var": 6.935850979061797e-05
},
{
"episode": 1568,
"epoch": 2.7508771929824563,
"eps": 3,
"loss/policy_avg": -0.003898909315466881,
"loss/value_avg": 0.36550819873809814,
"lr": 1.3177570093457943e-06,
"objective/entropy": 4.281040191650391,
"objective/kl": 174.15972900390625,
"objective/non_score_reward": -8.707986831665039,
"objective/rlhf_reward": -3.614236831665039,
"objective/scores": 5.09375,
"policy/approxkl_avg": 5.1715850830078125,
"policy/clipfrac_avg": 0.02712264284491539,
"policy/entropy_avg": 0.3137935698032379,
"step": 98,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9350335597991943,
"val/ratio_var": 1.248767239303561e-05
},
{
"episode": 1584,
"epoch": 2.778947368421053,
"eps": 3,
"loss/policy_avg": 0.0017306804656982422,
"loss/value_avg": 0.2737918496131897,
"lr": 1.185981308411215e-06,
"objective/entropy": 5.210065841674805,
"objective/kl": 173.97068786621094,
"objective/non_score_reward": -8.69853401184082,
"objective/rlhf_reward": -3.8860340118408203,
"objective/scores": 4.8125,
"policy/approxkl_avg": 5.011469841003418,
"policy/clipfrac_avg": 0.04304245114326477,
"policy/entropy_avg": 0.3419041931629181,
"step": 99,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9419320225715637,
"val/ratio_var": 8.726042324269656e-06
},
{
"episode": 1600,
"epoch": 2.807017543859649,
"eps": 3,
"loss/policy_avg": -0.006221463903784752,
"loss/value_avg": 0.3625496029853821,
"lr": 1.0542056074766355e-06,
"objective/entropy": 3.721562623977661,
"objective/kl": 175.773193359375,
"objective/non_score_reward": -8.78865909576416,
"objective/rlhf_reward": -3.47615909576416,
"objective/scores": 5.3125,
"policy/approxkl_avg": 5.388751029968262,
"policy/clipfrac_avg": 0.03419811278581619,
"policy/entropy_avg": 0.29315799474716187,
"step": 100,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.949840784072876,
"val/ratio_var": 3.881535303662531e-05
},
{
"episode": 1616,
"epoch": 2.8350877192982455,
"eps": 3,
"loss/policy_avg": -0.005170758813619614,
"loss/value_avg": 0.4136154055595398,
"lr": 9.22429906542056e-07,
"objective/entropy": 5.907715320587158,
"objective/kl": 171.47390747070312,
"objective/non_score_reward": -8.573695182800293,
"objective/rlhf_reward": -4.167445182800293,
"objective/scores": 4.40625,
"policy/approxkl_avg": 5.403087615966797,
"policy/clipfrac_avg": 0.03478773683309555,
"policy/entropy_avg": 0.33564049005508423,
"step": 101,
"val/clipfrac_avg": 0.000589622650295496,
"val/num_eos_tokens": 0,
"val/ratio": 0.9340347051620483,
"val/ratio_var": 3.0960076401242986e-05
},
{
"episode": 1632,
"epoch": 2.863157894736842,
"eps": 3,
"loss/policy_avg": 0.002457182854413986,
"loss/value_avg": 0.27742013335227966,
"lr": 7.906542056074767e-07,
"objective/entropy": 5.222499370574951,
"objective/kl": 176.05380249023438,
"objective/non_score_reward": -8.802690505981445,
"objective/rlhf_reward": -3.6151905059814453,
"objective/scores": 5.1875,
"policy/approxkl_avg": 4.675132751464844,
"policy/clipfrac_avg": 0.04304245114326477,
"policy/entropy_avg": 0.3403066396713257,
"step": 102,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9404515027999878,
"val/ratio_var": 7.289678615052253e-05
},
{
"episode": 1648,
"epoch": 2.8912280701754387,
"eps": 3,
"loss/policy_avg": -0.006544323638081551,
"loss/value_avg": 0.29731422662734985,
"lr": 6.588785046728972e-07,
"objective/entropy": 4.219725608825684,
"objective/kl": 178.1557159423828,
"objective/non_score_reward": -8.90778636932373,
"objective/rlhf_reward": -3.8452863693237305,
"objective/scores": 5.0625,
"policy/approxkl_avg": 5.953890800476074,
"policy/clipfrac_avg": 0.0383254699409008,
"policy/entropy_avg": 0.31199511885643005,
"step": 103,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9407525658607483,
"val/ratio_var": 5.8047560742124915e-05
},
{
"episode": 1664,
"epoch": 2.9192982456140353,
"eps": 3,
"loss/policy_avg": -0.0011880630627274513,
"loss/value_avg": 0.21903052926063538,
"lr": 5.271028037383178e-07,
"objective/entropy": 5.557653427124023,
"objective/kl": 170.518798828125,
"objective/non_score_reward": -8.52593994140625,
"objective/rlhf_reward": -3.05718994140625,
"objective/scores": 5.46875,
"policy/approxkl_avg": 4.447786331176758,
"policy/clipfrac_avg": 0.0383254699409008,
"policy/entropy_avg": 0.33431151509284973,
"step": 104,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9298925399780273,
"val/ratio_var": 9.144405339611694e-06
},
{
"episode": 1680,
"epoch": 2.9473684210526314,
"eps": 3,
"loss/policy_avg": -0.0007513905875384808,
"loss/value_avg": 0.21037007868289948,
"lr": 3.9532710280373834e-07,
"objective/entropy": 3.900575876235962,
"objective/kl": 174.99456787109375,
"objective/non_score_reward": -8.74972915649414,
"objective/rlhf_reward": -3.5309791564941406,
"objective/scores": 5.21875,
"policy/approxkl_avg": 5.165627479553223,
"policy/clipfrac_avg": 0.028301887214183807,
"policy/entropy_avg": 0.2935909032821655,
"step": 105,
"val/clipfrac_avg": 0.001179245300590992,
"val/num_eos_tokens": 0,
"val/ratio": 0.9422957897186279,
"val/ratio_var": 2.6614558009896427e-05
},
{
"episode": 1696,
"epoch": 2.975438596491228,
"eps": 3,
"loss/policy_avg": -0.005426734685897827,
"loss/value_avg": 0.21496959030628204,
"lr": 2.635514018691589e-07,
"objective/entropy": 4.556634902954102,
"objective/kl": 173.05136108398438,
"objective/non_score_reward": -8.652568817138672,
"objective/rlhf_reward": -2.933818817138672,
"objective/scores": 5.71875,
"policy/approxkl_avg": 4.820314884185791,
"policy/clipfrac_avg": 0.04304245486855507,
"policy/entropy_avg": 0.31966692209243774,
"step": 106,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9327390193939209,
"val/ratio_var": 8.508096652803943e-05
},
{
"episode": 1712,
"epoch": 3.0035087719298246,
"eps": 3,
"loss/policy_avg": 0.0001406269147992134,
"loss/value_avg": 0.186610609292984,
"lr": 1.3177570093457944e-07,
"objective/entropy": 4.999897003173828,
"objective/kl": 173.25045776367188,
"objective/non_score_reward": -8.66252326965332,
"objective/rlhf_reward": -2.9437732696533203,
"objective/scores": 5.71875,
"policy/approxkl_avg": 4.337066173553467,
"policy/clipfrac_avg": 0.04716981202363968,
"policy/entropy_avg": 0.3558363914489746,
"step": 107,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9243938326835632,
"val/ratio_var": 0.000128799001686275
}
],
"logging_steps": 10,
"max_steps": 107,
"num_input_tokens_seen": 0,
"num_train_epochs": 3.0,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0,
"train_batch_size": null,
"trial_name": null,
"trial_params": null
}