output_iter1_ckpt / trainer_state.json
DavieLion's picture
Upload 11 files
40b19df verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 100,
"global_step": 1650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0036363636363636364,
"grad_norm": 1914.0669841335673,
"learning_rate": 3.03030303030303e-09,
"logits/generated": 2.8247313499450684,
"logits/real": 2.0536062717437744,
"logps/generated": -351.4994201660156,
"logps/real": -477.549072265625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/generated": 0.0,
"rewards/margins": 0.0,
"rewards/real": 0.0,
"step": 1
},
{
"epoch": 0.03636363636363636,
"grad_norm": 957.205299846335,
"learning_rate": 3.0303030303030305e-08,
"logits/generated": 2.8234548568725586,
"logits/real": 2.13620662689209,
"logps/generated": -728.49853515625,
"logps/real": -385.6576843261719,
"loss": 0.6834,
"rewards/accuracies": 0.4861111044883728,
"rewards/generated": -0.06369408965110779,
"rewards/margins": 0.06537667661905289,
"rewards/real": 0.001682590926066041,
"step": 10
},
{
"epoch": 0.07272727272727272,
"grad_norm": 186.10374621715866,
"learning_rate": 6.060606060606061e-08,
"logits/generated": 3.0520665645599365,
"logits/real": 2.3358449935913086,
"logps/generated": -814.5985717773438,
"logps/real": -449.64654541015625,
"loss": 0.5796,
"rewards/accuracies": 0.75,
"rewards/generated": -0.590421199798584,
"rewards/margins": 0.5598743557929993,
"rewards/real": -0.030546903610229492,
"step": 20
},
{
"epoch": 0.10909090909090909,
"grad_norm": 89.49394324770884,
"learning_rate": 9.09090909090909e-08,
"logits/generated": 3.1544127464294434,
"logits/real": 2.341243028640747,
"logps/generated": -614.533203125,
"logps/real": -494.87823486328125,
"loss": 0.4,
"rewards/accuracies": 0.949999988079071,
"rewards/generated": -1.4715439081192017,
"rewards/margins": 1.2540919780731201,
"rewards/real": -0.21745213866233826,
"step": 30
},
{
"epoch": 0.14545454545454545,
"grad_norm": 56.92903236170165,
"learning_rate": 1.2121212121212122e-07,
"logits/generated": 3.0897538661956787,
"logits/real": 2.354968309402466,
"logps/generated": -777.1187744140625,
"logps/real": -518.1546020507812,
"loss": 0.2207,
"rewards/accuracies": 1.0,
"rewards/generated": -4.063536167144775,
"rewards/margins": 3.547344923019409,
"rewards/real": -0.5161911845207214,
"step": 40
},
{
"epoch": 0.18181818181818182,
"grad_norm": 22.146909469971494,
"learning_rate": 1.5151515151515152e-07,
"logits/generated": 3.1673903465270996,
"logits/real": 2.656967878341675,
"logps/generated": -958.2801513671875,
"logps/real": -417.46026611328125,
"loss": 0.1115,
"rewards/accuracies": 1.0,
"rewards/generated": -9.105631828308105,
"rewards/margins": 8.369694709777832,
"rewards/real": -0.7359374165534973,
"step": 50
},
{
"epoch": 0.21818181818181817,
"grad_norm": 6.710513441399776,
"learning_rate": 1.818181818181818e-07,
"logits/generated": 3.143266439437866,
"logits/real": 2.533066749572754,
"logps/generated": -990.8375244140625,
"logps/real": -449.7086486816406,
"loss": 0.0388,
"rewards/accuracies": 1.0,
"rewards/generated": -12.80955696105957,
"rewards/margins": 11.602985382080078,
"rewards/real": -1.2065709829330444,
"step": 60
},
{
"epoch": 0.2545454545454545,
"grad_norm": 10.81059007018425,
"learning_rate": 2.121212121212121e-07,
"logits/generated": 3.335444688796997,
"logits/real": 2.734952449798584,
"logps/generated": -1207.6993408203125,
"logps/real": -473.37615966796875,
"loss": 0.0235,
"rewards/accuracies": 1.0,
"rewards/generated": -17.868305206298828,
"rewards/margins": 16.526086807250977,
"rewards/real": -1.3422199487686157,
"step": 70
},
{
"epoch": 0.2909090909090909,
"grad_norm": 5.322710291928728,
"learning_rate": 2.4242424242424244e-07,
"logits/generated": 3.3226451873779297,
"logits/real": 2.602600336074829,
"logps/generated": -1233.799560546875,
"logps/real": -419.0763244628906,
"loss": 0.0111,
"rewards/accuracies": 1.0,
"rewards/generated": -21.38240623474121,
"rewards/margins": 19.921634674072266,
"rewards/real": -1.4607731103897095,
"step": 80
},
{
"epoch": 0.32727272727272727,
"grad_norm": 2.803596256476958,
"learning_rate": 2.727272727272727e-07,
"logits/generated": 2.8833084106445312,
"logits/real": 2.695115804672241,
"logps/generated": -637.830078125,
"logps/real": -467.0171813964844,
"loss": 0.005,
"rewards/accuracies": 1.0,
"rewards/generated": -18.157733917236328,
"rewards/margins": 15.862768173217773,
"rewards/real": -2.2949652671813965,
"step": 90
},
{
"epoch": 0.36363636363636365,
"grad_norm": 1.5269803313685761,
"learning_rate": 3.0303030303030305e-07,
"logits/generated": 3.1768739223480225,
"logits/real": 2.9757938385009766,
"logps/generated": -777.4149169921875,
"logps/real": -472.0936584472656,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/generated": -21.743295669555664,
"rewards/margins": 19.30459976196289,
"rewards/real": -2.438692808151245,
"step": 100
},
{
"epoch": 0.4,
"grad_norm": 0.8658527456401439,
"learning_rate": 3.333333333333333e-07,
"logits/generated": 3.320415496826172,
"logits/real": 2.9949562549591064,
"logps/generated": -881.67431640625,
"logps/real": -489.31378173828125,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/generated": -26.766021728515625,
"rewards/margins": 24.18350601196289,
"rewards/real": -2.5825135707855225,
"step": 110
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.7431319758468002,
"learning_rate": 3.636363636363636e-07,
"logits/generated": 3.732408046722412,
"logits/real": 3.3026363849639893,
"logps/generated": -1488.354736328125,
"logps/real": -397.7535400390625,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/generated": -43.310401916503906,
"rewards/margins": 41.15972137451172,
"rewards/real": -2.1506762504577637,
"step": 120
},
{
"epoch": 0.4727272727272727,
"grad_norm": 0.3539310109847719,
"learning_rate": 3.939393939393939e-07,
"logits/generated": 3.5819716453552246,
"logits/real": 3.207610607147217,
"logps/generated": -1109.115478515625,
"logps/real": -468.93304443359375,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/generated": -35.37150955200195,
"rewards/margins": 33.080970764160156,
"rewards/real": -2.290536403656006,
"step": 130
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.36998938203803255,
"learning_rate": 4.242424242424242e-07,
"logits/generated": 3.399541139602661,
"logits/real": 3.217557430267334,
"logps/generated": -786.1734619140625,
"logps/real": -448.0440368652344,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/generated": -24.668045043945312,
"rewards/margins": 22.3218994140625,
"rewards/real": -2.3461461067199707,
"step": 140
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.031003857964457825,
"learning_rate": 4.545454545454545e-07,
"logits/generated": 3.6583428382873535,
"logits/real": 3.1666476726531982,
"logps/generated": -1667.064453125,
"logps/real": -445.8917541503906,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/generated": -56.291542053222656,
"rewards/margins": 53.78043746948242,
"rewards/real": -2.5111021995544434,
"step": 150
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.10216382354780827,
"learning_rate": 4.848484848484849e-07,
"logits/generated": 3.6542396545410156,
"logits/real": 3.454103469848633,
"logps/generated": -1330.52587890625,
"logps/real": -446.6952209472656,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/generated": -44.972652435302734,
"rewards/margins": 42.243167877197266,
"rewards/real": -2.7294909954071045,
"step": 160
},
{
"epoch": 0.6181818181818182,
"grad_norm": 0.5393067289689158,
"learning_rate": 4.983164983164983e-07,
"logits/generated": 3.8200392723083496,
"logits/real": 3.27642560005188,
"logps/generated": -1152.6976318359375,
"logps/real": -457.31158447265625,
"loss": 0.0047,
"rewards/accuracies": 1.0,
"rewards/generated": -40.51918029785156,
"rewards/margins": 37.82313537597656,
"rewards/real": -2.696047306060791,
"step": 170
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.1428440835513627,
"learning_rate": 4.949494949494949e-07,
"logits/generated": 3.628065586090088,
"logits/real": 3.3550992012023926,
"logps/generated": -922.5845947265625,
"logps/real": -401.19830322265625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/generated": -32.044578552246094,
"rewards/margins": 29.7742977142334,
"rewards/real": -2.270280599594116,
"step": 180
},
{
"epoch": 0.6909090909090909,
"grad_norm": 0.3679331054154446,
"learning_rate": 4.915824915824915e-07,
"logits/generated": 3.8065967559814453,
"logits/real": 3.234556198120117,
"logps/generated": -1263.3641357421875,
"logps/real": -417.69512939453125,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/generated": -43.110328674316406,
"rewards/margins": 40.8560676574707,
"rewards/real": -2.254263401031494,
"step": 190
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.3074657387957313,
"learning_rate": 4.882154882154882e-07,
"logits/generated": 3.6317858695983887,
"logits/real": 3.2310867309570312,
"logps/generated": -977.9903564453125,
"logps/real": -551.634033203125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/generated": -34.79268264770508,
"rewards/margins": 31.959827423095703,
"rewards/real": -2.832855701446533,
"step": 200
},
{
"epoch": 0.7636363636363637,
"grad_norm": 0.059293841086296165,
"learning_rate": 4.848484848484849e-07,
"logits/generated": 3.605592727661133,
"logits/real": 3.393489122390747,
"logps/generated": -1668.871337890625,
"logps/real": -449.1474609375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/generated": -60.01628494262695,
"rewards/margins": 57.65550994873047,
"rewards/real": -2.360779285430908,
"step": 210
},
{
"epoch": 0.8,
"grad_norm": 0.08236734575362854,
"learning_rate": 4.814814814814814e-07,
"logits/generated": 3.7269718647003174,
"logits/real": 3.6311733722686768,
"logps/generated": -1147.8819580078125,
"logps/real": -363.1935729980469,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/generated": -41.762962341308594,
"rewards/margins": 39.59125518798828,
"rewards/real": -2.1717114448547363,
"step": 220
},
{
"epoch": 0.8363636363636363,
"grad_norm": 0.06506668417151372,
"learning_rate": 4.781144781144781e-07,
"logits/generated": 3.6163909435272217,
"logits/real": 3.610844373703003,
"logps/generated": -1436.1513671875,
"logps/real": -456.0074157714844,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/generated": -52.98537063598633,
"rewards/margins": 49.76090621948242,
"rewards/real": -3.224468231201172,
"step": 230
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.03507021137362982,
"learning_rate": 4.7474747474747474e-07,
"logits/generated": 3.553302764892578,
"logits/real": 3.47314715385437,
"logps/generated": -1206.565185546875,
"logps/real": -426.70867919921875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/generated": -46.37542724609375,
"rewards/margins": 43.096046447753906,
"rewards/real": -3.279383420944214,
"step": 240
},
{
"epoch": 0.9090909090909091,
"grad_norm": 3.960989766677836,
"learning_rate": 4.7138047138047136e-07,
"logits/generated": 3.4441192150115967,
"logits/real": 3.2921531200408936,
"logps/generated": -875.3850708007812,
"logps/real": -449.03936767578125,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/generated": -33.70497131347656,
"rewards/margins": 31.499170303344727,
"rewards/real": -2.2058050632476807,
"step": 250
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.1118397976534182,
"learning_rate": 4.68013468013468e-07,
"logits/generated": 3.552502393722534,
"logits/real": 3.365135669708252,
"logps/generated": -925.8385009765625,
"logps/real": -420.988525390625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/generated": -35.2255744934082,
"rewards/margins": 33.324729919433594,
"rewards/real": -1.9008433818817139,
"step": 260
},
{
"epoch": 0.9818181818181818,
"grad_norm": 0.0031177562577058694,
"learning_rate": 4.646464646464646e-07,
"logits/generated": 4.00087833404541,
"logits/real": 3.5007052421569824,
"logps/generated": -1584.2337646484375,
"logps/real": -434.68572998046875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/generated": -60.17878341674805,
"rewards/margins": 57.7464714050293,
"rewards/real": -2.4323067665100098,
"step": 270
},
{
"epoch": 1.018181818181818,
"grad_norm": 0.05555093819949706,
"learning_rate": 4.612794612794613e-07,
"logits/generated": 3.8233509063720703,
"logits/real": 3.6865200996398926,
"logps/generated": -1354.489501953125,
"logps/real": -416.71820068359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -50.17599868774414,
"rewards/margins": 48.03873825073242,
"rewards/real": -2.1372580528259277,
"step": 280
},
{
"epoch": 1.0545454545454545,
"grad_norm": 0.017890113281613322,
"learning_rate": 4.579124579124579e-07,
"logits/generated": 3.7898383140563965,
"logits/real": 3.3585867881774902,
"logps/generated": -840.5850830078125,
"logps/real": -517.4666748046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -31.330394744873047,
"rewards/margins": 28.912433624267578,
"rewards/real": -2.4179651737213135,
"step": 290
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.02423097085406043,
"learning_rate": 4.545454545454545e-07,
"logits/generated": 3.695669651031494,
"logits/real": 3.6764883995056152,
"logps/generated": -1163.597412109375,
"logps/real": -468.89691162109375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/generated": -45.13127899169922,
"rewards/margins": 42.545928955078125,
"rewards/real": -2.5853497982025146,
"step": 300
},
{
"epoch": 1.1272727272727272,
"grad_norm": 0.0049484467104669565,
"learning_rate": 4.5117845117845114e-07,
"logits/generated": 3.732903242111206,
"logits/real": 3.7502872943878174,
"logps/generated": -1116.242919921875,
"logps/real": -363.4823303222656,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -42.95992660522461,
"rewards/margins": 40.67012023925781,
"rewards/real": -2.289808511734009,
"step": 310
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.009030461223581908,
"learning_rate": 4.478114478114478e-07,
"logits/generated": 3.9101524353027344,
"logits/real": 3.1595616340637207,
"logps/generated": -1227.79638671875,
"logps/real": -436.49359130859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -46.5848503112793,
"rewards/margins": 44.355323791503906,
"rewards/real": -2.2295217514038086,
"step": 320
},
{
"epoch": 1.2,
"grad_norm": 0.003881099666210952,
"learning_rate": 4.444444444444444e-07,
"logits/generated": 3.598174571990967,
"logits/real": 3.6226792335510254,
"logps/generated": -1062.3441162109375,
"logps/real": -417.962158203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -41.58740234375,
"rewards/margins": 39.050086975097656,
"rewards/real": -2.53731369972229,
"step": 330
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.11244938516399426,
"learning_rate": 4.4107744107744106e-07,
"logits/generated": 3.567678451538086,
"logits/real": 3.7249159812927246,
"logps/generated": -961.2346801757812,
"logps/real": -434.4712829589844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -38.949554443359375,
"rewards/margins": 36.227928161621094,
"rewards/real": -2.721627950668335,
"step": 340
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.023911125609004182,
"learning_rate": 4.377104377104377e-07,
"logits/generated": 3.596735715866089,
"logits/real": 3.5958075523376465,
"logps/generated": -1215.8427734375,
"logps/real": -439.3638610839844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -47.29825210571289,
"rewards/margins": 44.59672164916992,
"rewards/real": -2.701531171798706,
"step": 350
},
{
"epoch": 1.309090909090909,
"grad_norm": 0.004609330889680118,
"learning_rate": 4.3434343434343435e-07,
"logits/generated": 3.857773542404175,
"logits/real": 3.0869548320770264,
"logps/generated": -1652.0277099609375,
"logps/real": -462.31866455078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -65.08594512939453,
"rewards/margins": 62.09027099609375,
"rewards/real": -2.995673656463623,
"step": 360
},
{
"epoch": 1.3454545454545455,
"grad_norm": 0.021695313180074092,
"learning_rate": 4.309764309764309e-07,
"logits/generated": 3.8632941246032715,
"logits/real": 3.208986759185791,
"logps/generated": -1602.880615234375,
"logps/real": -492.37872314453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -63.631866455078125,
"rewards/margins": 60.887290954589844,
"rewards/real": -2.7445733547210693,
"step": 370
},
{
"epoch": 1.3818181818181818,
"grad_norm": 0.08009450934560616,
"learning_rate": 4.276094276094276e-07,
"logits/generated": 3.9127426147460938,
"logits/real": 3.5540339946746826,
"logps/generated": -1360.1510009765625,
"logps/real": -459.3062438964844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -53.524253845214844,
"rewards/margins": 50.88030242919922,
"rewards/real": -2.6439599990844727,
"step": 380
},
{
"epoch": 1.4181818181818182,
"grad_norm": 0.015895453959011013,
"learning_rate": 4.242424242424242e-07,
"logits/generated": 3.659101963043213,
"logits/real": 3.691251277923584,
"logps/generated": -759.4293212890625,
"logps/real": -525.3631591796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -31.192474365234375,
"rewards/margins": 28.309253692626953,
"rewards/real": -2.883220672607422,
"step": 390
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.002626726892908866,
"learning_rate": 4.208754208754209e-07,
"logits/generated": 3.7086901664733887,
"logits/real": 3.498619794845581,
"logps/generated": -1179.454833984375,
"logps/real": -460.157958984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -48.267391204833984,
"rewards/margins": 45.48485565185547,
"rewards/real": -2.782532215118408,
"step": 400
},
{
"epoch": 1.490909090909091,
"grad_norm": 0.005161280214896955,
"learning_rate": 4.1750841750841746e-07,
"logits/generated": 3.730764389038086,
"logits/real": 3.284414291381836,
"logps/generated": -1389.023193359375,
"logps/real": -472.274658203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -56.40633010864258,
"rewards/margins": 53.044944763183594,
"rewards/real": -3.3613810539245605,
"step": 410
},
{
"epoch": 1.5272727272727273,
"grad_norm": 0.01203825929109908,
"learning_rate": 4.1414141414141413e-07,
"logits/generated": 3.4772918224334717,
"logits/real": 3.7250218391418457,
"logps/generated": -1553.1036376953125,
"logps/real": -465.3892517089844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -63.161415100097656,
"rewards/margins": 59.865386962890625,
"rewards/real": -3.29602313041687,
"step": 420
},
{
"epoch": 1.5636363636363635,
"grad_norm": 0.00954567611305199,
"learning_rate": 4.1077441077441075e-07,
"logits/generated": 4.080082893371582,
"logits/real": 3.5112476348876953,
"logps/generated": -1215.934326171875,
"logps/real": -410.541748046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -47.794490814208984,
"rewards/margins": 45.22086715698242,
"rewards/real": -2.5736231803894043,
"step": 430
},
{
"epoch": 1.6,
"grad_norm": 0.0021520111594958385,
"learning_rate": 4.0740740740740737e-07,
"logits/generated": 3.6889781951904297,
"logits/real": 3.5634422302246094,
"logps/generated": -1326.2265625,
"logps/real": -418.804931640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -53.98466110229492,
"rewards/margins": 51.10773849487305,
"rewards/real": -2.876925230026245,
"step": 440
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.0063222673485985,
"learning_rate": 4.04040404040404e-07,
"logits/generated": 3.561480760574341,
"logits/real": 3.5089659690856934,
"logps/generated": -1199.6884765625,
"logps/real": -423.364990234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -48.4073486328125,
"rewards/margins": 45.80794143676758,
"rewards/real": -2.599404811859131,
"step": 450
},
{
"epoch": 1.6727272727272728,
"grad_norm": 0.000768791949879652,
"learning_rate": 4.0067340067340067e-07,
"logits/generated": 3.9211220741271973,
"logits/real": 3.7459423542022705,
"logps/generated": -1202.2457275390625,
"logps/real": -396.33935546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -50.28257751464844,
"rewards/margins": 48.02632522583008,
"rewards/real": -2.256255865097046,
"step": 460
},
{
"epoch": 1.709090909090909,
"grad_norm": 0.01600856274211648,
"learning_rate": 3.973063973063973e-07,
"logits/generated": 3.9792861938476562,
"logits/real": 3.7603824138641357,
"logps/generated": -1093.778564453125,
"logps/real": -392.31646728515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -43.18585205078125,
"rewards/margins": 40.816978454589844,
"rewards/real": -2.368870973587036,
"step": 470
},
{
"epoch": 1.7454545454545456,
"grad_norm": 0.006151439686373968,
"learning_rate": 3.939393939393939e-07,
"logits/generated": 4.160303115844727,
"logits/real": 3.79850435256958,
"logps/generated": -1032.3426513671875,
"logps/real": -418.443359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -39.8023681640625,
"rewards/margins": 37.66193389892578,
"rewards/real": -2.1404361724853516,
"step": 480
},
{
"epoch": 1.7818181818181817,
"grad_norm": 0.0011853963068352326,
"learning_rate": 3.9057239057239053e-07,
"logits/generated": 3.737189531326294,
"logits/real": 3.491539478302002,
"logps/generated": -1278.316650390625,
"logps/real": -399.8204040527344,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -50.095821380615234,
"rewards/margins": 47.98601531982422,
"rewards/real": -2.1098055839538574,
"step": 490
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.005555711556241692,
"learning_rate": 3.872053872053872e-07,
"logits/generated": 3.7458102703094482,
"logits/real": 3.904841184616089,
"logps/generated": -1269.622802734375,
"logps/real": -502.83538818359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.42805862426758,
"rewards/margins": 49.014373779296875,
"rewards/real": -2.4136805534362793,
"step": 500
},
{
"epoch": 1.8545454545454545,
"grad_norm": 0.02264974580958711,
"learning_rate": 3.8383838383838377e-07,
"logits/generated": 4.089097023010254,
"logits/real": 4.16144323348999,
"logps/generated": -1649.2330322265625,
"logps/real": -356.6659851074219,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -65.34940338134766,
"rewards/margins": 63.115943908691406,
"rewards/real": -2.2334671020507812,
"step": 510
},
{
"epoch": 1.8909090909090909,
"grad_norm": 0.0030795735098584744,
"learning_rate": 3.8047138047138045e-07,
"logits/generated": 3.60687255859375,
"logits/real": 3.62739634513855,
"logps/generated": -1154.067626953125,
"logps/real": -519.19677734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -45.407779693603516,
"rewards/margins": 42.44924545288086,
"rewards/real": -2.958531379699707,
"step": 520
},
{
"epoch": 1.9272727272727272,
"grad_norm": 0.00743718788451984,
"learning_rate": 3.7710437710437707e-07,
"logits/generated": 4.003333568572998,
"logits/real": 3.6320443153381348,
"logps/generated": -1574.7633056640625,
"logps/real": -430.81146240234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -63.63165283203125,
"rewards/margins": 60.943687438964844,
"rewards/real": -2.6879489421844482,
"step": 530
},
{
"epoch": 1.9636363636363636,
"grad_norm": 0.003581692627886563,
"learning_rate": 3.7373737373737374e-07,
"logits/generated": 4.060280799865723,
"logits/real": 3.6592605113983154,
"logps/generated": -1586.013916015625,
"logps/real": -422.39892578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -64.56217956542969,
"rewards/margins": 61.98358917236328,
"rewards/real": -2.5785882472991943,
"step": 540
},
{
"epoch": 2.0,
"grad_norm": 0.004087463581422144,
"learning_rate": 3.703703703703703e-07,
"logits/generated": 3.9546761512756348,
"logits/real": 3.698378801345825,
"logps/generated": -1893.251220703125,
"logps/real": -402.0331726074219,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -77.40522003173828,
"rewards/margins": 74.73800659179688,
"rewards/real": -2.6672160625457764,
"step": 550
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.0026569888763763455,
"learning_rate": 3.67003367003367e-07,
"logits/generated": 3.850944995880127,
"logits/real": 3.5789389610290527,
"logps/generated": -1203.314453125,
"logps/real": -477.9483337402344,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.28553009033203,
"rewards/margins": 47.97153854370117,
"rewards/real": -3.3139939308166504,
"step": 560
},
{
"epoch": 2.0727272727272728,
"grad_norm": 0.002300572924014686,
"learning_rate": 3.636363636363636e-07,
"logits/generated": 3.9447109699249268,
"logits/real": 3.6901111602783203,
"logps/generated": -1685.8695068359375,
"logps/real": -463.8533630371094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -68.90098571777344,
"rewards/margins": 65.87489318847656,
"rewards/real": -3.0261025428771973,
"step": 570
},
{
"epoch": 2.109090909090909,
"grad_norm": 0.003583503892977523,
"learning_rate": 3.602693602693603e-07,
"logits/generated": 4.006188869476318,
"logits/real": 4.006190299987793,
"logps/generated": -1552.8453369140625,
"logps/real": -402.8440856933594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -64.53579711914062,
"rewards/margins": 61.512725830078125,
"rewards/real": -3.0230753421783447,
"step": 580
},
{
"epoch": 2.1454545454545455,
"grad_norm": 0.007705162380818939,
"learning_rate": 3.5690235690235685e-07,
"logits/generated": 4.073629379272461,
"logits/real": 3.875983476638794,
"logps/generated": -1343.013427734375,
"logps/real": -382.87689208984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -55.713523864746094,
"rewards/margins": 52.94609832763672,
"rewards/real": -2.7674267292022705,
"step": 590
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.013172669628685494,
"learning_rate": 3.535353535353535e-07,
"logits/generated": 4.253252983093262,
"logits/real": 3.769242525100708,
"logps/generated": -1818.175048828125,
"logps/real": -398.22308349609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -75.47774505615234,
"rewards/margins": 72.8851318359375,
"rewards/real": -2.5926156044006348,
"step": 600
},
{
"epoch": 2.2181818181818183,
"grad_norm": 0.007024294960501395,
"learning_rate": 3.5016835016835014e-07,
"logits/generated": 3.699592113494873,
"logits/real": 3.579563856124878,
"logps/generated": -889.1650390625,
"logps/real": -469.04815673828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -37.172508239746094,
"rewards/margins": 33.740169525146484,
"rewards/real": -3.432338237762451,
"step": 610
},
{
"epoch": 2.2545454545454544,
"grad_norm": 0.0009392103287631958,
"learning_rate": 3.4680134680134676e-07,
"logits/generated": 4.1535139083862305,
"logits/real": 3.8194522857666016,
"logps/generated": -1222.1712646484375,
"logps/real": -433.2972106933594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.39905548095703,
"rewards/margins": 48.590370178222656,
"rewards/real": -2.808685541152954,
"step": 620
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.0054695000716611505,
"learning_rate": 3.434343434343434e-07,
"logits/generated": 4.002943992614746,
"logits/real": 3.655449628829956,
"logps/generated": -1462.0904541015625,
"logps/real": -442.56365966796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -60.3794059753418,
"rewards/margins": 57.272552490234375,
"rewards/real": -3.106855630874634,
"step": 630
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.011303853467808675,
"learning_rate": 3.4006734006734006e-07,
"logits/generated": 3.58341908454895,
"logits/real": 3.253676176071167,
"logps/generated": -1504.0076904296875,
"logps/real": -466.627685546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -62.34798049926758,
"rewards/margins": 59.16129684448242,
"rewards/real": -3.1866836547851562,
"step": 640
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.0009684613032072848,
"learning_rate": 3.3670033670033673e-07,
"logits/generated": 4.34421443939209,
"logits/real": 3.901693820953369,
"logps/generated": -1832.802490234375,
"logps/real": -452.336181640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -76.98825073242188,
"rewards/margins": 74.16451263427734,
"rewards/real": -2.823739767074585,
"step": 650
},
{
"epoch": 2.4,
"grad_norm": 0.00016207364423209973,
"learning_rate": 3.333333333333333e-07,
"logits/generated": 3.6365978717803955,
"logits/real": 3.4315154552459717,
"logps/generated": -1143.705078125,
"logps/real": -484.4818420410156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -48.88311004638672,
"rewards/margins": 45.60661315917969,
"rewards/real": -3.2764992713928223,
"step": 660
},
{
"epoch": 2.4363636363636365,
"grad_norm": 0.010502550164526083,
"learning_rate": 3.2996632996633e-07,
"logits/generated": 3.862056016921997,
"logits/real": 3.24928617477417,
"logps/generated": -1533.475830078125,
"logps/real": -453.9549255371094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -63.62640380859375,
"rewards/margins": 60.355133056640625,
"rewards/real": -3.271270275115967,
"step": 670
},
{
"epoch": 2.4727272727272727,
"grad_norm": 0.0002909304249002409,
"learning_rate": 3.265993265993266e-07,
"logits/generated": 3.5648250579833984,
"logits/real": 3.6869399547576904,
"logps/generated": -912.8480224609375,
"logps/real": -476.197509765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -40.917945861816406,
"rewards/margins": 37.436744689941406,
"rewards/real": -3.4812045097351074,
"step": 680
},
{
"epoch": 2.509090909090909,
"grad_norm": 0.00032313616431939676,
"learning_rate": 3.2323232323232327e-07,
"logits/generated": 3.8342528343200684,
"logits/real": 3.8705894947052,
"logps/generated": -854.0946044921875,
"logps/real": -459.92529296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -38.8852653503418,
"rewards/margins": 35.70844650268555,
"rewards/real": -3.1768181324005127,
"step": 690
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.0021219902906988715,
"learning_rate": 3.1986531986531984e-07,
"logits/generated": 3.783543109893799,
"logits/real": 3.4980130195617676,
"logps/generated": -1084.9097900390625,
"logps/real": -487.141845703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -46.62445068359375,
"rewards/margins": 43.05109786987305,
"rewards/real": -3.5733554363250732,
"step": 700
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.0014337911425730536,
"learning_rate": 3.164983164983165e-07,
"logits/generated": 3.6583492755889893,
"logits/real": 3.749937057495117,
"logps/generated": -1316.029052734375,
"logps/real": -508.31683349609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -56.47169876098633,
"rewards/margins": 52.8726806640625,
"rewards/real": -3.5990149974823,
"step": 710
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.0028510300745991563,
"learning_rate": 3.1313131313131313e-07,
"logits/generated": 3.9335789680480957,
"logits/real": 3.828601837158203,
"logps/generated": -1295.112060546875,
"logps/real": -437.5403747558594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -56.59820556640625,
"rewards/margins": 53.75188064575195,
"rewards/real": -2.846327066421509,
"step": 720
},
{
"epoch": 2.6545454545454543,
"grad_norm": 0.003930759640413619,
"learning_rate": 3.0976430976430975e-07,
"logits/generated": 3.507559299468994,
"logits/real": 3.472404956817627,
"logps/generated": -1357.832275390625,
"logps/real": -477.1338806152344,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -57.733192443847656,
"rewards/margins": 54.20581817626953,
"rewards/real": -3.527372360229492,
"step": 730
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.0017241967233385743,
"learning_rate": 3.063973063973064e-07,
"logits/generated": 3.7288355827331543,
"logits/real": 3.64813232421875,
"logps/generated": -1540.7183837890625,
"logps/real": -501.73480224609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -63.22703170776367,
"rewards/margins": 59.9117546081543,
"rewards/real": -3.3152732849121094,
"step": 740
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.007579448915312288,
"learning_rate": 3.0303030303030305e-07,
"logits/generated": 3.859321117401123,
"logits/real": 3.7037596702575684,
"logps/generated": -1236.6575927734375,
"logps/real": -546.4544067382812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.599647521972656,
"rewards/margins": 47.99446105957031,
"rewards/real": -3.605181932449341,
"step": 750
},
{
"epoch": 2.7636363636363637,
"grad_norm": 0.00014409344199114584,
"learning_rate": 2.9966329966329967e-07,
"logits/generated": 3.5292091369628906,
"logits/real": 3.565258741378784,
"logps/generated": -1531.6552734375,
"logps/real": -448.81268310546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -65.52954864501953,
"rewards/margins": 62.135093688964844,
"rewards/real": -3.394456386566162,
"step": 760
},
{
"epoch": 2.8,
"grad_norm": 0.0013684000564848744,
"learning_rate": 2.962962962962963e-07,
"logits/generated": 4.109580039978027,
"logits/real": 3.457909345626831,
"logps/generated": -1383.8349609375,
"logps/real": -425.76324462890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -59.45878982543945,
"rewards/margins": 56.1872444152832,
"rewards/real": -3.2715511322021484,
"step": 770
},
{
"epoch": 2.8363636363636364,
"grad_norm": 0.005777678000195546,
"learning_rate": 2.929292929292929e-07,
"logits/generated": 3.7321064472198486,
"logits/real": 3.8243510723114014,
"logps/generated": -1196.93115234375,
"logps/real": -456.53228759765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.86643600463867,
"rewards/margins": 48.43678665161133,
"rewards/real": -3.429652452468872,
"step": 780
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.0009321131480379054,
"learning_rate": 2.895622895622896e-07,
"logits/generated": 3.882500171661377,
"logits/real": 3.6975486278533936,
"logps/generated": -1355.469482421875,
"logps/real": -484.16058349609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -60.43659591674805,
"rewards/margins": 56.8776741027832,
"rewards/real": -3.558910369873047,
"step": 790
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.001991138099404004,
"learning_rate": 2.8619528619528615e-07,
"logits/generated": 3.718451976776123,
"logits/real": 3.651914596557617,
"logps/generated": -1605.8145751953125,
"logps/real": -450.0298767089844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -70.13053131103516,
"rewards/margins": 66.21720886230469,
"rewards/real": -3.9133262634277344,
"step": 800
},
{
"epoch": 2.9454545454545453,
"grad_norm": 7.359425704162107e-05,
"learning_rate": 2.8282828282828283e-07,
"logits/generated": 3.7316231727600098,
"logits/real": 3.4421298503875732,
"logps/generated": -1398.3856201171875,
"logps/real": -456.47723388671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -61.99951171875,
"rewards/margins": 58.09382247924805,
"rewards/real": -3.905689239501953,
"step": 810
},
{
"epoch": 2.981818181818182,
"grad_norm": 0.0001666930627991202,
"learning_rate": 2.7946127946127945e-07,
"logits/generated": 3.904003858566284,
"logits/real": 3.768629789352417,
"logps/generated": -1664.2711181640625,
"logps/real": -424.12860107421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -72.53042602539062,
"rewards/margins": 69.21732330322266,
"rewards/real": -3.313105821609497,
"step": 820
},
{
"epoch": 3.018181818181818,
"grad_norm": 0.0015541320296847036,
"learning_rate": 2.760942760942761e-07,
"logits/generated": 4.119956970214844,
"logits/real": 4.344512939453125,
"logps/generated": -1267.0272216796875,
"logps/real": -452.1845703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -55.162574768066406,
"rewards/margins": 51.5660514831543,
"rewards/real": -3.5965213775634766,
"step": 830
},
{
"epoch": 3.0545454545454547,
"grad_norm": 0.00011859762362161128,
"learning_rate": 2.727272727272727e-07,
"logits/generated": 4.090231895446777,
"logits/real": 4.092189788818359,
"logps/generated": -1745.3656005859375,
"logps/real": -447.81494140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -74.62300872802734,
"rewards/margins": 70.8793716430664,
"rewards/real": -3.7436347007751465,
"step": 840
},
{
"epoch": 3.090909090909091,
"grad_norm": 0.0006239804571906667,
"learning_rate": 2.6936026936026936e-07,
"logits/generated": 3.666217803955078,
"logits/real": 3.422201633453369,
"logps/generated": -1149.569580078125,
"logps/real": -458.9976501464844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.963539123535156,
"rewards/margins": 48.148681640625,
"rewards/real": -3.8148579597473145,
"step": 850
},
{
"epoch": 3.1272727272727274,
"grad_norm": 0.00015905022058128955,
"learning_rate": 2.65993265993266e-07,
"logits/generated": 4.04958963394165,
"logits/real": 3.5136730670928955,
"logps/generated": -1752.6058349609375,
"logps/real": -387.2957458496094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -75.6007308959961,
"rewards/margins": 71.88221740722656,
"rewards/real": -3.7185134887695312,
"step": 860
},
{
"epoch": 3.1636363636363636,
"grad_norm": 0.002004494912082122,
"learning_rate": 2.6262626262626266e-07,
"logits/generated": 4.12989616394043,
"logits/real": 3.6296894550323486,
"logps/generated": -1105.5179443359375,
"logps/real": -386.453369140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -48.736148834228516,
"rewards/margins": 45.57777786254883,
"rewards/real": -3.1583666801452637,
"step": 870
},
{
"epoch": 3.2,
"grad_norm": 0.0012155564568384315,
"learning_rate": 2.5925925925925923e-07,
"logits/generated": 3.8862648010253906,
"logits/real": 3.694721221923828,
"logps/generated": -1257.2886962890625,
"logps/real": -369.3799743652344,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -54.6549186706543,
"rewards/margins": 51.277061462402344,
"rewards/real": -3.377854824066162,
"step": 880
},
{
"epoch": 3.2363636363636363,
"grad_norm": 0.006589071175646788,
"learning_rate": 2.558922558922559e-07,
"logits/generated": 3.9840781688690186,
"logits/real": 3.4879355430603027,
"logps/generated": -1746.123779296875,
"logps/real": -440.9857482910156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -76.70772552490234,
"rewards/margins": 72.97891998291016,
"rewards/real": -3.7288193702697754,
"step": 890
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.0030722271284426565,
"learning_rate": 2.525252525252525e-07,
"logits/generated": 3.954289674758911,
"logits/real": 3.8279597759246826,
"logps/generated": -1498.404296875,
"logps/real": -479.84222412109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -65.67853546142578,
"rewards/margins": 61.83612060546875,
"rewards/real": -3.8424181938171387,
"step": 900
},
{
"epoch": 3.309090909090909,
"grad_norm": 0.0001753130329635927,
"learning_rate": 2.4915824915824914e-07,
"logits/generated": 4.019095420837402,
"logits/real": 3.8157622814178467,
"logps/generated": -1326.3746337890625,
"logps/real": -370.5709228515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -59.56660842895508,
"rewards/margins": 55.939353942871094,
"rewards/real": -3.6272518634796143,
"step": 910
},
{
"epoch": 3.3454545454545457,
"grad_norm": 0.004363807101820759,
"learning_rate": 2.4579124579124576e-07,
"logits/generated": 4.095259189605713,
"logits/real": 3.8618171215057373,
"logps/generated": -1519.919189453125,
"logps/real": -453.9547424316406,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -67.54315948486328,
"rewards/margins": 63.63945388793945,
"rewards/real": -3.9037089347839355,
"step": 920
},
{
"epoch": 3.381818181818182,
"grad_norm": 0.0014407534499616213,
"learning_rate": 2.4242424242424244e-07,
"logits/generated": 3.8879623413085938,
"logits/real": 3.7041187286376953,
"logps/generated": -1519.233154296875,
"logps/real": -458.47625732421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -65.14590454101562,
"rewards/margins": 61.69196701049805,
"rewards/real": -3.453932285308838,
"step": 930
},
{
"epoch": 3.418181818181818,
"grad_norm": 0.00018436218899847752,
"learning_rate": 2.3905723905723906e-07,
"logits/generated": 3.6360068321228027,
"logits/real": 3.7878594398498535,
"logps/generated": -1373.73828125,
"logps/real": -496.204345703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -61.42437744140625,
"rewards/margins": 57.20280075073242,
"rewards/real": -4.221576690673828,
"step": 940
},
{
"epoch": 3.4545454545454546,
"grad_norm": 9.983852335117753e-05,
"learning_rate": 2.3569023569023568e-07,
"logits/generated": 3.9673848152160645,
"logits/real": 4.219111919403076,
"logps/generated": -1223.6982421875,
"logps/real": -411.6839904785156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -54.2238655090332,
"rewards/margins": 50.91444778442383,
"rewards/real": -3.309422731399536,
"step": 950
},
{
"epoch": 3.4909090909090907,
"grad_norm": 0.0007948451957228228,
"learning_rate": 2.323232323232323e-07,
"logits/generated": 3.7442917823791504,
"logits/real": 3.6274819374084473,
"logps/generated": -1369.765625,
"logps/real": -449.767578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -61.11466598510742,
"rewards/margins": 57.37229537963867,
"rewards/real": -3.742368698120117,
"step": 960
},
{
"epoch": 3.5272727272727273,
"grad_norm": 0.0030600399757966723,
"learning_rate": 2.2895622895622895e-07,
"logits/generated": 3.8217289447784424,
"logits/real": 3.576320171356201,
"logps/generated": -1459.0333251953125,
"logps/real": -448.5374450683594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -63.0662727355957,
"rewards/margins": 59.22290802001953,
"rewards/real": -3.8433589935302734,
"step": 970
},
{
"epoch": 3.5636363636363635,
"grad_norm": 0.004163869838731672,
"learning_rate": 2.2558922558922557e-07,
"logits/generated": 4.298056125640869,
"logits/real": 3.4841365814208984,
"logps/generated": -1740.793701171875,
"logps/real": -441.3580017089844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -75.46671295166016,
"rewards/margins": 71.69257354736328,
"rewards/real": -3.7741341590881348,
"step": 980
},
{
"epoch": 3.6,
"grad_norm": 0.0019080007177829318,
"learning_rate": 2.222222222222222e-07,
"logits/generated": 3.853611469268799,
"logits/real": 3.6462883949279785,
"logps/generated": -1441.735595703125,
"logps/real": -473.94879150390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -63.321319580078125,
"rewards/margins": 59.3939094543457,
"rewards/real": -3.92741322517395,
"step": 990
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.000704839046472418,
"learning_rate": 2.1885521885521884e-07,
"logits/generated": 4.106793403625488,
"logits/real": 3.7004337310791016,
"logps/generated": -1413.923583984375,
"logps/real": -472.50238037109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -62.2875862121582,
"rewards/margins": 58.44181442260742,
"rewards/real": -3.8457672595977783,
"step": 1000
},
{
"epoch": 3.672727272727273,
"grad_norm": 0.000858452546126466,
"learning_rate": 2.1548821548821546e-07,
"logits/generated": 4.079923629760742,
"logits/real": 3.8820438385009766,
"logps/generated": -1516.7833251953125,
"logps/real": -461.62078857421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -67.4074478149414,
"rewards/margins": 63.33385467529297,
"rewards/real": -4.073586940765381,
"step": 1010
},
{
"epoch": 3.709090909090909,
"grad_norm": 0.0010031363875996075,
"learning_rate": 2.121212121212121e-07,
"logits/generated": 3.767723798751831,
"logits/real": 3.8114075660705566,
"logps/generated": -1460.832275390625,
"logps/real": -497.56201171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -64.61216735839844,
"rewards/margins": 60.64111328125,
"rewards/real": -3.971052885055542,
"step": 1020
},
{
"epoch": 3.7454545454545456,
"grad_norm": 0.0024116334439129978,
"learning_rate": 2.0875420875420873e-07,
"logits/generated": 4.043444633483887,
"logits/real": 3.8120369911193848,
"logps/generated": -1331.077880859375,
"logps/real": -454.9960021972656,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -59.663169860839844,
"rewards/margins": 55.51136016845703,
"rewards/real": -4.151822090148926,
"step": 1030
},
{
"epoch": 3.7818181818181817,
"grad_norm": 0.0015199595459823992,
"learning_rate": 2.0538720538720538e-07,
"logits/generated": 3.829423189163208,
"logits/real": 3.5104575157165527,
"logps/generated": -1270.5301513671875,
"logps/real": -429.19122314453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -55.3685417175293,
"rewards/margins": 51.80549240112305,
"rewards/real": -3.5630505084991455,
"step": 1040
},
{
"epoch": 3.8181818181818183,
"grad_norm": 0.0005563199954904515,
"learning_rate": 2.02020202020202e-07,
"logits/generated": 3.9071640968322754,
"logits/real": 3.7608370780944824,
"logps/generated": -1778.2320556640625,
"logps/real": -423.9949645996094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -80.55699157714844,
"rewards/margins": 76.7548828125,
"rewards/real": -3.802103042602539,
"step": 1050
},
{
"epoch": 3.8545454545454545,
"grad_norm": 0.004755420237394011,
"learning_rate": 1.9865319865319864e-07,
"logits/generated": 4.1636857986450195,
"logits/real": 3.379361629486084,
"logps/generated": -1448.9886474609375,
"logps/real": -389.80328369140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -65.82633209228516,
"rewards/margins": 62.258392333984375,
"rewards/real": -3.5679428577423096,
"step": 1060
},
{
"epoch": 3.8909090909090907,
"grad_norm": 3.783155401402485e-05,
"learning_rate": 1.9528619528619527e-07,
"logits/generated": 3.940641403198242,
"logits/real": 4.008017063140869,
"logps/generated": -1480.460693359375,
"logps/real": -451.5081481933594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -66.01498413085938,
"rewards/margins": 62.06439971923828,
"rewards/real": -3.9505832195281982,
"step": 1070
},
{
"epoch": 3.9272727272727272,
"grad_norm": 0.0012735410485099623,
"learning_rate": 1.9191919191919189e-07,
"logits/generated": 3.90619158744812,
"logits/real": 3.8133018016815186,
"logps/generated": -1917.876708984375,
"logps/real": -466.3075256347656,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -87.14842224121094,
"rewards/margins": 83.11375427246094,
"rewards/real": -4.034660816192627,
"step": 1080
},
{
"epoch": 3.963636363636364,
"grad_norm": 0.0007351500072224141,
"learning_rate": 1.8855218855218853e-07,
"logits/generated": 3.9110474586486816,
"logits/real": 3.5808777809143066,
"logps/generated": -1701.5618896484375,
"logps/real": -473.091064453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -75.6848373413086,
"rewards/margins": 71.86578369140625,
"rewards/real": -3.8190536499023438,
"step": 1090
},
{
"epoch": 4.0,
"grad_norm": 5.6591036032243704e-05,
"learning_rate": 1.8518518518518516e-07,
"logits/generated": 3.9563984870910645,
"logits/real": 3.6396121978759766,
"logps/generated": -1587.2015380859375,
"logps/real": -424.1458435058594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -74.37104797363281,
"rewards/margins": 70.43859100341797,
"rewards/real": -3.9324498176574707,
"step": 1100
},
{
"epoch": 4.036363636363636,
"grad_norm": 0.0004206570610130193,
"learning_rate": 1.818181818181818e-07,
"logits/generated": 3.7221055030822754,
"logits/real": 3.831826686859131,
"logps/generated": -1685.308349609375,
"logps/real": -519.84130859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -74.87862396240234,
"rewards/margins": 70.57380676269531,
"rewards/real": -4.304826736450195,
"step": 1110
},
{
"epoch": 4.072727272727272,
"grad_norm": 0.001160372097406167,
"learning_rate": 1.7845117845117842e-07,
"logits/generated": 3.6212050914764404,
"logits/real": 3.7864627838134766,
"logps/generated": -1583.37158203125,
"logps/real": -511.188232421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -74.45861053466797,
"rewards/margins": 69.92201232910156,
"rewards/real": -4.53659725189209,
"step": 1120
},
{
"epoch": 4.109090909090909,
"grad_norm": 8.539568574380086e-05,
"learning_rate": 1.7508417508417507e-07,
"logits/generated": 3.8307089805603027,
"logits/real": 3.671238422393799,
"logps/generated": -1238.8209228515625,
"logps/real": -452.064208984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -55.9775505065918,
"rewards/margins": 52.53010940551758,
"rewards/real": -3.4474411010742188,
"step": 1130
},
{
"epoch": 4.1454545454545455,
"grad_norm": 0.0008660641077104146,
"learning_rate": 1.717171717171717e-07,
"logits/generated": 3.8134942054748535,
"logits/real": 3.990009307861328,
"logps/generated": -1304.6759033203125,
"logps/real": -445.03826904296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -59.402122497558594,
"rewards/margins": 55.251441955566406,
"rewards/real": -4.150691032409668,
"step": 1140
},
{
"epoch": 4.181818181818182,
"grad_norm": 0.0001382904157899484,
"learning_rate": 1.6835016835016837e-07,
"logits/generated": 4.140042304992676,
"logits/real": 3.885005474090576,
"logps/generated": -1764.208984375,
"logps/real": -455.87921142578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -79.93865203857422,
"rewards/margins": 75.69930267333984,
"rewards/real": -4.239354133605957,
"step": 1150
},
{
"epoch": 4.218181818181818,
"grad_norm": 0.0004243586857810705,
"learning_rate": 1.64983164983165e-07,
"logits/generated": 3.5448012351989746,
"logits/real": 3.717078447341919,
"logps/generated": -1162.357666015625,
"logps/real": -518.4683227539062,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.90681076049805,
"rewards/margins": 47.858604431152344,
"rewards/real": -4.048200607299805,
"step": 1160
},
{
"epoch": 4.254545454545455,
"grad_norm": 0.000987239513688741,
"learning_rate": 1.6161616161616163e-07,
"logits/generated": 4.015206336975098,
"logits/real": 3.729823350906372,
"logps/generated": -2077.96533203125,
"logps/real": -495.60015869140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -91.87440490722656,
"rewards/margins": 87.61203002929688,
"rewards/real": -4.262367248535156,
"step": 1170
},
{
"epoch": 4.290909090909091,
"grad_norm": 0.00037696538230314736,
"learning_rate": 1.5824915824915826e-07,
"logits/generated": 3.7414791584014893,
"logits/real": 3.636801242828369,
"logps/generated": -1549.16796875,
"logps/real": -531.6744384765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -70.65663146972656,
"rewards/margins": 65.79484558105469,
"rewards/real": -4.86180305480957,
"step": 1180
},
{
"epoch": 4.327272727272727,
"grad_norm": 0.00010522895913070764,
"learning_rate": 1.5488215488215488e-07,
"logits/generated": 3.7180187702178955,
"logits/real": 4.073853492736816,
"logps/generated": -1401.19775390625,
"logps/real": -431.75274658203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -67.09441375732422,
"rewards/margins": 62.73823165893555,
"rewards/real": -4.356192111968994,
"step": 1190
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.000493590978374745,
"learning_rate": 1.5151515151515152e-07,
"logits/generated": 3.5975699424743652,
"logits/real": 3.689765214920044,
"logps/generated": -1326.864501953125,
"logps/real": -542.3211059570312,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -59.640357971191406,
"rewards/margins": 55.317596435546875,
"rewards/real": -4.322758674621582,
"step": 1200
},
{
"epoch": 4.4,
"grad_norm": 0.00025492883366324984,
"learning_rate": 1.4814814814814815e-07,
"logits/generated": 3.959843873977661,
"logits/real": 3.6517300605773926,
"logps/generated": -1539.507080078125,
"logps/real": -475.3280334472656,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -69.64662170410156,
"rewards/margins": 65.52012634277344,
"rewards/real": -4.12650203704834,
"step": 1210
},
{
"epoch": 4.4363636363636365,
"grad_norm": 0.00022938298497810698,
"learning_rate": 1.447811447811448e-07,
"logits/generated": 3.842700958251953,
"logits/real": 3.8824355602264404,
"logps/generated": -1134.2059326171875,
"logps/real": -459.9798278808594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.40458297729492,
"rewards/margins": 47.1477165222168,
"rewards/real": -4.256866931915283,
"step": 1220
},
{
"epoch": 4.472727272727273,
"grad_norm": 0.0004052529610557066,
"learning_rate": 1.4141414141414141e-07,
"logits/generated": 3.9025471210479736,
"logits/real": 3.9180073738098145,
"logps/generated": -1886.6363525390625,
"logps/real": -467.0265197753906,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -85.01991271972656,
"rewards/margins": 80.82940673828125,
"rewards/real": -4.190503120422363,
"step": 1230
},
{
"epoch": 4.509090909090909,
"grad_norm": 0.0005280290781930737,
"learning_rate": 1.3804713804713806e-07,
"logits/generated": 4.019129753112793,
"logits/real": 3.748079776763916,
"logps/generated": -1440.4344482421875,
"logps/real": -405.66583251953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -67.43157958984375,
"rewards/margins": 63.11574172973633,
"rewards/real": -4.315838813781738,
"step": 1240
},
{
"epoch": 4.545454545454545,
"grad_norm": 5.366248566220936e-06,
"learning_rate": 1.3468013468013468e-07,
"logits/generated": 4.056360244750977,
"logits/real": 3.848876953125,
"logps/generated": -1692.2427978515625,
"logps/real": -400.6857604980469,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -76.63723754882812,
"rewards/margins": 72.62677764892578,
"rewards/real": -4.010465145111084,
"step": 1250
},
{
"epoch": 4.581818181818182,
"grad_norm": 0.00047402245995597456,
"learning_rate": 1.3131313131313133e-07,
"logits/generated": 3.9295859336853027,
"logits/real": 3.7341480255126953,
"logps/generated": -1847.7982177734375,
"logps/real": -472.1689453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -85.64716339111328,
"rewards/margins": 80.90381622314453,
"rewards/real": -4.743343830108643,
"step": 1260
},
{
"epoch": 4.618181818181818,
"grad_norm": 0.00128487736629245,
"learning_rate": 1.2794612794612795e-07,
"logits/generated": 3.9448304176330566,
"logits/real": 3.9632275104522705,
"logps/generated": -1450.473388671875,
"logps/real": -479.2431640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -66.07244110107422,
"rewards/margins": 61.695777893066406,
"rewards/real": -4.376659870147705,
"step": 1270
},
{
"epoch": 4.654545454545454,
"grad_norm": 0.00013514069974056552,
"learning_rate": 1.2457912457912457e-07,
"logits/generated": 3.9676432609558105,
"logits/real": 3.487008571624756,
"logps/generated": -1180.776123046875,
"logps/real": -429.3609924316406,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -57.64728927612305,
"rewards/margins": 53.328086853027344,
"rewards/real": -4.319206714630127,
"step": 1280
},
{
"epoch": 4.690909090909091,
"grad_norm": 0.0006303733396897195,
"learning_rate": 1.2121212121212122e-07,
"logits/generated": 4.023508548736572,
"logits/real": 3.998694658279419,
"logps/generated": -1272.932861328125,
"logps/real": -501.510986328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -59.267669677734375,
"rewards/margins": 54.53044891357422,
"rewards/real": -4.737218379974365,
"step": 1290
},
{
"epoch": 4.7272727272727275,
"grad_norm": 1.57814039841833e-05,
"learning_rate": 1.1784511784511784e-07,
"logits/generated": 3.772751569747925,
"logits/real": 3.721607208251953,
"logps/generated": -1451.5894775390625,
"logps/real": -440.76751708984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -66.0102767944336,
"rewards/margins": 61.77167892456055,
"rewards/real": -4.238595008850098,
"step": 1300
},
{
"epoch": 4.763636363636364,
"grad_norm": 6.235887123859738e-05,
"learning_rate": 1.1447811447811447e-07,
"logits/generated": 3.9907233715057373,
"logits/real": 3.906350612640381,
"logps/generated": -1149.9796142578125,
"logps/real": -501.718505859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -51.936546325683594,
"rewards/margins": 47.381919860839844,
"rewards/real": -4.554632186889648,
"step": 1310
},
{
"epoch": 4.8,
"grad_norm": 0.0005202534829377769,
"learning_rate": 1.111111111111111e-07,
"logits/generated": 3.2805778980255127,
"logits/real": 3.3483023643493652,
"logps/generated": -896.7745971679688,
"logps/real": -473.29718017578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -43.71897506713867,
"rewards/margins": 39.41306686401367,
"rewards/real": -4.305908203125,
"step": 1320
},
{
"epoch": 4.836363636363636,
"grad_norm": 6.406018245203921e-05,
"learning_rate": 1.0774410774410773e-07,
"logits/generated": 3.2299580574035645,
"logits/real": 3.425579071044922,
"logps/generated": -923.6653442382812,
"logps/real": -446.96832275390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -45.724266052246094,
"rewards/margins": 41.449378967285156,
"rewards/real": -4.274886131286621,
"step": 1330
},
{
"epoch": 4.872727272727273,
"grad_norm": 0.00014454501073033914,
"learning_rate": 1.0437710437710436e-07,
"logits/generated": 3.5981407165527344,
"logits/real": 4.00681734085083,
"logps/generated": -918.5948486328125,
"logps/real": -426.62921142578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -43.384376525878906,
"rewards/margins": 38.89148712158203,
"rewards/real": -4.492888450622559,
"step": 1340
},
{
"epoch": 4.909090909090909,
"grad_norm": 0.0006254585346169349,
"learning_rate": 1.01010101010101e-07,
"logits/generated": 3.715498685836792,
"logits/real": 3.8721745014190674,
"logps/generated": -1560.0103759765625,
"logps/real": -510.93927001953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -72.44107055664062,
"rewards/margins": 67.83012390136719,
"rewards/real": -4.610942363739014,
"step": 1350
},
{
"epoch": 4.945454545454545,
"grad_norm": 0.0008769643975252069,
"learning_rate": 9.764309764309763e-08,
"logits/generated": 3.9168152809143066,
"logits/real": 3.5084102153778076,
"logps/generated": -1468.501220703125,
"logps/real": -501.52032470703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -69.21226501464844,
"rewards/margins": 64.94023132324219,
"rewards/real": -4.272032737731934,
"step": 1360
},
{
"epoch": 4.9818181818181815,
"grad_norm": 0.0012166612736284428,
"learning_rate": 9.427609427609427e-08,
"logits/generated": 3.681574583053589,
"logits/real": 4.426842212677002,
"logps/generated": -1462.860595703125,
"logps/real": -457.4916076660156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -68.63682556152344,
"rewards/margins": 64.71060180664062,
"rewards/real": -3.9262146949768066,
"step": 1370
},
{
"epoch": 5.0181818181818185,
"grad_norm": 0.0003307362200277528,
"learning_rate": 9.09090909090909e-08,
"logits/generated": 3.945413112640381,
"logits/real": 3.938251495361328,
"logps/generated": -1445.0692138671875,
"logps/real": -426.0289611816406,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -66.72593688964844,
"rewards/margins": 62.89970016479492,
"rewards/real": -3.8262367248535156,
"step": 1380
},
{
"epoch": 5.054545454545455,
"grad_norm": 0.000133121896992743,
"learning_rate": 8.754208754208754e-08,
"logits/generated": 3.6478161811828613,
"logits/real": 3.513685941696167,
"logps/generated": -1289.758056640625,
"logps/real": -466.66119384765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -60.87055206298828,
"rewards/margins": 56.79083251953125,
"rewards/real": -4.07971715927124,
"step": 1390
},
{
"epoch": 5.090909090909091,
"grad_norm": 0.0006093163562234791,
"learning_rate": 8.417508417508418e-08,
"logits/generated": 3.8004722595214844,
"logits/real": 3.8927619457244873,
"logps/generated": -1247.7264404296875,
"logps/real": -516.7521362304688,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -59.13682174682617,
"rewards/margins": 54.87421417236328,
"rewards/real": -4.262602806091309,
"step": 1400
},
{
"epoch": 5.127272727272727,
"grad_norm": 0.0007397523574107219,
"learning_rate": 8.080808080808082e-08,
"logits/generated": 3.6465377807617188,
"logits/real": 3.726672649383545,
"logps/generated": -1440.296630859375,
"logps/real": -475.47283935546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -68.43962860107422,
"rewards/margins": 63.84468460083008,
"rewards/real": -4.594945430755615,
"step": 1410
},
{
"epoch": 5.163636363636364,
"grad_norm": 0.0003301105358851929,
"learning_rate": 7.744107744107744e-08,
"logits/generated": 4.112045764923096,
"logits/real": 3.6040217876434326,
"logps/generated": -1812.4189453125,
"logps/real": -512.8524169921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -84.6473617553711,
"rewards/margins": 80.17106628417969,
"rewards/real": -4.476306915283203,
"step": 1420
},
{
"epoch": 5.2,
"grad_norm": 6.415611492466389e-05,
"learning_rate": 7.407407407407407e-08,
"logits/generated": 3.957354784011841,
"logits/real": 4.061827659606934,
"logps/generated": -1631.3790283203125,
"logps/real": -501.46453857421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -73.66567993164062,
"rewards/margins": 69.13175964355469,
"rewards/real": -4.533923149108887,
"step": 1430
},
{
"epoch": 5.236363636363636,
"grad_norm": 6.0549193113947094e-05,
"learning_rate": 7.070707070707071e-08,
"logits/generated": 3.871743679046631,
"logits/real": 3.737473249435425,
"logps/generated": -1661.531005859375,
"logps/real": -478.5572814941406,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -76.69551086425781,
"rewards/margins": 72.16094207763672,
"rewards/real": -4.534560203552246,
"step": 1440
},
{
"epoch": 5.2727272727272725,
"grad_norm": 4.5226258894634027e-05,
"learning_rate": 6.734006734006734e-08,
"logits/generated": 3.7419967651367188,
"logits/real": 4.010978698730469,
"logps/generated": -1400.2882080078125,
"logps/real": -471.0364685058594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -65.46446228027344,
"rewards/margins": 61.05268478393555,
"rewards/real": -4.41178035736084,
"step": 1450
},
{
"epoch": 5.309090909090909,
"grad_norm": 0.00017288177287966272,
"learning_rate": 6.397306397306398e-08,
"logits/generated": 3.5792031288146973,
"logits/real": 3.6540589332580566,
"logps/generated": -1383.195068359375,
"logps/real": -501.170166015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -64.13642883300781,
"rewards/margins": 59.7711296081543,
"rewards/real": -4.365292549133301,
"step": 1460
},
{
"epoch": 5.345454545454546,
"grad_norm": 2.4760789132422138e-05,
"learning_rate": 6.060606060606061e-08,
"logits/generated": 3.975567579269409,
"logits/real": 3.8065619468688965,
"logps/generated": -1486.880126953125,
"logps/real": -466.724609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -69.9581298828125,
"rewards/margins": 64.9647445678711,
"rewards/real": -4.993378162384033,
"step": 1470
},
{
"epoch": 5.381818181818182,
"grad_norm": 0.0005286526078682218,
"learning_rate": 5.723905723905724e-08,
"logits/generated": 3.6327576637268066,
"logits/real": 3.6307568550109863,
"logps/generated": -984.1024169921875,
"logps/real": -434.69171142578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -47.59702682495117,
"rewards/margins": 43.50556182861328,
"rewards/real": -4.091464519500732,
"step": 1480
},
{
"epoch": 5.418181818181818,
"grad_norm": 0.0005548002338588267,
"learning_rate": 5.3872053872053865e-08,
"logits/generated": 3.9311721324920654,
"logits/real": 3.8737149238586426,
"logps/generated": -1967.173828125,
"logps/real": -438.94915771484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -94.0683364868164,
"rewards/margins": 89.73558807373047,
"rewards/real": -4.332737922668457,
"step": 1490
},
{
"epoch": 5.454545454545454,
"grad_norm": 0.00022247062485129725,
"learning_rate": 5.05050505050505e-08,
"logits/generated": 3.693833827972412,
"logits/real": 3.730213165283203,
"logps/generated": -1566.4849853515625,
"logps/real": -507.9705505371094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -74.78486633300781,
"rewards/margins": 69.65180206298828,
"rewards/real": -5.133058071136475,
"step": 1500
},
{
"epoch": 5.490909090909091,
"grad_norm": 0.00014823349149112444,
"learning_rate": 4.7138047138047134e-08,
"logits/generated": 4.056867599487305,
"logits/real": 3.9997782707214355,
"logps/generated": -1185.4552001953125,
"logps/real": -487.16693115234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -56.62416458129883,
"rewards/margins": 52.01531982421875,
"rewards/real": -4.608843803405762,
"step": 1510
},
{
"epoch": 5.527272727272727,
"grad_norm": 0.0002456658050929149,
"learning_rate": 4.377104377104377e-08,
"logits/generated": 3.7220072746276855,
"logits/real": 3.7891058921813965,
"logps/generated": -1615.204833984375,
"logps/real": -501.6162109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -76.18266296386719,
"rewards/margins": 71.68074035644531,
"rewards/real": -4.50192403793335,
"step": 1520
},
{
"epoch": 5.5636363636363635,
"grad_norm": 0.0006055537891314523,
"learning_rate": 4.040404040404041e-08,
"logits/generated": 3.833317995071411,
"logits/real": 3.712054491043091,
"logps/generated": -1525.6534423828125,
"logps/real": -478.32720947265625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -70.79685974121094,
"rewards/margins": 66.09513854980469,
"rewards/real": -4.701727390289307,
"step": 1530
},
{
"epoch": 5.6,
"grad_norm": 6.131079180902863e-05,
"learning_rate": 3.7037037037037036e-08,
"logits/generated": 3.6760215759277344,
"logits/real": 3.605074405670166,
"logps/generated": -1132.375732421875,
"logps/real": -488.8179626464844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -54.268714904785156,
"rewards/margins": 49.204002380371094,
"rewards/real": -5.064711093902588,
"step": 1540
},
{
"epoch": 5.636363636363637,
"grad_norm": 8.1504638837413e-05,
"learning_rate": 3.367003367003367e-08,
"logits/generated": 3.844359874725342,
"logits/real": 3.853445529937744,
"logps/generated": -1274.287353515625,
"logps/real": -484.49102783203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -60.99640655517578,
"rewards/margins": 55.981048583984375,
"rewards/real": -5.015349388122559,
"step": 1550
},
{
"epoch": 5.672727272727273,
"grad_norm": 0.00024942794031913856,
"learning_rate": 3.0303030303030305e-08,
"logits/generated": 3.7670280933380127,
"logits/real": 3.671916961669922,
"logps/generated": -1370.51611328125,
"logps/real": -470.8055114746094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -66.9756088256836,
"rewards/margins": 62.2355842590332,
"rewards/real": -4.740030765533447,
"step": 1560
},
{
"epoch": 5.709090909090909,
"grad_norm": 0.0001492803952467262,
"learning_rate": 2.6936026936026933e-08,
"logits/generated": 3.5178914070129395,
"logits/real": 3.622554063796997,
"logps/generated": -1014.978515625,
"logps/real": -516.9625244140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -50.696678161621094,
"rewards/margins": 45.91947555541992,
"rewards/real": -4.777202129364014,
"step": 1570
},
{
"epoch": 5.745454545454545,
"grad_norm": 0.0006824322144998329,
"learning_rate": 2.3569023569023567e-08,
"logits/generated": 3.5519561767578125,
"logits/real": 3.5069594383239746,
"logps/generated": -1010.9691162109375,
"logps/real": -470.0237731933594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -53.67352294921875,
"rewards/margins": 49.31608963012695,
"rewards/real": -4.357430934906006,
"step": 1580
},
{
"epoch": 5.781818181818182,
"grad_norm": 0.000186341266970004,
"learning_rate": 2.0202020202020204e-08,
"logits/generated": 3.713832139968872,
"logits/real": 3.9890835285186768,
"logps/generated": -1338.298583984375,
"logps/real": -486.15679931640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -65.81551361083984,
"rewards/margins": 61.16944122314453,
"rewards/real": -4.646066665649414,
"step": 1590
},
{
"epoch": 5.818181818181818,
"grad_norm": 0.00025122871712599865,
"learning_rate": 1.6835016835016835e-08,
"logits/generated": 3.836996555328369,
"logits/real": 3.388111114501953,
"logps/generated": -1553.741455078125,
"logps/real": -477.65765380859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -75.15855407714844,
"rewards/margins": 70.58387756347656,
"rewards/real": -4.574671268463135,
"step": 1600
},
{
"epoch": 5.8545454545454545,
"grad_norm": 0.00011562643100957245,
"learning_rate": 1.3468013468013466e-08,
"logits/generated": 4.382861614227295,
"logits/real": 3.5439746379852295,
"logps/generated": -2293.033935546875,
"logps/real": -465.66790771484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -106.3143081665039,
"rewards/margins": 102.07095336914062,
"rewards/real": -4.243343830108643,
"step": 1610
},
{
"epoch": 5.890909090909091,
"grad_norm": 0.0005696740525733873,
"learning_rate": 1.0101010101010102e-08,
"logits/generated": 3.699605941772461,
"logits/real": 3.266279697418213,
"logps/generated": -1303.4429931640625,
"logps/real": -535.8048095703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -64.61190795898438,
"rewards/margins": 59.79120635986328,
"rewards/real": -4.820700645446777,
"step": 1620
},
{
"epoch": 5.927272727272728,
"grad_norm": 0.00021057172282241802,
"learning_rate": 6.734006734006733e-09,
"logits/generated": 3.7417564392089844,
"logits/real": 3.6985526084899902,
"logps/generated": -1704.074462890625,
"logps/real": -515.9856567382812,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -78.1255111694336,
"rewards/margins": 73.12936401367188,
"rewards/real": -4.99615478515625,
"step": 1630
},
{
"epoch": 5.963636363636364,
"grad_norm": 0.0004341167683210516,
"learning_rate": 3.3670033670033666e-09,
"logits/generated": 3.8730645179748535,
"logits/real": 3.839846134185791,
"logps/generated": -1266.2689208984375,
"logps/real": -423.4774475097656,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -60.56571578979492,
"rewards/margins": 56.362342834472656,
"rewards/real": -4.20337438583374,
"step": 1640
},
{
"epoch": 6.0,
"grad_norm": 0.0004156988432161806,
"learning_rate": 0.0,
"logits/generated": 3.8449695110321045,
"logits/real": 3.563386917114258,
"logps/generated": -1621.296142578125,
"logps/real": -509.37469482421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/generated": -76.79975891113281,
"rewards/margins": 70.92039489746094,
"rewards/real": -5.879361629486084,
"step": 1650
},
{
"epoch": 6.0,
"step": 1650,
"total_flos": 0.0,
"train_loss": 0.012667954623552208,
"train_runtime": 21434.6493,
"train_samples_per_second": 4.927,
"train_steps_per_second": 0.077
}
],
"logging_steps": 10,
"max_steps": 1650,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}