{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996033320111067, "eval_steps": 500, "global_step": 210, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004760015866719556, "grad_norm": 2.6129452623698715, "learning_rate": 4.761904761904761e-09, "logits/chosen": 1.8391927480697632, "logits/rejected": NaN, "logps/chosen": -1184.0, "logps/rejected": -355.4166564941406, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.009520031733439112, "grad_norm": 2.5770813189869757, "learning_rate": 9.523809523809522e-09, "logits/chosen": 1.822265625, "logits/rejected": NaN, "logps/chosen": -1286.0, "logps/rejected": -413.0833435058594, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.014280047600158666, "grad_norm": 2.7660310366335357, "learning_rate": 1.4285714285714284e-08, "logits/chosen": 1.8483072519302368, "logits/rejected": 1.1949056386947632, "logps/chosen": -1194.6666259765625, "logps/rejected": -369.75, "loss": 0.6947, "rewards/accuracies": 0.1666666716337204, "rewards/chosen": -0.0016682943096384406, "rewards/margins": -0.0037485759239643812, "rewards/rejected": 0.0020853679161518812, "step": 3 }, { "epoch": 0.019040063466878223, "grad_norm": 2.691530161920512, "learning_rate": 1.9047619047619045e-08, "logits/chosen": 1.6959635019302368, "logits/rejected": NaN, "logps/chosen": -1280.0, "logps/rejected": -184.3333282470703, "loss": 0.6918, "rewards/accuracies": 0.2083333283662796, "rewards/chosen": 0.00250244140625, "rewards/margins": 0.0008341471548192203, "rewards/rejected": 0.0016682943096384406, "step": 4 }, { "epoch": 0.02380007933359778, "grad_norm": 3.010246381817704, "learning_rate": 2.3809523809523807e-08, "logits/chosen": 1.8460286855697632, "logits/rejected": NaN, "logps/chosen": -1411.6666259765625, "logps/rejected": -199.0, "loss": 0.6932, "rewards/accuracies": 0.2291666716337204, "rewards/chosen": -0.0041707358323037624, "rewards/margins": -0.0033467609900981188, "rewards/rejected": -0.0008341471548192203, "step": 5 }, { "epoch": 0.028560095200317333, "grad_norm": 2.659135397165267, "learning_rate": 2.857142857142857e-08, "logits/chosen": 1.8626302480697632, "logits/rejected": 1.3460286855697632, "logps/chosen": -1142.3333740234375, "logps/rejected": -213.5833282470703, "loss": 0.6898, "rewards/accuracies": 0.3125, "rewards/chosen": 0.006256103515625, "rewards/margins": 0.0063578286208212376, "rewards/rejected": -0.00010426839435240254, "step": 6 }, { "epoch": 0.03332011106703689, "grad_norm": 3.0961108718066885, "learning_rate": 3.333333333333333e-08, "logits/chosen": 1.9749349355697632, "logits/rejected": NaN, "logps/chosen": -1445.0, "logps/rejected": -273.3333435058594, "loss": 0.6849, "rewards/accuracies": 0.2708333432674408, "rewards/chosen": 0.015411376953125, "rewards/margins": 0.01598103903234005, "rewards/rejected": -0.0005213419790379703, "step": 7 }, { "epoch": 0.038080126933756446, "grad_norm": 3.398529990595042, "learning_rate": 3.809523809523809e-08, "logits/chosen": 2.0989582538604736, "logits/rejected": NaN, "logps/chosen": -1522.0, "logps/rejected": -731.1666870117188, "loss": 0.6917, "rewards/accuracies": 0.2708333432674408, "rewards/chosen": 0.0029195148963481188, "rewards/margins": 0.003326416015625, "rewards/rejected": -0.00041707357740961015, "step": 8 }, { "epoch": 0.042840142800476, "grad_norm": 2.8555673284882, "learning_rate": 4.285714285714285e-08, "logits/chosen": 1.7252603769302368, "logits/rejected": NaN, "logps/chosen": -1250.6666259765625, "logps/rejected": -429.1666564941406, "loss": 0.6873, "rewards/accuracies": 0.3125, "rewards/chosen": 0.009175618179142475, "rewards/margins": 0.008351643569767475, "rewards/rejected": 0.0008341471548192203, "step": 9 }, { "epoch": 0.04760015866719556, "grad_norm": 3.039275077247777, "learning_rate": 4.7619047619047613e-08, "logits/chosen": 1.8528646230697632, "logits/rejected": NaN, "logps/chosen": -1399.0, "logps/rejected": -494.5, "loss": 0.6919, "rewards/accuracies": 0.2916666567325592, "rewards/chosen": -0.0020853679161518812, "rewards/margins": -0.0016682943096384406, "rewards/rejected": -0.00041707357740961015, "step": 10 }, { "epoch": 0.052360174533915116, "grad_norm": 2.7307097358717765, "learning_rate": 5.238095238095238e-08, "logits/chosen": 1.8912760019302368, "logits/rejected": NaN, "logps/chosen": -1295.0, "logps/rejected": -192.0, "loss": 0.6868, "rewards/accuracies": 0.2708333432674408, "rewards/chosen": 0.008738200180232525, "rewards/margins": 0.00872802734375, "rewards/rejected": 0.0, "step": 11 }, { "epoch": 0.057120190400634666, "grad_norm": 2.858482330959412, "learning_rate": 5.714285714285714e-08, "logits/chosen": 2.0084636211395264, "logits/rejected": NaN, "logps/chosen": -1321.3333740234375, "logps/rejected": -203.4166717529297, "loss": 0.6875, "rewards/accuracies": 0.2916666567325592, "rewards/chosen": 0.0066731772385537624, "rewards/margins": 0.0054219565354287624, "rewards/rejected": 0.001251220703125, "step": 12 }, { "epoch": 0.06188020626735422, "grad_norm": 3.436218339030269, "learning_rate": 6.19047619047619e-08, "logits/chosen": 2.0472004413604736, "logits/rejected": NaN, "logps/chosen": -1647.6666259765625, "logps/rejected": -269.6666564941406, "loss": 0.6932, "rewards/accuracies": 0.25, "rewards/chosen": -0.00250244140625, "rewards/margins": -0.0033416748046875, "rewards/rejected": 0.0008341471548192203, "step": 13 }, { "epoch": 0.06664022213407378, "grad_norm": 2.734000397750556, "learning_rate": 6.666666666666665e-08, "logits/chosen": 1.9791666269302368, "logits/rejected": NaN, "logps/chosen": -1338.6666259765625, "logps/rejected": -475.9166564941406, "loss": 0.694, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.0020853679161518812, "rewards/margins": -0.0031280517578125, "rewards/rejected": 0.0052134194411337376, "step": 14 }, { "epoch": 0.07140023800079334, "grad_norm": 2.781191186330521, "learning_rate": 7.142857142857142e-08, "logits/chosen": 2.01171875, "logits/rejected": NaN, "logps/chosen": -1366.3333740234375, "logps/rejected": -614.0833129882812, "loss": 0.6967, "rewards/accuracies": 0.1458333283662796, "rewards/chosen": -0.009175618179142475, "rewards/margins": -0.011052449233829975, "rewards/rejected": 0.0018768310546875, "step": 15 }, { "epoch": 0.07616025386751289, "grad_norm": 3.296898067532412, "learning_rate": 7.619047619047618e-08, "logits/chosen": 2.0338542461395264, "logits/rejected": NaN, "logps/chosen": -1591.0, "logps/rejected": -361.1666564941406, "loss": 0.6959, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0041707358323037624, "rewards/margins": -0.0056254067458212376, "rewards/rejected": 0.0014597574481740594, "step": 16 }, { "epoch": 0.08092026973423244, "grad_norm": 3.56304516688633, "learning_rate": 8.095238095238095e-08, "logits/chosen": 2.1790363788604736, "logits/rejected": NaN, "logps/chosen": -1688.0, "logps/rejected": -361.0, "loss": 0.692, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.0008341471548192203, "rewards/margins": 0.0008392333984375, "rewards/rejected": 0.0, "step": 17 }, { "epoch": 0.085680285600952, "grad_norm": 3.1437723398960795, "learning_rate": 8.57142857142857e-08, "logits/chosen": 1.9895833730697632, "logits/rejected": NaN, "logps/chosen": -1379.0, "logps/rejected": -372.3333435058594, "loss": 0.69, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.001251220703125, "rewards/margins": -1.0172526344831567e-05, "rewards/rejected": -0.001251220703125, "step": 18 }, { "epoch": 0.09044030146767155, "grad_norm": 2.9085087039360804, "learning_rate": 9.047619047619047e-08, "logits/chosen": 1.962890625, "logits/rejected": NaN, "logps/chosen": -1443.6666259765625, "logps/rejected": -299.9166564941406, "loss": 0.6905, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.00041707357740961015, "rewards/margins": 0.0039723715744912624, "rewards/rejected": -0.0035451252479106188, "step": 19 }, { "epoch": 0.09520031733439112, "grad_norm": 3.046632619071066, "learning_rate": 9.523809523809523e-08, "logits/chosen": 2.1256511211395264, "logits/rejected": NaN, "logps/chosen": -1336.6666259765625, "logps/rejected": -334.5833435058594, "loss": 0.6921, "rewards/accuracies": 0.3958333432674408, "rewards/chosen": 0.00041707357740961015, "rewards/margins": 0.0023040771484375, "rewards/rejected": -0.0018768310546875, "step": 20 }, { "epoch": 0.09996033320111067, "grad_norm": 2.8826119418741762, "learning_rate": 1e-07, "logits/chosen": 2.072265625, "logits/rejected": NaN, "logps/chosen": -1300.0, "logps/rejected": -194.4166717529297, "loss": 0.6877, "rewards/accuracies": 0.3541666567325592, "rewards/chosen": 0.010416666977107525, "rewards/margins": 0.008550007827579975, "rewards/rejected": 0.0018768310546875, "step": 21 }, { "epoch": 0.10472034906783023, "grad_norm": 2.8488586675307856, "learning_rate": 9.999309273455527e-08, "logits/chosen": 1.8509114980697632, "logits/rejected": NaN, "logps/chosen": -1268.6666259765625, "logps/rejected": -404.0833435058594, "loss": 0.6915, "rewards/accuracies": 0.375, "rewards/chosen": -0.0012715657940134406, "rewards/margins": 0.0008494059438817203, "rewards/rejected": -0.0020853679161518812, "step": 22 }, { "epoch": 0.10948036493454978, "grad_norm": 3.496537373458162, "learning_rate": 9.997237284663377e-08, "logits/chosen": 2.130859375, "logits/rejected": NaN, "logps/chosen": -1653.6666259765625, "logps/rejected": -229.25, "loss": 0.6876, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": 0.006256103515625, "rewards/margins": 0.009714762680232525, "rewards/rejected": -0.0034383137244731188, "step": 23 }, { "epoch": 0.11424038080126933, "grad_norm": 3.1101740687622885, "learning_rate": 9.993784606094611e-08, "logits/chosen": 2.0432941913604736, "logits/rejected": NaN, "logps/chosen": -1445.3333740234375, "logps/rejected": -414.5833435058594, "loss": 0.686, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.014180500991642475, "rewards/margins": 0.01689656637609005, "rewards/rejected": -0.0027109782677143812, "step": 24 }, { "epoch": 0.1190003966679889, "grad_norm": 3.443772746353538, "learning_rate": 9.988952191691924e-08, "logits/chosen": 2.2135417461395264, "logits/rejected": NaN, "logps/chosen": -1725.6666259765625, "logps/rejected": -335.4166564941406, "loss": 0.6915, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0020853679161518812, "rewards/margins": 0.0006205241079442203, "rewards/rejected": -0.0027109782677143812, "step": 25 }, { "epoch": 0.12376041253470844, "grad_norm": 2.5776558770728704, "learning_rate": 9.982741376606077e-08, "logits/chosen": 2.0091145038604736, "logits/rejected": NaN, "logps/chosen": -1190.0, "logps/rejected": -194.75, "loss": 0.6879, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 0.0050048828125, "rewards/margins": 0.0075022378005087376, "rewards/rejected": -0.00250244140625, "step": 26 }, { "epoch": 0.128520428401428, "grad_norm": 3.0490668633558244, "learning_rate": 9.975153876827008e-08, "logits/chosen": 1.8876953125, "logits/rejected": NaN, "logps/chosen": -1605.3333740234375, "logps/rejected": -379.0416564941406, "loss": 0.6835, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 0.014180500991642475, "rewards/margins": 0.01519775390625, "rewards/rejected": -0.0010426839580759406, "step": 27 }, { "epoch": 0.13328044426814756, "grad_norm": 3.0844768372489675, "learning_rate": 9.966191788709714e-08, "logits/chosen": 2.40625, "logits/rejected": NaN, "logps/chosen": -1512.6666259765625, "logps/rejected": -516.8333129882812, "loss": 0.6835, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 0.011271159164607525, "rewards/margins": 0.0121002197265625, "rewards/rejected": -0.0008341471548192203, "step": 28 }, { "epoch": 0.13804046013486712, "grad_norm": 2.9796492194683073, "learning_rate": 9.955857588395063e-08, "logits/chosen": 2.0364582538604736, "logits/rejected": NaN, "logps/chosen": -1308.6666259765625, "logps/rejected": -206.4166717529297, "loss": 0.6828, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01751708984375, "rewards/margins": 0.02065022848546505, "rewards/rejected": -0.0031280517578125, "step": 29 }, { "epoch": 0.14280047600158668, "grad_norm": 2.9021818802869466, "learning_rate": 9.944154131125642e-08, "logits/chosen": 1.9016927480697632, "logits/rejected": NaN, "logps/chosen": -1332.0, "logps/rejected": -271.5, "loss": 0.6818, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.013346354477107525, "rewards/margins": 0.01731363870203495, "rewards/rejected": -0.0039621987380087376, "step": 30 }, { "epoch": 0.14756049186830622, "grad_norm": 3.7103574249848617, "learning_rate": 9.931084650456892e-08, "logits/chosen": 2.2962238788604736, "logits/rejected": NaN, "logps/chosen": -1784.6666259765625, "logps/rejected": -222.375, "loss": 0.683, "rewards/accuracies": 0.4375, "rewards/chosen": 0.01793416403234005, "rewards/margins": 0.0197270717471838, "rewards/rejected": -0.0017725626239553094, "step": 31 }, { "epoch": 0.15232050773502578, "grad_norm": 2.8083484443590043, "learning_rate": 9.916652757363697e-08, "logits/chosen": 1.9866536855697632, "logits/rejected": NaN, "logps/chosen": -1528.6666259765625, "logps/rejected": -199.3333282470703, "loss": 0.6799, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.02545166015625, "rewards/margins": 0.02791849710047245, "rewards/rejected": -0.00250244140625, "step": 32 }, { "epoch": 0.15708052360174535, "grad_norm": 3.195298894130754, "learning_rate": 9.900862439242718e-08, "logits/chosen": 2.1595051288604736, "logits/rejected": NaN, "logps/chosen": -1539.6666259765625, "logps/rejected": -337.3333435058594, "loss": 0.6778, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.02418009378015995, "rewards/margins": 0.02998860739171505, "rewards/rejected": -0.0058390297926962376, "step": 33 }, { "epoch": 0.16184053946846488, "grad_norm": 3.3044326013571346, "learning_rate": 9.883718058810706e-08, "logits/chosen": 1.95703125, "logits/rejected": NaN, "logps/chosen": -1542.6666259765625, "logps/rejected": -503.9166564941406, "loss": 0.6765, "rewards/accuracies": 0.6458333134651184, "rewards/chosen": 0.02669270895421505, "rewards/margins": 0.0319112129509449, "rewards/rejected": -0.0052134194411337376, "step": 34 }, { "epoch": 0.16660055533518445, "grad_norm": 2.995151668350896, "learning_rate": 9.865224352899118e-08, "logits/chosen": 1.7864583730697632, "logits/rejected": NaN, "logps/chosen": -1488.6666259765625, "logps/rejected": -686.9166870117188, "loss": 0.6766, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.024200439453125, "rewards/margins": 0.0314687080681324, "rewards/rejected": -0.0072987875901162624, "step": 35 }, { "epoch": 0.171360571201904, "grad_norm": 2.8084038402424976, "learning_rate": 9.84538643114539e-08, "logits/chosen": 1.8294271230697632, "logits/rejected": NaN, "logps/chosen": -1364.6666259765625, "logps/rejected": -272.5, "loss": 0.6661, "rewards/accuracies": 0.75, "rewards/chosen": 0.045074462890625, "rewards/margins": 0.04962158203125, "rewards/rejected": -0.0045878090895712376, "step": 36 }, { "epoch": 0.17612058706862357, "grad_norm": 2.9054773433025387, "learning_rate": 9.824209774581174e-08, "logits/chosen": 2.0826823711395264, "logits/rejected": NaN, "logps/chosen": -1418.3333740234375, "logps/rejected": -362.8333435058594, "loss": 0.671, "rewards/accuracies": 0.75, "rewards/chosen": 0.042999267578125, "rewards/margins": 0.0488077811896801, "rewards/rejected": -0.0058390297926962376, "step": 37 }, { "epoch": 0.1808806029353431, "grad_norm": 2.8670036159141534, "learning_rate": 9.801700234117999e-08, "logits/chosen": 1.9225260019302368, "logits/rejected": NaN, "logps/chosen": -1426.0, "logps/rejected": -433.0, "loss": 0.6672, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.037933349609375, "rewards/margins": 0.0521748848259449, "rewards/rejected": -0.014165242202579975, "step": 38 }, { "epoch": 0.18564061880206267, "grad_norm": 2.8679096733310594, "learning_rate": 9.777864028930704e-08, "logits/chosen": 2.0826823711395264, "logits/rejected": NaN, "logps/chosen": -1393.0, "logps/rejected": -193.5, "loss": 0.6719, "rewards/accuracies": 0.7708333134651184, "rewards/chosen": 0.03546142578125, "rewards/margins": 0.04315185546875, "rewards/rejected": -0.0077158608473837376, "step": 39 }, { "epoch": 0.19040063466878224, "grad_norm": 2.7627109082266226, "learning_rate": 9.752707744739145e-08, "logits/chosen": 1.9762369394302368, "logits/rejected": NaN, "logps/chosen": -1240.0, "logps/rejected": -183.2916717529297, "loss": 0.6674, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.0434061698615551, "rewards/margins": 0.0514933280646801, "rewards/rejected": -0.0081329345703125, "step": 40 }, { "epoch": 0.19516065053550177, "grad_norm": 2.6388334579169563, "learning_rate": 9.726238331988623e-08, "logits/chosen": 1.9010416269302368, "logits/rejected": NaN, "logps/chosen": -1267.0, "logps/rejected": -245.5833282470703, "loss": 0.6681, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.04217529296875, "rewards/margins": 0.0500335693359375, "rewards/rejected": -0.007924397476017475, "step": 41 }, { "epoch": 0.19992066640222134, "grad_norm": 2.9593554398274113, "learning_rate": 9.698463103929542e-08, "logits/chosen": 1.8557943105697632, "logits/rejected": NaN, "logps/chosen": -1371.0, "logps/rejected": -711.0416870117188, "loss": 0.6645, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04425048828125, "rewards/margins": 0.0631612166762352, "rewards/rejected": -0.019012451171875, "step": 42 }, { "epoch": 0.2046806822689409, "grad_norm": 3.042969595506808, "learning_rate": 9.669389734596817e-08, "logits/chosen": 2.0110676288604736, "logits/rejected": NaN, "logps/chosen": -1616.6666259765625, "logps/rejected": -491.0, "loss": 0.6688, "rewards/accuracies": 0.75, "rewards/chosen": 0.035888671875, "rewards/margins": 0.04876708984375, "rewards/rejected": -0.0129241943359375, "step": 43 }, { "epoch": 0.20944069813566046, "grad_norm": 3.630337496903089, "learning_rate": 9.639026256689626e-08, "logits/chosen": 2.490234375, "logits/rejected": NaN, "logps/chosen": -1765.3333740234375, "logps/rejected": -429.5833435058594, "loss": 0.6577, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.04840087890625, "rewards/margins": 0.0708109512925148, "rewards/rejected": -0.02239990234375, "step": 44 }, { "epoch": 0.21420071400238, "grad_norm": 3.1793801997133273, "learning_rate": 9.607381059352038e-08, "logits/chosen": 2.3997395038604736, "logits/rejected": NaN, "logps/chosen": -1595.0, "logps/rejected": -361.125, "loss": 0.6697, "rewards/accuracies": 0.7708333134651184, "rewards/chosen": 0.0383707694709301, "rewards/margins": 0.0468241386115551, "rewards/rejected": -0.008550007827579975, "step": 45 }, { "epoch": 0.21896072986909956, "grad_norm": 3.2332464466304356, "learning_rate": 9.574462885855172e-08, "logits/chosen": 2.4381511211395264, "logits/rejected": NaN, "logps/chosen": -1500.0, "logps/rejected": -399.75, "loss": 0.6653, "rewards/accuracies": 0.7708333134651184, "rewards/chosen": 0.0367024727165699, "rewards/margins": 0.0567220039665699, "rewards/rejected": -0.02001953125, "step": 46 }, { "epoch": 0.22372074573581913, "grad_norm": 2.770044306282577, "learning_rate": 9.540280831181524e-08, "logits/chosen": 1.8899739980697632, "logits/rejected": NaN, "logps/chosen": -1376.6666259765625, "logps/rejected": -205.4166717529297, "loss": 0.662, "rewards/accuracies": 0.875, "rewards/chosen": 0.0479838065803051, "rewards/margins": 0.0577799491584301, "rewards/rejected": -0.009801228530704975, "step": 47 }, { "epoch": 0.22848076160253866, "grad_norm": 2.7921454090980418, "learning_rate": 9.504844339512095e-08, "logits/chosen": 1.8723958730697632, "logits/rejected": NaN, "logps/chosen": -1307.3333740234375, "logps/rejected": -644.2083129882812, "loss": 0.6586, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0417378731071949, "rewards/margins": 0.0626017227768898, "rewards/rejected": -0.02083333395421505, "step": 48 }, { "epoch": 0.23324077746925823, "grad_norm": 2.682362712903166, "learning_rate": 9.468163201617062e-08, "logits/chosen": 1.8743489980697632, "logits/rejected": NaN, "logps/chosen": -1292.0, "logps/rejected": -209.5, "loss": 0.6641, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.0467325858771801, "rewards/margins": 0.05859375, "rewards/rejected": -0.011891682632267475, "step": 49 }, { "epoch": 0.2380007933359778, "grad_norm": 2.8024075453444026, "learning_rate": 9.430247552150672e-08, "logits/chosen": 1.8948568105697632, "logits/rejected": NaN, "logps/chosen": -1348.6666259765625, "logps/rejected": -197.0833282470703, "loss": 0.6694, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.0388387031853199, "rewards/margins": 0.0472005195915699, "rewards/rejected": -0.008443196304142475, "step": 50 }, { "epoch": 0.24276080920269735, "grad_norm": 2.8973595776612586, "learning_rate": 9.391107866851143e-08, "logits/chosen": 1.484375, "logits/rejected": NaN, "logps/chosen": -1479.3333740234375, "logps/rejected": -207.1666717529297, "loss": 0.6632, "rewards/accuracies": 0.875, "rewards/chosen": 0.0480244942009449, "rewards/margins": 0.05792236328125, "rewards/rejected": -0.010014851577579975, "step": 51 }, { "epoch": 0.2475208250694169, "grad_norm": 2.8257249326867346, "learning_rate": 9.350754959646305e-08, "logits/chosen": 1.8541666269302368, "logits/rejected": NaN, "logps/chosen": -1380.6666259765625, "logps/rejected": -368.3333435058594, "loss": 0.6635, "rewards/accuracies": 0.8958333134651184, "rewards/chosen": 0.0508626289665699, "rewards/margins": 0.0612589530646801, "rewards/rejected": -0.01043701171875, "step": 52 }, { "epoch": 0.25228084093613645, "grad_norm": 2.986625683949948, "learning_rate": 9.30919997966582e-08, "logits/chosen": 2.048828125, "logits/rejected": NaN, "logps/chosen": -1540.0, "logps/rejected": -188.9166717529297, "loss": 0.6523, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0700785294175148, "rewards/margins": 0.0830078125, "rewards/rejected": -0.012929280288517475, "step": 53 }, { "epoch": 0.257040856802856, "grad_norm": 2.5763656997654762, "learning_rate": 9.266454408160778e-08, "logits/chosen": 1.9134114980697632, "logits/rejected": NaN, "logps/chosen": -1294.3333740234375, "logps/rejected": -198.6666717529297, "loss": 0.6475, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.0808512344956398, "rewards/margins": 0.0957234725356102, "rewards/rejected": -0.014806111343204975, "step": 54 }, { "epoch": 0.2618008726695756, "grad_norm": 2.787476695165978, "learning_rate": 9.222530055331539e-08, "logits/chosen": 2.0227863788604736, "logits/rejected": NaN, "logps/chosen": -1496.3333740234375, "logps/rejected": -546.8333129882812, "loss": 0.6325, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.0945231094956398, "rewards/margins": 0.1277669221162796, "rewards/rejected": -0.0333506278693676, "step": 55 }, { "epoch": 0.2665608885362951, "grad_norm": 2.780436919632163, "learning_rate": 9.177439057064682e-08, "logits/chosen": 1.9576822519302368, "logits/rejected": NaN, "logps/chosen": -1275.3333740234375, "logps/rejected": -240.7916717529297, "loss": 0.6396, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.0884602889418602, "rewards/margins": 0.111083984375, "rewards/rejected": -0.02263387106359005, "step": 56 }, { "epoch": 0.27132090440301465, "grad_norm": 2.694503560733566, "learning_rate": 9.131193871579974e-08, "logits/chosen": 2.001953125, "logits/rejected": NaN, "logps/chosen": -1475.0, "logps/rejected": -257.75, "loss": 0.6303, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.1059977188706398, "rewards/margins": 0.1266682893037796, "rewards/rejected": -0.02086385153234005, "step": 57 }, { "epoch": 0.27608092026973424, "grad_norm": 2.6164739225559663, "learning_rate": 9.083807275988283e-08, "logits/chosen": 1.9791666269302368, "logits/rejected": NaN, "logps/chosen": -1329.6666259765625, "logps/rejected": -335.5833435058594, "loss": 0.6262, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10791015625, "rewards/margins": 0.1393636018037796, "rewards/rejected": -0.03131103515625, "step": 58 }, { "epoch": 0.2808409361364538, "grad_norm": 2.596903683235154, "learning_rate": 9.03529236276138e-08, "logits/chosen": 1.9534505605697632, "logits/rejected": NaN, "logps/chosen": -1323.6666259765625, "logps/rejected": -594.5, "loss": 0.6238, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1062825545668602, "rewards/margins": 0.1328938752412796, "rewards/rejected": -0.02649942971765995, "step": 59 }, { "epoch": 0.28560095200317337, "grad_norm": 2.848334529424154, "learning_rate": 8.985662536114613e-08, "logits/chosen": 2.0240886211395264, "logits/rejected": NaN, "logps/chosen": -1506.3333740234375, "logps/rejected": -521.6666870117188, "loss": 0.6219, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.115234375, "rewards/margins": 0.1461588591337204, "rewards/rejected": -0.03110249899327755, "step": 60 }, { "epoch": 0.2903609678698929, "grad_norm": 2.876426795335225, "learning_rate": 8.934931508303445e-08, "logits/chosen": 2.25, "logits/rejected": NaN, "logps/chosen": -1557.0, "logps/rejected": -221.75, "loss": 0.6266, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1195882186293602, "rewards/margins": 0.1431477814912796, "rewards/rejected": -0.02335103414952755, "step": 61 }, { "epoch": 0.29512098373661244, "grad_norm": 2.537423595701227, "learning_rate": 8.883113295834892e-08, "logits/chosen": 1.9329427480697632, "logits/rejected": NaN, "logps/chosen": -1155.3333740234375, "logps/rejected": -416.75, "loss": 0.6213, "rewards/accuracies": 1.0, "rewards/chosen": 0.1212972030043602, "rewards/margins": 0.1614583283662796, "rewards/rejected": -0.0398966483771801, "step": 62 }, { "epoch": 0.29988099960333203, "grad_norm": 2.7038971300904473, "learning_rate": 8.83022221559489e-08, "logits/chosen": 2.2102863788604736, "logits/rejected": NaN, "logps/chosen": -1405.0, "logps/rejected": -200.5, "loss": 0.6227, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.1200764998793602, "rewards/margins": 0.1464029997587204, "rewards/rejected": -0.02629598043859005, "step": 63 }, { "epoch": 0.30464101547005157, "grad_norm": 2.986877484832217, "learning_rate": 8.776272880892674e-08, "logits/chosen": 1.94921875, "logits/rejected": NaN, "logps/chosen": -1536.0, "logps/rejected": -524.0833129882812, "loss": 0.6094, "rewards/accuracies": 1.0, "rewards/chosen": 0.1381022185087204, "rewards/margins": 0.181884765625, "rewards/rejected": -0.0437825508415699, "step": 64 }, { "epoch": 0.3094010313367711, "grad_norm": 2.9882962132000306, "learning_rate": 8.721280197423258e-08, "logits/chosen": 2.1028645038604736, "logits/rejected": NaN, "logps/chosen": -1636.3333740234375, "logps/rejected": -249.125, "loss": 0.6254, "rewards/accuracies": 0.8958333134651184, "rewards/chosen": 0.1089274063706398, "rewards/margins": 0.1392822265625, "rewards/rejected": -0.030487060546875, "step": 65 }, { "epoch": 0.3141610472034907, "grad_norm": 2.632751517436848, "learning_rate": 8.665259359149131e-08, "logits/chosen": 1.7799478769302368, "logits/rejected": NaN, "logps/chosen": -1252.3333740234375, "logps/rejected": -425.9166564941406, "loss": 0.6191, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1243082657456398, "rewards/margins": 0.1647542268037796, "rewards/rejected": -0.040557861328125, "step": 66 }, { "epoch": 0.31892106307021023, "grad_norm": 3.244181144372953, "learning_rate": 8.608225844102311e-08, "logits/chosen": 2.185546875, "logits/rejected": NaN, "logps/chosen": -1859.6666259765625, "logps/rejected": -222.2916717529297, "loss": 0.6187, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.1287841796875, "rewards/margins": 0.15850830078125, "rewards/rejected": -0.02971394918859005, "step": 67 }, { "epoch": 0.32368107893692977, "grad_norm": 3.1054803317794772, "learning_rate": 8.550195410107901e-08, "logits/chosen": 2.0748698711395264, "logits/rejected": NaN, "logps/chosen": -1731.0, "logps/rejected": -723.5833129882812, "loss": 0.6134, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.1177164688706398, "rewards/margins": 0.1701253205537796, "rewards/rejected": -0.05230712890625, "step": 68 }, { "epoch": 0.32844109480364936, "grad_norm": 2.81107358272895, "learning_rate": 8.491184090430364e-08, "logits/chosen": 2.0362141132354736, "logits/rejected": NaN, "logps/chosen": -1541.6666259765625, "logps/rejected": -279.0833435058594, "loss": 0.6195, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1339925080537796, "rewards/margins": 0.1614176481962204, "rewards/rejected": -0.02710978128015995, "step": 69 }, { "epoch": 0.3332011106703689, "grad_norm": 2.398827229418021, "learning_rate": 8.431208189343668e-08, "logits/chosen": 1.7063802480697632, "logits/rejected": NaN, "logps/chosen": -1305.3333740234375, "logps/rejected": -189.5, "loss": 0.6207, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1221516951918602, "rewards/margins": 0.1538899689912796, "rewards/rejected": -0.031707763671875, "step": 70 }, { "epoch": 0.33796112653708843, "grad_norm": 2.976311987538715, "learning_rate": 8.370284277626575e-08, "logits/chosen": 1.9915364980697632, "logits/rejected": NaN, "logps/chosen": -1480.3333740234375, "logps/rejected": -386.1666564941406, "loss": 0.6108, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.138671875, "rewards/margins": 0.1826578825712204, "rewards/rejected": -0.04400634765625, "step": 71 }, { "epoch": 0.342721142403808, "grad_norm": 2.7906860167659495, "learning_rate": 8.308429187984297e-08, "logits/chosen": 2.017578125, "logits/rejected": NaN, "logps/chosen": -1594.3333740234375, "logps/rejected": -200.625, "loss": 0.6132, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.14013671875, "rewards/margins": 0.1676432341337204, "rewards/rejected": -0.02764892578125, "step": 72 }, { "epoch": 0.34748115827052756, "grad_norm": 2.956423551442551, "learning_rate": 8.24566001039776e-08, "logits/chosen": 1.8580728769302368, "logits/rejected": 1.398193359375, "logps/chosen": -1390.3333740234375, "logps/rejected": -239.0, "loss": 0.6187, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.133544921875, "rewards/margins": 0.1695556640625, "rewards/rejected": -0.0360616035759449, "step": 73 }, { "epoch": 0.35224117413724715, "grad_norm": 2.57139969502871, "learning_rate": 8.181994087401818e-08, "logits/chosen": 2.0625, "logits/rejected": 0.9173991084098816, "logps/chosen": -1206.3333740234375, "logps/rejected": -1035.25, "loss": 0.5967, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1431477814912796, "rewards/margins": 0.2132161408662796, "rewards/rejected": -0.070159912109375, "step": 74 }, { "epoch": 0.3570011900039667, "grad_norm": 2.3789263635601756, "learning_rate": 8.117449009293668e-08, "logits/chosen": 1.8365885019302368, "logits/rejected": NaN, "logps/chosen": -1216.6666259765625, "logps/rejected": -417.4166564941406, "loss": 0.6078, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.14013671875, "rewards/margins": 0.1861979216337204, "rewards/rejected": -0.0458984375, "step": 75 }, { "epoch": 0.3617612058706862, "grad_norm": 2.6269014458132616, "learning_rate": 8.052042609272816e-08, "logits/chosen": 1.9140625, "logits/rejected": NaN, "logps/chosen": -1333.6666259765625, "logps/rejected": -309.5, "loss": 0.6164, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1212565079331398, "rewards/margins": 0.1600748747587204, "rewards/rejected": -0.0386962890625, "step": 76 }, { "epoch": 0.3665212217374058, "grad_norm": 2.739178703516763, "learning_rate": 7.98579295851393e-08, "logits/chosen": 2.1529948711395264, "logits/rejected": NaN, "logps/chosen": -1397.0, "logps/rejected": -290.9166564941406, "loss": 0.61, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1343180388212204, "rewards/margins": 0.1739095002412796, "rewards/rejected": -0.0396219901740551, "step": 77 }, { "epoch": 0.37128123760412535, "grad_norm": 2.3079066820004304, "learning_rate": 7.91871836117395e-08, "logits/chosen": 1.8785806894302368, "logits/rejected": NaN, "logps/chosen": -1415.3333740234375, "logps/rejected": -222.1666717529297, "loss": 0.6086, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.1553548127412796, "rewards/margins": 0.1886393278837204, "rewards/rejected": -0.0333455391228199, "step": 78 }, { "epoch": 0.3760412534708449, "grad_norm": 2.5191243394068144, "learning_rate": 7.850837349334809e-08, "logits/chosen": 2.1647136211395264, "logits/rejected": NaN, "logps/chosen": -1232.6666259765625, "logps/rejected": -314.0833435058594, "loss": 0.6022, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.15283203125, "rewards/margins": 0.1917317658662796, "rewards/rejected": -0.0390218086540699, "step": 79 }, { "epoch": 0.3808012693375645, "grad_norm": 2.464350768449704, "learning_rate": 7.782168677883206e-08, "logits/chosen": 1.916015625, "logits/rejected": NaN, "logps/chosen": -1364.3333740234375, "logps/rejected": -656.0833129882812, "loss": 0.6058, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1260579377412796, "rewards/margins": 0.1813151091337204, "rewards/rejected": -0.0555216483771801, "step": 80 }, { "epoch": 0.385561285204284, "grad_norm": 2.717004939585792, "learning_rate": 7.712731319328797e-08, "logits/chosen": 1.9537760019302368, "logits/rejected": NaN, "logps/chosen": -1486.6666259765625, "logps/rejected": -224.25, "loss": 0.611, "rewards/accuracies": 0.875, "rewards/chosen": 0.13720703125, "rewards/margins": 0.1704915314912796, "rewards/rejected": -0.0333760567009449, "step": 81 }, { "epoch": 0.39032130107100355, "grad_norm": 3.2264984436857738, "learning_rate": 7.642544458562277e-08, "logits/chosen": 2.267578125, "logits/rejected": NaN, "logps/chosen": -1840.6666259765625, "logps/rejected": -411.1666564941406, "loss": 0.6014, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.1565348356962204, "rewards/margins": 0.1999918669462204, "rewards/rejected": -0.0435587577521801, "step": 82 }, { "epoch": 0.39508131693772314, "grad_norm": 2.683314265623292, "learning_rate": 7.571627487554768e-08, "logits/chosen": 2.0540363788604736, "logits/rejected": NaN, "logps/chosen": -1430.0, "logps/rejected": -212.0, "loss": 0.6104, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.13525390625, "rewards/margins": 0.1728515625, "rewards/rejected": -0.0377400703728199, "step": 83 }, { "epoch": 0.3998413328044427, "grad_norm": 2.7144969695615284, "learning_rate": 7.5e-08, "logits/chosen": 2.173828125, "logits/rejected": NaN, "logps/chosen": -1470.0, "logps/rejected": -171.8333282470703, "loss": 0.6058, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1543782502412796, "rewards/margins": 0.1909993439912796, "rewards/rejected": -0.0367228202521801, "step": 84 }, { "epoch": 0.40460134867116226, "grad_norm": 2.7975541338538963, "learning_rate": 7.42768178590076e-08, "logits/chosen": 1.98046875, "logits/rejected": NaN, "logps/chosen": -1496.6666259765625, "logps/rejected": -337.4583435058594, "loss": 0.6065, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.1447346955537796, "rewards/margins": 0.1871744841337204, "rewards/rejected": -0.0425211600959301, "step": 85 }, { "epoch": 0.4093613645378818, "grad_norm": 2.5565364148587646, "learning_rate": 7.354692826101101e-08, "logits/chosen": 1.9537760019302368, "logits/rejected": NaN, "logps/chosen": -1364.6666259765625, "logps/rejected": -325.1666564941406, "loss": 0.6036, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1505126953125, "rewards/margins": 0.1897786408662796, "rewards/rejected": -0.0393575020134449, "step": 86 }, { "epoch": 0.41412138040460134, "grad_norm": 2.6614568460230212, "learning_rate": 7.281053286765815e-08, "logits/chosen": 2.2415363788604736, "logits/rejected": NaN, "logps/chosen": -1447.3333740234375, "logps/rejected": -194.0833282470703, "loss": 0.6062, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1497395783662796, "rewards/margins": 0.1826985627412796, "rewards/rejected": -0.0329793281853199, "step": 87 }, { "epoch": 0.4188813962713209, "grad_norm": 2.8504744317923145, "learning_rate": 7.206783513808719e-08, "logits/chosen": 2.0045573711395264, "logits/rejected": NaN, "logps/chosen": -1482.3333740234375, "logps/rejected": -231.9166717529297, "loss": 0.6066, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.1485595703125, "rewards/margins": 0.1824544221162796, "rewards/rejected": -0.0341898612678051, "step": 88 }, { "epoch": 0.42364141213804046, "grad_norm": 3.0789510684671173, "learning_rate": 7.131904027271269e-08, "logits/chosen": 2.4290363788604736, "logits/rejected": NaN, "logps/chosen": -1558.6666259765625, "logps/rejected": -210.25, "loss": 0.6094, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1439615935087204, "rewards/margins": 0.180908203125, "rewards/rejected": -0.0371195487678051, "step": 89 }, { "epoch": 0.42840142800476, "grad_norm": 2.3401989437208512, "learning_rate": 7.056435515653059e-08, "logits/chosen": 1.7708333730697632, "logits/rejected": NaN, "logps/chosen": -1350.3333740234375, "logps/rejected": -375.2083435058594, "loss": 0.5939, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1522623747587204, "rewards/margins": 0.1978352814912796, "rewards/rejected": -0.0454508475959301, "step": 90 }, { "epoch": 0.4331614438714796, "grad_norm": 2.617864829611724, "learning_rate": 6.980398830195784e-08, "logits/chosen": 2.0774738788604736, "logits/rejected": NaN, "logps/chosen": -1413.0, "logps/rejected": -191.6666717529297, "loss": 0.6024, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1588541716337204, "rewards/margins": 0.1947428435087204, "rewards/rejected": -0.03582763671875, "step": 91 }, { "epoch": 0.4379214597381991, "grad_norm": 2.945106819409662, "learning_rate": 6.903814979122247e-08, "logits/chosen": 2.1328125, "logits/rejected": NaN, "logps/chosen": -1458.0, "logps/rejected": -405.75, "loss": 0.5955, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.1756184846162796, "rewards/margins": 0.219482421875, "rewards/rejected": -0.0438232421875, "step": 92 }, { "epoch": 0.44268147560491866, "grad_norm": 2.3715073720788062, "learning_rate": 6.826705121831976e-08, "logits/chosen": 1.724609375, "logits/rejected": NaN, "logps/chosen": -1186.3333740234375, "logps/rejected": -194.1666717529297, "loss": 0.5867, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.1758626252412796, "rewards/margins": 0.222900390625, "rewards/rejected": -0.0468953438103199, "step": 93 }, { "epoch": 0.44744149147163825, "grad_norm": 2.695026787169654, "learning_rate": 6.749090563055075e-08, "logits/chosen": 2.0143229961395264, "logits/rejected": NaN, "logps/chosen": -1569.3333740234375, "logps/rejected": -333.0, "loss": 0.5999, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.1517740935087204, "rewards/margins": 0.1964518278837204, "rewards/rejected": -0.0446370430290699, "step": 94 }, { "epoch": 0.4522015073383578, "grad_norm": 2.6523018617450473, "learning_rate": 6.670992746965938e-08, "logits/chosen": 2.1263020038604736, "logits/rejected": NaN, "logps/chosen": -1398.3333740234375, "logps/rejected": -290.5, "loss": 0.6019, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1586100310087204, "rewards/margins": 0.2041015625, "rewards/rejected": -0.0452473945915699, "step": 95 }, { "epoch": 0.4569615232050773, "grad_norm": 2.8165136119650693, "learning_rate": 6.592433251258423e-08, "logits/chosen": 2.0849609375, "logits/rejected": 1.6292318105697632, "logps/chosen": -1399.6666259765625, "logps/rejected": -301.4166564941406, "loss": 0.5977, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.1651204377412796, "rewards/margins": 0.2105305939912796, "rewards/rejected": -0.04522705078125, "step": 96 }, { "epoch": 0.4617215390717969, "grad_norm": 2.795118120284924, "learning_rate": 6.51343378118413e-08, "logits/chosen": 1.8997396230697632, "logits/rejected": NaN, "logps/chosen": -1594.0, "logps/rejected": -372.8333435058594, "loss": 0.5988, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1497395783662796, "rewards/margins": 0.202392578125, "rewards/rejected": -0.05255126953125, "step": 97 }, { "epoch": 0.46648155493851645, "grad_norm": 2.3527175308766677, "learning_rate": 6.434016163555452e-08, "logits/chosen": 2.013671875, "logits/rejected": NaN, "logps/chosen": -1271.6666259765625, "logps/rejected": -446.5833435058594, "loss": 0.5936, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.1669108122587204, "rewards/margins": 0.2158203125, "rewards/rejected": -0.0489501953125, "step": 98 }, { "epoch": 0.47124157080523604, "grad_norm": 2.269292458197308, "learning_rate": 6.354202340715025e-08, "logits/chosen": 1.7379556894302368, "logits/rejected": NaN, "logps/chosen": -1289.0, "logps/rejected": -432.0, "loss": 0.592, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1646321564912796, "rewards/margins": 0.2252604216337204, "rewards/rejected": -0.0604654960334301, "step": 99 }, { "epoch": 0.4760015866719556, "grad_norm": 2.654026647395738, "learning_rate": 6.274014364473274e-08, "logits/chosen": 1.7734375, "logits/rejected": NaN, "logps/chosen": -1326.3333740234375, "logps/rejected": -368.9166564941406, "loss": 0.5782, "rewards/accuracies": 1.0, "rewards/chosen": 0.1892903596162796, "rewards/margins": 0.2517903745174408, "rewards/rejected": -0.0622355155646801, "step": 100 }, { "epoch": 0.4807616025386751, "grad_norm": 2.224568158982869, "learning_rate": 6.19347439001569e-08, "logits/chosen": 1.6510416269302368, "logits/rejected": NaN, "logps/chosen": -1172.6666259765625, "logps/rejected": -398.5, "loss": 0.5793, "rewards/accuracies": 1.0, "rewards/chosen": 0.1842447966337204, "rewards/margins": 0.2439778596162796, "rewards/rejected": -0.0596516914665699, "step": 101 }, { "epoch": 0.4855216184053947, "grad_norm": 2.597661312044313, "learning_rate": 6.112604669781571e-08, "logits/chosen": 2.0286457538604736, "logits/rejected": NaN, "logps/chosen": -1323.0, "logps/rejected": -420.6666564941406, "loss": 0.5632, "rewards/accuracies": 1.0, "rewards/chosen": 0.22021484375, "rewards/margins": 0.2819010317325592, "rewards/rejected": -0.0615030936896801, "step": 102 }, { "epoch": 0.49028163427211424, "grad_norm": 2.4892560953648024, "learning_rate": 6.031427547315888e-08, "logits/chosen": 2.1149089336395264, "logits/rejected": NaN, "logps/chosen": -1374.0, "logps/rejected": -347.8333435058594, "loss": 0.5768, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.1827799528837204, "rewards/margins": 0.25146484375, "rewards/rejected": -0.0687662735581398, "step": 103 }, { "epoch": 0.4950416501388338, "grad_norm": 2.5037829768257054, "learning_rate": 5.949965451095951e-08, "logits/chosen": 2.2347004413604736, "logits/rejected": NaN, "logps/chosen": -1490.3333740234375, "logps/rejected": -179.4583282470703, "loss": 0.5827, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1913655549287796, "rewards/margins": 0.24267578125, "rewards/rejected": -0.0511983223259449, "step": 104 }, { "epoch": 0.49980166600555337, "grad_norm": 2.3007706020312946, "learning_rate": 5.868240888334653e-08, "logits/chosen": 2.0305988788604736, "logits/rejected": NaN, "logps/chosen": -1358.6666259765625, "logps/rejected": -378.1666564941406, "loss": 0.5672, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.2044270783662796, "rewards/margins": 0.2724609375, "rewards/rejected": -0.0679931640625, "step": 105 }, { "epoch": 0.5045616818722729, "grad_norm": 2.687454169363537, "learning_rate": 5.786276438761927e-08, "logits/chosen": 1.939453125, "logits/rejected": NaN, "logps/chosen": -1579.0, "logps/rejected": -280.125, "loss": 0.5729, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.1995442658662796, "rewards/margins": 0.2615559995174408, "rewards/rejected": -0.0616353340446949, "step": 106 }, { "epoch": 0.5093216977389925, "grad_norm": 2.0530034264243957, "learning_rate": 5.7040947483861834e-08, "logits/chosen": 1.6572265625, "logits/rejected": NaN, "logps/chosen": -1141.0, "logps/rejected": -210.5, "loss": 0.5526, "rewards/accuracies": 1.0, "rewards/chosen": 0.2364908903837204, "rewards/margins": 0.3084309995174408, "rewards/rejected": -0.0723470076918602, "step": 107 }, { "epoch": 0.514081713605712, "grad_norm": 2.0586503729863037, "learning_rate": 5.621718523237427e-08, "logits/chosen": 1.61279296875, "logits/rejected": NaN, "logps/chosen": -1207.3333740234375, "logps/rejected": -202.9166717529297, "loss": 0.5562, "rewards/accuracies": 1.0, "rewards/chosen": 0.2274576872587204, "rewards/margins": 0.3038736879825592, "rewards/rejected": -0.0763753280043602, "step": 108 }, { "epoch": 0.5188417294724316, "grad_norm": 2.127852797306922, "learning_rate": 5.5391705230937934e-08, "logits/chosen": 1.677734375, "logits/rejected": NaN, "logps/chosen": -1323.0, "logps/rejected": -385.1666564941406, "loss": 0.5467, "rewards/accuracies": 1.0, "rewards/chosen": 0.2367350310087204, "rewards/margins": 0.3196614682674408, "rewards/rejected": -0.0827229842543602, "step": 109 }, { "epoch": 0.5236017453391512, "grad_norm": 2.5008366890366354, "learning_rate": 5.4564735551932416e-08, "logits/chosen": 2.212890625, "logits/rejected": NaN, "logps/chosen": -1570.3333740234375, "logps/rejected": -427.4166564941406, "loss": 0.5413, "rewards/accuracies": 1.0, "rewards/chosen": 0.2569173276424408, "rewards/margins": 0.3401692807674408, "rewards/rejected": -0.0832926407456398, "step": 110 }, { "epoch": 0.5283617612058706, "grad_norm": 1.956784631889553, "learning_rate": 5.373650467932121e-08, "logits/chosen": 1.7138671875, "logits/rejected": NaN, "logps/chosen": -1150.6666259765625, "logps/rejected": -475.75, "loss": 0.5356, "rewards/accuracies": 1.0, "rewards/chosen": 0.25244140625, "rewards/margins": 0.3492838442325592, "rewards/rejected": -0.0970052108168602, "step": 111 }, { "epoch": 0.5331217770725902, "grad_norm": 2.5190391554421487, "learning_rate": 5.2907241445523785e-08, "logits/chosen": 2.2141926288604736, "logits/rejected": NaN, "logps/chosen": -1508.3333740234375, "logps/rejected": -337.4583435058594, "loss": 0.5448, "rewards/accuracies": 0.9583333134651184, "rewards/chosen": 0.2505696713924408, "rewards/margins": 0.3324381411075592, "rewards/rejected": -0.0818684920668602, "step": 112 }, { "epoch": 0.5378817929393098, "grad_norm": 2.311957407228179, "learning_rate": 5.207717496819134e-08, "logits/chosen": 1.5335286855697632, "logits/rejected": NaN, "logps/chosen": -1486.3333740234375, "logps/rejected": -319.1666564941406, "loss": 0.5288, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.269775390625, "rewards/margins": 0.36865234375, "rewards/rejected": -0.0992431640625, "step": 113 }, { "epoch": 0.5426418088060293, "grad_norm": 2.0481092553278843, "learning_rate": 5.124653458690364e-08, "logits/chosen": 1.94921875, "logits/rejected": NaN, "logps/chosen": -1287.3333740234375, "logps/rejected": -398.25, "loss": 0.5248, "rewards/accuracies": 1.0, "rewards/chosen": 0.2877604067325592, "rewards/margins": 0.3829752504825592, "rewards/rejected": -0.09527587890625, "step": 114 }, { "epoch": 0.5474018246727489, "grad_norm": 2.214152823316418, "learning_rate": 5.0415549799804857e-08, "logits/chosen": 1.81640625, "logits/rejected": NaN, "logps/chosen": -1390.3333740234375, "logps/rejected": -218.0, "loss": 0.5158, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.29296875, "rewards/margins": 0.3922525942325592, "rewards/rejected": -0.0993245467543602, "step": 115 }, { "epoch": 0.5521618405394685, "grad_norm": 2.213377743308588, "learning_rate": 4.958445020019515e-08, "logits/chosen": 2.0950520038604736, "logits/rejected": NaN, "logps/chosen": -1339.0, "logps/rejected": -443.8333435058594, "loss": 0.5209, "rewards/accuracies": 1.0, "rewards/chosen": 0.27783203125, "rewards/margins": 0.3819986879825592, "rewards/rejected": -0.103759765625, "step": 116 }, { "epoch": 0.5569218564061881, "grad_norm": 2.256076170183229, "learning_rate": 4.875346541309636e-08, "logits/chosen": 1.9759114980697632, "logits/rejected": NaN, "logps/chosen": -1347.3333740234375, "logps/rejected": -333.3333435058594, "loss": 0.5226, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.2803548276424408, "rewards/margins": 0.38427734375, "rewards/rejected": -0.1038004532456398, "step": 117 }, { "epoch": 0.5616818722729076, "grad_norm": 2.1024738378940504, "learning_rate": 4.792282503180867e-08, "logits/chosen": 1.8665364980697632, "logits/rejected": NaN, "logps/chosen": -1256.0, "logps/rejected": -280.6666564941406, "loss": 0.5211, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2827962338924408, "rewards/margins": 0.3873697817325592, "rewards/rejected": -0.10430908203125, "step": 118 }, { "epoch": 0.5664418881396271, "grad_norm": 2.3120916296769574, "learning_rate": 4.709275855447621e-08, "logits/chosen": 1.7356771230697632, "logits/rejected": NaN, "logps/chosen": -1440.6666259765625, "logps/rejected": -469.8333435058594, "loss": 0.5148, "rewards/accuracies": 1.0, "rewards/chosen": 0.2939453125, "rewards/margins": 0.4103190004825592, "rewards/rejected": -0.1161702498793602, "step": 119 }, { "epoch": 0.5712019040063467, "grad_norm": 2.1793904669362107, "learning_rate": 4.6263495320678784e-08, "logits/chosen": 1.9680989980697632, "logits/rejected": NaN, "logps/chosen": -1382.0, "logps/rejected": -320.25, "loss": 0.5156, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.2911783754825592, "rewards/margins": 0.3990885317325592, "rewards/rejected": -0.1077677384018898, "step": 120 }, { "epoch": 0.5759619198730662, "grad_norm": 2.217847138959823, "learning_rate": 4.543526444806759e-08, "logits/chosen": 1.9404296875, "logits/rejected": 1.2596029043197632, "logps/chosen": -1353.6666259765625, "logps/rejected": -852.0, "loss": 0.495, "rewards/accuracies": 1.0, "rewards/chosen": 0.3121744692325592, "rewards/margins": 0.4568684995174408, "rewards/rejected": -0.1444905549287796, "step": 121 }, { "epoch": 0.5807219357397858, "grad_norm": 2.2556436443719847, "learning_rate": 4.460829476906207e-08, "logits/chosen": 2.3072917461395264, "logits/rejected": NaN, "logps/chosen": -1423.6666259765625, "logps/rejected": -225.0833282470703, "loss": 0.4993, "rewards/accuracies": 1.0, "rewards/chosen": 0.3306477963924408, "rewards/margins": 0.4436849057674408, "rewards/rejected": -0.1129150390625, "step": 122 }, { "epoch": 0.5854819516065054, "grad_norm": 2.4496973638231547, "learning_rate": 4.3782814767625755e-08, "logits/chosen": 1.9225260019302368, "logits/rejected": NaN, "logps/chosen": -1690.3333740234375, "logps/rejected": -516.0833129882812, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 0.3191731870174408, "rewards/margins": 0.44287109375, "rewards/rejected": -0.12384033203125, "step": 123 }, { "epoch": 0.5902419674732249, "grad_norm": 2.3479931534510783, "learning_rate": 4.295905251613817e-08, "logits/chosen": 1.806640625, "logits/rejected": NaN, "logps/chosen": -1628.6666259765625, "logps/rejected": -218.4166717529297, "loss": 0.5097, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3076985776424408, "rewards/margins": 0.4147135317325592, "rewards/rejected": -0.1068522110581398, "step": 124 }, { "epoch": 0.5950019833399445, "grad_norm": 2.3806255168384465, "learning_rate": 4.213723561238074e-08, "logits/chosen": 1.8951822519302368, "logits/rejected": NaN, "logps/chosen": -1689.3333740234375, "logps/rejected": -394.0, "loss": 0.4976, "rewards/accuracies": 1.0, "rewards/chosen": 0.3312174379825592, "rewards/margins": 0.4498697817325592, "rewards/rejected": -0.1182861328125, "step": 125 }, { "epoch": 0.5997619992066641, "grad_norm": 2.15828623531091, "learning_rate": 4.131759111665348e-08, "logits/chosen": 1.6080728769302368, "logits/rejected": NaN, "logps/chosen": -1303.6666259765625, "logps/rejected": -230.1666717529297, "loss": 0.4974, "rewards/accuracies": 1.0, "rewards/chosen": 0.3365071713924408, "rewards/margins": 0.4469400942325592, "rewards/rejected": -0.11065673828125, "step": 126 }, { "epoch": 0.6045220150733835, "grad_norm": 2.117376463270655, "learning_rate": 4.0500345489040513e-08, "logits/chosen": 1.7604166269302368, "logits/rejected": NaN, "logps/chosen": -1392.6666259765625, "logps/rejected": -411.1666564941406, "loss": 0.5065, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3119303286075592, "rewards/margins": 0.4251302182674408, "rewards/rejected": -0.1136067733168602, "step": 127 }, { "epoch": 0.6092820309401031, "grad_norm": 2.304820104315325, "learning_rate": 3.9685724526841126e-08, "logits/chosen": 1.86328125, "logits/rejected": NaN, "logps/chosen": -1548.6666259765625, "logps/rejected": -331.9166564941406, "loss": 0.4876, "rewards/accuracies": 1.0, "rewards/chosen": 0.3339029848575592, "rewards/margins": 0.4640299379825592, "rewards/rejected": -0.1304117888212204, "step": 128 }, { "epoch": 0.6140420468068227, "grad_norm": 1.8048639966913345, "learning_rate": 3.887395330218428e-08, "logits/chosen": 1.5784505605697632, "logits/rejected": NaN, "logps/chosen": -1254.6666259765625, "logps/rejected": -213.5, "loss": 0.5005, "rewards/accuracies": 1.0, "rewards/chosen": 0.3151041567325592, "rewards/margins": 0.4314778745174408, "rewards/rejected": -0.1164143905043602, "step": 129 }, { "epoch": 0.6188020626735422, "grad_norm": 2.3570364514405417, "learning_rate": 3.8065256099843115e-08, "logits/chosen": 2.11328125, "logits/rejected": NaN, "logps/chosen": -1585.3333740234375, "logps/rejected": -489.9583435058594, "loss": 0.4932, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3302408754825592, "rewards/margins": 0.4661458432674408, "rewards/rejected": -0.1361083984375, "step": 130 }, { "epoch": 0.6235620785402618, "grad_norm": 2.808262013514303, "learning_rate": 3.7259856355267275e-08, "logits/chosen": 2.390625, "logits/rejected": NaN, "logps/chosen": -1765.0, "logps/rejected": -400.75, "loss": 0.5049, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3160807192325592, "rewards/margins": 0.4415690004825592, "rewards/rejected": -0.1250813752412796, "step": 131 }, { "epoch": 0.6283220944069814, "grad_norm": 2.286497267688774, "learning_rate": 3.645797659284975e-08, "logits/chosen": 1.759765625, "logits/rejected": NaN, "logps/chosen": -1537.3333740234375, "logps/rejected": -192.25, "loss": 0.4949, "rewards/accuracies": 1.0, "rewards/chosen": 0.3328450620174408, "rewards/margins": 0.4503580629825592, "rewards/rejected": -0.1178792342543602, "step": 132 }, { "epoch": 0.6330821102737009, "grad_norm": 1.9113524972685845, "learning_rate": 3.56598383644455e-08, "logits/chosen": 1.8893228769302368, "logits/rejected": NaN, "logps/chosen": -1182.6666259765625, "logps/rejected": -212.25, "loss": 0.4939, "rewards/accuracies": 1.0, "rewards/chosen": 0.3352864682674408, "rewards/margins": 0.45947265625, "rewards/rejected": -0.1245524063706398, "step": 133 }, { "epoch": 0.6378421261404205, "grad_norm": 2.193102688331593, "learning_rate": 3.486566218815871e-08, "logits/chosen": 2.0107421875, "logits/rejected": NaN, "logps/chosen": -1234.0, "logps/rejected": -230.25, "loss": 0.4969, "rewards/accuracies": 1.0, "rewards/chosen": 0.326171875, "rewards/margins": 0.4485677182674408, "rewards/rejected": -0.1224772110581398, "step": 134 }, { "epoch": 0.64260214200714, "grad_norm": 2.0784106910572673, "learning_rate": 3.407566748741578e-08, "logits/chosen": 1.8997396230697632, "logits/rejected": NaN, "logps/chosen": -1351.3333740234375, "logps/rejected": -383.75, "loss": 0.4852, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3331705629825592, "rewards/margins": 0.48193359375, "rewards/rejected": -0.14892578125, "step": 135 }, { "epoch": 0.6473621578738595, "grad_norm": 1.9321500226818664, "learning_rate": 3.329007253034062e-08, "logits/chosen": 1.6796875, "logits/rejected": NaN, "logps/chosen": -1373.3333740234375, "logps/rejected": -673.5833129882812, "loss": 0.4815, "rewards/accuracies": 1.0, "rewards/chosen": 0.3312174379825592, "rewards/margins": 0.5047200322151184, "rewards/rejected": -0.1734822541475296, "step": 136 }, { "epoch": 0.6521221737405791, "grad_norm": 1.996048135547203, "learning_rate": 3.2509094369449276e-08, "logits/chosen": 1.765625, "logits/rejected": NaN, "logps/chosen": -1214.0, "logps/rejected": -314.8333435058594, "loss": 0.4861, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3360188901424408, "rewards/margins": 0.4677734375, "rewards/rejected": -0.1311442106962204, "step": 137 }, { "epoch": 0.6568821896072987, "grad_norm": 2.398499680880508, "learning_rate": 3.1732948781680246e-08, "logits/chosen": 2.24609375, "logits/rejected": NaN, "logps/chosen": -1620.6666259765625, "logps/rejected": -409.0833435058594, "loss": 0.4791, "rewards/accuracies": 1.0, "rewards/chosen": 0.3566080629825592, "rewards/margins": 0.4921875, "rewards/rejected": -0.1356608122587204, "step": 138 }, { "epoch": 0.6616422054740182, "grad_norm": 2.0131909940504205, "learning_rate": 3.096185020877752e-08, "logits/chosen": 2.1484375, "logits/rejected": NaN, "logps/chosen": -1380.3333740234375, "logps/rejected": -233.8333282470703, "loss": 0.4908, "rewards/accuracies": 1.0, "rewards/chosen": 0.3385416567325592, "rewards/margins": 0.4611002504825592, "rewards/rejected": -0.1222330704331398, "step": 139 }, { "epoch": 0.6664022213407378, "grad_norm": 1.747753302823929, "learning_rate": 3.0196011698042156e-08, "logits/chosen": 1.775390625, "logits/rejected": 1.3990885019302368, "logps/chosen": -1055.8333740234375, "logps/rejected": -290.4166564941406, "loss": 0.4865, "rewards/accuracies": 1.0, "rewards/chosen": 0.3341471254825592, "rewards/margins": 0.4755859375, "rewards/rejected": -0.1412760466337204, "step": 140 }, { "epoch": 0.6711622372074574, "grad_norm": 1.6738880638021751, "learning_rate": 2.9435644843469432e-08, "logits/chosen": 1.69140625, "logits/rejected": NaN, "logps/chosen": -1106.6666259765625, "logps/rejected": -361.3333435058594, "loss": 0.4833, "rewards/accuracies": 1.0, "rewards/chosen": 0.34716796875, "rewards/margins": 0.48974609375, "rewards/rejected": -0.14306640625, "step": 141 }, { "epoch": 0.6759222530741769, "grad_norm": 2.3037248507214776, "learning_rate": 2.868095972728731e-08, "logits/chosen": 1.759765625, "logits/rejected": NaN, "logps/chosen": -1547.3333740234375, "logps/rejected": -339.1666564941406, "loss": 0.4789, "rewards/accuracies": 1.0, "rewards/chosen": 0.3564453125, "rewards/margins": 0.486328125, "rewards/rejected": -0.1299641877412796, "step": 142 }, { "epoch": 0.6806822689408965, "grad_norm": 2.1036297721447967, "learning_rate": 2.79321648619128e-08, "logits/chosen": 1.69921875, "logits/rejected": NaN, "logps/chosen": -1345.6666259765625, "logps/rejected": -251.1666717529297, "loss": 0.476, "rewards/accuracies": 1.0, "rewards/chosen": 0.3704427182674408, "rewards/margins": 0.4988606870174408, "rewards/rejected": -0.1288655549287796, "step": 143 }, { "epoch": 0.685442284807616, "grad_norm": 2.169307583974489, "learning_rate": 2.7189467132341847e-08, "logits/chosen": 1.8224283456802368, "logits/rejected": NaN, "logps/chosen": -1462.0, "logps/rejected": -446.6666564941406, "loss": 0.4822, "rewards/accuracies": 1.0, "rewards/chosen": 0.34521484375, "rewards/margins": 0.4856770932674408, "rewards/rejected": -0.1404622346162796, "step": 144 }, { "epoch": 0.6902023006743356, "grad_norm": 2.0550176987990922, "learning_rate": 2.645307173898901e-08, "logits/chosen": 1.8990885019302368, "logits/rejected": NaN, "logps/chosen": -1392.0, "logps/rejected": -569.5, "loss": 0.4623, "rewards/accuracies": 1.0, "rewards/chosen": 0.3741861879825592, "rewards/margins": 0.5369465947151184, "rewards/rejected": -0.1636149138212204, "step": 145 }, { "epoch": 0.6949623165410551, "grad_norm": 1.9559413913722827, "learning_rate": 2.5723182140992383e-08, "logits/chosen": 1.6575521230697632, "logits/rejected": NaN, "logps/chosen": -1391.6666259765625, "logps/rejected": -486.4583435058594, "loss": 0.4781, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.34716796875, "rewards/margins": 0.4957682192325592, "rewards/rejected": -0.14892578125, "step": 146 }, { "epoch": 0.6997223324077747, "grad_norm": 1.77184690368037, "learning_rate": 2.500000000000001e-08, "logits/chosen": 1.8196614980697632, "logits/rejected": NaN, "logps/chosen": -1120.3333740234375, "logps/rejected": -378.0416564941406, "loss": 0.4746, "rewards/accuracies": 1.0, "rewards/chosen": 0.3556315004825592, "rewards/margins": 0.5065104365348816, "rewards/rejected": -0.1513671875, "step": 147 }, { "epoch": 0.7044823482744943, "grad_norm": 2.3332126766504016, "learning_rate": 2.4283725124452327e-08, "logits/chosen": 1.7135416269302368, "logits/rejected": NaN, "logps/chosen": -1617.0, "logps/rejected": -409.25, "loss": 0.4777, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.35986328125, "rewards/margins": 0.5079752802848816, "rewards/rejected": -0.147705078125, "step": 148 }, { "epoch": 0.7092423641412138, "grad_norm": 1.795140752408747, "learning_rate": 2.3574555414377228e-08, "logits/chosen": 1.8294271230697632, "logits/rejected": NaN, "logps/chosen": -1275.3333740234375, "logps/rejected": -272.125, "loss": 0.4873, "rewards/accuracies": 1.0, "rewards/chosen": 0.337158203125, "rewards/margins": 0.4729817807674408, "rewards/rejected": -0.1357421875, "step": 149 }, { "epoch": 0.7140023800079334, "grad_norm": 1.952752981212216, "learning_rate": 2.2872686806712033e-08, "logits/chosen": 1.7740885019302368, "logits/rejected": NaN, "logps/chosen": -1366.6666259765625, "logps/rejected": -290.1666564941406, "loss": 0.482, "rewards/accuracies": 1.0, "rewards/chosen": 0.3479817807674408, "rewards/margins": 0.4830729067325592, "rewards/rejected": -0.1351318359375, "step": 150 }, { "epoch": 0.718762395874653, "grad_norm": 1.789618404415905, "learning_rate": 2.2178313221167965e-08, "logits/chosen": 1.8151041269302368, "logits/rejected": NaN, "logps/chosen": -1119.3333740234375, "logps/rejected": -217.75, "loss": 0.4856, "rewards/accuracies": 1.0, "rewards/chosen": 0.3388671875, "rewards/margins": 0.4736328125, "rewards/rejected": -0.1352132111787796, "step": 151 }, { "epoch": 0.7235224117413724, "grad_norm": 2.4468941870774255, "learning_rate": 2.1491626506651912e-08, "logits/chosen": 2.0755207538604736, "logits/rejected": NaN, "logps/chosen": -1539.3333740234375, "logps/rejected": -324.0, "loss": 0.4802, "rewards/accuracies": 1.0, "rewards/chosen": 0.3562825620174408, "rewards/margins": 0.4931640625, "rewards/rejected": -0.136474609375, "step": 152 }, { "epoch": 0.728282427608092, "grad_norm": 1.8555464035051497, "learning_rate": 2.0812816388260517e-08, "logits/chosen": 2.1106770038604736, "logits/rejected": NaN, "logps/chosen": -1203.0, "logps/rejected": -651.1666870117188, "loss": 0.4611, "rewards/accuracies": 1.0, "rewards/chosen": 0.3645833432674408, "rewards/margins": 0.5540364384651184, "rewards/rejected": -0.1890462189912796, "step": 153 }, { "epoch": 0.7330424434748116, "grad_norm": 2.129230370787164, "learning_rate": 2.0142070414860702e-08, "logits/chosen": 1.779296875, "logits/rejected": NaN, "logps/chosen": -1340.0, "logps/rejected": -256.25, "loss": 0.4843, "rewards/accuracies": 1.0, "rewards/chosen": 0.3509114682674408, "rewards/margins": 0.4837239682674408, "rewards/rejected": -0.1325276643037796, "step": 154 }, { "epoch": 0.7378024593415311, "grad_norm": 1.8326859248201792, "learning_rate": 1.9479573907271845e-08, "logits/chosen": 2.130859375, "logits/rejected": NaN, "logps/chosen": -1219.3333740234375, "logps/rejected": -333.0, "loss": 0.484, "rewards/accuracies": 1.0, "rewards/chosen": 0.3313802182674408, "rewards/margins": 0.4890950620174408, "rewards/rejected": -0.15771484375, "step": 155 }, { "epoch": 0.7425624752082507, "grad_norm": 2.43553161056017, "learning_rate": 1.8825509907063325e-08, "logits/chosen": 1.99609375, "logits/rejected": NaN, "logps/chosen": -1680.3333740234375, "logps/rejected": -209.6666717529297, "loss": 0.4848, "rewards/accuracies": 1.0, "rewards/chosen": 0.3712565004825592, "rewards/margins": 0.486328125, "rewards/rejected": -0.1145426407456398, "step": 156 }, { "epoch": 0.7473224910749703, "grad_norm": 1.7213663847904304, "learning_rate": 1.818005912598182e-08, "logits/chosen": 1.6399739980697632, "logits/rejected": NaN, "logps/chosen": -1296.3333740234375, "logps/rejected": -779.25, "loss": 0.4594, "rewards/accuracies": 1.0, "rewards/chosen": 0.3583984375, "rewards/margins": 0.5672200322151184, "rewards/rejected": -0.2093098908662796, "step": 157 }, { "epoch": 0.7520825069416898, "grad_norm": 2.138543660798964, "learning_rate": 1.7543399896022403e-08, "logits/chosen": 2.0455729961395264, "logits/rejected": NaN, "logps/chosen": -1411.3333740234375, "logps/rejected": -248.5, "loss": 0.4781, "rewards/accuracies": 1.0, "rewards/chosen": 0.3616536557674408, "rewards/margins": 0.4970703125, "rewards/rejected": -0.1357421875, "step": 158 }, { "epoch": 0.7568425228084094, "grad_norm": 1.903956596321935, "learning_rate": 1.691570812015704e-08, "logits/chosen": 1.8932291269302368, "logits/rejected": NaN, "logps/chosen": -1360.0, "logps/rejected": -421.3333435058594, "loss": 0.4789, "rewards/accuracies": 1.0, "rewards/chosen": 0.3427734375, "rewards/margins": 0.5009765625, "rewards/rejected": -0.1580403596162796, "step": 159 }, { "epoch": 0.761602538675129, "grad_norm": 1.8076086685577293, "learning_rate": 1.6297157223734225e-08, "logits/chosen": 1.8173828125, "logits/rejected": NaN, "logps/chosen": -1186.0, "logps/rejected": -303.8333435058594, "loss": 0.4796, "rewards/accuracies": 1.0, "rewards/chosen": 0.35595703125, "rewards/margins": 0.4944661557674408, "rewards/rejected": -0.1382242888212204, "step": 160 }, { "epoch": 0.7663625545418484, "grad_norm": 2.506667550726532, "learning_rate": 1.5687918106563324e-08, "logits/chosen": 1.8919271230697632, "logits/rejected": NaN, "logps/chosen": -1604.6666259765625, "logps/rejected": -204.75, "loss": 0.4746, "rewards/accuracies": 1.0, "rewards/chosen": 0.3763020932674408, "rewards/margins": 0.5084635615348816, "rewards/rejected": -0.1322835236787796, "step": 161 }, { "epoch": 0.771122570408568, "grad_norm": 2.2406757166716016, "learning_rate": 1.5088159095696363e-08, "logits/chosen": 1.8639322519302368, "logits/rejected": NaN, "logps/chosen": -1440.3333740234375, "logps/rejected": -462.75, "loss": 0.4573, "rewards/accuracies": 1.0, "rewards/chosen": 0.3792317807674408, "rewards/margins": 0.5618489384651184, "rewards/rejected": -0.1825764924287796, "step": 162 }, { "epoch": 0.7758825862752876, "grad_norm": 2.0966224181917315, "learning_rate": 1.4498045898920986e-08, "logits/chosen": 1.70703125, "logits/rejected": NaN, "logps/chosen": -1400.3333740234375, "logps/rejected": -519.6666870117188, "loss": 0.4605, "rewards/accuracies": 1.0, "rewards/chosen": 0.38330078125, "rewards/margins": 0.5519205927848816, "rewards/rejected": -0.1690266877412796, "step": 163 }, { "epoch": 0.7806426021420071, "grad_norm": 2.1990999218901957, "learning_rate": 1.3917741558976892e-08, "logits/chosen": 1.8307291269302368, "logits/rejected": NaN, "logps/chosen": -1525.0, "logps/rejected": -323.9166564941406, "loss": 0.4728, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3665364682674408, "rewards/margins": 0.5110676884651184, "rewards/rejected": -0.1442464143037796, "step": 164 }, { "epoch": 0.7854026180087267, "grad_norm": 1.8977299979885642, "learning_rate": 1.3347406408508693e-08, "logits/chosen": 1.9837239980697632, "logits/rejected": NaN, "logps/chosen": -1439.0, "logps/rejected": -407.9166564941406, "loss": 0.4755, "rewards/accuracies": 1.0, "rewards/chosen": 0.3600260317325592, "rewards/margins": 0.51171875, "rewards/rejected": -0.151123046875, "step": 165 }, { "epoch": 0.7901626338754463, "grad_norm": 2.505572334038866, "learning_rate": 1.2787198025767415e-08, "logits/chosen": 2.1725261211395264, "logits/rejected": NaN, "logps/chosen": -1913.6666259765625, "logps/rejected": -311.5208435058594, "loss": 0.4691, "rewards/accuracies": 1.0, "rewards/chosen": 0.384765625, "rewards/margins": 0.52978515625, "rewards/rejected": -0.1443888396024704, "step": 166 }, { "epoch": 0.7949226497421658, "grad_norm": 2.436963521338477, "learning_rate": 1.2237271191073268e-08, "logits/chosen": 2.14453125, "logits/rejected": NaN, "logps/chosen": -1648.3333740234375, "logps/rejected": -361.0, "loss": 0.4705, "rewards/accuracies": 1.0, "rewards/chosen": 0.3693033754825592, "rewards/margins": 0.5159505009651184, "rewards/rejected": -0.1475016325712204, "step": 167 }, { "epoch": 0.7996826656088853, "grad_norm": 2.1545871993874997, "learning_rate": 1.1697777844051105e-08, "logits/chosen": 2.181640625, "logits/rejected": NaN, "logps/chosen": -1643.0, "logps/rejected": -659.75, "loss": 0.4708, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3497721254825592, "rewards/margins": 0.53369140625, "rewards/rejected": -0.183837890625, "step": 168 }, { "epoch": 0.8044426814756049, "grad_norm": 1.9783928114364981, "learning_rate": 1.1168867041651081e-08, "logits/chosen": 2.0305988788604736, "logits/rejected": NaN, "logps/chosen": -1307.3333740234375, "logps/rejected": -385.375, "loss": 0.4649, "rewards/accuracies": 1.0, "rewards/chosen": 0.3761393129825592, "rewards/margins": 0.5364583134651184, "rewards/rejected": -0.1610514372587204, "step": 169 }, { "epoch": 0.8092026973423245, "grad_norm": 2.048419402752998, "learning_rate": 1.0650684916965558e-08, "logits/chosen": 1.5989583730697632, "logits/rejected": NaN, "logps/chosen": -1502.3333740234375, "logps/rejected": -197.5, "loss": 0.4778, "rewards/accuracies": 1.0, "rewards/chosen": 0.35791015625, "rewards/margins": 0.48974609375, "rewards/rejected": -0.1322224885225296, "step": 170 }, { "epoch": 0.813962713209044, "grad_norm": 1.9383784775969035, "learning_rate": 1.0143374638853892e-08, "logits/chosen": 1.6829427480697632, "logits/rejected": NaN, "logps/chosen": -1231.0, "logps/rejected": -525.6666870117188, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/chosen": 0.3564453125, "rewards/margins": 0.53271484375, "rewards/rejected": -0.1761881560087204, "step": 171 }, { "epoch": 0.8187227290757636, "grad_norm": 2.11654254419848, "learning_rate": 9.647076372386193e-09, "logits/chosen": 1.6419271230697632, "logits/rejected": NaN, "logps/chosen": -1409.0, "logps/rejected": -275.1666564941406, "loss": 0.4746, "rewards/accuracies": 1.0, "rewards/chosen": 0.3515625, "rewards/margins": 0.5096028447151184, "rewards/rejected": -0.1581624299287796, "step": 172 }, { "epoch": 0.8234827449424832, "grad_norm": 2.031187504683685, "learning_rate": 9.161927240117174e-09, "logits/chosen": 1.7906900644302368, "logits/rejected": NaN, "logps/chosen": -1438.3333740234375, "logps/rejected": -287.0833435058594, "loss": 0.493, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3235677182674408, "rewards/margins": 0.4562174379825592, "rewards/rejected": -0.1332194060087204, "step": 173 }, { "epoch": 0.8282427608092027, "grad_norm": 1.7371218629594014, "learning_rate": 8.688061284200266e-09, "logits/chosen": 1.873046875, "logits/rejected": NaN, "logps/chosen": -1163.6666259765625, "logps/rejected": -443.25, "loss": 0.4719, "rewards/accuracies": 1.0, "rewards/chosen": 0.3527018129825592, "rewards/margins": 0.5113932490348816, "rewards/rejected": -0.1587321013212204, "step": 174 }, { "epoch": 0.8330027766759223, "grad_norm": 2.137935345105564, "learning_rate": 8.225609429353186e-09, "logits/chosen": 1.9832357168197632, "logits/rejected": NaN, "logps/chosen": -1427.6666259765625, "logps/rejected": -440.0, "loss": 0.4677, "rewards/accuracies": 1.0, "rewards/chosen": 0.3673502504825592, "rewards/margins": 0.5314127802848816, "rewards/rejected": -0.163330078125, "step": 175 }, { "epoch": 0.8377627925426419, "grad_norm": 2.2149373272583577, "learning_rate": 7.774699446684607e-09, "logits/chosen": 1.810546875, "logits/rejected": NaN, "logps/chosen": -1571.3333740234375, "logps/rejected": -198.25, "loss": 0.4745, "rewards/accuracies": 1.0, "rewards/chosen": 0.3662109375, "rewards/margins": 0.5026041865348816, "rewards/rejected": -0.1363932341337204, "step": 176 }, { "epoch": 0.8425228084093613, "grad_norm": 1.6437345264974124, "learning_rate": 7.335455918392219e-09, "logits/chosen": 1.6298828125, "logits/rejected": NaN, "logps/chosen": -1051.3333740234375, "logps/rejected": -219.25, "loss": 0.4659, "rewards/accuracies": 1.0, "rewards/chosen": 0.3740234375, "rewards/margins": 0.53076171875, "rewards/rejected": -0.15673828125, "step": 177 }, { "epoch": 0.8472828242760809, "grad_norm": 2.2648226740875437, "learning_rate": 6.908000203341802e-09, "logits/chosen": 1.7578125, "logits/rejected": NaN, "logps/chosen": -1593.3333740234375, "logps/rejected": -556.6666870117188, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 0.3670247495174408, "rewards/margins": 0.5305989384651184, "rewards/rejected": -0.1636962890625, "step": 178 }, { "epoch": 0.8520428401428005, "grad_norm": 1.8643505256455393, "learning_rate": 6.492450403536959e-09, "logits/chosen": 1.88671875, "logits/rejected": NaN, "logps/chosen": -1274.3333740234375, "logps/rejected": -242.5833282470703, "loss": 0.4638, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.3751627504825592, "rewards/margins": 0.5281575322151184, "rewards/rejected": -0.1529947966337204, "step": 179 }, { "epoch": 0.85680285600952, "grad_norm": 1.8118886195560266, "learning_rate": 6.088921331488567e-09, "logits/chosen": 1.72265625, "logits/rejected": NaN, "logps/chosen": -1327.3333740234375, "logps/rejected": -337.1666564941406, "loss": 0.4771, "rewards/accuracies": 1.0, "rewards/chosen": 0.3557942807674408, "rewards/margins": 0.50927734375, "rewards/rejected": -0.1533610075712204, "step": 180 }, { "epoch": 0.8615628718762396, "grad_norm": 1.7763301753189165, "learning_rate": 5.697524478493287e-09, "logits/chosen": 1.7522786855697632, "logits/rejected": NaN, "logps/chosen": -1195.6666259765625, "logps/rejected": -213.6666717529297, "loss": 0.47, "rewards/accuracies": 1.0, "rewards/chosen": 0.3663736879825592, "rewards/margins": 0.5123698115348816, "rewards/rejected": -0.14599609375, "step": 181 }, { "epoch": 0.8663228877429592, "grad_norm": 1.4514297510493193, "learning_rate": 5.3183679838293915e-09, "logits/chosen": 1.4544271230697632, "logits/rejected": NaN, "logps/chosen": -1031.6666259765625, "logps/rejected": -418.75, "loss": 0.4691, "rewards/accuracies": 1.0, "rewards/chosen": 0.3513997495174408, "rewards/margins": 0.5218098759651184, "rewards/rejected": -0.1700846403837204, "step": 182 }, { "epoch": 0.8710829036096787, "grad_norm": 2.1059124601925374, "learning_rate": 4.951556604879048e-09, "logits/chosen": 1.7239583730697632, "logits/rejected": NaN, "logps/chosen": -1482.3333740234375, "logps/rejected": -267.8333435058594, "loss": 0.4813, "rewards/accuracies": 1.0, "rewards/chosen": 0.3521321713924408, "rewards/margins": 0.4930013120174408, "rewards/rejected": -0.1409912109375, "step": 183 }, { "epoch": 0.8758429194763983, "grad_norm": 2.4415439647837265, "learning_rate": 4.597191688184754e-09, "logits/chosen": 1.7776693105697632, "logits/rejected": NaN, "logps/chosen": -1659.3333740234375, "logps/rejected": -300.9166564941406, "loss": 0.4708, "rewards/accuracies": 1.0, "rewards/chosen": 0.37109375, "rewards/margins": 0.51513671875, "rewards/rejected": -0.1446126252412796, "step": 184 }, { "epoch": 0.8806029353431178, "grad_norm": 1.8529794337457115, "learning_rate": 4.255371141448272e-09, "logits/chosen": 1.8059896230697632, "logits/rejected": NaN, "logps/chosen": -1316.0, "logps/rejected": -379.5, "loss": 0.4629, "rewards/accuracies": 1.0, "rewards/chosen": 0.3702799379825592, "rewards/margins": 0.54248046875, "rewards/rejected": -0.1726887971162796, "step": 185 }, { "epoch": 0.8853629512098373, "grad_norm": 2.1226991701058293, "learning_rate": 3.926189406479613e-09, "logits/chosen": 1.791015625, "logits/rejected": NaN, "logps/chosen": -1405.3333740234375, "logps/rejected": -452.8333435058594, "loss": 0.4643, "rewards/accuracies": 1.0, "rewards/chosen": 0.3753255307674408, "rewards/margins": 0.54052734375, "rewards/rejected": -0.1650390625, "step": 186 }, { "epoch": 0.8901229670765569, "grad_norm": 2.30275268133539, "learning_rate": 3.609737433103732e-09, "logits/chosen": 2.056640625, "logits/rejected": NaN, "logps/chosen": -1542.6666259765625, "logps/rejected": -506.0, "loss": 0.4651, "rewards/accuracies": 1.0, "rewards/chosen": 0.36181640625, "rewards/margins": 0.5431315302848816, "rewards/rejected": -0.181884765625, "step": 187 }, { "epoch": 0.8948829829432765, "grad_norm": 2.0856238119236163, "learning_rate": 3.3061026540318227e-09, "logits/chosen": 2.0104167461395264, "logits/rejected": NaN, "logps/chosen": -1519.3333740234375, "logps/rejected": -205.5, "loss": 0.4808, "rewards/accuracies": 1.0, "rewards/chosen": 0.3570963442325592, "rewards/margins": 0.4864908754825592, "rewards/rejected": -0.1291910856962204, "step": 188 }, { "epoch": 0.899642998809996, "grad_norm": 2.1869075364111135, "learning_rate": 3.015368960704584e-09, "logits/chosen": 2.2467448711395264, "logits/rejected": NaN, "logps/chosen": -1450.3333740234375, "logps/rejected": -357.0833435058594, "loss": 0.4728, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 0.356689453125, "rewards/margins": 0.50927734375, "rewards/rejected": -0.1526285856962204, "step": 189 }, { "epoch": 0.9044030146767156, "grad_norm": 1.9167057092483704, "learning_rate": 2.737616680113758e-09, "logits/chosen": 1.978515625, "logits/rejected": NaN, "logps/chosen": -1395.0, "logps/rejected": -656.5, "loss": 0.4674, "rewards/accuracies": 1.0, "rewards/chosen": 0.3466796875, "rewards/margins": 0.54443359375, "rewards/rejected": -0.1977946013212204, "step": 190 }, { "epoch": 0.9091630305434352, "grad_norm": 2.2088575890028, "learning_rate": 2.4729225526085585e-09, "logits/chosen": 1.8391927480697632, "logits/rejected": NaN, "logps/chosen": -1550.0, "logps/rejected": -212.4166717529297, "loss": 0.4724, "rewards/accuracies": 1.0, "rewards/chosen": 0.3707682192325592, "rewards/margins": 0.5139973759651184, "rewards/rejected": -0.1436360627412796, "step": 191 }, { "epoch": 0.9139230464101546, "grad_norm": 1.6175308557989694, "learning_rate": 2.2213597106929607e-09, "logits/chosen": 1.7259114980697632, "logits/rejected": NaN, "logps/chosen": -1176.3333740234375, "logps/rejected": -336.0, "loss": 0.4801, "rewards/accuracies": 1.0, "rewards/chosen": 0.3434244692325592, "rewards/margins": 0.4947916567325592, "rewards/rejected": -0.1509602814912796, "step": 192 }, { "epoch": 0.9186830622768742, "grad_norm": 1.885584304766745, "learning_rate": 1.9829976588200126e-09, "logits/chosen": 1.892578125, "logits/rejected": NaN, "logps/chosen": -1342.6666259765625, "logps/rejected": -364.3333435058594, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 0.3595377504825592, "rewards/margins": 0.5169270634651184, "rewards/rejected": -0.1571858674287796, "step": 193 }, { "epoch": 0.9234430781435938, "grad_norm": 2.294659887781663, "learning_rate": 1.7579022541882539e-09, "logits/chosen": 1.986328125, "logits/rejected": NaN, "logps/chosen": -1536.3333740234375, "logps/rejected": -279.9166564941406, "loss": 0.4647, "rewards/accuracies": 1.0, "rewards/chosen": 0.3738606870174408, "rewards/margins": 0.52587890625, "rewards/rejected": -0.1516520231962204, "step": 194 }, { "epoch": 0.9282030940103134, "grad_norm": 2.060577728033778, "learning_rate": 1.5461356885461075e-09, "logits/chosen": 1.8860677480697632, "logits/rejected": NaN, "logps/chosen": -1427.0, "logps/rejected": -492.9166564941406, "loss": 0.4543, "rewards/accuracies": 1.0, "rewards/chosen": 0.3899739682674408, "rewards/margins": 0.5662434697151184, "rewards/rejected": -0.1758219450712204, "step": 195 }, { "epoch": 0.9329631098770329, "grad_norm": 2.098117892116623, "learning_rate": 1.3477564710088097e-09, "logits/chosen": 1.6959635019302368, "logits/rejected": NaN, "logps/chosen": -1369.6666259765625, "logps/rejected": -407.1666564941406, "loss": 0.463, "rewards/accuracies": 1.0, "rewards/chosen": 0.3759765625, "rewards/margins": 0.54541015625, "rewards/rejected": -0.1689860075712204, "step": 196 }, { "epoch": 0.9377231257437525, "grad_norm": 2.157174977293862, "learning_rate": 1.1628194118929402e-09, "logits/chosen": 2.107421875, "logits/rejected": NaN, "logps/chosen": -1396.3333740234375, "logps/rejected": -231.3333282470703, "loss": 0.4862, "rewards/accuracies": 1.0, "rewards/chosen": 0.345947265625, "rewards/margins": 0.4767252504825592, "rewards/rejected": -0.1308797150850296, "step": 197 }, { "epoch": 0.9424831416104721, "grad_norm": 1.8625361719410045, "learning_rate": 9.913756075728085e-10, "logits/chosen": 1.9921875, "logits/rejected": NaN, "logps/chosen": -1281.6666259765625, "logps/rejected": -779.25, "loss": 0.4592, "rewards/accuracies": 1.0, "rewards/chosen": 0.3557942807674408, "rewards/margins": 0.572265625, "rewards/rejected": -0.2164713591337204, "step": 198 }, { "epoch": 0.9472431574771916, "grad_norm": 1.90411986231157, "learning_rate": 8.334724263630299e-10, "logits/chosen": 1.927734375, "logits/rejected": NaN, "logps/chosen": -1244.0, "logps/rejected": -475.8333435058594, "loss": 0.457, "rewards/accuracies": 1.0, "rewards/chosen": 0.3772786557674408, "rewards/margins": 0.5545247197151184, "rewards/rejected": -0.1775716096162796, "step": 199 }, { "epoch": 0.9520031733439112, "grad_norm": 2.2273492902165857, "learning_rate": 6.891534954310885e-10, "logits/chosen": 1.8365885019302368, "logits/rejected": NaN, "logps/chosen": -1464.6666259765625, "logps/rejected": -230.8333282470703, "loss": 0.4679, "rewards/accuracies": 1.0, "rewards/chosen": 0.3821614682674408, "rewards/margins": 0.52490234375, "rewards/rejected": -0.1425374299287796, "step": 200 }, { "epoch": 0.9567631892106307, "grad_norm": 1.9356040315595135, "learning_rate": 5.584586887435739e-10, "logits/chosen": 1.8515625, "logits/rejected": NaN, "logps/chosen": -1235.0, "logps/rejected": -473.3333435058594, "loss": 0.4593, "rewards/accuracies": 1.0, "rewards/chosen": 0.3761393129825592, "rewards/margins": 0.5594075322151184, "rewards/rejected": -0.18310546875, "step": 201 }, { "epoch": 0.9615232050773502, "grad_norm": 2.0045721624389117, "learning_rate": 4.414241160493659e-10, "logits/chosen": 1.513671875, "logits/rejected": NaN, "logps/chosen": -1449.6666259765625, "logps/rejected": -620.375, "loss": 0.4642, "rewards/accuracies": 1.0, "rewards/chosen": 0.3624674379825592, "rewards/margins": 0.53857421875, "rewards/rejected": -0.1766154021024704, "step": 202 }, { "epoch": 0.9662832209440698, "grad_norm": 2.258022376849562, "learning_rate": 3.3808211290284883e-10, "logits/chosen": 2.310546875, "logits/rejected": NaN, "logps/chosen": -1613.6666259765625, "logps/rejected": -673.5, "loss": 0.4624, "rewards/accuracies": 1.0, "rewards/chosen": 0.36181640625, "rewards/margins": 0.53857421875, "rewards/rejected": -0.1770833283662796, "step": 203 }, { "epoch": 0.9710432368107894, "grad_norm": 2.0920594811785582, "learning_rate": 2.484612317299295e-10, "logits/chosen": 1.9537760019302368, "logits/rejected": NaN, "logps/chosen": -1398.0, "logps/rejected": -373.9166564941406, "loss": 0.4568, "rewards/accuracies": 1.0, "rewards/chosen": 0.3958333432674408, "rewards/margins": 0.5577799677848816, "rewards/rejected": -0.1617838591337204, "step": 204 }, { "epoch": 0.9758032526775089, "grad_norm": 2.5414097510677287, "learning_rate": 1.7258623393922588e-10, "logits/chosen": 2.0263671875, "logits/rejected": NaN, "logps/chosen": -1744.6666259765625, "logps/rejected": -483.7708435058594, "loss": 0.4702, "rewards/accuracies": 1.0, "rewards/chosen": 0.3580729067325592, "rewards/margins": 0.5281575322151184, "rewards/rejected": -0.1700846403837204, "step": 205 }, { "epoch": 0.9805632685442285, "grad_norm": 2.3268108178524383, "learning_rate": 1.1047808308075057e-10, "logits/chosen": 2.20703125, "logits/rejected": NaN, "logps/chosen": -1482.3333740234375, "logps/rejected": -611.0, "loss": 0.4564, "rewards/accuracies": 1.0, "rewards/chosen": 0.3743489682674408, "rewards/margins": 0.55517578125, "rewards/rejected": -0.1808268278837204, "step": 206 }, { "epoch": 0.9853232844109481, "grad_norm": 2.096746342425909, "learning_rate": 6.215393905388277e-11, "logits/chosen": 1.955078125, "logits/rejected": NaN, "logps/chosen": -1452.0, "logps/rejected": -208.4166717529297, "loss": 0.477, "rewards/accuracies": 1.0, "rewards/chosen": 0.3597005307674408, "rewards/margins": 0.5052083134651184, "rewards/rejected": -0.1449788361787796, "step": 207 }, { "epoch": 0.9900833002776676, "grad_norm": 2.048833798956088, "learning_rate": 2.7627153366222012e-11, "logits/chosen": 2.0823566913604736, "logits/rejected": NaN, "logps/chosen": -1296.6666259765625, "logps/rejected": -302.5833435058594, "loss": 0.4793, "rewards/accuracies": 1.0, "rewards/chosen": 0.3583984375, "rewards/margins": 0.4939778745174408, "rewards/rejected": -0.1356608122587204, "step": 208 }, { "epoch": 0.9948433161443871, "grad_norm": 2.1643804376502103, "learning_rate": 6.907265444716648e-12, "logits/chosen": 2.162109375, "logits/rejected": NaN, "logps/chosen": -1641.1666259765625, "logps/rejected": -360.5833435058594, "loss": 0.4901, "rewards/accuracies": 1.0, "rewards/chosen": 0.3214518129825592, "rewards/margins": 0.4715169370174408, "rewards/rejected": -0.1501871794462204, "step": 209 }, { "epoch": 0.9996033320111067, "grad_norm": 1.7319154590819232, "learning_rate": 0.0, "logits/chosen": 1.9921875, "logits/rejected": NaN, "logps/chosen": -1177.3333740234375, "logps/rejected": -452.6666564941406, "loss": 0.4729, "rewards/accuracies": 1.0, "rewards/chosen": 0.3430989682674408, "rewards/margins": 0.5192057490348816, "rewards/rejected": -0.17626953125, "step": 210 } ], "logging_steps": 1, "max_steps": 210, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }