diff --git "a/doo_poo_onion/900/trainer_state.json" "b/doo_poo_onion/900/trainer_state.json" new file mode 100644--- /dev/null +++ "b/doo_poo_onion/900/trainer_state.json" @@ -0,0 +1,4534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7729214590864277, + "eval_steps": 500, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005915215248110417, + "grad_norm": 75.64228057861328, + "learning_rate": 6.535947712418302e-08, + "logits/chosen": 0.06831549108028412, + "logits/rejected": 0.22947487235069275, + "logps/chosen": -288.36907958984375, + "logps/rejected": -308.97442626953125, + "loss": 0.6936, + "rewards/accuracies": 0.15740741789340973, + "rewards/chosen": 0.0018866577884182334, + "rewards/margins": 0.00013007604866288602, + "rewards/rejected": 0.0017565814778208733, + "step": 3 + }, + { + "epoch": 0.011830430496220835, + "grad_norm": 85.5863265991211, + "learning_rate": 1.6339869281045755e-07, + "logits/chosen": 0.1193937212228775, + "logits/rejected": 0.14183634519577026, + "logps/chosen": -297.21484375, + "logps/rejected": -289.2657165527344, + "loss": 0.6933, + "rewards/accuracies": 0.513888955116272, + "rewards/chosen": 0.00845375657081604, + "rewards/margins": 0.003605001140385866, + "rewards/rejected": 0.004848754033446312, + "step": 6 + }, + { + "epoch": 0.017745645744331254, + "grad_norm": 92.01388549804688, + "learning_rate": 2.6143790849673207e-07, + "logits/chosen": 0.013294734060764313, + "logits/rejected": 0.10444601625204086, + "logps/chosen": -293.22174072265625, + "logps/rejected": -296.61444091796875, + "loss": 0.6879, + "rewards/accuracies": 0.5601852536201477, + "rewards/chosen": 0.0001924792304635048, + "rewards/margins": 0.014045190066099167, + "rewards/rejected": -0.013852710835635662, + "step": 9 + }, + { + "epoch": 0.02366086099244167, + "grad_norm": 63.192481994628906, + "learning_rate": 3.5947712418300653e-07, + "logits/chosen": 0.1236824318766594, + "logits/rejected": 0.056549232453107834, + "logps/chosen": -284.45782470703125, + "logps/rejected": -284.9747009277344, + "loss": 0.6899, + "rewards/accuracies": 0.5370370149612427, + "rewards/chosen": 0.007598253898322582, + "rewards/margins": 0.011467371135950089, + "rewards/rejected": -0.003869118168950081, + "step": 12 + }, + { + "epoch": 0.02957607624055209, + "grad_norm": 52.87683868408203, + "learning_rate": 4.5751633986928105e-07, + "logits/chosen": 0.1870713233947754, + "logits/rejected": 0.17200356721878052, + "logps/chosen": -278.11785888671875, + "logps/rejected": -288.7649841308594, + "loss": 0.6823, + "rewards/accuracies": 0.5694444179534912, + "rewards/chosen": 0.018766043707728386, + "rewards/margins": 0.029558217152953148, + "rewards/rejected": -0.010792172513902187, + "step": 15 + }, + { + "epoch": 0.03549129148866251, + "grad_norm": 47.12651443481445, + "learning_rate": 5.555555555555555e-07, + "logits/chosen": 0.2681354880332947, + "logits/rejected": 0.2608181834220886, + "logps/chosen": -299.3112487792969, + "logps/rejected": -293.26654052734375, + "loss": 0.672, + "rewards/accuracies": 0.5972222685813904, + "rewards/chosen": 0.013350310735404491, + "rewards/margins": 0.058473195880651474, + "rewards/rejected": -0.04512288421392441, + "step": 18 + }, + { + "epoch": 0.04140650673677292, + "grad_norm": 46.12940216064453, + "learning_rate": 6.535947712418302e-07, + "logits/chosen": 0.21857470273971558, + "logits/rejected": 0.1517491191625595, + "logps/chosen": -297.955078125, + "logps/rejected": -291.7802429199219, + "loss": 0.6583, + "rewards/accuracies": 0.5787036418914795, + "rewards/chosen": -0.0388781875371933, + "rewards/margins": 0.12246696650981903, + "rewards/rejected": -0.16134515404701233, + "step": 21 + }, + { + "epoch": 0.04732172198488334, + "grad_norm": 43.86560821533203, + "learning_rate": 7.516339869281046e-07, + "logits/chosen": 0.24428892135620117, + "logits/rejected": 0.2012861967086792, + "logps/chosen": -294.25177001953125, + "logps/rejected": -295.1514892578125, + "loss": 0.658, + "rewards/accuracies": 0.6481481790542603, + "rewards/chosen": 0.02628326043486595, + "rewards/margins": 0.15634408593177795, + "rewards/rejected": -0.1300608068704605, + "step": 24 + }, + { + "epoch": 0.053236937232993754, + "grad_norm": 34.92121124267578, + "learning_rate": 8.496732026143792e-07, + "logits/chosen": 0.19387498497962952, + "logits/rejected": 0.2192877233028412, + "logps/chosen": -297.76287841796875, + "logps/rejected": -298.34576416015625, + "loss": 0.6738, + "rewards/accuracies": 0.6064814925193787, + "rewards/chosen": -0.02720721624791622, + "rewards/margins": 0.20852135121822357, + "rewards/rejected": -0.23572856187820435, + "step": 27 + }, + { + "epoch": 0.05915215248110418, + "grad_norm": 35.40020751953125, + "learning_rate": 9.477124183006536e-07, + "logits/chosen": 0.0057902163825929165, + "logits/rejected": 0.068918377161026, + "logps/chosen": -284.17608642578125, + "logps/rejected": -289.56268310546875, + "loss": 0.5956, + "rewards/accuracies": 0.6990741491317749, + "rewards/chosen": 0.17752066254615784, + "rewards/margins": 0.39666658639907837, + "rewards/rejected": -0.21914593875408173, + "step": 30 + }, + { + "epoch": 0.06506736772921459, + "grad_norm": 65.21533966064453, + "learning_rate": 1.0457516339869283e-06, + "logits/chosen": 0.17345206439495087, + "logits/rejected": 0.2315821498632431, + "logps/chosen": -274.51983642578125, + "logps/rejected": -297.6330261230469, + "loss": 0.6454, + "rewards/accuracies": 0.5601851940155029, + "rewards/chosen": 0.33469104766845703, + "rewards/margins": 0.3119283616542816, + "rewards/rejected": 0.022762654349207878, + "step": 33 + }, + { + "epoch": 0.07098258297732501, + "grad_norm": 52.888023376464844, + "learning_rate": 1.1437908496732026e-06, + "logits/chosen": 0.10116317868232727, + "logits/rejected": 0.12458281219005585, + "logps/chosen": -304.16064453125, + "logps/rejected": -330.84197998046875, + "loss": 0.6079, + "rewards/accuracies": 0.6342592835426331, + "rewards/chosen": 0.6159655451774597, + "rewards/margins": 0.4744771122932434, + "rewards/rejected": 0.14148837327957153, + "step": 36 + }, + { + "epoch": 0.07689779822543542, + "grad_norm": 44.073524475097656, + "learning_rate": 1.2418300653594772e-06, + "logits/chosen": 0.11387699842453003, + "logits/rejected": 0.11010300368070602, + "logps/chosen": -282.6150207519531, + "logps/rejected": -302.6578369140625, + "loss": 0.5832, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": 0.7451757192611694, + "rewards/margins": 0.5253991484642029, + "rewards/rejected": 0.21977655589580536, + "step": 39 + }, + { + "epoch": 0.08281301347354585, + "grad_norm": 45.03438186645508, + "learning_rate": 1.3398692810457518e-06, + "logits/chosen": 0.14107277989387512, + "logits/rejected": 0.08299855887889862, + "logps/chosen": -289.40655517578125, + "logps/rejected": -301.38641357421875, + "loss": 0.5929, + "rewards/accuracies": 0.6759259700775146, + "rewards/chosen": 0.8502761125564575, + "rewards/margins": 0.6023391485214233, + "rewards/rejected": 0.24793694913387299, + "step": 42 + }, + { + "epoch": 0.08872822872165625, + "grad_norm": 34.897117614746094, + "learning_rate": 1.4379084967320261e-06, + "logits/chosen": 0.11066042631864548, + "logits/rejected": 0.18711869418621063, + "logps/chosen": -277.82086181640625, + "logps/rejected": -311.96185302734375, + "loss": 0.574, + "rewards/accuracies": 0.6435185670852661, + "rewards/chosen": 0.7543072700500488, + "rewards/margins": 0.595453679561615, + "rewards/rejected": 0.15885356068611145, + "step": 45 + }, + { + "epoch": 0.09464344396976668, + "grad_norm": 52.84747314453125, + "learning_rate": 1.535947712418301e-06, + "logits/chosen": 0.07594814896583557, + "logits/rejected": 0.06705646216869354, + "logps/chosen": -288.2305603027344, + "logps/rejected": -294.79473876953125, + "loss": 0.6463, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 0.5022045373916626, + "rewards/margins": 0.49619922041893005, + "rewards/rejected": 0.006005376577377319, + "step": 48 + }, + { + "epoch": 0.1005586592178771, + "grad_norm": 42.00156784057617, + "learning_rate": 1.6339869281045753e-06, + "logits/chosen": 0.28929704427719116, + "logits/rejected": 0.28439193964004517, + "logps/chosen": -295.6617126464844, + "logps/rejected": -297.2070617675781, + "loss": 0.6407, + "rewards/accuracies": 0.5925926566123962, + "rewards/chosen": 0.33414918184280396, + "rewards/margins": 0.5068655610084534, + "rewards/rejected": -0.17271637916564941, + "step": 51 + }, + { + "epoch": 0.10647387446598751, + "grad_norm": 41.05317687988281, + "learning_rate": 1.7320261437908499e-06, + "logits/chosen": 0.2613026797771454, + "logits/rejected": 0.25626230239868164, + "logps/chosen": -270.5616149902344, + "logps/rejected": -288.9526672363281, + "loss": 0.666, + "rewards/accuracies": 0.6111111044883728, + "rewards/chosen": 0.35354501008987427, + "rewards/margins": 0.516350507736206, + "rewards/rejected": -0.16280552744865417, + "step": 54 + }, + { + "epoch": 0.11238908971409793, + "grad_norm": 40.66948318481445, + "learning_rate": 1.8300653594771242e-06, + "logits/chosen": 0.14654496312141418, + "logits/rejected": 0.18361543118953705, + "logps/chosen": -276.3260498046875, + "logps/rejected": -286.315185546875, + "loss": 0.6593, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": 0.19915954768657684, + "rewards/margins": 0.5710794925689697, + "rewards/rejected": -0.3719198703765869, + "step": 57 + }, + { + "epoch": 0.11830430496220835, + "grad_norm": 39.70878601074219, + "learning_rate": 1.928104575163399e-06, + "logits/chosen": 0.26200026273727417, + "logits/rejected": 0.2716706097126007, + "logps/chosen": -296.445068359375, + "logps/rejected": -302.8128356933594, + "loss": 0.6215, + "rewards/accuracies": 0.6481481790542603, + "rewards/chosen": 0.1488932967185974, + "rewards/margins": 0.6068000793457031, + "rewards/rejected": -0.4579067826271057, + "step": 60 + }, + { + "epoch": 0.12421952021031876, + "grad_norm": 45.6502571105957, + "learning_rate": 2.0261437908496734e-06, + "logits/chosen": 0.16293856501579285, + "logits/rejected": 0.16650280356407166, + "logps/chosen": -279.3722839355469, + "logps/rejected": -302.0152282714844, + "loss": 0.5946, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": 0.07076837122440338, + "rewards/margins": 0.6879870891571045, + "rewards/rejected": -0.6172187328338623, + "step": 63 + }, + { + "epoch": 0.13013473545842918, + "grad_norm": 37.381980895996094, + "learning_rate": 2.1241830065359477e-06, + "logits/chosen": 0.1667034924030304, + "logits/rejected": 0.11124895513057709, + "logps/chosen": -286.760986328125, + "logps/rejected": -294.1531982421875, + "loss": 0.5771, + "rewards/accuracies": 0.6481481790542603, + "rewards/chosen": 0.002402188954874873, + "rewards/margins": 0.7446907758712769, + "rewards/rejected": -0.7422885298728943, + "step": 66 + }, + { + "epoch": 0.1360499507065396, + "grad_norm": 40.16853713989258, + "learning_rate": 2.222222222222222e-06, + "logits/chosen": 0.1928640604019165, + "logits/rejected": 0.2365691065788269, + "logps/chosen": -289.02642822265625, + "logps/rejected": -299.3502197265625, + "loss": 0.599, + "rewards/accuracies": 0.6898148059844971, + "rewards/chosen": 0.0025644637644290924, + "rewards/margins": 0.755738377571106, + "rewards/rejected": -0.7531739473342896, + "step": 69 + }, + { + "epoch": 0.14196516595465003, + "grad_norm": 61.28244400024414, + "learning_rate": 2.320261437908497e-06, + "logits/chosen": 0.13621635735034943, + "logits/rejected": 0.2548728287220001, + "logps/chosen": -289.6425476074219, + "logps/rejected": -316.8446350097656, + "loss": 0.5755, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": 0.012163564562797546, + "rewards/margins": 0.7862691879272461, + "rewards/rejected": -0.7741057276725769, + "step": 72 + }, + { + "epoch": 0.14788038120276042, + "grad_norm": 72.57213592529297, + "learning_rate": 2.4183006535947716e-06, + "logits/chosen": 0.05144810676574707, + "logits/rejected": 0.13899767398834229, + "logps/chosen": -280.32623291015625, + "logps/rejected": -301.4366760253906, + "loss": 0.5793, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": 0.18221376836299896, + "rewards/margins": 0.7852480411529541, + "rewards/rejected": -0.603034257888794, + "step": 75 + }, + { + "epoch": 0.15379559645087085, + "grad_norm": 43.978668212890625, + "learning_rate": 2.516339869281046e-06, + "logits/chosen": 0.2053721845149994, + "logits/rejected": 0.301210880279541, + "logps/chosen": -293.8603820800781, + "logps/rejected": -311.3579406738281, + "loss": 0.5898, + "rewards/accuracies": 0.6712962985038757, + "rewards/chosen": 0.04886661097407341, + "rewards/margins": 0.7296194434165955, + "rewards/rejected": -0.6807528734207153, + "step": 78 + }, + { + "epoch": 0.15971081169898127, + "grad_norm": 36.1211051940918, + "learning_rate": 2.6143790849673208e-06, + "logits/chosen": 0.0973750576376915, + "logits/rejected": 0.1262703239917755, + "logps/chosen": -286.0694274902344, + "logps/rejected": -307.6643371582031, + "loss": 0.5745, + "rewards/accuracies": 0.680555522441864, + "rewards/chosen": 0.17426416277885437, + "rewards/margins": 0.898255467414856, + "rewards/rejected": -0.7239912748336792, + "step": 81 + }, + { + "epoch": 0.1656260269470917, + "grad_norm": 34.10630416870117, + "learning_rate": 2.7124183006535947e-06, + "logits/chosen": 0.135942280292511, + "logits/rejected": 0.17899185419082642, + "logps/chosen": -282.0833740234375, + "logps/rejected": -294.80340576171875, + "loss": 0.5664, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": 0.24072617292404175, + "rewards/margins": 0.7590986490249634, + "rewards/rejected": -0.5183724761009216, + "step": 84 + }, + { + "epoch": 0.17154124219520211, + "grad_norm": 101.0091323852539, + "learning_rate": 2.8104575163398695e-06, + "logits/chosen": 0.049379341304302216, + "logits/rejected": 0.22774741053581238, + "logps/chosen": -276.23651123046875, + "logps/rejected": -328.92730712890625, + "loss": 0.6363, + "rewards/accuracies": 0.6388888955116272, + "rewards/chosen": 0.24817490577697754, + "rewards/margins": 0.6880936026573181, + "rewards/rejected": -0.4399186372756958, + "step": 87 + }, + { + "epoch": 0.1774564574433125, + "grad_norm": 41.10043716430664, + "learning_rate": 2.9084967320261443e-06, + "logits/chosen": 0.04306400939822197, + "logits/rejected": 0.12012603133916855, + "logps/chosen": -273.14862060546875, + "logps/rejected": -297.67779541015625, + "loss": 0.6138, + "rewards/accuracies": 0.6388888955116272, + "rewards/chosen": 0.2495533525943756, + "rewards/margins": 0.97026127576828, + "rewards/rejected": -0.720707893371582, + "step": 90 + }, + { + "epoch": 0.18337167269142293, + "grad_norm": 36.96125030517578, + "learning_rate": 3.0065359477124182e-06, + "logits/chosen": -0.0033540725708007812, + "logits/rejected": 0.15706853568553925, + "logps/chosen": -280.9652099609375, + "logps/rejected": -311.3870544433594, + "loss": 0.5552, + "rewards/accuracies": 0.6990741491317749, + "rewards/chosen": 0.41701826453208923, + "rewards/margins": 1.1586750745773315, + "rewards/rejected": -0.7416568994522095, + "step": 93 + }, + { + "epoch": 0.18928688793953335, + "grad_norm": 34.02159118652344, + "learning_rate": 3.104575163398693e-06, + "logits/chosen": 0.055927395820617676, + "logits/rejected": 0.07919944822788239, + "logps/chosen": -277.48199462890625, + "logps/rejected": -295.92218017578125, + "loss": 0.5402, + "rewards/accuracies": 0.6898148059844971, + "rewards/chosen": 0.7134115099906921, + "rewards/margins": 1.0553935766220093, + "rewards/rejected": -0.34198200702667236, + "step": 96 + }, + { + "epoch": 0.19520210318764378, + "grad_norm": 30.828338623046875, + "learning_rate": 3.2026143790849674e-06, + "logits/chosen": 0.02216392755508423, + "logits/rejected": 0.17252111434936523, + "logps/chosen": -274.6871337890625, + "logps/rejected": -308.18707275390625, + "loss": 0.5325, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": 0.7039467692375183, + "rewards/margins": 1.234229326248169, + "rewards/rejected": -0.5302824974060059, + "step": 99 + }, + { + "epoch": 0.2011173184357542, + "grad_norm": 55.14480972290039, + "learning_rate": 3.300653594771242e-06, + "logits/chosen": 0.0836678296327591, + "logits/rejected": 0.1754518747329712, + "logps/chosen": -285.959228515625, + "logps/rejected": -307.93658447265625, + "loss": 0.6502, + "rewards/accuracies": 0.6759259700775146, + "rewards/chosen": 0.4246281087398529, + "rewards/margins": 1.0537660121917725, + "rewards/rejected": -0.6291378736495972, + "step": 102 + }, + { + "epoch": 0.2070325336838646, + "grad_norm": 62.848453521728516, + "learning_rate": 3.398692810457517e-06, + "logits/chosen": 0.027146054431796074, + "logits/rejected": 0.0840519368648529, + "logps/chosen": -294.8350524902344, + "logps/rejected": -316.4007873535156, + "loss": 0.6039, + "rewards/accuracies": 0.6712962985038757, + "rewards/chosen": 0.3796502947807312, + "rewards/margins": 1.3719719648361206, + "rewards/rejected": -0.9923217296600342, + "step": 105 + }, + { + "epoch": 0.21294774893197502, + "grad_norm": 34.59516143798828, + "learning_rate": 3.496732026143791e-06, + "logits/chosen": 0.18716061115264893, + "logits/rejected": 0.2005024403333664, + "logps/chosen": -294.5157470703125, + "logps/rejected": -318.09368896484375, + "loss": 0.646, + "rewards/accuracies": 0.703703761100769, + "rewards/chosen": 0.1613771915435791, + "rewards/margins": 1.098291277885437, + "rewards/rejected": -0.9369141459465027, + "step": 108 + }, + { + "epoch": 0.21886296418008544, + "grad_norm": 59.11061096191406, + "learning_rate": 3.5947712418300657e-06, + "logits/chosen": 0.02729523926973343, + "logits/rejected": 0.18032339215278625, + "logps/chosen": -279.9372863769531, + "logps/rejected": -314.1539306640625, + "loss": 0.6169, + "rewards/accuracies": 0.6944445371627808, + "rewards/chosen": 0.2143605351448059, + "rewards/margins": 1.1479460000991821, + "rewards/rejected": -0.933585524559021, + "step": 111 + }, + { + "epoch": 0.22477817942819586, + "grad_norm": 54.83635711669922, + "learning_rate": 3.6928104575163404e-06, + "logits/chosen": -0.07722613215446472, + "logits/rejected": 0.056545909494161606, + "logps/chosen": -265.40985107421875, + "logps/rejected": -308.90350341796875, + "loss": 0.5748, + "rewards/accuracies": 0.6944445371627808, + "rewards/chosen": 0.5514373183250427, + "rewards/margins": 1.499307632446289, + "rewards/rejected": -0.9478704333305359, + "step": 114 + }, + { + "epoch": 0.23069339467630628, + "grad_norm": 82.6873779296875, + "learning_rate": 3.7908496732026144e-06, + "logits/chosen": -0.05033176392316818, + "logits/rejected": 0.08841504901647568, + "logps/chosen": -284.0753479003906, + "logps/rejected": -315.85931396484375, + "loss": 0.6855, + "rewards/accuracies": 0.6990741491317749, + "rewards/chosen": 0.38630211353302, + "rewards/margins": 1.3165416717529297, + "rewards/rejected": -0.9302395582199097, + "step": 117 + }, + { + "epoch": 0.2366086099244167, + "grad_norm": 34.905941009521484, + "learning_rate": 3.88888888888889e-06, + "logits/chosen": -0.014662293717265129, + "logits/rejected": 0.03569987416267395, + "logps/chosen": -269.83001708984375, + "logps/rejected": -294.5672912597656, + "loss": 0.5944, + "rewards/accuracies": 0.703703761100769, + "rewards/chosen": 0.29097482562065125, + "rewards/margins": 1.4026880264282227, + "rewards/rejected": -1.111713171005249, + "step": 120 + }, + { + "epoch": 0.2425238251725271, + "grad_norm": 34.07733917236328, + "learning_rate": 3.986928104575164e-06, + "logits/chosen": -0.08946999907493591, + "logits/rejected": 0.01624571532011032, + "logps/chosen": -283.3293151855469, + "logps/rejected": -312.3072509765625, + "loss": 0.554, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": 0.07897276431322098, + "rewards/margins": 1.4951574802398682, + "rewards/rejected": -1.416184663772583, + "step": 123 + }, + { + "epoch": 0.24843904042063752, + "grad_norm": 47.81401824951172, + "learning_rate": 4.084967320261438e-06, + "logits/chosen": -0.08563312888145447, + "logits/rejected": -0.123930923640728, + "logps/chosen": -308.0613098144531, + "logps/rejected": -300.32025146484375, + "loss": 0.6619, + "rewards/accuracies": 0.6342592835426331, + "rewards/chosen": -0.285559743642807, + "rewards/margins": 1.018710970878601, + "rewards/rejected": -1.304270625114441, + "step": 126 + }, + { + "epoch": 0.2543542556687479, + "grad_norm": 40.03066635131836, + "learning_rate": 4.183006535947713e-06, + "logits/chosen": 0.0017192339291796088, + "logits/rejected": 0.14679786562919617, + "logps/chosen": -316.32379150390625, + "logps/rejected": -339.078125, + "loss": 0.6979, + "rewards/accuracies": 0.6481481790542603, + "rewards/chosen": -0.6922714710235596, + "rewards/margins": 0.9672110080718994, + "rewards/rejected": -1.659482717514038, + "step": 129 + }, + { + "epoch": 0.26026947091685837, + "grad_norm": 53.74517822265625, + "learning_rate": 4.281045751633987e-06, + "logits/chosen": -0.05079513043165207, + "logits/rejected": 0.07216000556945801, + "logps/chosen": -295.9747314453125, + "logps/rejected": -332.39013671875, + "loss": 0.5792, + "rewards/accuracies": 0.7222222685813904, + "rewards/chosen": -0.8141295909881592, + "rewards/margins": 1.5237984657287598, + "rewards/rejected": -2.337928295135498, + "step": 132 + }, + { + "epoch": 0.26618468616496876, + "grad_norm": 35.11404800415039, + "learning_rate": 4.379084967320262e-06, + "logits/chosen": -0.046733301132917404, + "logits/rejected": 0.0873221755027771, + "logps/chosen": -289.7781982421875, + "logps/rejected": -330.5262756347656, + "loss": 0.5458, + "rewards/accuracies": 0.7129629850387573, + "rewards/chosen": -0.5395014882087708, + "rewards/margins": 1.484311819076538, + "rewards/rejected": -2.023813247680664, + "step": 135 + }, + { + "epoch": 0.2720999014130792, + "grad_norm": 46.060089111328125, + "learning_rate": 4.477124183006537e-06, + "logits/chosen": -0.10985089838504791, + "logits/rejected": 0.007513361983001232, + "logps/chosen": -291.6156311035156, + "logps/rejected": -330.0778503417969, + "loss": 0.6452, + "rewards/accuracies": 0.6712963581085205, + "rewards/chosen": -0.8152337074279785, + "rewards/margins": 1.0877575874328613, + "rewards/rejected": -1.9029912948608398, + "step": 138 + }, + { + "epoch": 0.2780151166611896, + "grad_norm": 36.288475036621094, + "learning_rate": 4.5751633986928105e-06, + "logits/chosen": -0.12672923505306244, + "logits/rejected": -0.05195396766066551, + "logps/chosen": -291.0050048828125, + "logps/rejected": -328.9648742675781, + "loss": 0.622, + "rewards/accuracies": 0.6990740299224854, + "rewards/chosen": -0.6529079675674438, + "rewards/margins": 1.3228920698165894, + "rewards/rejected": -1.9758000373840332, + "step": 141 + }, + { + "epoch": 0.28393033190930006, + "grad_norm": 61.34006118774414, + "learning_rate": 4.673202614379085e-06, + "logits/chosen": -0.09269940853118896, + "logits/rejected": -0.09529760479927063, + "logps/chosen": -308.4407958984375, + "logps/rejected": -327.4217224121094, + "loss": 0.7128, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -0.8733962774276733, + "rewards/margins": 1.3924760818481445, + "rewards/rejected": -2.2658724784851074, + "step": 144 + }, + { + "epoch": 0.28984554715741045, + "grad_norm": 60.66278839111328, + "learning_rate": 4.77124183006536e-06, + "logits/chosen": -0.21165470778942108, + "logits/rejected": -0.08071193099021912, + "logps/chosen": -292.4040832519531, + "logps/rejected": -327.4679260253906, + "loss": 0.6824, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -1.0551072359085083, + "rewards/margins": 1.499154806137085, + "rewards/rejected": -2.5542619228363037, + "step": 147 + }, + { + "epoch": 0.29576076240552085, + "grad_norm": 37.254608154296875, + "learning_rate": 4.869281045751634e-06, + "logits/chosen": -0.09953123331069946, + "logits/rejected": -0.10298528522253036, + "logps/chosen": -307.6355285644531, + "logps/rejected": -334.2861328125, + "loss": 0.6938, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1909699440002441, + "rewards/margins": 1.456664800643921, + "rewards/rejected": -2.647634506225586, + "step": 150 + }, + { + "epoch": 0.3016759776536313, + "grad_norm": 29.970460891723633, + "learning_rate": 4.967320261437909e-06, + "logits/chosen": -0.16488048434257507, + "logits/rejected": -0.026979412883520126, + "logps/chosen": -292.1322021484375, + "logps/rejected": -334.13330078125, + "loss": 0.7198, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.4055306911468506, + "rewards/margins": 1.0318067073822021, + "rewards/rejected": -2.4373371601104736, + "step": 153 + }, + { + "epoch": 0.3075911929017417, + "grad_norm": 102.86775970458984, + "learning_rate": 4.999973746084687e-06, + "logits/chosen": -0.2688429355621338, + "logits/rejected": -0.08179165422916412, + "logps/chosen": -292.56201171875, + "logps/rejected": -334.35443115234375, + "loss": 0.6896, + "rewards/accuracies": 0.6712962985038757, + "rewards/chosen": -1.556567907333374, + "rewards/margins": 1.6510968208312988, + "rewards/rejected": -3.2076644897460938, + "step": 156 + }, + { + "epoch": 0.31350640814985214, + "grad_norm": 35.398048400878906, + "learning_rate": 4.999835914537063e-06, + "logits/chosen": -0.1668413281440735, + "logits/rejected": -0.13912776112556458, + "logps/chosen": -304.44842529296875, + "logps/rejected": -323.0757141113281, + "loss": 0.5734, + "rewards/accuracies": 0.7546296119689941, + "rewards/chosen": -1.2236417531967163, + "rewards/margins": 1.9698877334594727, + "rewards/rejected": -3.1935291290283203, + "step": 159 + }, + { + "epoch": 0.31942162339796254, + "grad_norm": 54.54057312011719, + "learning_rate": 4.999579948383184e-06, + "logits/chosen": -0.15461772680282593, + "logits/rejected": -0.03271166980266571, + "logps/chosen": -295.79632568359375, + "logps/rejected": -327.92584228515625, + "loss": 0.7708, + "rewards/accuracies": 0.6620371341705322, + "rewards/chosen": -1.315285086631775, + "rewards/margins": 1.5617358684539795, + "rewards/rejected": -2.877020835876465, + "step": 162 + }, + { + "epoch": 0.32533683864607293, + "grad_norm": 34.624603271484375, + "learning_rate": 4.9992058597192255e-06, + "logits/chosen": -0.10605038702487946, + "logits/rejected": 0.021997269243001938, + "logps/chosen": -306.4567565917969, + "logps/rejected": -336.64208984375, + "loss": 0.6976, + "rewards/accuracies": 0.6620370745658875, + "rewards/chosen": -1.345373511314392, + "rewards/margins": 1.6246647834777832, + "rewards/rejected": -2.9700381755828857, + "step": 165 + }, + { + "epoch": 0.3312520538941834, + "grad_norm": 67.67698669433594, + "learning_rate": 4.9987136662234764e-06, + "logits/chosen": -0.0949287861585617, + "logits/rejected": -0.04236632585525513, + "logps/chosen": -310.03973388671875, + "logps/rejected": -338.93768310546875, + "loss": 0.7999, + "rewards/accuracies": 0.6851851940155029, + "rewards/chosen": -2.0794289112091064, + "rewards/margins": 1.357082724571228, + "rewards/rejected": -3.436511516571045, + "step": 168 + }, + { + "epoch": 0.3371672691422938, + "grad_norm": 32.37373733520508, + "learning_rate": 4.998103391155496e-06, + "logits/chosen": -0.0922163650393486, + "logits/rejected": 0.0038010727148503065, + "logps/chosen": -299.1915283203125, + "logps/rejected": -340.01202392578125, + "loss": 0.7605, + "rewards/accuracies": 0.6990741491317749, + "rewards/chosen": -2.1031405925750732, + "rewards/margins": 1.5288575887680054, + "rewards/rejected": -3.631998062133789, + "step": 171 + }, + { + "epoch": 0.34308248439040423, + "grad_norm": 41.414024353027344, + "learning_rate": 4.997375063355021e-06, + "logits/chosen": -0.14530247449874878, + "logits/rejected": -0.017910713329911232, + "logps/chosen": -294.3675537109375, + "logps/rejected": -331.98138427734375, + "loss": 0.6571, + "rewards/accuracies": 0.6898148059844971, + "rewards/chosen": -2.397458791732788, + "rewards/margins": 1.8552653789520264, + "rewards/rejected": -4.252723693847656, + "step": 174 + }, + { + "epoch": 0.3489976996385146, + "grad_norm": 48.10707473754883, + "learning_rate": 4.996528717240595e-06, + "logits/chosen": -0.05039427801966667, + "logits/rejected": -0.03388180956244469, + "logps/chosen": -334.423583984375, + "logps/rejected": -366.33087158203125, + "loss": 0.6121, + "rewards/accuracies": 0.7129630446434021, + "rewards/chosen": -2.6137490272521973, + "rewards/margins": 2.267016887664795, + "rewards/rejected": -4.880765438079834, + "step": 177 + }, + { + "epoch": 0.354912914886625, + "grad_norm": 48.403987884521484, + "learning_rate": 4.995564392807951e-06, + "logits/chosen": -0.12557795643806458, + "logits/rejected": -0.08240347355604172, + "logps/chosen": -307.76190185546875, + "logps/rejected": -336.8500671386719, + "loss": 0.8248, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": -2.706207036972046, + "rewards/margins": 1.6851190328598022, + "rewards/rejected": -4.391325950622559, + "step": 180 + }, + { + "epoch": 0.36082813013473547, + "grad_norm": 33.078346252441406, + "learning_rate": 4.994482135628115e-06, + "logits/chosen": -0.0540686696767807, + "logits/rejected": -0.02901211380958557, + "logps/chosen": -308.53997802734375, + "logps/rejected": -325.751220703125, + "loss": 0.6532, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -2.590834617614746, + "rewards/margins": 2.104125738143921, + "rewards/rejected": -4.694960594177246, + "step": 183 + }, + { + "epoch": 0.36674334538284586, + "grad_norm": 48.71003723144531, + "learning_rate": 4.993281996845253e-06, + "logits/chosen": -0.12156584858894348, + "logits/rejected": -0.009240781888365746, + "logps/chosen": -309.54833984375, + "logps/rejected": -357.3154296875, + "loss": 0.7973, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -3.1666088104248047, + "rewards/margins": 1.5774611234664917, + "rewards/rejected": -4.744070053100586, + "step": 186 + }, + { + "epoch": 0.3726585606309563, + "grad_norm": 34.78226089477539, + "learning_rate": 4.991964033174257e-06, + "logits/chosen": 0.03807515650987625, + "logits/rejected": 0.03681405261158943, + "logps/chosen": -311.2495422363281, + "logps/rejected": -324.69183349609375, + "loss": 0.7562, + "rewards/accuracies": 0.6388888359069824, + "rewards/chosen": -2.8808019161224365, + "rewards/margins": 1.3406963348388672, + "rewards/rejected": -4.221498489379883, + "step": 189 + }, + { + "epoch": 0.3785737758790667, + "grad_norm": 40.5612678527832, + "learning_rate": 4.990528306898062e-06, + "logits/chosen": -0.02013694867491722, + "logits/rejected": 0.06969591975212097, + "logps/chosen": -311.6338806152344, + "logps/rejected": -357.86932373046875, + "loss": 0.6847, + "rewards/accuracies": 0.6481481790542603, + "rewards/chosen": -2.982748031616211, + "rewards/margins": 1.80600905418396, + "rewards/rejected": -4.78875732421875, + "step": 192 + }, + { + "epoch": 0.3844889911271771, + "grad_norm": 40.58036804199219, + "learning_rate": 4.988974885864706e-06, + "logits/chosen": -0.07019342482089996, + "logits/rejected": 0.034122247248888016, + "logps/chosen": -311.8460388183594, + "logps/rejected": -335.7353515625, + "loss": 0.6818, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -2.6682796478271484, + "rewards/margins": 2.0244059562683105, + "rewards/rejected": -4.692685127258301, + "step": 195 + }, + { + "epoch": 0.39040420637528755, + "grad_norm": 40.306243896484375, + "learning_rate": 4.987303843484119e-06, + "logits/chosen": -0.04457690566778183, + "logits/rejected": -0.04778536409139633, + "logps/chosen": -323.8233337402344, + "logps/rejected": -337.99658203125, + "loss": 0.8443, + "rewards/accuracies": 0.6712962985038757, + "rewards/chosen": -2.713296413421631, + "rewards/margins": 2.1648170948028564, + "rewards/rejected": -4.878113269805908, + "step": 198 + }, + { + "epoch": 0.39631942162339795, + "grad_norm": 40.987098693847656, + "learning_rate": 4.985515258724657e-06, + "logits/chosen": -0.1199549064040184, + "logits/rejected": 0.04760899394750595, + "logps/chosen": -315.40380859375, + "logps/rejected": -365.968017578125, + "loss": 0.8342, + "rewards/accuracies": 0.6296296119689941, + "rewards/chosen": -2.873556137084961, + "rewards/margins": 1.9774025678634644, + "rewards/rejected": -4.850958824157715, + "step": 201 + }, + { + "epoch": 0.4022346368715084, + "grad_norm": 63.48135757446289, + "learning_rate": 4.983609216109371e-06, + "logits/chosen": -0.11053924262523651, + "logits/rejected": 0.05218297988176346, + "logps/chosen": -315.01361083984375, + "logps/rejected": -348.6982421875, + "loss": 0.8018, + "rewards/accuracies": 0.6481481790542603, + "rewards/chosen": -3.1666953563690186, + "rewards/margins": 2.3253214359283447, + "rewards/rejected": -5.492016792297363, + "step": 204 + }, + { + "epoch": 0.4081498521196188, + "grad_norm": 38.437744140625, + "learning_rate": 4.981585805712011e-06, + "logits/chosen": 0.056118495762348175, + "logits/rejected": 0.09087176620960236, + "logps/chosen": -318.7930603027344, + "logps/rejected": -362.77825927734375, + "loss": 0.8394, + "rewards/accuracies": 0.638888955116272, + "rewards/chosen": -3.5864510536193848, + "rewards/margins": 2.024658203125, + "rewards/rejected": -5.611109733581543, + "step": 207 + }, + { + "epoch": 0.4140650673677292, + "grad_norm": 38.63540267944336, + "learning_rate": 4.979445123152767e-06, + "logits/chosen": 0.05023570358753204, + "logits/rejected": 0.07232505083084106, + "logps/chosen": -325.7311096191406, + "logps/rejected": -351.2863464355469, + "loss": 0.8521, + "rewards/accuracies": 0.6574074625968933, + "rewards/chosen": -3.6044719219207764, + "rewards/margins": 1.671539306640625, + "rewards/rejected": -5.2760114669799805, + "step": 210 + }, + { + "epoch": 0.41998028261583964, + "grad_norm": 41.875938415527344, + "learning_rate": 4.977187269593758e-06, + "logits/chosen": -0.009280918166041374, + "logits/rejected": 0.08061732351779938, + "logps/chosen": -309.8787841796875, + "logps/rejected": -338.63531494140625, + "loss": 0.9004, + "rewards/accuracies": 0.6481481790542603, + "rewards/chosen": -3.5854623317718506, + "rewards/margins": 1.1396470069885254, + "rewards/rejected": -4.725109100341797, + "step": 213 + }, + { + "epoch": 0.42589549786395003, + "grad_norm": 103.75460815429688, + "learning_rate": 4.974812351734241e-06, + "logits/chosen": -0.02067667804658413, + "logits/rejected": 0.05178874731063843, + "logps/chosen": -312.7908630371094, + "logps/rejected": -342.73443603515625, + "loss": 0.802, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -3.1982085704803467, + "rewards/margins": 1.6781682968139648, + "rewards/rejected": -4.876377105712891, + "step": 216 + }, + { + "epoch": 0.4318107131120605, + "grad_norm": 45.85707092285156, + "learning_rate": 4.972320481805578e-06, + "logits/chosen": -0.1770055741071701, + "logits/rejected": -0.07678120583295822, + "logps/chosen": -323.437744140625, + "logps/rejected": -376.6873474121094, + "loss": 0.7821, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -3.227814197540283, + "rewards/margins": 1.796374797821045, + "rewards/rejected": -5.024188995361328, + "step": 219 + }, + { + "epoch": 0.4377259283601709, + "grad_norm": 61.05466079711914, + "learning_rate": 4.969711777565928e-06, + "logits/chosen": -0.05364451929926872, + "logits/rejected": -0.02637811005115509, + "logps/chosen": -334.20367431640625, + "logps/rejected": -368.54693603515625, + "loss": 0.8648, + "rewards/accuracies": 0.6759259104728699, + "rewards/chosen": -3.613854169845581, + "rewards/margins": 1.6240665912628174, + "rewards/rejected": -5.237921237945557, + "step": 222 + }, + { + "epoch": 0.4436411436082813, + "grad_norm": 34.13490295410156, + "learning_rate": 4.96698636229468e-06, + "logits/chosen": -0.005055941641330719, + "logits/rejected": 0.06123928725719452, + "logps/chosen": -324.4971923828125, + "logps/rejected": -349.1205749511719, + "loss": 0.7293, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -3.4245200157165527, + "rewards/margins": 1.7457599639892578, + "rewards/rejected": -5.1702799797058105, + "step": 225 + }, + { + "epoch": 0.4495563588563917, + "grad_norm": 83.93833923339844, + "learning_rate": 4.964144364786632e-06, + "logits/chosen": 0.13881027698516846, + "logits/rejected": 0.12519869208335876, + "logps/chosen": -348.6081237792969, + "logps/rejected": -361.4942321777344, + "loss": 1.0233, + "rewards/accuracies": 0.6851851940155029, + "rewards/chosen": -4.027976036071777, + "rewards/margins": 1.528003454208374, + "rewards/rejected": -5.555978775024414, + "step": 228 + }, + { + "epoch": 0.4554715741045021, + "grad_norm": 97.00470733642578, + "learning_rate": 4.9611859193459015e-06, + "logits/chosen": 0.07493604719638824, + "logits/rejected": 0.14453333616256714, + "logps/chosen": -326.7818298339844, + "logps/rejected": -349.25189208984375, + "loss": 0.92, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -4.245251655578613, + "rewards/margins": 1.4085248708724976, + "rewards/rejected": -5.653777122497559, + "step": 231 + }, + { + "epoch": 0.46138678935261257, + "grad_norm": 42.000885009765625, + "learning_rate": 4.958111165779579e-06, + "logits/chosen": 0.0968238115310669, + "logits/rejected": 0.17590413987636566, + "logps/chosen": -324.4273376464844, + "logps/rejected": -368.86004638671875, + "loss": 0.8419, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -4.32670259475708, + "rewards/margins": 1.9024198055267334, + "rewards/rejected": -6.229121685028076, + "step": 234 + }, + { + "epoch": 0.46730200460072296, + "grad_norm": 34.4752197265625, + "learning_rate": 4.954920249391123e-06, + "logits/chosen": 0.035372521728277206, + "logits/rejected": 0.07403655350208282, + "logps/chosen": -336.0986633300781, + "logps/rejected": -353.7882995605469, + "loss": 0.6037, + "rewards/accuracies": 0.7546296119689941, + "rewards/chosen": -3.8972387313842773, + "rewards/margins": 1.817379355430603, + "rewards/rejected": -5.714618682861328, + "step": 237 + }, + { + "epoch": 0.4732172198488334, + "grad_norm": 28.94403648376465, + "learning_rate": 4.951613320973491e-06, + "logits/chosen": 0.0323605015873909, + "logits/rejected": 0.0774674192070961, + "logps/chosen": -324.99774169921875, + "logps/rejected": -344.53985595703125, + "loss": 0.7093, + "rewards/accuracies": 0.7268518805503845, + "rewards/chosen": -4.354706764221191, + "rewards/margins": 1.9214775562286377, + "rewards/rejected": -6.276185035705566, + "step": 240 + }, + { + "epoch": 0.4791324350969438, + "grad_norm": 30.506528854370117, + "learning_rate": 4.948190536802015e-06, + "logits/chosen": -0.06049029156565666, + "logits/rejected": 0.006028448697179556, + "logps/chosen": -324.240234375, + "logps/rejected": -350.5492858886719, + "loss": 0.7427, + "rewards/accuracies": 0.6388888955116272, + "rewards/chosen": -4.099154472351074, + "rewards/margins": 1.713746428489685, + "rewards/rejected": -5.812901496887207, + "step": 243 + }, + { + "epoch": 0.4850476503450542, + "grad_norm": 73.70082092285156, + "learning_rate": 4.944652058627013e-06, + "logits/chosen": -0.08537846058607101, + "logits/rejected": -0.032611675560474396, + "logps/chosen": -336.5609130859375, + "logps/rejected": -372.4656677246094, + "loss": 0.6616, + "rewards/accuracies": 0.703703761100769, + "rewards/chosen": -4.300870895385742, + "rewards/margins": 1.9437074661254883, + "rewards/rejected": -6.2445783615112305, + "step": 246 + }, + { + "epoch": 0.49096286559316465, + "grad_norm": 34.85869598388672, + "learning_rate": 4.9409980536661535e-06, + "logits/chosen": -0.06217961013317108, + "logits/rejected": -0.004529049154371023, + "logps/chosen": -340.4526062011719, + "logps/rejected": -369.7755126953125, + "loss": 0.8365, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": -4.246068000793457, + "rewards/margins": 1.704252004623413, + "rewards/rejected": -5.950319290161133, + "step": 249 + }, + { + "epoch": 0.49687808084127505, + "grad_norm": 39.66143035888672, + "learning_rate": 4.937228694596545e-06, + "logits/chosen": -0.12100633233785629, + "logits/rejected": -0.006573869846761227, + "logps/chosen": -320.8197021484375, + "logps/rejected": -355.7471923828125, + "loss": 0.6241, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -3.746490478515625, + "rewards/margins": 2.363032579421997, + "rewards/rejected": -6.109523296356201, + "step": 252 + }, + { + "epoch": 0.5027932960893855, + "grad_norm": 33.68574142456055, + "learning_rate": 4.933344159546577e-06, + "logits/chosen": -0.18371449410915375, + "logits/rejected": -0.022255782037973404, + "logps/chosen": -333.9938049316406, + "logps/rejected": -384.731689453125, + "loss": 0.8266, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -4.545222759246826, + "rewards/margins": 2.332798719406128, + "rewards/rejected": -6.878022193908691, + "step": 255 + }, + { + "epoch": 0.5087085113374958, + "grad_norm": 45.985260009765625, + "learning_rate": 4.929344632087506e-06, + "logits/chosen": -0.14562034606933594, + "logits/rejected": -0.04126621410250664, + "logps/chosen": -327.22882080078125, + "logps/rejected": -382.4744873046875, + "loss": 0.6009, + "rewards/accuracies": 0.7777777910232544, + "rewards/chosen": -4.554243564605713, + "rewards/margins": 3.3480947017669678, + "rewards/rejected": -7.902338027954102, + "step": 258 + }, + { + "epoch": 0.5146237265856063, + "grad_norm": 59.11403274536133, + "learning_rate": 4.9252303012247775e-06, + "logits/chosen": -0.11811560392379761, + "logits/rejected": -0.01293177530169487, + "logps/chosen": -351.2400817871094, + "logps/rejected": -404.04693603515625, + "loss": 0.7836, + "rewards/accuracies": 0.7592592239379883, + "rewards/chosen": -5.45390510559082, + "rewards/margins": 3.5169780254364014, + "rewards/rejected": -8.970884323120117, + "step": 261 + }, + { + "epoch": 0.5205389418337167, + "grad_norm": 38.673831939697266, + "learning_rate": 4.921001361389096e-06, + "logits/chosen": -0.05326487869024277, + "logits/rejected": 0.009572159498929977, + "logps/chosen": -333.938232421875, + "logps/rejected": -371.80816650390625, + "loss": 0.7732, + "rewards/accuracies": 0.7175925970077515, + "rewards/chosen": -5.093016624450684, + "rewards/margins": 2.952821731567383, + "rewards/rejected": -8.045838356018066, + "step": 264 + }, + { + "epoch": 0.5264541570818272, + "grad_norm": 33.54100036621094, + "learning_rate": 4.916658012427235e-06, + "logits/chosen": -0.037946220487356186, + "logits/rejected": 0.06312233209609985, + "logps/chosen": -342.2742614746094, + "logps/rejected": -391.18646240234375, + "loss": 0.7494, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -5.872312545776367, + "rewards/margins": 2.313776969909668, + "rewards/rejected": -8.186089515686035, + "step": 267 + }, + { + "epoch": 0.5323693723299375, + "grad_norm": 37.88241195678711, + "learning_rate": 4.912200459592595e-06, + "logits/chosen": -0.06072680652141571, + "logits/rejected": 0.09704061597585678, + "logps/chosen": -343.1576843261719, + "logps/rejected": -407.43798828125, + "loss": 0.8544, + "rewards/accuracies": 0.6620370149612427, + "rewards/chosen": -6.514297008514404, + "rewards/margins": 2.3704113960266113, + "rewards/rejected": -8.884708404541016, + "step": 270 + }, + { + "epoch": 0.538284587578048, + "grad_norm": 71.56549072265625, + "learning_rate": 4.9076289135355e-06, + "logits/chosen": 0.02188403531908989, + "logits/rejected": 0.1214473694562912, + "logps/chosen": -355.06793212890625, + "logps/rejected": -388.5225830078125, + "loss": 0.873, + "rewards/accuracies": 0.6342592835426331, + "rewards/chosen": -6.959141731262207, + "rewards/margins": 1.8934102058410645, + "rewards/rejected": -8.852551460266113, + "step": 273 + }, + { + "epoch": 0.5441998028261584, + "grad_norm": 55.54237365722656, + "learning_rate": 4.902943590293245e-06, + "logits/chosen": 0.0230946596711874, + "logits/rejected": 0.08508029580116272, + "logps/chosen": -349.1761169433594, + "logps/rejected": -398.710693359375, + "loss": 0.8547, + "rewards/accuracies": 0.7222222685813904, + "rewards/chosen": -6.52820348739624, + "rewards/margins": 2.2471137046813965, + "rewards/rejected": -8.775317192077637, + "step": 276 + }, + { + "epoch": 0.5501150180742688, + "grad_norm": 48.57643127441406, + "learning_rate": 4.898144711279894e-06, + "logits/chosen": -0.07424692809581757, + "logits/rejected": 0.09649358689785004, + "logps/chosen": -339.84686279296875, + "logps/rejected": -393.98016357421875, + "loss": 0.8443, + "rewards/accuracies": 0.5972222685813904, + "rewards/chosen": -6.690239906311035, + "rewards/margins": 2.0843098163604736, + "rewards/rejected": -8.77454948425293, + "step": 279 + }, + { + "epoch": 0.5560302333223792, + "grad_norm": 28.025590896606445, + "learning_rate": 4.8932325032758006e-06, + "logits/chosen": -0.13962030410766602, + "logits/rejected": 0.05175274237990379, + "logps/chosen": -335.58966064453125, + "logps/rejected": -378.72833251953125, + "loss": 0.6595, + "rewards/accuracies": 0.703703761100769, + "rewards/chosen": -5.493226528167725, + "rewards/margins": 2.881283760070801, + "rewards/rejected": -8.374510765075684, + "step": 282 + }, + { + "epoch": 0.5619454485704897, + "grad_norm": 42.00820541381836, + "learning_rate": 4.8882071984169055e-06, + "logits/chosen": 0.007744944654405117, + "logits/rejected": 0.10258468985557556, + "logps/chosen": -363.2281494140625, + "logps/rejected": -410.42266845703125, + "loss": 0.7705, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -6.42072868347168, + "rewards/margins": 2.2050187587738037, + "rewards/rejected": -8.625746726989746, + "step": 285 + }, + { + "epoch": 0.5678606638186001, + "grad_norm": 54.86595916748047, + "learning_rate": 4.8830690341837596e-06, + "logits/chosen": -0.032913923263549805, + "logits/rejected": 0.1072501391172409, + "logps/chosen": -358.8460693359375, + "logps/rejected": -409.0872802734375, + "loss": 0.9018, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": -6.712845802307129, + "rewards/margins": 2.6474528312683105, + "rewards/rejected": -9.360298156738281, + "step": 288 + }, + { + "epoch": 0.5737758790667105, + "grad_norm": 63.118587493896484, + "learning_rate": 4.877818253390303e-06, + "logits/chosen": -0.04406512528657913, + "logits/rejected": 0.006794461514800787, + "logps/chosen": -367.21063232421875, + "logps/rejected": -415.6984558105469, + "loss": 0.866, + "rewards/accuracies": 0.6944445371627808, + "rewards/chosen": -7.632047176361084, + "rewards/margins": 2.7398147583007812, + "rewards/rejected": -10.371862411499023, + "step": 291 + }, + { + "epoch": 0.5796910943148209, + "grad_norm": 35.43265914916992, + "learning_rate": 4.872455104172392e-06, + "logits/chosen": -0.018917741253972054, + "logits/rejected": 0.09038470685482025, + "logps/chosen": -353.57666015625, + "logps/rejected": -384.8489990234375, + "loss": 0.8107, + "rewards/accuracies": 0.703703761100769, + "rewards/chosen": -7.3933916091918945, + "rewards/margins": 2.474519729614258, + "rewards/rejected": -9.867910385131836, + "step": 294 + }, + { + "epoch": 0.5856063095629314, + "grad_norm": 44.21418762207031, + "learning_rate": 4.866979839976068e-06, + "logits/chosen": -0.0317760705947876, + "logits/rejected": 0.05773278325796127, + "logps/chosen": -358.10247802734375, + "logps/rejected": -407.8479919433594, + "loss": 0.8849, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -8.095900535583496, + "rewards/margins": 2.4276022911071777, + "rewards/rejected": -10.523502349853516, + "step": 297 + }, + { + "epoch": 0.5915215248110417, + "grad_norm": 63.40266799926758, + "learning_rate": 4.861392719545586e-06, + "logits/chosen": -0.07806281745433807, + "logits/rejected": -0.02447107993066311, + "logps/chosen": -356.18218994140625, + "logps/rejected": -387.41180419921875, + "loss": 0.9313, + "rewards/accuracies": 0.6388888955116272, + "rewards/chosen": -7.356019020080566, + "rewards/margins": 1.7391393184661865, + "rewards/rejected": -9.095157623291016, + "step": 300 + }, + { + "epoch": 0.5974367400591521, + "grad_norm": 66.6312484741211, + "learning_rate": 4.855694006911184e-06, + "logits/chosen": -0.06037697196006775, + "logits/rejected": -0.07680558413267136, + "logps/chosen": -368.1618347167969, + "logps/rejected": -388.21026611328125, + "loss": 0.9553, + "rewards/accuracies": 0.6435185670852661, + "rewards/chosen": -7.906184673309326, + "rewards/margins": 1.781913161277771, + "rewards/rejected": -9.688097953796387, + "step": 303 + }, + { + "epoch": 0.6033519553072626, + "grad_norm": 50.65395736694336, + "learning_rate": 4.849883971376608e-06, + "logits/chosen": -0.036385513842105865, + "logits/rejected": 0.02312180958688259, + "logps/chosen": -360.32763671875, + "logps/rejected": -385.574462890625, + "loss": 0.8076, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -7.673541069030762, + "rewards/margins": 2.30534291267395, + "rewards/rejected": -9.97888469696045, + "step": 306 + }, + { + "epoch": 0.6092671705553729, + "grad_norm": 37.83553695678711, + "learning_rate": 4.843962887506382e-06, + "logits/chosen": -0.007246436085551977, + "logits/rejected": 0.10245460271835327, + "logps/chosen": -360.2029724121094, + "logps/rejected": -388.7747497558594, + "loss": 1.0146, + "rewards/accuracies": 0.638888955116272, + "rewards/chosen": -7.557069301605225, + "rewards/margins": 2.0892889499664307, + "rewards/rejected": -9.646357536315918, + "step": 309 + }, + { + "epoch": 0.6151823858034834, + "grad_norm": 67.90631103515625, + "learning_rate": 4.837931035112836e-06, + "logits/chosen": -0.021308597177267075, + "logits/rejected": 0.05145422741770744, + "logps/chosen": -340.68328857421875, + "logps/rejected": -396.016845703125, + "loss": 0.8041, + "rewards/accuracies": 0.6712962985038757, + "rewards/chosen": -6.7121500968933105, + "rewards/margins": 3.0467376708984375, + "rewards/rejected": -9.758888244628906, + "step": 312 + }, + { + "epoch": 0.6210976010515938, + "grad_norm": 38.1522331237793, + "learning_rate": 4.831788699242882e-06, + "logits/chosen": 0.12459397315979004, + "logits/rejected": 0.07747067511081696, + "logps/chosen": -381.19317626953125, + "logps/rejected": -386.43023681640625, + "loss": 0.8322, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -7.670474529266357, + "rewards/margins": 2.657536268234253, + "rewards/rejected": -10.328010559082031, + "step": 315 + }, + { + "epoch": 0.6270128162997043, + "grad_norm": 37.3199462890625, + "learning_rate": 4.825536170164543e-06, + "logits/chosen": 0.0364363007247448, + "logits/rejected": 0.1072683334350586, + "logps/chosen": -384.99658203125, + "logps/rejected": -423.906494140625, + "loss": 0.8372, + "rewards/accuracies": 0.703703761100769, + "rewards/chosen": -8.085029602050781, + "rewards/margins": 2.9126205444335938, + "rewards/rejected": -10.997650146484375, + "step": 318 + }, + { + "epoch": 0.6329280315478146, + "grad_norm": 84.70304870605469, + "learning_rate": 4.819173743353237e-06, + "logits/chosen": 0.07106878608465195, + "logits/rejected": 0.07161180675029755, + "logps/chosen": -353.46929931640625, + "logps/rejected": -382.072998046875, + "loss": 0.9152, + "rewards/accuracies": 0.6620370745658875, + "rewards/chosen": -8.362825393676758, + "rewards/margins": 2.396571636199951, + "rewards/rejected": -10.759397506713867, + "step": 321 + }, + { + "epoch": 0.6388432467959251, + "grad_norm": 41.621517181396484, + "learning_rate": 4.812701719477813e-06, + "logits/chosen": -0.042387984693050385, + "logits/rejected": -0.006728718988597393, + "logps/chosen": -397.9143981933594, + "logps/rejected": -417.5738220214844, + "loss": 0.7585, + "rewards/accuracies": 0.7222222685813904, + "rewards/chosen": -8.613710403442383, + "rewards/margins": 3.0014796257019043, + "rewards/rejected": -11.615188598632812, + "step": 324 + }, + { + "epoch": 0.6447584620440355, + "grad_norm": 34.132080078125, + "learning_rate": 4.80612040438634e-06, + "logits/chosen": -0.005274191033095121, + "logits/rejected": -0.0017657628050073981, + "logps/chosen": -391.5089111328125, + "logps/rejected": -413.4745788574219, + "loss": 0.8454, + "rewards/accuracies": 0.6759259700775146, + "rewards/chosen": -8.282991409301758, + "rewards/margins": 2.736060619354248, + "rewards/rejected": -11.019050598144531, + "step": 327 + }, + { + "epoch": 0.6506736772921459, + "grad_norm": 42.51496505737305, + "learning_rate": 4.799430109091659e-06, + "logits/chosen": -0.15326707065105438, + "logits/rejected": -0.08302909135818481, + "logps/chosen": -363.5447998046875, + "logps/rejected": -425.05645751953125, + "loss": 0.8109, + "rewards/accuracies": 0.7314814925193787, + "rewards/chosen": -8.056258201599121, + "rewards/margins": 3.1323180198669434, + "rewards/rejected": -11.188575744628906, + "step": 330 + }, + { + "epoch": 0.6565888925402563, + "grad_norm": 38.95637512207031, + "learning_rate": 4.792631149756683e-06, + "logits/chosen": -0.10734808444976807, + "logits/rejected": -0.13398586213588715, + "logps/chosen": -383.37957763671875, + "logps/rejected": -396.33050537109375, + "loss": 0.9087, + "rewards/accuracies": 0.6805557012557983, + "rewards/chosen": -9.402881622314453, + "rewards/margins": 2.247579574584961, + "rewards/rejected": -11.65046215057373, + "step": 333 + }, + { + "epoch": 0.6625041077883668, + "grad_norm": 30.90215301513672, + "learning_rate": 4.785723847679451e-06, + "logits/chosen": -0.17347650229930878, + "logits/rejected": -0.04830838367342949, + "logps/chosen": -365.1563415527344, + "logps/rejected": -424.2984619140625, + "loss": 0.7836, + "rewards/accuracies": 0.7222222089767456, + "rewards/chosen": -9.815483093261719, + "rewards/margins": 2.839183807373047, + "rewards/rejected": -12.654666900634766, + "step": 336 + }, + { + "epoch": 0.6684193230364771, + "grad_norm": 47.471736907958984, + "learning_rate": 4.778708529277954e-06, + "logits/chosen": -0.1100246012210846, + "logits/rejected": -0.06617899239063263, + "logps/chosen": -385.6170654296875, + "logps/rejected": -420.5422058105469, + "loss": 0.9052, + "rewards/accuracies": 0.703703761100769, + "rewards/chosen": -9.508901596069336, + "rewards/margins": 3.112328052520752, + "rewards/rejected": -12.621230125427246, + "step": 339 + }, + { + "epoch": 0.6743345382845876, + "grad_norm": 47.64808654785156, + "learning_rate": 4.7715855260747e-06, + "logits/chosen": -0.1708156168460846, + "logits/rejected": -0.07380948960781097, + "logps/chosen": -389.4483337402344, + "logps/rejected": -430.76080322265625, + "loss": 0.7454, + "rewards/accuracies": 0.6990741491317749, + "rewards/chosen": -9.387763023376465, + "rewards/margins": 3.1104159355163574, + "rewards/rejected": -12.498178482055664, + "step": 342 + }, + { + "epoch": 0.680249753532698, + "grad_norm": 34.76079559326172, + "learning_rate": 4.764355174681056e-06, + "logits/chosen": -0.12020980566740036, + "logits/rejected": -0.09629341214895248, + "logps/chosen": -386.8775329589844, + "logps/rejected": -407.75299072265625, + "loss": 0.7826, + "rewards/accuracies": 0.7129629850387573, + "rewards/chosen": -9.380858421325684, + "rewards/margins": 2.675083637237549, + "rewards/rejected": -12.055941581726074, + "step": 345 + }, + { + "epoch": 0.6861649687808085, + "grad_norm": 31.153663635253906, + "learning_rate": 4.757017816781331e-06, + "logits/chosen": -0.16364365816116333, + "logits/rejected": -0.0642888993024826, + "logps/chosen": -379.285400390625, + "logps/rejected": -452.3611755371094, + "loss": 0.6606, + "rewards/accuracies": 0.7314814925193787, + "rewards/chosen": -9.602463722229004, + "rewards/margins": 3.611215591430664, + "rewards/rejected": -13.213679313659668, + "step": 348 + }, + { + "epoch": 0.6920801840289188, + "grad_norm": 49.72859191894531, + "learning_rate": 4.74957379911664e-06, + "logits/chosen": -0.07502593845129013, + "logits/rejected": 0.00437380513176322, + "logps/chosen": -395.549560546875, + "logps/rejected": -437.9383544921875, + "loss": 0.8434, + "rewards/accuracies": 0.6990741491317749, + "rewards/chosen": -10.370527267456055, + "rewards/margins": 2.701253652572632, + "rewards/rejected": -13.07178020477295, + "step": 351 + }, + { + "epoch": 0.6979953992770292, + "grad_norm": 56.92310333251953, + "learning_rate": 4.7420234734685104e-06, + "logits/chosen": -0.03868694603443146, + "logits/rejected": -0.0014290250837802887, + "logps/chosen": -401.3062744140625, + "logps/rejected": -445.33807373046875, + "loss": 0.8181, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -10.017228126525879, + "rewards/margins": 3.0577263832092285, + "rewards/rejected": -13.07495403289795, + "step": 354 + }, + { + "epoch": 0.7039106145251397, + "grad_norm": 28.037378311157227, + "learning_rate": 4.7343671966422584e-06, + "logits/chosen": -0.19204241037368774, + "logits/rejected": -0.08173765987157822, + "logps/chosen": -388.4847106933594, + "logps/rejected": -435.093505859375, + "loss": 0.8228, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": -9.829904556274414, + "rewards/margins": 2.792029857635498, + "rewards/rejected": -12.621932983398438, + "step": 357 + }, + { + "epoch": 0.70982582977325, + "grad_norm": 40.521690368652344, + "learning_rate": 4.726605330450132e-06, + "logits/chosen": -0.13022971153259277, + "logits/rejected": -0.08149293065071106, + "logps/chosen": -379.46185302734375, + "logps/rejected": -414.7354736328125, + "loss": 0.7553, + "rewards/accuracies": 0.7222222685813904, + "rewards/chosen": -8.846694946289062, + "rewards/margins": 2.8018383979797363, + "rewards/rejected": -11.64853286743164, + "step": 360 + }, + { + "epoch": 0.7157410450213605, + "grad_norm": 58.53022003173828, + "learning_rate": 4.718738241694207e-06, + "logits/chosen": -0.1846962869167328, + "logits/rejected": -0.10722313821315765, + "logps/chosen": -353.771484375, + "logps/rejected": -411.9970703125, + "loss": 0.7709, + "rewards/accuracies": 0.708333432674408, + "rewards/chosen": -8.93757152557373, + "rewards/margins": 2.895968198776245, + "rewards/rejected": -11.833539962768555, + "step": 363 + }, + { + "epoch": 0.7216562602694709, + "grad_norm": 62.94578552246094, + "learning_rate": 4.710766302149059e-06, + "logits/chosen": -0.11727502197027206, + "logits/rejected": -0.07848824560642242, + "logps/chosen": -386.8691101074219, + "logps/rejected": -423.58203125, + "loss": 0.9148, + "rewards/accuracies": 0.6990741491317749, + "rewards/chosen": -9.200393676757812, + "rewards/margins": 2.5602023601531982, + "rewards/rejected": -11.76059627532959, + "step": 366 + }, + { + "epoch": 0.7275714755175814, + "grad_norm": 41.92021560668945, + "learning_rate": 4.7026898885441895e-06, + "logits/chosen": -0.23892198503017426, + "logits/rejected": -0.19355922937393188, + "logps/chosen": -371.76507568359375, + "logps/rejected": -428.6576843261719, + "loss": 0.5582, + "rewards/accuracies": 0.7685184478759766, + "rewards/chosen": -8.767149925231934, + "rewards/margins": 3.590496063232422, + "rewards/rejected": -12.357645988464355, + "step": 369 + }, + { + "epoch": 0.7334866907656917, + "grad_norm": 49.8127555847168, + "learning_rate": 4.694509382546225e-06, + "logits/chosen": -0.15272876620292664, + "logits/rejected": -0.16361907124519348, + "logps/chosen": -391.66729736328125, + "logps/rejected": -436.54638671875, + "loss": 0.7923, + "rewards/accuracies": 0.7222222685813904, + "rewards/chosen": -9.770153045654297, + "rewards/margins": 3.136183738708496, + "rewards/rejected": -12.906336784362793, + "step": 372 + }, + { + "epoch": 0.7394019060138022, + "grad_norm": 58.40044403076172, + "learning_rate": 4.686225170740881e-06, + "logits/chosen": -0.18054398894309998, + "logits/rejected": -0.10335493087768555, + "logps/chosen": -412.2997741699219, + "logps/rejected": -459.0295104980469, + "loss": 0.8095, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -9.97874641418457, + "rewards/margins": 2.9409613609313965, + "rewards/rejected": -12.919708251953125, + "step": 375 + }, + { + "epoch": 0.7453171212619126, + "grad_norm": 39.62379837036133, + "learning_rate": 4.677837644614692e-06, + "logits/chosen": -0.0909600779414177, + "logits/rejected": -0.023966720327734947, + "logps/chosen": -404.3166198730469, + "logps/rejected": -445.68218994140625, + "loss": 0.8851, + "rewards/accuracies": 0.6712962985038757, + "rewards/chosen": -10.392790794372559, + "rewards/margins": 2.6830623149871826, + "rewards/rejected": -13.07585334777832, + "step": 378 + }, + { + "epoch": 0.751232336510023, + "grad_norm": 70.16593170166016, + "learning_rate": 4.669347200536513e-06, + "logits/chosen": -0.22100476920604706, + "logits/rejected": -0.07667340338230133, + "logps/chosen": -388.3753662109375, + "logps/rejected": -459.80645751953125, + "loss": 0.9195, + "rewards/accuracies": 0.6944444179534912, + "rewards/chosen": -9.879241943359375, + "rewards/margins": 3.15310001373291, + "rewards/rejected": -13.032341003417969, + "step": 381 + }, + { + "epoch": 0.7571475517581334, + "grad_norm": 31.03739356994629, + "learning_rate": 4.660754239738784e-06, + "logits/chosen": -0.13154415786266327, + "logits/rejected": -0.14076797664165497, + "logps/chosen": -357.6185302734375, + "logps/rejected": -386.7919921875, + "loss": 0.7967, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -8.652280807495117, + "rewards/margins": 3.0989902019500732, + "rewards/rejected": -11.751270294189453, + "step": 384 + }, + { + "epoch": 0.7630627670062439, + "grad_norm": 36.11872482299805, + "learning_rate": 4.652059168298575e-06, + "logits/chosen": -0.1265685260295868, + "logits/rejected": -0.16448134183883667, + "logps/chosen": -372.2834167480469, + "logps/rejected": -404.4194030761719, + "loss": 0.8985, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": -8.796895980834961, + "rewards/margins": 2.566009521484375, + "rewards/rejected": -11.362905502319336, + "step": 387 + }, + { + "epoch": 0.7689779822543542, + "grad_norm": 46.44245910644531, + "learning_rate": 4.6432623971183914e-06, + "logits/chosen": -0.14210101962089539, + "logits/rejected": -0.10977351665496826, + "logps/chosen": -394.84918212890625, + "logps/rejected": -433.3570556640625, + "loss": 0.9522, + "rewards/accuracies": 0.6759259104728699, + "rewards/chosen": -9.386205673217773, + "rewards/margins": 2.8492612838745117, + "rewards/rejected": -12.235466003417969, + "step": 390 + }, + { + "epoch": 0.7748931975024647, + "grad_norm": 97.72257232666016, + "learning_rate": 4.634364341906758e-06, + "logits/chosen": -0.0367339588701725, + "logits/rejected": -0.006284890230745077, + "logps/chosen": -384.237548828125, + "logps/rejected": -411.18560791015625, + "loss": 0.9734, + "rewards/accuracies": 0.6435185670852661, + "rewards/chosen": -9.642322540283203, + "rewards/margins": 2.240882635116577, + "rewards/rejected": -11.88320541381836, + "step": 393 + }, + { + "epoch": 0.7808084127505751, + "grad_norm": 31.60021209716797, + "learning_rate": 4.6253654231585724e-06, + "logits/chosen": -0.04863632842898369, + "logits/rejected": -0.05471419543027878, + "logps/chosen": -393.5486755371094, + "logps/rejected": -420.154541015625, + "loss": 0.8569, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -10.027898788452148, + "rewards/margins": 2.8087966442108154, + "rewards/rejected": -12.83669662475586, + "step": 396 + }, + { + "epoch": 0.7867236279986856, + "grad_norm": 34.62734603881836, + "learning_rate": 4.616266066135236e-06, + "logits/chosen": 0.012388413771986961, + "logits/rejected": 0.08139034360647202, + "logps/chosen": -403.1490478515625, + "logps/rejected": -445.5652770996094, + "loss": 0.8866, + "rewards/accuracies": 0.7175925970077515, + "rewards/chosen": -10.999832153320312, + "rewards/margins": 2.9154164791107178, + "rewards/rejected": -13.915247917175293, + "step": 399 + }, + { + "epoch": 0.7926388432467959, + "grad_norm": 45.868507385253906, + "learning_rate": 4.6070667008445565e-06, + "logits/chosen": -0.1295236349105835, + "logits/rejected": -0.05852815508842468, + "logps/chosen": -410.78717041015625, + "logps/rejected": -466.6341247558594, + "loss": 0.8873, + "rewards/accuracies": 0.6898148059844971, + "rewards/chosen": -11.442038536071777, + "rewards/margins": 2.919829845428467, + "rewards/rejected": -14.361867904663086, + "step": 402 + }, + { + "epoch": 0.7985540584949063, + "grad_norm": 57.16217041015625, + "learning_rate": 4.597767762020425e-06, + "logits/chosen": -0.09274892508983612, + "logits/rejected": -0.0731249749660492, + "logps/chosen": -414.71905517578125, + "logps/rejected": -454.8731384277344, + "loss": 0.8367, + "rewards/accuracies": 0.6759259104728699, + "rewards/chosen": -12.529696464538574, + "rewards/margins": 2.8657846450805664, + "rewards/rejected": -15.395480155944824, + "step": 405 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 40.70981979370117, + "learning_rate": 4.588369689102275e-06, + "logits/chosen": -0.19083881378173828, + "logits/rejected": -0.22435228526592255, + "logps/chosen": -420.8725280761719, + "logps/rejected": -443.5750732421875, + "loss": 0.8609, + "rewards/accuracies": 0.7037036418914795, + "rewards/chosen": -12.435295104980469, + "rewards/margins": 2.047469139099121, + "rewards/rejected": -14.48276424407959, + "step": 408 + }, + { + "epoch": 0.8103844889911271, + "grad_norm": 81.51207733154297, + "learning_rate": 4.578872926214312e-06, + "logits/chosen": -0.1143086701631546, + "logits/rejected": -0.12217384576797485, + "logps/chosen": -426.69769287109375, + "logps/rejected": -465.9218444824219, + "loss": 0.8227, + "rewards/accuracies": 0.6898148059844971, + "rewards/chosen": -11.509513854980469, + "rewards/margins": 3.481245756149292, + "rewards/rejected": -14.990760803222656, + "step": 411 + }, + { + "epoch": 0.8162997042392376, + "grad_norm": 37.07810974121094, + "learning_rate": 4.569277922144531e-06, + "logits/chosen": -0.07632291316986084, + "logits/rejected": -0.09633226692676544, + "logps/chosen": -380.8441162109375, + "logps/rejected": -426.43780517578125, + "loss": 0.7403, + "rewards/accuracies": 0.7175925970077515, + "rewards/chosen": -10.021100997924805, + "rewards/margins": 3.8166158199310303, + "rewards/rejected": -13.83771800994873, + "step": 414 + }, + { + "epoch": 0.822214919487348, + "grad_norm": 40.48078155517578, + "learning_rate": 4.559585130323503e-06, + "logits/chosen": -0.11609819531440735, + "logits/rejected": -0.11777342855930328, + "logps/chosen": -382.35833740234375, + "logps/rejected": -425.1037292480469, + "loss": 0.7272, + "rewards/accuracies": 0.7453703880310059, + "rewards/chosen": -10.085898399353027, + "rewards/margins": 4.251326560974121, + "rewards/rejected": -14.337224960327148, + "step": 417 + }, + { + "epoch": 0.8281301347354584, + "grad_norm": 23.162534713745117, + "learning_rate": 4.549795008802951e-06, + "logits/chosen": -0.09667672216892242, + "logits/rejected": -0.16936007142066956, + "logps/chosen": -414.39794921875, + "logps/rejected": -467.24224853515625, + "loss": 0.71, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.526082992553711, + "rewards/margins": 5.361236572265625, + "rewards/rejected": -15.887319564819336, + "step": 420 + }, + { + "epoch": 0.8340453499835688, + "grad_norm": 46.23252868652344, + "learning_rate": 4.539908020234101e-06, + "logits/chosen": -0.18942461907863617, + "logits/rejected": -0.20741616189479828, + "logps/chosen": -395.4661560058594, + "logps/rejected": -422.1636962890625, + "loss": 0.8233, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -11.337644577026367, + "rewards/margins": 3.6029138565063477, + "rewards/rejected": -14.940558433532715, + "step": 423 + }, + { + "epoch": 0.8399605652316793, + "grad_norm": 39.3061637878418, + "learning_rate": 4.529924631845819e-06, + "logits/chosen": -0.22194555401802063, + "logits/rejected": -0.2846095860004425, + "logps/chosen": -391.3856201171875, + "logps/rejected": -436.1163024902344, + "loss": 0.7741, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -11.072221755981445, + "rewards/margins": 3.993955612182617, + "rewards/rejected": -15.066177368164062, + "step": 426 + }, + { + "epoch": 0.8458757804797897, + "grad_norm": 35.3477783203125, + "learning_rate": 4.5198453154225336e-06, + "logits/chosen": -0.20290356874465942, + "logits/rejected": -0.22691264748573303, + "logps/chosen": -401.7808532714844, + "logps/rejected": -431.0494384765625, + "loss": 0.9883, + "rewards/accuracies": 0.6527777910232544, + "rewards/chosen": -11.68575668334961, + "rewards/margins": 3.1858057975769043, + "rewards/rejected": -14.871562957763672, + "step": 429 + }, + { + "epoch": 0.8517909957279001, + "grad_norm": 45.111228942871094, + "learning_rate": 4.509670547281938e-06, + "logits/chosen": -0.1648390144109726, + "logits/rejected": -0.15016193687915802, + "logps/chosen": -408.5751647949219, + "logps/rejected": -455.25286865234375, + "loss": 0.685, + "rewards/accuracies": 0.7129630446434021, + "rewards/chosen": -11.243772506713867, + "rewards/margins": 4.181289196014404, + "rewards/rejected": -15.425060272216797, + "step": 432 + }, + { + "epoch": 0.8577062109760105, + "grad_norm": 46.299076080322266, + "learning_rate": 4.499400808252481e-06, + "logits/chosen": -0.17850446701049805, + "logits/rejected": -0.17678868770599365, + "logps/chosen": -426.147216796875, + "logps/rejected": -462.2786560058594, + "loss": 0.8077, + "rewards/accuracies": 0.7453703880310059, + "rewards/chosen": -12.385601043701172, + "rewards/margins": 3.7832705974578857, + "rewards/rejected": -16.168874740600586, + "step": 435 + }, + { + "epoch": 0.863621426224121, + "grad_norm": 33.022735595703125, + "learning_rate": 4.489036583650649e-06, + "logits/chosen": -0.23058825731277466, + "logits/rejected": -0.1485554724931717, + "logps/chosen": -404.13458251953125, + "logps/rejected": -459.26434326171875, + "loss": 0.7394, + "rewards/accuracies": 0.6851851940155029, + "rewards/chosen": -12.277857780456543, + "rewards/margins": 4.416906833648682, + "rewards/rejected": -16.694765090942383, + "step": 438 + }, + { + "epoch": 0.8695366414722313, + "grad_norm": 35.275081634521484, + "learning_rate": 4.478578363258023e-06, + "logits/chosen": -0.16853290796279907, + "logits/rejected": -0.1521489918231964, + "logps/chosen": -415.5618591308594, + "logps/rejected": -451.77178955078125, + "loss": 0.8398, + "rewards/accuracies": 0.6898148059844971, + "rewards/chosen": -12.673078536987305, + "rewards/margins": 3.461433172225952, + "rewards/rejected": -16.134510040283203, + "step": 441 + }, + { + "epoch": 0.8754518567203418, + "grad_norm": 33.70681381225586, + "learning_rate": 4.468026641298142e-06, + "logits/chosen": -0.1421818733215332, + "logits/rejected": -0.08376338332891464, + "logps/chosen": -427.12335205078125, + "logps/rejected": -484.59649658203125, + "loss": 0.7882, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -13.382842063903809, + "rewards/margins": 3.6166725158691406, + "rewards/rejected": -16.999515533447266, + "step": 444 + }, + { + "epoch": 0.8813670719684522, + "grad_norm": 33.35516357421875, + "learning_rate": 4.457381916413141e-06, + "logits/chosen": -0.13292686641216278, + "logits/rejected": -0.18141326308250427, + "logps/chosen": -425.69287109375, + "logps/rejected": -463.3936767578125, + "loss": 0.8456, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -12.859245300292969, + "rewards/margins": 3.692777156829834, + "rewards/rejected": -16.55202293395996, + "step": 447 + }, + { + "epoch": 0.8872822872165625, + "grad_norm": 35.195640563964844, + "learning_rate": 4.4466446916401895e-06, + "logits/chosen": -0.22234384715557098, + "logits/rejected": -0.08050793409347534, + "logps/chosen": -418.515869140625, + "logps/rejected": -479.6258239746094, + "loss": 0.754, + "rewards/accuracies": 0.7222222685813904, + "rewards/chosen": -13.117304801940918, + "rewards/margins": 4.073483467102051, + "rewards/rejected": -17.19078826904297, + "step": 450 + }, + { + "epoch": 0.893197502464673, + "grad_norm": 57.9023551940918, + "learning_rate": 4.435815474387719e-06, + "logits/chosen": -0.117046058177948, + "logits/rejected": -0.09925241768360138, + "logps/chosen": -431.378662109375, + "logps/rejected": -481.478271484375, + "loss": 0.9017, + "rewards/accuracies": 0.6898148059844971, + "rewards/chosen": -13.18567943572998, + "rewards/margins": 3.816985845565796, + "rewards/rejected": -17.002666473388672, + "step": 453 + }, + { + "epoch": 0.8991127177127834, + "grad_norm": 32.576194763183594, + "learning_rate": 4.424894776411445e-06, + "logits/chosen": -0.14799581468105316, + "logits/rejected": -0.14989186823368073, + "logps/chosen": -423.6829833984375, + "logps/rejected": -472.8582458496094, + "loss": 0.751, + "rewards/accuracies": 0.7222222685813904, + "rewards/chosen": -12.279951095581055, + "rewards/margins": 4.995446681976318, + "rewards/rejected": -17.27539825439453, + "step": 456 + }, + { + "epoch": 0.9050279329608939, + "grad_norm": 42.71744155883789, + "learning_rate": 4.413883113790183e-06, + "logits/chosen": -0.1738191843032837, + "logits/rejected": -0.13758057355880737, + "logps/chosen": -426.9272155761719, + "logps/rejected": -479.89434814453125, + "loss": 0.9077, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -13.401716232299805, + "rewards/margins": 3.820394277572632, + "rewards/rejected": -17.22211265563965, + "step": 459 + }, + { + "epoch": 0.9109431482090042, + "grad_norm": 39.175025939941406, + "learning_rate": 4.402781006901457e-06, + "logits/chosen": -0.2070755809545517, + "logits/rejected": -0.14179669320583344, + "logps/chosen": -414.93804931640625, + "logps/rejected": -477.036376953125, + "loss": 0.8665, + "rewards/accuracies": 0.6805555820465088, + "rewards/chosen": -13.253950119018555, + "rewards/margins": 3.051947593688965, + "rewards/rejected": -16.305896759033203, + "step": 462 + }, + { + "epoch": 0.9168583634571147, + "grad_norm": 22.947998046875, + "learning_rate": 4.391588980396913e-06, + "logits/chosen": -0.1404464989900589, + "logits/rejected": -0.181797593832016, + "logps/chosen": -406.57586669921875, + "logps/rejected": -447.8778076171875, + "loss": 0.6584, + "rewards/accuracies": 0.7407407760620117, + "rewards/chosen": -11.844066619873047, + "rewards/margins": 4.259424209594727, + "rewards/rejected": -16.103490829467773, + "step": 465 + }, + { + "epoch": 0.9227735787052251, + "grad_norm": 27.205076217651367, + "learning_rate": 4.380307563177523e-06, + "logits/chosen": -0.27725404500961304, + "logits/rejected": -0.20193539559841156, + "logps/chosen": -409.8758239746094, + "logps/rejected": -500.9091491699219, + "loss": 0.7579, + "rewards/accuracies": 0.7361111640930176, + "rewards/chosen": -12.209704399108887, + "rewards/margins": 5.026104927062988, + "rewards/rejected": -17.235809326171875, + "step": 468 + }, + { + "epoch": 0.9286887939533355, + "grad_norm": 28.20140266418457, + "learning_rate": 4.36893728836859e-06, + "logits/chosen": -0.2689639925956726, + "logits/rejected": -0.20782801508903503, + "logps/chosen": -396.9151611328125, + "logps/rejected": -470.01593017578125, + "loss": 0.5835, + "rewards/accuracies": 0.7731481790542603, + "rewards/chosen": -12.019521713256836, + "rewards/margins": 5.0072855949401855, + "rewards/rejected": -17.02680778503418, + "step": 471 + }, + { + "epoch": 0.9346040092014459, + "grad_norm": 32.47650909423828, + "learning_rate": 4.357478693294557e-06, + "logits/chosen": -0.3439037799835205, + "logits/rejected": -0.28947803378105164, + "logps/chosen": -415.56072998046875, + "logps/rejected": -484.7381286621094, + "loss": 0.7827, + "rewards/accuracies": 0.763888955116272, + "rewards/chosen": -12.406390190124512, + "rewards/margins": 5.7553629875183105, + "rewards/rejected": -18.161752700805664, + "step": 474 + }, + { + "epoch": 0.9405192244495564, + "grad_norm": 37.89875793457031, + "learning_rate": 4.345932319453612e-06, + "logits/chosen": -0.3605978488922119, + "logits/rejected": -0.29322177171707153, + "logps/chosen": -429.87091064453125, + "logps/rejected": -483.867431640625, + "loss": 0.8029, + "rewards/accuracies": 0.703703761100769, + "rewards/chosen": -13.3162841796875, + "rewards/margins": 4.183923721313477, + "rewards/rejected": -17.500207901000977, + "step": 477 + }, + { + "epoch": 0.9464344396976668, + "grad_norm": 50.86394119262695, + "learning_rate": 4.334298712492098e-06, + "logits/chosen": -0.2936496138572693, + "logits/rejected": -0.32563602924346924, + "logps/chosen": -436.94317626953125, + "logps/rejected": -472.5697937011719, + "loss": 0.9131, + "rewards/accuracies": 0.6574074029922485, + "rewards/chosen": -13.809501647949219, + "rewards/margins": 3.8074746131896973, + "rewards/rejected": -17.61697769165039, + "step": 480 + }, + { + "epoch": 0.9523496549457772, + "grad_norm": 52.91617202758789, + "learning_rate": 4.32257842217873e-06, + "logits/chosen": -0.2928038239479065, + "logits/rejected": -0.25982213020324707, + "logps/chosen": -438.8931579589844, + "logps/rejected": -474.4143981933594, + "loss": 0.8308, + "rewards/accuracies": 0.7314814925193787, + "rewards/chosen": -14.169482231140137, + "rewards/margins": 3.389918327331543, + "rewards/rejected": -17.559402465820312, + "step": 483 + }, + { + "epoch": 0.9582648701938876, + "grad_norm": 40.86037826538086, + "learning_rate": 4.310772002378613e-06, + "logits/chosen": -0.33903825283050537, + "logits/rejected": -0.27782881259918213, + "logps/chosen": -430.27288818359375, + "logps/rejected": -489.8445739746094, + "loss": 0.7149, + "rewards/accuracies": 0.7407407164573669, + "rewards/chosen": -14.304798126220703, + "rewards/margins": 4.365303993225098, + "rewards/rejected": -18.670101165771484, + "step": 486 + }, + { + "epoch": 0.9641800854419981, + "grad_norm": 38.9904899597168, + "learning_rate": 4.298880011027067e-06, + "logits/chosen": -0.3663506507873535, + "logits/rejected": -0.2783759832382202, + "logps/chosen": -428.8545837402344, + "logps/rejected": -498.8543701171875, + "loss": 0.54, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.275480270385742, + "rewards/margins": 4.776096343994141, + "rewards/rejected": -19.051578521728516, + "step": 489 + }, + { + "epoch": 0.9700953006901084, + "grad_norm": 33.28893280029297, + "learning_rate": 4.286903010103267e-06, + "logits/chosen": -0.34440621733665466, + "logits/rejected": -0.3880541920661926, + "logps/chosen": -448.9291687011719, + "logps/rejected": -492.80609130859375, + "loss": 0.7484, + "rewards/accuracies": 0.7546297311782837, + "rewards/chosen": -13.652152061462402, + "rewards/margins": 4.3886871337890625, + "rewards/rejected": -18.04084014892578, + "step": 492 + }, + { + "epoch": 0.9760105159382189, + "grad_norm": 23.393043518066406, + "learning_rate": 4.274841565603674e-06, + "logits/chosen": -0.3807776868343353, + "logits/rejected": -0.37542426586151123, + "logps/chosen": -420.848388671875, + "logps/rejected": -468.35150146484375, + "loss": 0.6653, + "rewards/accuracies": 0.7361111044883728, + "rewards/chosen": -13.749267578125, + "rewards/margins": 3.9116599559783936, + "rewards/rejected": -17.660926818847656, + "step": 495 + }, + { + "epoch": 0.9819257311863293, + "grad_norm": 71.44178771972656, + "learning_rate": 4.262696247515298e-06, + "logits/chosen": -0.33630847930908203, + "logits/rejected": -0.3011205196380615, + "logps/chosen": -427.74493408203125, + "logps/rejected": -486.2608337402344, + "loss": 0.79, + "rewards/accuracies": 0.7453703284263611, + "rewards/chosen": -13.645870208740234, + "rewards/margins": 3.291267156600952, + "rewards/rejected": -16.937137603759766, + "step": 498 + }, + { + "epoch": 0.9878409464344396, + "grad_norm": 38.925453186035156, + "learning_rate": 4.250467629788758e-06, + "logits/chosen": -0.3359528183937073, + "logits/rejected": -0.32171425223350525, + "logps/chosen": -416.31011962890625, + "logps/rejected": -453.65667724609375, + "loss": 0.8185, + "rewards/accuracies": 0.7592593431472778, + "rewards/chosen": -13.421798706054688, + "rewards/margins": 3.002066135406494, + "rewards/rejected": -16.423866271972656, + "step": 501 + }, + { + "epoch": 0.9937561616825501, + "grad_norm": 43.50007247924805, + "learning_rate": 4.238156290311159e-06, + "logits/chosen": -0.2119762897491455, + "logits/rejected": -0.21785835921764374, + "logps/chosen": -419.2802734375, + "logps/rejected": -464.8529052734375, + "loss": 0.8978, + "rewards/accuracies": 0.7083333730697632, + "rewards/chosen": -12.489054679870605, + "rewards/margins": 2.8680572509765625, + "rewards/rejected": -15.357111930847168, + "step": 504 + }, + { + "epoch": 0.9996713769306605, + "grad_norm": 28.424711227416992, + "learning_rate": 4.2257628108787855e-06, + "logits/chosen": -0.3750268220901489, + "logits/rejected": -0.3536463975906372, + "logps/chosen": -406.7919921875, + "logps/rejected": -456.8607177734375, + "loss": 0.8355, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.890769004821777, + "rewards/margins": 3.3902645111083984, + "rewards/rejected": -15.281034469604492, + "step": 507 + }, + { + "epoch": 1.0039434768320736, + "grad_norm": 9.685483932495117, + "learning_rate": 4.2132877771696e-06, + "logits/chosen": -0.44336915016174316, + "logits/rejected": -0.3426854610443115, + "logps/chosen": -401.211669921875, + "logps/rejected": -507.76446533203125, + "loss": 0.2311, + "rewards/accuracies": 0.9487179517745972, + "rewards/chosen": -8.906108856201172, + "rewards/margins": 9.99924087524414, + "rewards/rejected": -18.905351638793945, + "step": 510 + }, + { + "epoch": 1.009858692080184, + "grad_norm": 15.767539978027344, + "learning_rate": 4.200731778715575e-06, + "logits/chosen": -0.3226369619369507, + "logits/rejected": -0.23430070281028748, + "logps/chosen": -386.5018005371094, + "logps/rejected": -498.0735778808594, + "loss": 0.0674, + "rewards/accuracies": 0.9722223281860352, + "rewards/chosen": -8.576701164245605, + "rewards/margins": 10.921747207641602, + "rewards/rejected": -19.49844741821289, + "step": 513 + }, + { + "epoch": 1.0157739073282945, + "grad_norm": 8.739374160766602, + "learning_rate": 4.188095408874829e-06, + "logits/chosen": -0.3742499053478241, + "logits/rejected": -0.33725112676620483, + "logps/chosen": -354.2779235839844, + "logps/rejected": -456.7283935546875, + "loss": 0.0407, + "rewards/accuracies": 0.9907407760620117, + "rewards/chosen": -8.695253372192383, + "rewards/margins": 9.780828475952148, + "rewards/rejected": -18.47608184814453, + "step": 516 + }, + { + "epoch": 1.0216891225764049, + "grad_norm": 13.139928817749023, + "learning_rate": 4.175379264803587e-06, + "logits/chosen": -0.37535345554351807, + "logits/rejected": -0.31396183371543884, + "logps/chosen": -375.81396484375, + "logps/rejected": -471.1697692871094, + "loss": 0.0762, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -8.316873550415039, + "rewards/margins": 9.90269660949707, + "rewards/rejected": -18.219572067260742, + "step": 519 + }, + { + "epoch": 1.0276043378245152, + "grad_norm": 3.5331549644470215, + "learning_rate": 4.162583947427958e-06, + "logits/chosen": -0.40028223395347595, + "logits/rejected": -0.22156159579753876, + "logps/chosen": -378.267822265625, + "logps/rejected": -504.5182800292969, + "loss": 0.1258, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -9.443648338317871, + "rewards/margins": 11.473689079284668, + "rewards/rejected": -20.917339324951172, + "step": 522 + }, + { + "epoch": 1.0335195530726258, + "grad_norm": 10.87333869934082, + "learning_rate": 4.149710061415542e-06, + "logits/chosen": -0.3893941044807434, + "logits/rejected": -0.22475658357143402, + "logps/chosen": -389.7325134277344, + "logps/rejected": -521.1325073242188, + "loss": 0.0873, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -9.745861053466797, + "rewards/margins": 12.743724822998047, + "rewards/rejected": -22.489585876464844, + "step": 525 + }, + { + "epoch": 1.0394347683207361, + "grad_norm": 10.995318412780762, + "learning_rate": 4.13675821514685e-06, + "logits/chosen": -0.39965152740478516, + "logits/rejected": -0.252058744430542, + "logps/chosen": -410.03411865234375, + "logps/rejected": -544.2578735351562, + "loss": 0.0969, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -11.346731185913086, + "rewards/margins": 11.341825485229492, + "rewards/rejected": -22.688552856445312, + "step": 528 + }, + { + "epoch": 1.0453499835688465, + "grad_norm": 8.572622299194336, + "learning_rate": 4.12372902068656e-06, + "logits/chosen": -0.4358088970184326, + "logits/rejected": -0.2031504511833191, + "logps/chosen": -403.5755310058594, + "logps/rejected": -527.644775390625, + "loss": 0.1168, + "rewards/accuracies": 0.9444444179534912, + "rewards/chosen": -11.759601593017578, + "rewards/margins": 10.76253890991211, + "rewards/rejected": -22.522140502929688, + "step": 531 + }, + { + "epoch": 1.051265198816957, + "grad_norm": 7.874849796295166, + "learning_rate": 4.110623093754585e-06, + "logits/chosen": -0.38974201679229736, + "logits/rejected": -0.25777122378349304, + "logps/chosen": -408.3241882324219, + "logps/rejected": -516.1395263671875, + "loss": 0.1062, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -11.630317687988281, + "rewards/margins": 10.122415542602539, + "rewards/rejected": -21.752735137939453, + "step": 534 + }, + { + "epoch": 1.0571804140650674, + "grad_norm": 7.267045974731445, + "learning_rate": 4.097441053696985e-06, + "logits/chosen": -0.44127148389816284, + "logits/rejected": -0.32113516330718994, + "logps/chosen": -430.0728759765625, + "logps/rejected": -561.5933837890625, + "loss": 0.0811, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -11.996893882751465, + "rewards/margins": 12.880267143249512, + "rewards/rejected": -24.87716293334961, + "step": 537 + }, + { + "epoch": 1.0630956293131777, + "grad_norm": 26.037012100219727, + "learning_rate": 4.08418352345669e-06, + "logits/chosen": -0.3767847418785095, + "logits/rejected": -0.31423503160476685, + "logps/chosen": -406.0875244140625, + "logps/rejected": -526.44482421875, + "loss": 0.0899, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -10.038455963134766, + "rewards/margins": 13.710319519042969, + "rewards/rejected": -23.748777389526367, + "step": 540 + }, + { + "epoch": 1.0690108445612883, + "grad_norm": 13.05281925201416, + "learning_rate": 4.070851129544065e-06, + "logits/chosen": -0.465512216091156, + "logits/rejected": -0.2587343454360962, + "logps/chosen": -399.33642578125, + "logps/rejected": -554.8599243164062, + "loss": 0.1257, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -10.270711898803711, + "rewards/margins": 13.220477104187012, + "rewards/rejected": -23.491188049316406, + "step": 543 + }, + { + "epoch": 1.0749260598093986, + "grad_norm": 24.870304107666016, + "learning_rate": 4.057444502007306e-06, + "logits/chosen": -0.5171704888343811, + "logits/rejected": -0.33416056632995605, + "logps/chosen": -398.2466735839844, + "logps/rejected": -536.3380737304688, + "loss": 0.1337, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -9.948308944702148, + "rewards/margins": 13.192928314208984, + "rewards/rejected": -23.1412353515625, + "step": 546 + }, + { + "epoch": 1.0808412750575092, + "grad_norm": 3.7352793216705322, + "learning_rate": 4.043964274402663e-06, + "logits/chosen": -0.4453051686286926, + "logits/rejected": -0.30837786197662354, + "logps/chosen": -386.0931091308594, + "logps/rejected": -503.6631774902344, + "loss": 0.0532, + "rewards/accuracies": 0.9768519401550293, + "rewards/chosen": -9.698603630065918, + "rewards/margins": 12.827930450439453, + "rewards/rejected": -22.526535034179688, + "step": 549 + }, + { + "epoch": 1.0867564903056195, + "grad_norm": 9.68443489074707, + "learning_rate": 4.030411083764498e-06, + "logits/chosen": -0.5020241737365723, + "logits/rejected": -0.28794384002685547, + "logps/chosen": -378.8649597167969, + "logps/rejected": -540.6756591796875, + "loss": 0.0671, + "rewards/accuracies": 0.9814814925193787, + "rewards/chosen": -9.302837371826172, + "rewards/margins": 13.408391952514648, + "rewards/rejected": -22.711231231689453, + "step": 552 + }, + { + "epoch": 1.0926717055537298, + "grad_norm": 20.763072967529297, + "learning_rate": 4.0167855705751855e-06, + "logits/chosen": -0.5195566415786743, + "logits/rejected": -0.35310834646224976, + "logps/chosen": -388.2412109375, + "logps/rejected": -517.09375, + "loss": 0.1577, + "rewards/accuracies": 0.9351853132247925, + "rewards/chosen": -9.682899475097656, + "rewards/margins": 12.611946105957031, + "rewards/rejected": -22.294845581054688, + "step": 555 + }, + { + "epoch": 1.0985869208018402, + "grad_norm": 10.18380069732666, + "learning_rate": 4.003088378734841e-06, + "logits/chosen": -0.5263486504554749, + "logits/rejected": -0.3212735056877136, + "logps/chosen": -395.2178649902344, + "logps/rejected": -554.233642578125, + "loss": 0.0954, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -11.4990234375, + "rewards/margins": 13.107860565185547, + "rewards/rejected": -24.606884002685547, + "step": 558 + }, + { + "epoch": 1.1045021360499507, + "grad_norm": 17.160139083862305, + "learning_rate": 3.989320155530894e-06, + "logits/chosen": -0.4392577111721039, + "logits/rejected": -0.2696951925754547, + "logps/chosen": -410.08343505859375, + "logps/rejected": -540.97509765625, + "loss": 0.1387, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -11.576102256774902, + "rewards/margins": 12.610298156738281, + "rewards/rejected": -24.186399459838867, + "step": 561 + }, + { + "epoch": 1.110417351298061, + "grad_norm": 16.328100204467773, + "learning_rate": 3.9754815516075e-06, + "logits/chosen": -0.49670523405075073, + "logits/rejected": -0.22816550731658936, + "logps/chosen": -402.760498046875, + "logps/rejected": -556.445556640625, + "loss": 0.1141, + "rewards/accuracies": 0.9490741491317749, + "rewards/chosen": -11.802339553833008, + "rewards/margins": 13.369563102722168, + "rewards/rejected": -25.171903610229492, + "step": 564 + }, + { + "epoch": 1.1163325665461716, + "grad_norm": 3.9692351818084717, + "learning_rate": 3.9615732209347925e-06, + "logits/chosen": -0.4443477988243103, + "logits/rejected": -0.30315929651260376, + "logps/chosen": -390.0570068359375, + "logps/rejected": -514.3804931640625, + "loss": 0.1019, + "rewards/accuracies": 0.9629630446434021, + "rewards/chosen": -11.126566886901855, + "rewards/margins": 10.942947387695312, + "rewards/rejected": -22.069515228271484, + "step": 567 + }, + { + "epoch": 1.122247781794282, + "grad_norm": 14.457867622375488, + "learning_rate": 3.947595820777978e-06, + "logits/chosen": -0.517672061920166, + "logits/rejected": -0.24748222529888153, + "logps/chosen": -386.17291259765625, + "logps/rejected": -539.0173950195312, + "loss": 0.1354, + "rewards/accuracies": 0.9444444179534912, + "rewards/chosen": -11.382842063903809, + "rewards/margins": 13.581473350524902, + "rewards/rejected": -24.96431541442871, + "step": 570 + }, + { + "epoch": 1.1281629970423923, + "grad_norm": 8.523018836975098, + "learning_rate": 3.933550011666275e-06, + "logits/chosen": -0.401355117559433, + "logits/rejected": -0.2663224935531616, + "logps/chosen": -422.0758972167969, + "logps/rejected": -550.5574951171875, + "loss": 0.1321, + "rewards/accuracies": 0.9444444179534912, + "rewards/chosen": -12.331350326538086, + "rewards/margins": 13.582274436950684, + "rewards/rejected": -25.913619995117188, + "step": 573 + }, + { + "epoch": 1.1340782122905029, + "grad_norm": 37.025264739990234, + "learning_rate": 3.919436457361701e-06, + "logits/chosen": -0.3494156002998352, + "logits/rejected": -0.2177804559469223, + "logps/chosen": -423.7979431152344, + "logps/rejected": -559.4660034179688, + "loss": 0.1034, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -12.480727195739746, + "rewards/margins": 12.493712425231934, + "rewards/rejected": -24.97443962097168, + "step": 576 + }, + { + "epoch": 1.1399934275386132, + "grad_norm": 10.311182975769043, + "learning_rate": 3.905255824827703e-06, + "logits/chosen": -0.404990017414093, + "logits/rejected": -0.23333707451820374, + "logps/chosen": -400.90179443359375, + "logps/rejected": -549.761962890625, + "loss": 0.0563, + "rewards/accuracies": 0.9861111044883728, + "rewards/chosen": -12.794790267944336, + "rewards/margins": 12.089736938476562, + "rewards/rejected": -24.8845272064209, + "step": 579 + }, + { + "epoch": 1.1459086427867236, + "grad_norm": 16.51976203918457, + "learning_rate": 3.891008784197642e-06, + "logits/chosen": -0.4602348208427429, + "logits/rejected": -0.32792428135871887, + "logps/chosen": -443.4609069824219, + "logps/rejected": -569.1358642578125, + "loss": 0.1204, + "rewards/accuracies": 0.9490741491317749, + "rewards/chosen": -13.16263484954834, + "rewards/margins": 12.034723281860352, + "rewards/rejected": -25.197355270385742, + "step": 582 + }, + { + "epoch": 1.1518238580348341, + "grad_norm": 19.857742309570312, + "learning_rate": 3.87669600874312e-06, + "logits/chosen": -0.45605939626693726, + "logits/rejected": -0.2824591398239136, + "logps/chosen": -440.00982666015625, + "logps/rejected": -580.4444580078125, + "loss": 0.1661, + "rewards/accuracies": 0.9490741491317749, + "rewards/chosen": -13.881183624267578, + "rewards/margins": 12.989599227905273, + "rewards/rejected": -26.87078094482422, + "step": 585 + }, + { + "epoch": 1.1577390732829445, + "grad_norm": 13.731415748596191, + "learning_rate": 3.8623181748421705e-06, + "logits/chosen": -0.5049574375152588, + "logits/rejected": -0.3214029371738434, + "logps/chosen": -420.5501403808594, + "logps/rejected": -567.5693969726562, + "loss": 0.0816, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -13.632789611816406, + "rewards/margins": 13.632120132446289, + "rewards/rejected": -27.264907836914062, + "step": 588 + }, + { + "epoch": 1.1636542885310548, + "grad_norm": 8.935365676879883, + "learning_rate": 3.847875961947284e-06, + "logits/chosen": -0.4097817540168762, + "logits/rejected": -0.30726078152656555, + "logps/chosen": -445.2584228515625, + "logps/rejected": -565.4356689453125, + "loss": 0.0672, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -13.714117050170898, + "rewards/margins": 12.752127647399902, + "rewards/rejected": -26.466245651245117, + "step": 591 + }, + { + "epoch": 1.1695695037791654, + "grad_norm": 7.481489181518555, + "learning_rate": 3.833370052553311e-06, + "logits/chosen": -0.44872909784317017, + "logits/rejected": -0.2258618324995041, + "logps/chosen": -420.02825927734375, + "logps/rejected": -583.5780029296875, + "loss": 0.106, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -12.75123119354248, + "rewards/margins": 15.216802597045898, + "rewards/rejected": -27.968032836914062, + "step": 594 + }, + { + "epoch": 1.1754847190272757, + "grad_norm": 18.711483001708984, + "learning_rate": 3.818801132165203e-06, + "logits/chosen": -0.5571283102035522, + "logits/rejected": -0.3472014367580414, + "logps/chosen": -406.1875915527344, + "logps/rejected": -583.6386108398438, + "loss": 0.1648, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -11.881487846374512, + "rewards/margins": 16.52305793762207, + "rewards/rejected": -28.404544830322266, + "step": 597 + }, + { + "epoch": 1.1813999342753863, + "grad_norm": 16.07924461364746, + "learning_rate": 3.804169889265615e-06, + "logits/chosen": -0.5486902594566345, + "logits/rejected": -0.3185918927192688, + "logps/chosen": -393.58740234375, + "logps/rejected": -564.5778198242188, + "loss": 0.1159, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -11.905487060546875, + "rewards/margins": 15.971022605895996, + "rewards/rejected": -27.876510620117188, + "step": 600 + }, + { + "epoch": 1.1873151495234966, + "grad_norm": 23.647781372070312, + "learning_rate": 3.789477015282377e-06, + "logits/chosen": -0.4597387909889221, + "logits/rejected": -0.3537699580192566, + "logps/chosen": -416.2174987792969, + "logps/rejected": -577.02099609375, + "loss": 0.1292, + "rewards/accuracies": 0.9398148059844971, + "rewards/chosen": -11.628500938415527, + "rewards/margins": 15.871562004089355, + "rewards/rejected": -27.500064849853516, + "step": 603 + }, + { + "epoch": 1.193230364771607, + "grad_norm": 11.807229995727539, + "learning_rate": 3.7747232045558145e-06, + "logits/chosen": -0.508562445640564, + "logits/rejected": -0.41957926750183105, + "logps/chosen": -424.5294189453125, + "logps/rejected": -576.8077392578125, + "loss": 0.0649, + "rewards/accuracies": 0.9675925970077515, + "rewards/chosen": -11.785578727722168, + "rewards/margins": 15.648004531860352, + "rewards/rejected": -27.433584213256836, + "step": 606 + }, + { + "epoch": 1.1991455800197173, + "grad_norm": 8.959671974182129, + "learning_rate": 3.7599091543059383e-06, + "logits/chosen": -0.5661264657974243, + "logits/rejected": -0.33349892497062683, + "logps/chosen": -407.1838073730469, + "logps/rejected": -614.8472900390625, + "loss": 0.1004, + "rewards/accuracies": 0.9490741491317749, + "rewards/chosen": -11.349664688110352, + "rewards/margins": 18.92013168334961, + "rewards/rejected": -30.26979637145996, + "step": 609 + }, + { + "epoch": 1.2050607952678278, + "grad_norm": 21.92123794555664, + "learning_rate": 3.745035564599493e-06, + "logits/chosen": -0.5120652914047241, + "logits/rejected": -0.305344820022583, + "logps/chosen": -409.43231201171875, + "logps/rejected": -560.684814453125, + "loss": 0.1223, + "rewards/accuracies": 0.9675925970077515, + "rewards/chosen": -11.581550598144531, + "rewards/margins": 15.356374740600586, + "rewards/rejected": -26.937925338745117, + "step": 612 + }, + { + "epoch": 1.2109760105159382, + "grad_norm": 11.959835052490234, + "learning_rate": 3.730103138316877e-06, + "logits/chosen": -0.40998566150665283, + "logits/rejected": -0.2449893057346344, + "logps/chosen": -407.49591064453125, + "logps/rejected": -565.201171875, + "loss": 0.0922, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -12.233043670654297, + "rewards/margins": 16.14073944091797, + "rewards/rejected": -28.3737850189209, + "step": 615 + }, + { + "epoch": 1.2168912257640487, + "grad_norm": 12.5054349899292, + "learning_rate": 3.715112581118926e-06, + "logits/chosen": -0.5086304545402527, + "logits/rejected": -0.3284660875797272, + "logps/chosen": -407.4039306640625, + "logps/rejected": -565.1151123046875, + "loss": 0.1273, + "rewards/accuracies": 0.9490741491317749, + "rewards/chosen": -11.724024772644043, + "rewards/margins": 15.240280151367188, + "rewards/rejected": -26.964305877685547, + "step": 618 + }, + { + "epoch": 1.222806441012159, + "grad_norm": 5.091789722442627, + "learning_rate": 3.7000646014135644e-06, + "logits/chosen": -0.5005905032157898, + "logits/rejected": -0.30296605825424194, + "logps/chosen": -387.29046630859375, + "logps/rejected": -552.9913330078125, + "loss": 0.1661, + "rewards/accuracies": 0.9398148059844971, + "rewards/chosen": -11.790307998657227, + "rewards/margins": 13.575780868530273, + "rewards/rejected": -25.366086959838867, + "step": 621 + }, + { + "epoch": 1.2287216562602694, + "grad_norm": 11.061638832092285, + "learning_rate": 3.684959910322328e-06, + "logits/chosen": -0.4842309355735779, + "logits/rejected": -0.3434720039367676, + "logps/chosen": -418.47467041015625, + "logps/rejected": -548.7460327148438, + "loss": 0.1391, + "rewards/accuracies": 0.9444445371627808, + "rewards/chosen": -12.249799728393555, + "rewards/margins": 13.029688835144043, + "rewards/rejected": -25.27948760986328, + "step": 624 + }, + { + "epoch": 1.23463687150838, + "grad_norm": 7.677492141723633, + "learning_rate": 3.669799221646763e-06, + "logits/chosen": -0.6426317691802979, + "logits/rejected": -0.38971924781799316, + "logps/chosen": -397.60284423828125, + "logps/rejected": -572.2548217773438, + "loss": 0.0773, + "rewards/accuracies": 0.9675925970077515, + "rewards/chosen": -11.118556022644043, + "rewards/margins": 14.920271873474121, + "rewards/rejected": -26.03882598876953, + "step": 627 + }, + { + "epoch": 1.2405520867564903, + "grad_norm": 25.286819458007812, + "learning_rate": 3.6545832518346858e-06, + "logits/chosen": -0.5745781660079956, + "logits/rejected": -0.33949047327041626, + "logps/chosen": -411.95452880859375, + "logps/rejected": -592.3193359375, + "loss": 0.1103, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -11.729289054870605, + "rewards/margins": 16.89739990234375, + "rewards/rejected": -28.626689910888672, + "step": 630 + }, + { + "epoch": 1.2464673020046007, + "grad_norm": 8.95457649230957, + "learning_rate": 3.6393127199463324e-06, + "logits/chosen": -0.6063730716705322, + "logits/rejected": -0.3560806214809418, + "logps/chosen": -423.2503662109375, + "logps/rejected": -585.0638427734375, + "loss": 0.1884, + "rewards/accuracies": 0.9444444179534912, + "rewards/chosen": -13.2220458984375, + "rewards/margins": 14.846598625183105, + "rewards/rejected": -28.068647384643555, + "step": 633 + }, + { + "epoch": 1.2523825172527112, + "grad_norm": 8.886478424072266, + "learning_rate": 3.623988347620377e-06, + "logits/chosen": -0.49550676345825195, + "logits/rejected": -0.34937870502471924, + "logps/chosen": -395.3653564453125, + "logps/rejected": -540.3450317382812, + "loss": 0.0642, + "rewards/accuracies": 0.9814814925193787, + "rewards/chosen": -12.46823501586914, + "rewards/margins": 13.855956077575684, + "rewards/rejected": -26.324188232421875, + "step": 636 + }, + { + "epoch": 1.2582977325008216, + "grad_norm": 20.726913452148438, + "learning_rate": 3.608610859039825e-06, + "logits/chosen": -0.4779970049858093, + "logits/rejected": -0.2709328830242157, + "logps/chosen": -409.822021484375, + "logps/rejected": -580.0155029296875, + "loss": 0.1294, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -13.424339294433594, + "rewards/margins": 14.51318359375, + "rewards/rejected": -27.937522888183594, + "step": 639 + }, + { + "epoch": 1.264212947748932, + "grad_norm": 2.6223342418670654, + "learning_rate": 3.593180980897798e-06, + "logits/chosen": -0.4195180833339691, + "logits/rejected": -0.25311508774757385, + "logps/chosen": -419.2818298339844, + "logps/rejected": -596.8194580078125, + "loss": 0.1061, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -13.21990966796875, + "rewards/margins": 15.108877182006836, + "rewards/rejected": -28.328786849975586, + "step": 642 + }, + { + "epoch": 1.2701281629970425, + "grad_norm": 8.978631019592285, + "learning_rate": 3.577699442363185e-06, + "logits/chosen": -0.4365012049674988, + "logits/rejected": -0.2796782851219177, + "logps/chosen": -408.720703125, + "logps/rejected": -583.4784545898438, + "loss": 0.1048, + "rewards/accuracies": 0.958333432674408, + "rewards/chosen": -12.458578109741211, + "rewards/margins": 14.986913681030273, + "rewards/rejected": -27.445491790771484, + "step": 645 + }, + { + "epoch": 1.2760433782451528, + "grad_norm": 21.892375946044922, + "learning_rate": 3.5621669750461884e-06, + "logits/chosen": -0.3126218914985657, + "logits/rejected": -0.19176405668258667, + "logps/chosen": -419.88702392578125, + "logps/rejected": -584.5615844726562, + "loss": 0.1656, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -13.262084007263184, + "rewards/margins": 14.785198211669922, + "rewards/rejected": -28.047279357910156, + "step": 648 + }, + { + "epoch": 1.2819585934932634, + "grad_norm": 12.852477073669434, + "learning_rate": 3.5465843129637485e-06, + "logits/chosen": -0.41281387209892273, + "logits/rejected": -0.2300848811864853, + "logps/chosen": -425.6917419433594, + "logps/rejected": -604.7215576171875, + "loss": 0.0975, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -13.764169692993164, + "rewards/margins": 16.54694175720215, + "rewards/rejected": -30.311111450195312, + "step": 651 + }, + { + "epoch": 1.2878738087413737, + "grad_norm": 10.992759704589844, + "learning_rate": 3.5309521925048583e-06, + "logits/chosen": -0.3890347480773926, + "logits/rejected": -0.23443222045898438, + "logps/chosen": -412.0900573730469, + "logps/rejected": -558.9166870117188, + "loss": 0.0783, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -14.06924057006836, + "rewards/margins": 13.748847961425781, + "rewards/rejected": -27.818090438842773, + "step": 654 + }, + { + "epoch": 1.293789023989484, + "grad_norm": 3.0267937183380127, + "learning_rate": 3.515271352395761e-06, + "logits/chosen": -0.454285591840744, + "logits/rejected": -0.2215675413608551, + "logps/chosen": -401.04071044921875, + "logps/rejected": -567.78857421875, + "loss": 0.0873, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -12.640899658203125, + "rewards/margins": 14.464056015014648, + "rewards/rejected": -27.104957580566406, + "step": 657 + }, + { + "epoch": 1.2997042392375944, + "grad_norm": 13.43500804901123, + "learning_rate": 3.4995425336650426e-06, + "logits/chosen": -0.4409823417663574, + "logits/rejected": -0.33086246252059937, + "logps/chosen": -431.1331481933594, + "logps/rejected": -579.64306640625, + "loss": 0.1357, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -12.985621452331543, + "rewards/margins": 13.821687698364258, + "rewards/rejected": -26.80731201171875, + "step": 660 + }, + { + "epoch": 1.305619454485705, + "grad_norm": 6.95866060256958, + "learning_rate": 3.4837664796086114e-06, + "logits/chosen": -0.5430381298065186, + "logits/rejected": -0.3280678987503052, + "logps/chosen": -414.4799499511719, + "logps/rejected": -570.076904296875, + "loss": 0.0622, + "rewards/accuracies": 0.9814814925193787, + "rewards/chosen": -12.180598258972168, + "rewards/margins": 15.332086563110352, + "rewards/rejected": -27.512685775756836, + "step": 663 + }, + { + "epoch": 1.3115346697338153, + "grad_norm": 19.8109073638916, + "learning_rate": 3.4679439357545735e-06, + "logits/chosen": -0.495805025100708, + "logits/rejected": -0.2872101068496704, + "logps/chosen": -418.2121276855469, + "logps/rejected": -591.630615234375, + "loss": 0.1216, + "rewards/accuracies": 0.9768519401550293, + "rewards/chosen": -12.411150932312012, + "rewards/margins": 15.308040618896484, + "rewards/rejected": -27.719192504882812, + "step": 666 + }, + { + "epoch": 1.3174498849819258, + "grad_norm": 13.173885345458984, + "learning_rate": 3.452075649828e-06, + "logits/chosen": -0.45853424072265625, + "logits/rejected": -0.3134307861328125, + "logps/chosen": -420.82940673828125, + "logps/rejected": -568.0089111328125, + "loss": 0.1106, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -12.83984375, + "rewards/margins": 14.545321464538574, + "rewards/rejected": -27.38516616821289, + "step": 669 + }, + { + "epoch": 1.3233651002300362, + "grad_norm": 18.055919647216797, + "learning_rate": 3.4361623717155944e-06, + "logits/chosen": -0.43652403354644775, + "logits/rejected": -0.2904703617095947, + "logps/chosen": -436.1803894042969, + "logps/rejected": -587.0079345703125, + "loss": 0.1241, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -13.198162078857422, + "rewards/margins": 14.956771850585938, + "rewards/rejected": -28.154935836791992, + "step": 672 + }, + { + "epoch": 1.3292803154781465, + "grad_norm": 9.747901916503906, + "learning_rate": 3.420204853430251e-06, + "logits/chosen": -0.4740216135978699, + "logits/rejected": -0.2630161941051483, + "logps/chosen": -425.52972412109375, + "logps/rejected": -606.2034912109375, + "loss": 0.1055, + "rewards/accuracies": 0.958333432674408, + "rewards/chosen": -14.687101364135742, + "rewards/margins": 15.13984203338623, + "rewards/rejected": -29.826946258544922, + "step": 675 + }, + { + "epoch": 1.3351955307262569, + "grad_norm": 6.129786968231201, + "learning_rate": 3.4042038490755204e-06, + "logits/chosen": -0.4343684911727905, + "logits/rejected": -0.3021864593029022, + "logps/chosen": -421.15234375, + "logps/rejected": -597.9252319335938, + "loss": 0.0662, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -13.493698120117188, + "rewards/margins": 16.47793197631836, + "rewards/rejected": -29.97163200378418, + "step": 678 + }, + { + "epoch": 1.3411107459743674, + "grad_norm": 8.284217834472656, + "learning_rate": 3.388160114809972e-06, + "logits/chosen": -0.47282737493515015, + "logits/rejected": -0.345129132270813, + "logps/chosen": -419.5128173828125, + "logps/rejected": -584.769775390625, + "loss": 0.0648, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -12.615592002868652, + "rewards/margins": 14.85853385925293, + "rewards/rejected": -27.474124908447266, + "step": 681 + }, + { + "epoch": 1.3470259612224778, + "grad_norm": 15.322405815124512, + "learning_rate": 3.372074408811462e-06, + "logits/chosen": -0.501899003982544, + "logits/rejected": -0.34835514426231384, + "logps/chosen": -441.57080078125, + "logps/rejected": -649.8969116210938, + "loss": 0.1257, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -12.605691909790039, + "rewards/margins": 20.80094337463379, + "rewards/rejected": -33.40663528442383, + "step": 684 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 5.639370441436768, + "learning_rate": 3.355947491241299e-06, + "logits/chosen": -0.452075719833374, + "logits/rejected": -0.229254812002182, + "logps/chosen": -398.7666015625, + "logps/rejected": -573.904052734375, + "loss": 0.0981, + "rewards/accuracies": 0.9675925970077515, + "rewards/chosen": -11.79572582244873, + "rewards/margins": 15.229665756225586, + "rewards/rejected": -27.025390625, + "step": 687 + }, + { + "epoch": 1.3588563917186987, + "grad_norm": 6.3108367919921875, + "learning_rate": 3.339780124208326e-06, + "logits/chosen": -0.49501562118530273, + "logits/rejected": -0.32818734645843506, + "logps/chosen": -414.3514404296875, + "logps/rejected": -599.226806640625, + "loss": 0.0966, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -11.531137466430664, + "rewards/margins": 16.380834579467773, + "rewards/rejected": -27.911970138549805, + "step": 690 + }, + { + "epoch": 1.364771606966809, + "grad_norm": 6.605685710906982, + "learning_rate": 3.3235730717329068e-06, + "logits/chosen": -0.4823850989341736, + "logits/rejected": -0.3090657889842987, + "logps/chosen": -404.1416931152344, + "logps/rejected": -587.4310913085938, + "loss": 0.0872, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -12.170730590820312, + "rewards/margins": 14.946853637695312, + "rewards/rejected": -27.117584228515625, + "step": 693 + }, + { + "epoch": 1.3706868222149196, + "grad_norm": 7.977323055267334, + "learning_rate": 3.3073270997108125e-06, + "logits/chosen": -0.4905123710632324, + "logits/rejected": -0.26319581270217896, + "logps/chosen": -410.1026306152344, + "logps/rejected": -573.89208984375, + "loss": 0.1452, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -12.655623435974121, + "rewards/margins": 14.811518669128418, + "rewards/rejected": -27.467140197753906, + "step": 696 + }, + { + "epoch": 1.37660203746303, + "grad_norm": 8.694334983825684, + "learning_rate": 3.291042975877038e-06, + "logits/chosen": -0.4588570296764374, + "logits/rejected": -0.26584672927856445, + "logps/chosen": -423.863037109375, + "logps/rejected": -574.7138671875, + "loss": 0.0837, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -13.446654319763184, + "rewards/margins": 13.398460388183594, + "rewards/rejected": -26.84511375427246, + "step": 699 + }, + { + "epoch": 1.3825172527111405, + "grad_norm": 10.281498908996582, + "learning_rate": 3.274721469769513e-06, + "logits/chosen": -0.469176709651947, + "logits/rejected": -0.32011306285858154, + "logps/chosen": -428.8432312011719, + "logps/rejected": -581.0775756835938, + "loss": 0.1607, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -14.345442771911621, + "rewards/margins": 13.746419906616211, + "rewards/rejected": -28.09186363220215, + "step": 702 + }, + { + "epoch": 1.3884324679592508, + "grad_norm": 18.152202606201172, + "learning_rate": 3.258363352692741e-06, + "logits/chosen": -0.34438759088516235, + "logits/rejected": -0.2696823477745056, + "logps/chosen": -443.9820251464844, + "logps/rejected": -565.47119140625, + "loss": 0.245, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -14.562562942504883, + "rewards/margins": 12.881950378417969, + "rewards/rejected": -27.444515228271484, + "step": 705 + }, + { + "epoch": 1.3943476832073611, + "grad_norm": 6.2034173011779785, + "learning_rate": 3.2419693976813477e-06, + "logits/chosen": -0.41304439306259155, + "logits/rejected": -0.2365877628326416, + "logps/chosen": -416.036376953125, + "logps/rejected": -572.3029174804688, + "loss": 0.4041, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -13.416823387145996, + "rewards/margins": 14.599647521972656, + "rewards/rejected": -28.01647186279297, + "step": 708 + }, + { + "epoch": 1.4002628984554715, + "grad_norm": 25.40282440185547, + "learning_rate": 3.2255403794635497e-06, + "logits/chosen": -0.43869340419769287, + "logits/rejected": -0.35092705488204956, + "logps/chosen": -471.41424560546875, + "logps/rejected": -621.4520263671875, + "loss": 0.1842, + "rewards/accuracies": 0.9351851940155029, + "rewards/chosen": -15.23921012878418, + "rewards/margins": 14.903246879577637, + "rewards/rejected": -30.1424560546875, + "step": 711 + }, + { + "epoch": 1.406178113703582, + "grad_norm": 11.920066833496094, + "learning_rate": 3.2090770744245435e-06, + "logits/chosen": -0.459338515996933, + "logits/rejected": -0.2651335895061493, + "logps/chosen": -423.816650390625, + "logps/rejected": -563.6014404296875, + "loss": 0.0619, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -12.381643295288086, + "rewards/margins": 14.162805557250977, + "rewards/rejected": -26.544448852539062, + "step": 714 + }, + { + "epoch": 1.4120933289516924, + "grad_norm": 7.712157249450684, + "learning_rate": 3.1925802605698165e-06, + "logits/chosen": -0.4218568205833435, + "logits/rejected": -0.2566026449203491, + "logps/chosen": -420.7464294433594, + "logps/rejected": -582.0595703125, + "loss": 0.063, + "rewards/accuracies": 0.9814815521240234, + "rewards/chosen": -13.265060424804688, + "rewards/margins": 13.685523986816406, + "rewards/rejected": -26.95058822631836, + "step": 717 + }, + { + "epoch": 1.418008544199803, + "grad_norm": 22.951683044433594, + "learning_rate": 3.1760507174883804e-06, + "logits/chosen": -0.4511021077632904, + "logits/rejected": -0.29694703221321106, + "logps/chosen": -404.72821044921875, + "logps/rejected": -556.6552734375, + "loss": 0.1845, + "rewards/accuracies": 0.9351851940155029, + "rewards/chosen": -12.559541702270508, + "rewards/margins": 13.269923210144043, + "rewards/rejected": -25.829465866088867, + "step": 720 + }, + { + "epoch": 1.4239237594479133, + "grad_norm": 12.7056245803833, + "learning_rate": 3.1594892263159294e-06, + "logits/chosen": -0.4648706316947937, + "logits/rejected": -0.2607460021972656, + "logps/chosen": -397.7279968261719, + "logps/rejected": -555.120849609375, + "loss": 0.1086, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -12.578474044799805, + "rewards/margins": 14.585589408874512, + "rewards/rejected": -27.1640625, + "step": 723 + }, + { + "epoch": 1.4298389746960236, + "grad_norm": 11.01905632019043, + "learning_rate": 3.1428965696979265e-06, + "logits/chosen": -0.46104860305786133, + "logits/rejected": -0.3599787950515747, + "logps/chosen": -405.425048828125, + "logps/rejected": -548.0807495117188, + "loss": 0.0793, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -11.586812973022461, + "rewards/margins": 13.873998641967773, + "rewards/rejected": -25.460811614990234, + "step": 726 + }, + { + "epoch": 1.435754189944134, + "grad_norm": 10.477839469909668, + "learning_rate": 3.1262735317526204e-06, + "logits/chosen": -0.4063342809677124, + "logits/rejected": -0.30593451857566833, + "logps/chosen": -418.88232421875, + "logps/rejected": -548.84130859375, + "loss": 0.0675, + "rewards/accuracies": 0.9814814925193787, + "rewards/chosen": -11.848334312438965, + "rewards/margins": 12.712087631225586, + "rewards/rejected": -24.560422897338867, + "step": 729 + }, + { + "epoch": 1.4416694051922445, + "grad_norm": 7.770344257354736, + "learning_rate": 3.109620898033987e-06, + "logits/chosen": -0.43280476331710815, + "logits/rejected": -0.3245890736579895, + "logps/chosen": -429.44140625, + "logps/rejected": -560.7389526367188, + "loss": 0.1388, + "rewards/accuracies": 0.958333432674408, + "rewards/chosen": -12.780267715454102, + "rewards/margins": 14.02849006652832, + "rewards/rejected": -26.808757781982422, + "step": 732 + }, + { + "epoch": 1.4475846204403549, + "grad_norm": 16.65318489074707, + "learning_rate": 3.0929394554946072e-06, + "logits/chosen": -0.4457210302352905, + "logits/rejected": -0.2969880700111389, + "logps/chosen": -427.81134033203125, + "logps/rejected": -596.4744873046875, + "loss": 0.0973, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -14.292985916137695, + "rewards/margins": 14.159872055053711, + "rewards/rejected": -28.452857971191406, + "step": 735 + }, + { + "epoch": 1.4534998356884654, + "grad_norm": 6.961342811584473, + "learning_rate": 3.0762299924484817e-06, + "logits/chosen": -0.4221629500389099, + "logits/rejected": -0.30257365107536316, + "logps/chosen": -429.935791015625, + "logps/rejected": -584.0244140625, + "loss": 0.1806, + "rewards/accuracies": 0.9444445371627808, + "rewards/chosen": -13.865874290466309, + "rewards/margins": 14.795816421508789, + "rewards/rejected": -28.66168975830078, + "step": 738 + }, + { + "epoch": 1.4594150509365758, + "grad_norm": 12.587532997131348, + "learning_rate": 3.0594932985337715e-06, + "logits/chosen": -0.43738850951194763, + "logits/rejected": -0.29806679487228394, + "logps/chosen": -418.6221923828125, + "logps/rejected": -582.5604248046875, + "loss": 0.0767, + "rewards/accuracies": 0.9814814925193787, + "rewards/chosen": -14.361178398132324, + "rewards/margins": 14.669501304626465, + "rewards/rejected": -29.030677795410156, + "step": 741 + }, + { + "epoch": 1.465330266184686, + "grad_norm": 4.6231303215026855, + "learning_rate": 3.0427301646754874e-06, + "logits/chosen": -0.4250808358192444, + "logits/rejected": -0.22024689614772797, + "logps/chosen": -452.19024658203125, + "logps/rejected": -626.263427734375, + "loss": 0.1349, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -15.096145629882812, + "rewards/margins": 14.675666809082031, + "rewards/rejected": -29.771812438964844, + "step": 744 + }, + { + "epoch": 1.4712454814327967, + "grad_norm": 13.469663619995117, + "learning_rate": 3.0259413830481093e-06, + "logits/chosen": -0.4654272496700287, + "logits/rejected": -0.27737024426460266, + "logps/chosen": -429.6473388671875, + "logps/rejected": -578.0706787109375, + "loss": 0.1076, + "rewards/accuracies": 0.9490741491317749, + "rewards/chosen": -15.359736442565918, + "rewards/margins": 13.461069107055664, + "rewards/rejected": -28.8208065032959, + "step": 747 + }, + { + "epoch": 1.477160696680907, + "grad_norm": 15.791036605834961, + "learning_rate": 3.0091277470381547e-06, + "logits/chosen": -0.47309786081314087, + "logits/rejected": -0.27244430780410767, + "logps/chosen": -442.2904052734375, + "logps/rejected": -596.2742309570312, + "loss": 0.1425, + "rewards/accuracies": 0.9444445371627808, + "rewards/chosen": -15.658645629882812, + "rewards/margins": 13.340225219726562, + "rewards/rejected": -28.998870849609375, + "step": 750 + }, + { + "epoch": 1.4830759119290176, + "grad_norm": 12.009902000427246, + "learning_rate": 2.9922900512066804e-06, + "logits/chosen": -0.46984386444091797, + "logits/rejected": -0.34757542610168457, + "logps/chosen": -439.5838317871094, + "logps/rejected": -565.4759521484375, + "loss": 0.1453, + "rewards/accuracies": 0.9490741491317749, + "rewards/chosen": -14.995939254760742, + "rewards/margins": 13.354354858398438, + "rewards/rejected": -28.350296020507812, + "step": 753 + }, + { + "epoch": 1.488991127177128, + "grad_norm": 10.95486831665039, + "learning_rate": 2.975429091251739e-06, + "logits/chosen": -0.5273573398590088, + "logits/rejected": -0.3908497989177704, + "logps/chosen": -451.13671875, + "logps/rejected": -604.9403076171875, + "loss": 0.051, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -16.25088882446289, + "rewards/margins": 14.09473705291748, + "rewards/rejected": -30.345623016357422, + "step": 756 + }, + { + "epoch": 1.4949063424252382, + "grad_norm": 17.647659301757812, + "learning_rate": 2.958545663970774e-06, + "logits/chosen": -0.3867652416229248, + "logits/rejected": -0.29644548892974854, + "logps/chosen": -476.21478271484375, + "logps/rejected": -616.864013671875, + "loss": 0.1236, + "rewards/accuracies": 0.9490741491317749, + "rewards/chosen": -17.239521026611328, + "rewards/margins": 13.26838493347168, + "rewards/rejected": -30.50790786743164, + "step": 759 + }, + { + "epoch": 1.5008215576733486, + "grad_norm": 13.190269470214844, + "learning_rate": 2.941640567222966e-06, + "logits/chosen": -0.44550761580467224, + "logits/rejected": -0.3317570090293884, + "logps/chosen": -456.7379150390625, + "logps/rejected": -595.80810546875, + "loss": 0.0919, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -16.739410400390625, + "rewards/margins": 12.945990562438965, + "rewards/rejected": -29.68539810180664, + "step": 762 + }, + { + "epoch": 1.5067367729214591, + "grad_norm": 17.65411376953125, + "learning_rate": 2.924714599891527e-06, + "logits/chosen": -0.5359483361244202, + "logits/rejected": -0.4007814824581146, + "logps/chosen": -455.808837890625, + "logps/rejected": -608.6019287109375, + "loss": 0.1, + "rewards/accuracies": 0.9675925970077515, + "rewards/chosen": -16.04080581665039, + "rewards/margins": 14.246829986572266, + "rewards/rejected": -30.287635803222656, + "step": 765 + }, + { + "epoch": 1.5126519881695695, + "grad_norm": 12.9535551071167, + "learning_rate": 2.9077685618459523e-06, + "logits/chosen": -0.49911361932754517, + "logits/rejected": -0.3754135072231293, + "logps/chosen": -429.71112060546875, + "logps/rejected": -558.918212890625, + "loss": 0.118, + "rewards/accuracies": 0.9351853132247925, + "rewards/chosen": -15.281416893005371, + "rewards/margins": 12.335990905761719, + "rewards/rejected": -27.617406845092773, + "step": 768 + }, + { + "epoch": 1.51856720341768, + "grad_norm": 26.166357040405273, + "learning_rate": 2.890803253904214e-06, + "logits/chosen": -0.4002833962440491, + "logits/rejected": -0.2346002757549286, + "logps/chosen": -435.1705017089844, + "logps/rejected": -581.8082885742188, + "loss": 0.1564, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -16.032634735107422, + "rewards/margins": 12.411070823669434, + "rewards/rejected": -28.443706512451172, + "step": 771 + }, + { + "epoch": 1.5244824186657904, + "grad_norm": 9.620137214660645, + "learning_rate": 2.8738194777949236e-06, + "logits/chosen": -0.5191614627838135, + "logits/rejected": -0.3987714946269989, + "logps/chosen": -438.0507507324219, + "logps/rejected": -569.0453491210938, + "loss": 0.147, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -15.467351913452148, + "rewards/margins": 12.752056121826172, + "rewards/rejected": -28.219406127929688, + "step": 774 + }, + { + "epoch": 1.5303976339139007, + "grad_norm": 15.478388786315918, + "learning_rate": 2.8568180361194394e-06, + "logits/chosen": -0.5597792863845825, + "logits/rejected": -0.3868427276611328, + "logps/chosen": -431.40277099609375, + "logps/rejected": -598.789794921875, + "loss": 0.1103, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -15.434844970703125, + "rewards/margins": 14.038825035095215, + "rewards/rejected": -29.473669052124023, + "step": 777 + }, + { + "epoch": 1.536312849162011, + "grad_norm": 20.271595001220703, + "learning_rate": 2.8397997323139424e-06, + "logits/chosen": -0.5222074389457703, + "logits/rejected": -0.3276955485343933, + "logps/chosen": -451.5032958984375, + "logps/rejected": -593.4445190429688, + "loss": 0.1287, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -14.768893241882324, + "rewards/margins": 14.125001907348633, + "rewards/rejected": -28.89389419555664, + "step": 780 + }, + { + "epoch": 1.5422280644101216, + "grad_norm": 13.155769348144531, + "learning_rate": 2.8227653706114653e-06, + "logits/chosen": -0.5657685995101929, + "logits/rejected": -0.3959158658981323, + "logps/chosen": -441.99755859375, + "logps/rejected": -606.1256103515625, + "loss": 0.0911, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -15.186637878417969, + "rewards/margins": 13.747825622558594, + "rewards/rejected": -28.934463500976562, + "step": 783 + }, + { + "epoch": 1.5481432796582322, + "grad_norm": 8.477231979370117, + "learning_rate": 2.805715756003888e-06, + "logits/chosen": -0.4907180964946747, + "logits/rejected": -0.3279544711112976, + "logps/chosen": -416.3576354980469, + "logps/rejected": -562.9335327148438, + "loss": 0.1236, + "rewards/accuracies": 0.9629630446434021, + "rewards/chosen": -13.1373291015625, + "rewards/margins": 13.31623363494873, + "rewards/rejected": -26.45356559753418, + "step": 786 + }, + { + "epoch": 1.5540584949063425, + "grad_norm": 18.048381805419922, + "learning_rate": 2.7886516942038965e-06, + "logits/chosen": -0.6140527725219727, + "logits/rejected": -0.41285985708236694, + "logps/chosen": -417.43048095703125, + "logps/rejected": -561.987548828125, + "loss": 0.1607, + "rewards/accuracies": 0.9629630446434021, + "rewards/chosen": -14.243610382080078, + "rewards/margins": 12.485297203063965, + "rewards/rejected": -26.72890853881836, + "step": 789 + }, + { + "epoch": 1.5599737101544529, + "grad_norm": 14.007949829101562, + "learning_rate": 2.7715739916069055e-06, + "logits/chosen": -0.6075841784477234, + "logits/rejected": -0.4069880247116089, + "logps/chosen": -414.72918701171875, + "logps/rejected": -580.2230834960938, + "loss": 0.0942, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -13.613109588623047, + "rewards/margins": 12.703289031982422, + "rewards/rejected": -26.316402435302734, + "step": 792 + }, + { + "epoch": 1.5658889254025632, + "grad_norm": 11.378000259399414, + "learning_rate": 2.754483455252955e-06, + "logits/chosen": -0.6046838164329529, + "logits/rejected": -0.4010394513607025, + "logps/chosen": -430.98040771484375, + "logps/rejected": -583.5884399414062, + "loss": 0.165, + "rewards/accuracies": 0.9444445371627808, + "rewards/chosen": -14.201919555664062, + "rewards/margins": 13.310440063476562, + "rewards/rejected": -27.512357711791992, + "step": 795 + }, + { + "epoch": 1.5718041406506735, + "grad_norm": 16.261093139648438, + "learning_rate": 2.7373808927885665e-06, + "logits/chosen": -0.6048099398612976, + "logits/rejected": -0.4555709958076477, + "logps/chosen": -445.0321960449219, + "logps/rejected": -600.6202392578125, + "loss": 0.1134, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -15.614604949951172, + "rewards/margins": 12.559003829956055, + "rewards/rejected": -28.17361068725586, + "step": 798 + }, + { + "epoch": 1.577719355898784, + "grad_norm": 4.100447654724121, + "learning_rate": 2.720267112428579e-06, + "logits/chosen": -0.6097166538238525, + "logits/rejected": -0.42670154571533203, + "logps/chosen": -433.3768005371094, + "logps/rejected": -565.345458984375, + "loss": 0.1358, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -15.040475845336914, + "rewards/margins": 12.202143669128418, + "rewards/rejected": -27.24262237548828, + "step": 801 + }, + { + "epoch": 1.5836345711468947, + "grad_norm": 9.510040283203125, + "learning_rate": 2.7031429229179568e-06, + "logits/chosen": -0.529786229133606, + "logits/rejected": -0.46382081508636475, + "logps/chosen": -432.32464599609375, + "logps/rejected": -555.993896484375, + "loss": 0.1105, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -15.261589050292969, + "rewards/margins": 11.754304885864258, + "rewards/rejected": -27.01589584350586, + "step": 804 + }, + { + "epoch": 1.589549786395005, + "grad_norm": 14.93637466430664, + "learning_rate": 2.6860091334935667e-06, + "logits/chosen": -0.6101542711257935, + "logits/rejected": -0.5047379732131958, + "logps/chosen": -461.29412841796875, + "logps/rejected": -587.7117309570312, + "loss": 0.1033, + "rewards/accuracies": 0.9537038207054138, + "rewards/chosen": -16.00438690185547, + "rewards/margins": 12.531213760375977, + "rewards/rejected": -28.535598754882812, + "step": 807 + }, + { + "epoch": 1.5954650016431153, + "grad_norm": 4.996009349822998, + "learning_rate": 2.6688665538459395e-06, + "logits/chosen": -0.6482338309288025, + "logits/rejected": -0.528326690196991, + "logps/chosen": -419.1895446777344, + "logps/rejected": -574.5487670898438, + "loss": 0.043, + "rewards/accuracies": 0.9814814925193787, + "rewards/chosen": -15.451910972595215, + "rewards/margins": 12.121021270751953, + "rewards/rejected": -27.57292938232422, + "step": 810 + }, + { + "epoch": 1.6013802168912257, + "grad_norm": 7.183305740356445, + "learning_rate": 2.651715994081004e-06, + "logits/chosen": -0.5737613439559937, + "logits/rejected": -0.46935686469078064, + "logps/chosen": -437.48138427734375, + "logps/rejected": -574.8239135742188, + "loss": 0.0833, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -14.817813873291016, + "rewards/margins": 12.590240478515625, + "rewards/rejected": -27.40805435180664, + "step": 813 + }, + { + "epoch": 1.6072954321393362, + "grad_norm": 13.455055236816406, + "learning_rate": 2.6345582646818064e-06, + "logits/chosen": -0.6544980406761169, + "logits/rejected": -0.5129336714744568, + "logps/chosen": -445.51922607421875, + "logps/rejected": -592.19775390625, + "loss": 0.1761, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -15.481573104858398, + "rewards/margins": 12.895651817321777, + "rewards/rejected": -28.377225875854492, + "step": 816 + }, + { + "epoch": 1.6132106473874466, + "grad_norm": 15.47365951538086, + "learning_rate": 2.6173941764702064e-06, + "logits/chosen": -0.6039730906486511, + "logits/rejected": -0.4822230339050293, + "logps/chosen": -415.309814453125, + "logps/rejected": -560.3380126953125, + "loss": 0.141, + "rewards/accuracies": 0.9537036418914795, + "rewards/chosen": -14.446993827819824, + "rewards/margins": 12.657259941101074, + "rewards/rejected": -27.104251861572266, + "step": 819 + }, + { + "epoch": 1.6191258626355571, + "grad_norm": 20.26230812072754, + "learning_rate": 2.600224540568562e-06, + "logits/chosen": -0.6074631810188293, + "logits/rejected": -0.4393419921398163, + "logps/chosen": -442.9925231933594, + "logps/rejected": -595.2706298828125, + "loss": 0.094, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -14.230273246765137, + "rewards/margins": 13.708857536315918, + "rewards/rejected": -27.939128875732422, + "step": 822 + }, + { + "epoch": 1.6250410778836675, + "grad_norm": 14.09592056274414, + "learning_rate": 2.5830501683614e-06, + "logits/chosen": -0.622641921043396, + "logits/rejected": -0.468400239944458, + "logps/chosen": -421.7862243652344, + "logps/rejected": -571.685791015625, + "loss": 0.1997, + "rewards/accuracies": 0.9398148059844971, + "rewards/chosen": -13.297590255737305, + "rewards/margins": 11.761819839477539, + "rewards/rejected": -25.059410095214844, + "step": 825 + }, + { + "epoch": 1.6309562931317778, + "grad_norm": 9.527044296264648, + "learning_rate": 2.565871871457068e-06, + "logits/chosen": -0.6550042629241943, + "logits/rejected": -0.5018041133880615, + "logps/chosen": -436.56121826171875, + "logps/rejected": -563.2191162109375, + "loss": 0.1558, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -13.345283508300781, + "rewards/margins": 11.860011100769043, + "rewards/rejected": -25.20529556274414, + "step": 828 + }, + { + "epoch": 1.6368715083798882, + "grad_norm": 4.440464973449707, + "learning_rate": 2.5486904616493836e-06, + "logits/chosen": -0.6473764777183533, + "logits/rejected": -0.48216694593429565, + "logps/chosen": -442.418701171875, + "logps/rejected": -575.4085083007812, + "loss": 0.1618, + "rewards/accuracies": 0.9537036418914795, + "rewards/chosen": -13.242083549499512, + "rewards/margins": 13.441139221191406, + "rewards/rejected": -26.68321990966797, + "step": 831 + }, + { + "epoch": 1.6427867236279987, + "grad_norm": 16.591794967651367, + "learning_rate": 2.531506750879272e-06, + "logits/chosen": -0.6643091440200806, + "logits/rejected": -0.5735387802124023, + "logps/chosen": -422.335693359375, + "logps/rejected": -560.1734619140625, + "loss": 0.0882, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -13.788366317749023, + "rewards/margins": 12.23902416229248, + "rewards/rejected": -26.027389526367188, + "step": 834 + }, + { + "epoch": 1.648701938876109, + "grad_norm": 25.704437255859375, + "learning_rate": 2.5143215511963924e-06, + "logits/chosen": -0.6127258539199829, + "logits/rejected": -0.4650610089302063, + "logps/chosen": -431.94287109375, + "logps/rejected": -563.011474609375, + "loss": 0.1712, + "rewards/accuracies": 0.9351851940155029, + "rewards/chosen": -13.537942886352539, + "rewards/margins": 12.424100875854492, + "rewards/rejected": -25.9620418548584, + "step": 837 + }, + { + "epoch": 1.6546171541242196, + "grad_norm": 3.809575319290161, + "learning_rate": 2.497135674720767e-06, + "logits/chosen": -0.6093651652336121, + "logits/rejected": -0.4662657380104065, + "logps/chosen": -406.9659423828125, + "logps/rejected": -547.5611572265625, + "loss": 0.1135, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -12.872491836547852, + "rewards/margins": 12.23689079284668, + "rewards/rejected": -25.10938262939453, + "step": 840 + }, + { + "epoch": 1.66053236937233, + "grad_norm": 17.25910186767578, + "learning_rate": 2.4799499336044024e-06, + "logits/chosen": -0.5865241289138794, + "logits/rejected": -0.4418222904205322, + "logps/chosen": -421.354736328125, + "logps/rejected": -548.7689208984375, + "loss": 0.1608, + "rewards/accuracies": 0.953703761100769, + "rewards/chosen": -13.852258682250977, + "rewards/margins": 12.092245101928711, + "rewards/rejected": -25.944503784179688, + "step": 843 + }, + { + "epoch": 1.6664475846204403, + "grad_norm": 8.59719467163086, + "learning_rate": 2.4627651399929062e-06, + "logits/chosen": -0.5704312324523926, + "logits/rejected": -0.4715225100517273, + "logps/chosen": -420.5884704589844, + "logps/rejected": -536.214599609375, + "loss": 0.0487, + "rewards/accuracies": 0.9814814925193787, + "rewards/chosen": -13.342948913574219, + "rewards/margins": 11.972259521484375, + "rewards/rejected": -25.315208435058594, + "step": 846 + }, + { + "epoch": 1.6723627998685506, + "grad_norm": 10.35339641571045, + "learning_rate": 2.445582105987109e-06, + "logits/chosen": -0.6614564657211304, + "logits/rejected": -0.4607757329940796, + "logps/chosen": -436.13128662109375, + "logps/rejected": -581.8274536132812, + "loss": 0.0495, + "rewards/accuracies": 0.9861111044883728, + "rewards/chosen": -14.702802658081055, + "rewards/margins": 13.050193786621094, + "rewards/rejected": -27.752994537353516, + "step": 849 + }, + { + "epoch": 1.6782780151166612, + "grad_norm": 14.956944465637207, + "learning_rate": 2.4284016436046926e-06, + "logits/chosen": -0.6371724605560303, + "logits/rejected": -0.4794648587703705, + "logps/chosen": -444.399169921875, + "logps/rejected": -574.8007202148438, + "loss": 0.1003, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -15.0126953125, + "rewards/margins": 12.315101623535156, + "rewards/rejected": -27.327796936035156, + "step": 852 + }, + { + "epoch": 1.6841932303647718, + "grad_norm": 13.839925765991211, + "learning_rate": 2.411224564741808e-06, + "logits/chosen": -0.569583535194397, + "logits/rejected": -0.4447590708732605, + "logps/chosen": -430.7904052734375, + "logps/rejected": -559.3048095703125, + "loss": 0.1438, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -14.179176330566406, + "rewards/margins": 12.510564804077148, + "rewards/rejected": -26.689739227294922, + "step": 855 + }, + { + "epoch": 1.690108445612882, + "grad_norm": 22.94325828552246, + "learning_rate": 2.394051681134709e-06, + "logits/chosen": -0.6330985426902771, + "logits/rejected": -0.49612173438072205, + "logps/chosen": -458.27496337890625, + "logps/rejected": -623.1173095703125, + "loss": 0.0924, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -16.434764862060547, + "rewards/margins": 13.425687789916992, + "rewards/rejected": -29.860454559326172, + "step": 858 + }, + { + "epoch": 1.6960236608609924, + "grad_norm": 7.350907325744629, + "learning_rate": 2.3768838043214017e-06, + "logits/chosen": -0.5280520915985107, + "logits/rejected": -0.4022086262702942, + "logps/chosen": -439.752197265625, + "logps/rejected": -577.3414306640625, + "loss": 0.1342, + "rewards/accuracies": 0.9675926566123962, + "rewards/chosen": -16.48275375366211, + "rewards/margins": 13.460319519042969, + "rewards/rejected": -29.943073272705078, + "step": 861 + }, + { + "epoch": 1.7019388761091028, + "grad_norm": 10.35116195678711, + "learning_rate": 2.359721745603281e-06, + "logits/chosen": -0.6492921710014343, + "logits/rejected": -0.4556545615196228, + "logps/chosen": -453.32763671875, + "logps/rejected": -619.16552734375, + "loss": 0.1158, + "rewards/accuracies": 0.9444444179534912, + "rewards/chosen": -16.452810287475586, + "rewards/margins": 13.99924087524414, + "rewards/rejected": -30.452049255371094, + "step": 864 + }, + { + "epoch": 1.7078540913572131, + "grad_norm": 26.204652786254883, + "learning_rate": 2.342566316006796e-06, + "logits/chosen": -0.5175681114196777, + "logits/rejected": -0.35332179069519043, + "logps/chosen": -459.5921630859375, + "logps/rejected": -617.079345703125, + "loss": 0.1357, + "rewards/accuracies": 0.958333432674408, + "rewards/chosen": -15.511926651000977, + "rewards/margins": 13.921857833862305, + "rewards/rejected": -29.433780670166016, + "step": 867 + }, + { + "epoch": 1.7137693066053237, + "grad_norm": 7.6619744300842285, + "learning_rate": 2.3254183262451262e-06, + "logits/chosen": -0.5789531469345093, + "logits/rejected": -0.4110344648361206, + "logps/chosen": -442.13519287109375, + "logps/rejected": -579.24267578125, + "loss": 0.0957, + "rewards/accuracies": 0.9768518805503845, + "rewards/chosen": -15.932055473327637, + "rewards/margins": 13.983152389526367, + "rewards/rejected": -29.915210723876953, + "step": 870 + }, + { + "epoch": 1.7196845218534342, + "grad_norm": 11.0267972946167, + "learning_rate": 2.308278586679868e-06, + "logits/chosen": -0.594774067401886, + "logits/rejected": -0.560388445854187, + "logps/chosen": -462.3426208496094, + "logps/rejected": -580.824951171875, + "loss": 0.1244, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -15.821860313415527, + "rewards/margins": 12.203353881835938, + "rewards/rejected": -28.02521324157715, + "step": 873 + }, + { + "epoch": 1.7255997371015446, + "grad_norm": 15.19896411895752, + "learning_rate": 2.291147907282735e-06, + "logits/chosen": -0.5738660097122192, + "logits/rejected": -0.4418361783027649, + "logps/chosen": -459.52362060546875, + "logps/rejected": -590.7941284179688, + "loss": 0.104, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -15.917755126953125, + "rewards/margins": 13.78125, + "rewards/rejected": -29.69900894165039, + "step": 876 + }, + { + "epoch": 1.731514952349655, + "grad_norm": 17.122583389282227, + "learning_rate": 2.274027097597288e-06, + "logits/chosen": -0.5340918302536011, + "logits/rejected": -0.340861439704895, + "logps/chosen": -445.60028076171875, + "logps/rejected": -592.4035034179688, + "loss": 0.1316, + "rewards/accuracies": 0.9722222089767456, + "rewards/chosen": -15.184493064880371, + "rewards/margins": 14.001117706298828, + "rewards/rejected": -29.185611724853516, + "step": 879 + }, + { + "epoch": 1.7374301675977653, + "grad_norm": 5.882667541503906, + "learning_rate": 2.2569169667006753e-06, + "logits/chosen": -0.5894896984100342, + "logits/rejected": -0.4293175935745239, + "logps/chosen": -432.3334655761719, + "logps/rejected": -583.857421875, + "loss": 0.1617, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -15.522388458251953, + "rewards/margins": 12.695643424987793, + "rewards/rejected": -28.21803092956543, + "step": 882 + }, + { + "epoch": 1.7433453828458758, + "grad_norm": 11.34843921661377, + "learning_rate": 2.2398183231653965e-06, + "logits/chosen": -0.6059713363647461, + "logits/rejected": -0.48524826765060425, + "logps/chosen": -443.12127685546875, + "logps/rejected": -571.3858642578125, + "loss": 0.0763, + "rewards/accuracies": 0.9629629850387573, + "rewards/chosen": -14.814570426940918, + "rewards/margins": 13.475343704223633, + "rewards/rejected": -28.289913177490234, + "step": 885 + }, + { + "epoch": 1.7492605980939862, + "grad_norm": 12.489235877990723, + "learning_rate": 2.2227319750210953e-06, + "logits/chosen": -0.6146866083145142, + "logits/rejected": -0.40949270129203796, + "logps/chosen": -424.9417724609375, + "logps/rejected": -614.04638671875, + "loss": 0.0533, + "rewards/accuracies": 0.9722222089767456, + "rewards/chosen": -15.254619598388672, + "rewards/margins": 14.743501663208008, + "rewards/rejected": -29.998123168945312, + "step": 888 + }, + { + "epoch": 1.7551758133420967, + "grad_norm": 25.573904037475586, + "learning_rate": 2.2056587297163705e-06, + "logits/chosen": -0.5818058252334595, + "logits/rejected": -0.4116588830947876, + "logps/chosen": -438.244140625, + "logps/rejected": -608.5093383789062, + "loss": 0.1422, + "rewards/accuracies": 0.9305555820465088, + "rewards/chosen": -15.043352127075195, + "rewards/margins": 14.259744644165039, + "rewards/rejected": -29.303096771240234, + "step": 891 + }, + { + "epoch": 1.761091028590207, + "grad_norm": 2.851255416870117, + "learning_rate": 2.1885993940806242e-06, + "logits/chosen": -0.5600209832191467, + "logits/rejected": -0.4914409816265106, + "logps/chosen": -437.0643310546875, + "logps/rejected": -585.774658203125, + "loss": 0.0955, + "rewards/accuracies": 0.9583333730697632, + "rewards/chosen": -15.314014434814453, + "rewards/margins": 13.094465255737305, + "rewards/rejected": -28.40848159790039, + "step": 894 + }, + { + "epoch": 1.7670062438383174, + "grad_norm": 3.1413819789886475, + "learning_rate": 2.1715547742859276e-06, + "logits/chosen": -0.6104824542999268, + "logits/rejected": -0.5107275247573853, + "logps/chosen": -450.92352294921875, + "logps/rejected": -583.9996337890625, + "loss": 0.0665, + "rewards/accuracies": 0.9722222685813904, + "rewards/chosen": -14.522634506225586, + "rewards/margins": 13.630828857421875, + "rewards/rejected": -28.15346336364746, + "step": 897 + }, + { + "epoch": 1.7729214590864277, + "grad_norm": 13.548209190368652, + "learning_rate": 2.1545256758089257e-06, + "logits/chosen": -0.646898090839386, + "logits/rejected": -0.5041735768318176, + "logps/chosen": -431.9864196777344, + "logps/rejected": -576.46630859375, + "loss": 0.1321, + "rewards/accuracies": 0.9444445371627808, + "rewards/chosen": -14.59251594543457, + "rewards/margins": 13.649394035339355, + "rewards/rejected": -28.24190902709961, + "step": 900 + } + ], + "logging_steps": 3, + "max_steps": 1524, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.22364860235409e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}