{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08977063602495623, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008977063602495623, "grad_norm": 49.75, "learning_rate": 1.8797356064157479e-06, "logits/chosen": -1.5944416522979736, "logits/rejected": -1.6120755672454834, "logps/chosen": -218.08145141601562, "logps/rejected": -238.650634765625, "loss": 0.69, "rewards/accuracies": 0.375, "rewards/chosen": 0.003223979379981756, "rewards/margins": 0.006512450985610485, "rewards/rejected": -0.003288471605628729, "step": 5 }, { "epoch": 0.0017954127204991247, "grad_norm": 52.75, "learning_rate": 4.229405114435433e-06, "logits/chosen": -1.638082504272461, "logits/rejected": -1.644774079322815, "logps/chosen": -218.1611785888672, "logps/rejected": -238.66098022460938, "loss": 0.6066, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07871033251285553, "rewards/margins": 0.1871051788330078, "rewards/rejected": -0.10839483886957169, "step": 10 }, { "epoch": 0.002693119080748687, "grad_norm": 28.625, "learning_rate": 6.579074622455118e-06, "logits/chosen": -1.7269313335418701, "logits/rejected": -1.7303335666656494, "logps/chosen": -226.7982635498047, "logps/rejected": -258.6452331542969, "loss": 0.3006, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.6204186081886292, "rewards/margins": 1.243154764175415, "rewards/rejected": -1.8635733127593994, "step": 15 }, { "epoch": 0.0035908254409982494, "grad_norm": 7.84375, "learning_rate": 8.928744130474802e-06, "logits/chosen": -1.7967208623886108, "logits/rejected": -1.814859390258789, "logps/chosen": -229.9993133544922, "logps/rejected": -288.5595397949219, "loss": 0.0873, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8708345890045166, "rewards/margins": 3.8205904960632324, "rewards/rejected": -5.691425323486328, "step": 20 }, { "epoch": 0.004488531801247812, "grad_norm": 14.875, "learning_rate": 1.1278413638494489e-05, "logits/chosen": -1.7308677434921265, "logits/rejected": -1.7561269998550415, "logps/chosen": -243.46029663085938, "logps/rejected": -340.27764892578125, "loss": 0.018, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0787580013275146, "rewards/margins": 7.586331367492676, "rewards/rejected": -10.665090560913086, "step": 25 }, { "epoch": 0.005386238161497374, "grad_norm": 8.25, "learning_rate": 1.3628083146514173e-05, "logits/chosen": -1.6984357833862305, "logits/rejected": -1.7242708206176758, "logps/chosen": -264.53125, "logps/rejected": -376.6803894042969, "loss": 0.0285, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.747864246368408, "rewards/margins": 9.422819137573242, "rewards/rejected": -14.170684814453125, "step": 30 }, { "epoch": 0.006283944521746937, "grad_norm": 4.1875, "learning_rate": 1.5977752654533858e-05, "logits/chosen": -1.6455342769622803, "logits/rejected": -1.6728187799453735, "logps/chosen": -281.365966796875, "logps/rejected": -407.96337890625, "loss": 0.0216, "rewards/accuracies": 0.984375, "rewards/chosen": -6.47125768661499, "rewards/margins": 10.662993431091309, "rewards/rejected": -17.13425064086914, "step": 35 }, { "epoch": 0.007181650881996499, "grad_norm": 6.3125, "learning_rate": 1.6447684804072058e-05, "logits/chosen": -1.5919939279556274, "logits/rejected": -1.617920160293579, "logps/chosen": -278.3464660644531, "logps/rejected": -404.8271484375, "loss": 0.0342, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -6.385420799255371, "rewards/margins": 10.654914855957031, "rewards/rejected": -17.040334701538086, "step": 40 }, { "epoch": 0.00807935724224606, "grad_norm": 6.625, "learning_rate": 1.6447677686306693e-05, "logits/chosen": -1.6035076379776, "logits/rejected": -1.6163572072982788, "logps/chosen": -279.767822265625, "logps/rejected": -383.13458251953125, "loss": 0.0432, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -5.704493999481201, "rewards/margins": 8.454301834106445, "rewards/rejected": -14.158796310424805, "step": 45 }, { "epoch": 0.008977063602495623, "grad_norm": 4.8125, "learning_rate": 1.6447665093343918e-05, "logits/chosen": -1.6678664684295654, "logits/rejected": -1.6700479984283447, "logps/chosen": -264.4732971191406, "logps/rejected": -363.1940612792969, "loss": 0.0464, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -5.7117018699646, "rewards/margins": 7.980559349060059, "rewards/rejected": -13.692262649536133, "step": 50 }, { "epoch": 0.009874769962745186, "grad_norm": 4.65625, "learning_rate": 1.6447647025194904e-05, "logits/chosen": -1.5799241065979004, "logits/rejected": -1.5821675062179565, "logps/chosen": -268.1691589355469, "logps/rejected": -374.00518798828125, "loss": 0.0205, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.694095134735107, "rewards/margins": 8.601041793823242, "rewards/rejected": -14.295137405395508, "step": 55 }, { "epoch": 0.010772476322994749, "grad_norm": 4.625, "learning_rate": 1.6447623481875693e-05, "logits/chosen": -1.615523338317871, "logits/rejected": -1.6053167581558228, "logps/chosen": -269.4774475097656, "logps/rejected": -389.2843933105469, "loss": 0.0186, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.536202907562256, "rewards/margins": 9.901906967163086, "rewards/rejected": -15.4381103515625, "step": 60 }, { "epoch": 0.011670182683244311, "grad_norm": 5.0625, "learning_rate": 1.644759446340718e-05, "logits/chosen": -1.62222158908844, "logits/rejected": -1.6126108169555664, "logps/chosen": -273.30316162109375, "logps/rejected": -401.03851318359375, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.82913875579834, "rewards/margins": 10.689005851745605, "rewards/rejected": -16.518144607543945, "step": 65 }, { "epoch": 0.012567889043493874, "grad_norm": 5.0625, "learning_rate": 1.644755996981513e-05, "logits/chosen": -1.6640081405639648, "logits/rejected": -1.6507833003997803, "logps/chosen": -274.74053955078125, "logps/rejected": -404.7392883300781, "loss": 0.0319, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -5.577990531921387, "rewards/margins": 10.969307899475098, "rewards/rejected": -16.547298431396484, "step": 70 }, { "epoch": 0.013465595403743435, "grad_norm": 30.25, "learning_rate": 1.6447520001130158e-05, "logits/chosen": -1.5772068500518799, "logits/rejected": -1.5707197189331055, "logps/chosen": -278.74658203125, "logps/rejected": -406.4679260253906, "loss": 0.0555, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -6.121307849884033, "rewards/margins": 10.843521118164062, "rewards/rejected": -16.96483039855957, "step": 75 }, { "epoch": 0.014363301763992998, "grad_norm": 3.109375, "learning_rate": 1.6447474557387748e-05, "logits/chosen": -1.53738534450531, "logits/rejected": -1.534790277481079, "logps/chosen": -314.1097717285156, "logps/rejected": -420.47308349609375, "loss": 0.0185, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -9.349452018737793, "rewards/margins": 8.684282302856445, "rewards/rejected": -18.033733367919922, "step": 80 }, { "epoch": 0.01526100812424256, "grad_norm": 4.03125, "learning_rate": 1.6447423638628237e-05, "logits/chosen": -1.5148117542266846, "logits/rejected": -1.5227998495101929, "logps/chosen": -316.7289123535156, "logps/rejected": -430.1024475097656, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -10.431829452514648, "rewards/margins": 9.403815269470215, "rewards/rejected": -19.835643768310547, "step": 85 }, { "epoch": 0.01615871448449212, "grad_norm": 3.84375, "learning_rate": 1.6447367244896826e-05, "logits/chosen": -1.606527328491211, "logits/rejected": -1.6100928783416748, "logps/chosen": -319.0159606933594, "logps/rejected": -440.75738525390625, "loss": 0.0257, "rewards/accuracies": 0.984375, "rewards/chosen": -10.429452896118164, "rewards/margins": 10.196511268615723, "rewards/rejected": -20.625965118408203, "step": 90 }, { "epoch": 0.017056420844741686, "grad_norm": 24.125, "learning_rate": 1.644730537624358e-05, "logits/chosen": -1.6584317684173584, "logits/rejected": -1.6650241613388062, "logps/chosen": -315.8913269042969, "logps/rejected": -439.966552734375, "loss": 0.0371, "rewards/accuracies": 0.984375, "rewards/chosen": -10.410109519958496, "rewards/margins": 10.561999320983887, "rewards/rejected": -20.972110748291016, "step": 95 }, { "epoch": 0.017954127204991246, "grad_norm": 13.375, "learning_rate": 1.644723803272341e-05, "logits/chosen": -1.6700522899627686, "logits/rejected": -1.6734033823013306, "logps/chosen": -319.60516357421875, "logps/rejected": -441.1056213378906, "loss": 0.0222, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -10.69153881072998, "rewards/margins": 10.231417655944824, "rewards/rejected": -20.922958374023438, "step": 100 }, { "epoch": 0.01885183356524081, "grad_norm": 6.3125, "learning_rate": 1.644716521439611e-05, "logits/chosen": -1.7060085535049438, "logits/rejected": -1.7089792490005493, "logps/chosen": -347.458984375, "logps/rejected": -468.8707580566406, "loss": 0.0224, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -12.833531379699707, "rewards/margins": 10.041297912597656, "rewards/rejected": -22.874828338623047, "step": 105 }, { "epoch": 0.019749539925490372, "grad_norm": 10.8125, "learning_rate": 1.644708692132631e-05, "logits/chosen": -1.7060142755508423, "logits/rejected": -1.7084630727767944, "logps/chosen": -356.79656982421875, "logps/rejected": -479.97235107421875, "loss": 0.0264, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -13.447436332702637, "rewards/margins": 10.534585952758789, "rewards/rejected": -23.982025146484375, "step": 110 }, { "epoch": 0.020647246285739933, "grad_norm": 6.15625, "learning_rate": 1.6447003153583514e-05, "logits/chosen": -1.642289161682129, "logits/rejected": -1.6480754613876343, "logps/chosen": -356.51251220703125, "logps/rejected": -481.41827392578125, "loss": 0.0122, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -13.347066879272461, "rewards/margins": 10.536771774291992, "rewards/rejected": -23.883838653564453, "step": 115 }, { "epoch": 0.021544952645989497, "grad_norm": 8.3125, "learning_rate": 1.644691391124208e-05, "logits/chosen": -1.6251140832901, "logits/rejected": -1.6293065547943115, "logps/chosen": -362.5952453613281, "logps/rejected": -487.474853515625, "loss": 0.0638, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.098132133483887, "rewards/margins": 10.43175983428955, "rewards/rejected": -24.529891967773438, "step": 120 }, { "epoch": 0.022442659006239058, "grad_norm": 0.9375, "learning_rate": 1.6446819194381232e-05, "logits/chosen": -1.6321861743927002, "logits/rejected": -1.6401519775390625, "logps/chosen": -365.3531188964844, "logps/rejected": -472.98590087890625, "loss": 0.0341, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.91484546661377, "rewards/margins": 8.75818920135498, "rewards/rejected": -23.673038482666016, "step": 125 }, { "epoch": 0.023340365366488623, "grad_norm": 16.125, "learning_rate": 1.6446719003085048e-05, "logits/chosen": -1.682080864906311, "logits/rejected": -1.6901333332061768, "logps/chosen": -375.67047119140625, "logps/rejected": -475.9076232910156, "loss": 0.0263, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.836702346801758, "rewards/margins": 8.173124313354492, "rewards/rejected": -23.00982666015625, "step": 130 }, { "epoch": 0.024238071726738183, "grad_norm": 5.09375, "learning_rate": 1.6446613337442464e-05, "logits/chosen": -1.7631546258926392, "logits/rejected": -1.75924813747406, "logps/chosen": -334.9482421875, "logps/rejected": -438.7393493652344, "loss": 0.0274, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -12.050386428833008, "rewards/margins": 8.398730278015137, "rewards/rejected": -20.44911766052246, "step": 135 }, { "epoch": 0.025135778086987748, "grad_norm": 16.125, "learning_rate": 1.6446502197547285e-05, "logits/chosen": -1.6700756549835205, "logits/rejected": -1.6583993434906006, "logps/chosen": -327.59222412109375, "logps/rejected": -443.2696228027344, "loss": 0.0349, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -10.80932903289795, "rewards/margins": 9.692026138305664, "rewards/rejected": -20.501354217529297, "step": 140 }, { "epoch": 0.02603348444723731, "grad_norm": 6.5625, "learning_rate": 1.6446385583498166e-05, "logits/chosen": -1.603623628616333, "logits/rejected": -1.5888742208480835, "logps/chosen": -325.58660888671875, "logps/rejected": -446.7493591308594, "loss": 0.0473, "rewards/accuracies": 0.96875, "rewards/chosen": -11.347869873046875, "rewards/margins": 10.037898063659668, "rewards/rejected": -21.38576889038086, "step": 145 }, { "epoch": 0.02693119080748687, "grad_norm": 5.21875, "learning_rate": 1.6446263495398625e-05, "logits/chosen": -1.6120811700820923, "logits/rejected": -1.5870082378387451, "logps/chosen": -317.60162353515625, "logps/rejected": -438.42633056640625, "loss": 0.0193, "rewards/accuracies": 0.984375, "rewards/chosen": -9.432976722717285, "rewards/margins": 10.237409591674805, "rewards/rejected": -19.670385360717773, "step": 150 }, { "epoch": 0.027828897167736434, "grad_norm": 3.4375, "learning_rate": 1.644613593335704e-05, "logits/chosen": -1.5875444412231445, "logits/rejected": -1.5749518871307373, "logps/chosen": -303.3853454589844, "logps/rejected": -432.47808837890625, "loss": 0.0154, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -8.602149963378906, "rewards/margins": 10.840978622436523, "rewards/rejected": -19.443126678466797, "step": 155 }, { "epoch": 0.028726603527985995, "grad_norm": 3.9375, "learning_rate": 1.6446002897486648e-05, "logits/chosen": -1.618011236190796, "logits/rejected": -1.6145331859588623, "logps/chosen": -312.6946105957031, "logps/rejected": -446.9215393066406, "loss": 0.0265, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -8.892807960510254, "rewards/margins": 11.515321731567383, "rewards/rejected": -20.408130645751953, "step": 160 }, { "epoch": 0.02962430988823556, "grad_norm": 5.96875, "learning_rate": 1.644586438790554e-05, "logits/chosen": -1.5836814641952515, "logits/rejected": -1.587181806564331, "logps/chosen": -306.9125061035156, "logps/rejected": -445.1659240722656, "loss": 0.0256, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.431074142456055, "rewards/margins": 11.796531677246094, "rewards/rejected": -21.22760581970215, "step": 165 }, { "epoch": 0.03052201624848512, "grad_norm": 11.125, "learning_rate": 1.6445720404736678e-05, "logits/chosen": -1.6508190631866455, "logits/rejected": -1.65244460105896, "logps/chosen": -310.2176208496094, "logps/rejected": -440.75714111328125, "loss": 0.0288, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.615917205810547, "rewards/margins": 11.127912521362305, "rewards/rejected": -20.743831634521484, "step": 170 }, { "epoch": 0.031419722608734685, "grad_norm": 7.0, "learning_rate": 1.644557094810787e-05, "logits/chosen": -1.7216987609863281, "logits/rejected": -1.7154676914215088, "logps/chosen": -340.46466064453125, "logps/rejected": -450.27294921875, "loss": 0.0397, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -11.357550621032715, "rewards/margins": 8.966830253601074, "rewards/rejected": -20.32438087463379, "step": 175 }, { "epoch": 0.03231742896898424, "grad_norm": 24.0, "learning_rate": 1.6445416018151788e-05, "logits/chosen": -1.7959930896759033, "logits/rejected": -1.800244927406311, "logps/chosen": -331.75506591796875, "logps/rejected": -427.2474670410156, "loss": 0.0207, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -12.001955032348633, "rewards/margins": 7.540495872497559, "rewards/rejected": -19.542451858520508, "step": 180 }, { "epoch": 0.03321513532923381, "grad_norm": 43.75, "learning_rate": 1.644525561500596e-05, "logits/chosen": -1.9910930395126343, "logits/rejected": -1.9894500970840454, "logps/chosen": -350.335693359375, "logps/rejected": -449.16796875, "loss": 0.0458, "rewards/accuracies": 0.96875, "rewards/chosen": -13.615007400512695, "rewards/margins": 8.14229679107666, "rewards/rejected": -21.757305145263672, "step": 185 }, { "epoch": 0.03411284168948337, "grad_norm": 13.3125, "learning_rate": 1.6445089738812785e-05, "logits/chosen": -1.9771511554718018, "logits/rejected": -1.9749290943145752, "logps/chosen": -348.55828857421875, "logps/rejected": -453.4300842285156, "loss": 0.0432, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.186309814453125, "rewards/margins": 8.536267280578613, "rewards/rejected": -21.722576141357422, "step": 190 }, { "epoch": 0.035010548049732935, "grad_norm": 0.146484375, "learning_rate": 1.6444918389719505e-05, "logits/chosen": -1.9536895751953125, "logits/rejected": -1.9450359344482422, "logps/chosen": -335.36553955078125, "logps/rejected": -448.2103576660156, "loss": 0.0296, "rewards/accuracies": 0.984375, "rewards/chosen": -11.7564697265625, "rewards/margins": 9.423359870910645, "rewards/rejected": -21.179828643798828, "step": 195 }, { "epoch": 0.03590825440998249, "grad_norm": 20.25, "learning_rate": 1.644474156787822e-05, "logits/chosen": -1.878861665725708, "logits/rejected": -1.8584403991699219, "logps/chosen": -319.38067626953125, "logps/rejected": -443.59033203125, "loss": 0.0307, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -10.937708854675293, "rewards/margins": 10.651717185974121, "rewards/rejected": -21.589426040649414, "step": 200 }, { "epoch": 0.03680596077023206, "grad_norm": 17.875, "learning_rate": 1.6444559273445908e-05, "logits/chosen": -1.6908838748931885, "logits/rejected": -1.6792293787002563, "logps/chosen": -321.28009033203125, "logps/rejected": -460.1475524902344, "loss": 0.022, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -10.206222534179688, "rewards/margins": 11.932024955749512, "rewards/rejected": -22.138248443603516, "step": 205 }, { "epoch": 0.03770366713048162, "grad_norm": 7.0, "learning_rate": 1.6444371506584377e-05, "logits/chosen": -1.6957308053970337, "logits/rejected": -1.690157175064087, "logps/chosen": -290.0798034667969, "logps/rejected": -412.7769470214844, "loss": 0.018, "rewards/accuracies": 0.984375, "rewards/chosen": -7.47702693939209, "rewards/margins": 10.390680313110352, "rewards/rejected": -17.86771011352539, "step": 210 }, { "epoch": 0.03860137349073118, "grad_norm": 8.5625, "learning_rate": 1.644417826746031e-05, "logits/chosen": -1.650665521621704, "logits/rejected": -1.6541109085083008, "logps/chosen": -290.309326171875, "logps/rejected": -417.1390686035156, "loss": 0.0184, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.279687404632568, "rewards/margins": 10.711103439331055, "rewards/rejected": -17.990793228149414, "step": 215 }, { "epoch": 0.039499079850980744, "grad_norm": 8.4375, "learning_rate": 1.6443979556245252e-05, "logits/chosen": -1.6047160625457764, "logits/rejected": -1.6234970092773438, "logps/chosen": -322.4940185546875, "logps/rejected": -458.4981384277344, "loss": 0.0263, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -9.725212097167969, "rewards/margins": 11.667040824890137, "rewards/rejected": -21.39225196838379, "step": 220 }, { "epoch": 0.04039678621123031, "grad_norm": 23.5, "learning_rate": 1.6443775373115592e-05, "logits/chosen": -1.5689036846160889, "logits/rejected": -1.5908584594726562, "logps/chosen": -343.6766052246094, "logps/rejected": -472.40948486328125, "loss": 0.0679, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -11.692893028259277, "rewards/margins": 10.971635818481445, "rewards/rejected": -22.66452980041504, "step": 225 }, { "epoch": 0.041294492571479866, "grad_norm": 8.6875, "learning_rate": 1.6443565718252586e-05, "logits/chosen": -1.5273631811141968, "logits/rejected": -1.5362848043441772, "logps/chosen": -333.0375061035156, "logps/rejected": -450.404541015625, "loss": 0.0259, "rewards/accuracies": 0.984375, "rewards/chosen": -11.445978164672852, "rewards/margins": 9.806886672973633, "rewards/rejected": -21.25286293029785, "step": 230 }, { "epoch": 0.04219219893172943, "grad_norm": 9.25, "learning_rate": 1.644335059184234e-05, "logits/chosen": -1.4887597560882568, "logits/rejected": -1.5063436031341553, "logps/chosen": -334.2455139160156, "logps/rejected": -466.03326416015625, "loss": 0.0239, "rewards/accuracies": 0.984375, "rewards/chosen": -12.205533981323242, "rewards/margins": 10.970166206359863, "rewards/rejected": -23.175701141357422, "step": 235 }, { "epoch": 0.043089905291978994, "grad_norm": 5.125, "learning_rate": 1.644312999407582e-05, "logits/chosen": -1.4916335344314575, "logits/rejected": -1.5103265047073364, "logps/chosen": -339.73175048828125, "logps/rejected": -466.05645751953125, "loss": 0.0523, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -12.136640548706055, "rewards/margins": 10.656683921813965, "rewards/rejected": -22.793325424194336, "step": 240 }, { "epoch": 0.04398761165222856, "grad_norm": 6.1875, "learning_rate": 1.644290392514886e-05, "logits/chosen": -1.4491441249847412, "logits/rejected": -1.4810426235198975, "logps/chosen": -330.1545104980469, "logps/rejected": -447.7718811035156, "loss": 0.0283, "rewards/accuracies": 0.984375, "rewards/chosen": -11.471087455749512, "rewards/margins": 9.670295715332031, "rewards/rejected": -21.14138412475586, "step": 245 }, { "epoch": 0.044885318012478116, "grad_norm": 9.75, "learning_rate": 1.6442672385262126e-05, "logits/chosen": -1.3768192529678345, "logits/rejected": -1.4130717515945435, "logps/chosen": -315.2447204589844, "logps/rejected": -440.4739685058594, "loss": 0.0267, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.256295204162598, "rewards/margins": 10.560578346252441, "rewards/rejected": -19.81687355041504, "step": 250 }, { "epoch": 0.04578302437272768, "grad_norm": 1.8125, "learning_rate": 1.6442435374621164e-05, "logits/chosen": -1.3219325542449951, "logits/rejected": -1.3581187725067139, "logps/chosen": -295.7016906738281, "logps/rejected": -422.5862731933594, "loss": 0.0325, "rewards/accuracies": 0.984375, "rewards/chosen": -8.51601791381836, "rewards/margins": 10.66891098022461, "rewards/rejected": -19.18492889404297, "step": 255 }, { "epoch": 0.046680730732977245, "grad_norm": 1.265625, "learning_rate": 1.6442192893436368e-05, "logits/chosen": -1.2778997421264648, "logits/rejected": -1.312280535697937, "logps/chosen": -303.11151123046875, "logps/rejected": -420.34912109375, "loss": 0.024, "rewards/accuracies": 0.984375, "rewards/chosen": -8.392851829528809, "rewards/margins": 9.67725658416748, "rewards/rejected": -18.070110321044922, "step": 260 }, { "epoch": 0.0475784370932268, "grad_norm": 2.765625, "learning_rate": 1.644194494192298e-05, "logits/chosen": -1.2928860187530518, "logits/rejected": -1.3260154724121094, "logps/chosen": -286.2437438964844, "logps/rejected": -393.56231689453125, "loss": 0.0281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -7.124878883361816, "rewards/margins": 8.837203979492188, "rewards/rejected": -15.962081909179688, "step": 265 }, { "epoch": 0.04847614345347637, "grad_norm": 16.125, "learning_rate": 1.6441691520301115e-05, "logits/chosen": -1.278626799583435, "logits/rejected": -1.3031818866729736, "logps/chosen": -310.00946044921875, "logps/rejected": -427.1468200683594, "loss": 0.0328, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.098076820373535, "rewards/margins": 9.82015609741211, "rewards/rejected": -18.918231964111328, "step": 270 }, { "epoch": 0.04937384981372593, "grad_norm": 2.625, "learning_rate": 1.644143262879573e-05, "logits/chosen": -1.355022668838501, "logits/rejected": -1.377715826034546, "logps/chosen": -317.8216247558594, "logps/rejected": -431.4881896972656, "loss": 0.0385, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -9.668425559997559, "rewards/margins": 9.433794975280762, "rewards/rejected": -19.102222442626953, "step": 275 }, { "epoch": 0.050271556173975496, "grad_norm": 9.75, "learning_rate": 1.644116826763664e-05, "logits/chosen": -1.3636281490325928, "logits/rejected": -1.384377360343933, "logps/chosen": -302.4226379394531, "logps/rejected": -413.4556579589844, "loss": 0.0475, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -9.204570770263672, "rewards/margins": 9.083320617675781, "rewards/rejected": -18.287891387939453, "step": 280 }, { "epoch": 0.05116926253422505, "grad_norm": 8.125, "learning_rate": 1.6440898437058523e-05, "logits/chosen": -1.340986728668213, "logits/rejected": -1.3553143739700317, "logps/chosen": -313.5350341796875, "logps/rejected": -417.8633728027344, "loss": 0.0624, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -8.931371688842773, "rewards/margins": 8.754600524902344, "rewards/rejected": -17.68597412109375, "step": 285 }, { "epoch": 0.05206696889447462, "grad_norm": 1.4609375, "learning_rate": 1.64406231373009e-05, "logits/chosen": -1.3220717906951904, "logits/rejected": -1.3361364603042603, "logps/chosen": -301.2831726074219, "logps/rejected": -406.24481201171875, "loss": 0.033, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.154109001159668, "rewards/margins": 8.63221263885498, "rewards/rejected": -17.78632164001465, "step": 290 }, { "epoch": 0.05296467525472418, "grad_norm": 5.15625, "learning_rate": 1.6440342368608156e-05, "logits/chosen": -1.2657798528671265, "logits/rejected": -1.2791422605514526, "logps/chosen": -326.9562072753906, "logps/rejected": -439.09356689453125, "loss": 0.061, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -11.56185531616211, "rewards/margins": 9.128196716308594, "rewards/rejected": -20.690053939819336, "step": 295 }, { "epoch": 0.05386238161497374, "grad_norm": 0.0198974609375, "learning_rate": 1.6440056131229532e-05, "logits/chosen": -1.2754865884780884, "logits/rejected": -1.2849574089050293, "logps/chosen": -345.1870422363281, "logps/rejected": -475.9234924316406, "loss": 0.0144, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -11.819540977478027, "rewards/margins": 11.19267463684082, "rewards/rejected": -23.012216567993164, "step": 300 }, { "epoch": 0.054760087975223304, "grad_norm": 1.828125, "learning_rate": 1.6439764425419112e-05, "logits/chosen": -1.274107813835144, "logits/rejected": -1.2885282039642334, "logps/chosen": -330.4737854003906, "logps/rejected": -473.398681640625, "loss": 0.0267, "rewards/accuracies": 0.984375, "rewards/chosen": -11.204086303710938, "rewards/margins": 12.114290237426758, "rewards/rejected": -23.318378448486328, "step": 305 }, { "epoch": 0.05565779433547287, "grad_norm": 13.9375, "learning_rate": 1.6439467251435852e-05, "logits/chosen": -1.2366708517074585, "logits/rejected": -1.2527769804000854, "logps/chosen": -323.2499694824219, "logps/rejected": -467.12890625, "loss": 0.023, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -11.250632286071777, "rewards/margins": 12.270976066589355, "rewards/rejected": -23.521610260009766, "step": 310 }, { "epoch": 0.05655550069572243, "grad_norm": 2.21875, "learning_rate": 1.6439164609543545e-05, "logits/chosen": -1.287007212638855, "logits/rejected": -1.315598726272583, "logps/chosen": -315.24200439453125, "logps/rejected": -456.1424255371094, "loss": 0.0308, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -10.730131149291992, "rewards/margins": 11.966339111328125, "rewards/rejected": -22.696468353271484, "step": 315 }, { "epoch": 0.05745320705597199, "grad_norm": 4.03125, "learning_rate": 1.6438856500010842e-05, "logits/chosen": -1.412188172340393, "logits/rejected": -1.4369876384735107, "logps/chosen": -304.62298583984375, "logps/rejected": -442.72808837890625, "loss": 0.0438, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.49343490600586, "rewards/margins": 11.810284614562988, "rewards/rejected": -21.303720474243164, "step": 320 }, { "epoch": 0.058350913416221555, "grad_norm": 3.34375, "learning_rate": 1.643854292311126e-05, "logits/chosen": -1.4016748666763306, "logits/rejected": -1.4312589168548584, "logps/chosen": -321.81982421875, "logps/rejected": -464.9969787597656, "loss": 0.016, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -9.75760269165039, "rewards/margins": 12.245366096496582, "rewards/rejected": -22.00296974182129, "step": 325 }, { "epoch": 0.05924861977647112, "grad_norm": 8.125, "learning_rate": 1.6438223879123157e-05, "logits/chosen": -1.420204520225525, "logits/rejected": -1.4521286487579346, "logps/chosen": -334.3448486328125, "logps/rejected": -486.096435546875, "loss": 0.0395, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -11.95421028137207, "rewards/margins": 13.074414253234863, "rewards/rejected": -25.02862548828125, "step": 330 }, { "epoch": 0.060146326136720676, "grad_norm": 7.4375, "learning_rate": 1.6437899368329744e-05, "logits/chosen": -1.4968700408935547, "logits/rejected": -1.516984224319458, "logps/chosen": -362.8880310058594, "logps/rejected": -503.55047607421875, "loss": 0.0443, "rewards/accuracies": 0.96875, "rewards/chosen": -13.416742324829102, "rewards/margins": 12.286725997924805, "rewards/rejected": -25.703466415405273, "step": 335 }, { "epoch": 0.06104403249697024, "grad_norm": 3.65625, "learning_rate": 1.643756939101909e-05, "logits/chosen": -1.4715522527694702, "logits/rejected": -1.500880479812622, "logps/chosen": -347.5955505371094, "logps/rejected": -490.23345947265625, "loss": 0.0188, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -13.808749198913574, "rewards/margins": 12.222585678100586, "rewards/rejected": -26.031335830688477, "step": 340 }, { "epoch": 0.061941738857219805, "grad_norm": 0.0283203125, "learning_rate": 1.6437233947484115e-05, "logits/chosen": -1.4634774923324585, "logits/rejected": -1.4903171062469482, "logps/chosen": -346.1238708496094, "logps/rejected": -480.40740966796875, "loss": 0.026, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -13.692280769348145, "rewards/margins": 11.461533546447754, "rewards/rejected": -25.153812408447266, "step": 345 }, { "epoch": 0.06283944521746937, "grad_norm": 2.734375, "learning_rate": 1.6436893038022587e-05, "logits/chosen": -1.4172029495239258, "logits/rejected": -1.442546010017395, "logps/chosen": -339.7004089355469, "logps/rejected": -476.07861328125, "loss": 0.0222, "rewards/accuracies": 0.984375, "rewards/chosen": -13.242881774902344, "rewards/margins": 11.535491943359375, "rewards/rejected": -24.77837562561035, "step": 350 }, { "epoch": 0.06373715157771893, "grad_norm": 2.515625, "learning_rate": 1.6436546662937136e-05, "logits/chosen": -1.4132306575775146, "logits/rejected": -1.438727855682373, "logps/chosen": -340.2590637207031, "logps/rejected": -478.34759521484375, "loss": 0.03, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -12.395490646362305, "rewards/margins": 11.930428504943848, "rewards/rejected": -24.325918197631836, "step": 355 }, { "epoch": 0.06463485793796848, "grad_norm": 1.9609375, "learning_rate": 1.6436194822535237e-05, "logits/chosen": -1.3696801662445068, "logits/rejected": -1.4052057266235352, "logps/chosen": -332.30072021484375, "logps/rejected": -476.0782165527344, "loss": 0.0272, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -12.618162155151367, "rewards/margins": 12.268811225891113, "rewards/rejected": -24.886974334716797, "step": 360 }, { "epoch": 0.06553256429821805, "grad_norm": 1.421875, "learning_rate": 1.643583751712921e-05, "logits/chosen": -1.3992929458618164, "logits/rejected": -1.4282127618789673, "logps/chosen": -334.0565490722656, "logps/rejected": -480.23785400390625, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -11.715566635131836, "rewards/margins": 12.651227951049805, "rewards/rejected": -24.36679458618164, "step": 365 }, { "epoch": 0.06643027065846761, "grad_norm": 5.1875, "learning_rate": 1.6435474747036243e-05, "logits/chosen": -1.453920602798462, "logits/rejected": -1.4755427837371826, "logps/chosen": -322.3940124511719, "logps/rejected": -467.23126220703125, "loss": 0.0371, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -11.112676620483398, "rewards/margins": 12.558187484741211, "rewards/rejected": -23.67086410522461, "step": 370 }, { "epoch": 0.06732797701871718, "grad_norm": 3.84375, "learning_rate": 1.643510651257836e-05, "logits/chosen": -1.4459034204483032, "logits/rejected": -1.4675936698913574, "logps/chosen": -320.50347900390625, "logps/rejected": -463.59014892578125, "loss": 0.0144, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.755430221557617, "rewards/margins": 12.185154914855957, "rewards/rejected": -22.94058609008789, "step": 375 }, { "epoch": 0.06822568337896674, "grad_norm": 0.0186767578125, "learning_rate": 1.6434732814082442e-05, "logits/chosen": -1.4478992223739624, "logits/rejected": -1.4632583856582642, "logps/chosen": -331.53289794921875, "logps/rejected": -466.4090881347656, "loss": 0.0205, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -10.466972351074219, "rewards/margins": 11.554471969604492, "rewards/rejected": -22.021446228027344, "step": 380 }, { "epoch": 0.0691233897392163, "grad_norm": 5.21875, "learning_rate": 1.6434353651880223e-05, "logits/chosen": -1.4576263427734375, "logits/rejected": -1.470090627670288, "logps/chosen": -322.73773193359375, "logps/rejected": -458.134521484375, "loss": 0.0305, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -10.470837593078613, "rewards/margins": 11.618741035461426, "rewards/rejected": -22.089576721191406, "step": 385 }, { "epoch": 0.07002109609946587, "grad_norm": 2.875, "learning_rate": 1.643396902630828e-05, "logits/chosen": -1.4584577083587646, "logits/rejected": -1.4607679843902588, "logps/chosen": -317.8088073730469, "logps/rejected": -451.45440673828125, "loss": 0.0322, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -10.205190658569336, "rewards/margins": 11.55008316040039, "rewards/rejected": -21.75527572631836, "step": 390 }, { "epoch": 0.07091880245971542, "grad_norm": 6.96875, "learning_rate": 1.6433578937708046e-05, "logits/chosen": -1.4126781225204468, "logits/rejected": -1.4239190816879272, "logps/chosen": -322.39801025390625, "logps/rejected": -458.4278259277344, "loss": 0.0256, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.876721382141113, "rewards/margins": 11.59015941619873, "rewards/rejected": -21.466880798339844, "step": 395 }, { "epoch": 0.07181650881996499, "grad_norm": 0.02001953125, "learning_rate": 1.64331833864258e-05, "logits/chosen": -1.4148705005645752, "logits/rejected": -1.4251186847686768, "logps/chosen": -316.4896545410156, "logps/rejected": -450.53240966796875, "loss": 0.0164, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -9.625123023986816, "rewards/margins": 11.543768882751465, "rewards/rejected": -21.168895721435547, "step": 400 }, { "epoch": 0.07271421518021455, "grad_norm": 0.890625, "learning_rate": 1.643278237281267e-05, "logits/chosen": -1.421555757522583, "logits/rejected": -1.4265415668487549, "logps/chosen": -308.0295104980469, "logps/rejected": -445.5311584472656, "loss": 0.0249, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.17651653289795, "rewards/margins": 11.756936073303223, "rewards/rejected": -20.933452606201172, "step": 405 }, { "epoch": 0.07361192154046411, "grad_norm": 2.140625, "learning_rate": 1.6432375897224637e-05, "logits/chosen": -1.3315099477767944, "logits/rejected": -1.3350989818572998, "logps/chosen": -315.9861755371094, "logps/rejected": -451.1835021972656, "loss": 0.044, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.11551284790039, "rewards/margins": 11.56936264038086, "rewards/rejected": -20.68487548828125, "step": 410 }, { "epoch": 0.07450962790071368, "grad_norm": 0.94140625, "learning_rate": 1.6431963960022524e-05, "logits/chosen": -1.2902719974517822, "logits/rejected": -1.2910665273666382, "logps/chosen": -309.9100341796875, "logps/rejected": -442.4892578125, "loss": 0.0278, "rewards/accuracies": 0.984375, "rewards/chosen": -9.109766006469727, "rewards/margins": 11.337265968322754, "rewards/rejected": -20.447031021118164, "step": 415 }, { "epoch": 0.07540733426096324, "grad_norm": 4.28125, "learning_rate": 1.643154656157201e-05, "logits/chosen": -1.2414597272872925, "logits/rejected": -1.2512853145599365, "logps/chosen": -301.22174072265625, "logps/rejected": -416.2430725097656, "loss": 0.0651, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -9.233153343200684, "rewards/margins": 9.558819770812988, "rewards/rejected": -18.79197120666504, "step": 420 }, { "epoch": 0.07630504062121281, "grad_norm": 2.984375, "learning_rate": 1.6431123702243618e-05, "logits/chosen": -1.2505871057510376, "logits/rejected": -1.2604036331176758, "logps/chosen": -319.97369384765625, "logps/rejected": -420.20184326171875, "loss": 0.0256, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -9.754773139953613, "rewards/margins": 7.977179527282715, "rewards/rejected": -17.731952667236328, "step": 425 }, { "epoch": 0.07720274698146236, "grad_norm": 2.75, "learning_rate": 1.6430695382412714e-05, "logits/chosen": -1.2662450075149536, "logits/rejected": -1.2877540588378906, "logps/chosen": -322.77532958984375, "logps/rejected": -426.742919921875, "loss": 0.0443, "rewards/accuracies": 0.984375, "rewards/chosen": -10.8510103225708, "rewards/margins": 8.357492446899414, "rewards/rejected": -19.208499908447266, "step": 430 }, { "epoch": 0.07810045334171192, "grad_norm": 0.039794921875, "learning_rate": 1.6430261602459523e-05, "logits/chosen": -1.291669487953186, "logits/rejected": -1.3137457370758057, "logps/chosen": -338.3268127441406, "logps/rejected": -457.73077392578125, "loss": 0.0177, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -12.022209167480469, "rewards/margins": 10.057284355163574, "rewards/rejected": -22.07949447631836, "step": 435 }, { "epoch": 0.07899815970196149, "grad_norm": 7.125, "learning_rate": 1.6429822362769104e-05, "logits/chosen": -1.2740453481674194, "logits/rejected": -1.2928683757781982, "logps/chosen": -351.74859619140625, "logps/rejected": -467.54345703125, "loss": 0.0661, "rewards/accuracies": 0.96875, "rewards/chosen": -13.225227355957031, "rewards/margins": 9.72153377532959, "rewards/rejected": -22.946762084960938, "step": 440 }, { "epoch": 0.07989586606221105, "grad_norm": 5.0625, "learning_rate": 1.642937766373137e-05, "logits/chosen": -1.2580888271331787, "logits/rejected": -1.2826154232025146, "logps/chosen": -356.80303955078125, "logps/rejected": -462.6808166503906, "loss": 0.05, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -13.879257202148438, "rewards/margins": 8.64616584777832, "rewards/rejected": -22.525421142578125, "step": 445 }, { "epoch": 0.08079357242246062, "grad_norm": 3.53125, "learning_rate": 1.6428927505741077e-05, "logits/chosen": -1.3274773359298706, "logits/rejected": -1.3538029193878174, "logps/chosen": -351.6380920410156, "logps/rejected": -460.96075439453125, "loss": 0.0209, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -12.758635520935059, "rewards/margins": 9.032510757446289, "rewards/rejected": -21.79114532470703, "step": 450 }, { "epoch": 0.08169127878271018, "grad_norm": 2.140625, "learning_rate": 1.642847188919783e-05, "logits/chosen": -1.3372552394866943, "logits/rejected": -1.3623110055923462, "logps/chosen": -347.4330139160156, "logps/rejected": -465.4501037597656, "loss": 0.0277, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -12.52314281463623, "rewards/margins": 9.909950256347656, "rewards/rejected": -22.43309211730957, "step": 455 }, { "epoch": 0.08258898514295973, "grad_norm": 5.5, "learning_rate": 1.6428010814506082e-05, "logits/chosen": -1.3123576641082764, "logits/rejected": -1.339634895324707, "logps/chosen": -338.0999450683594, "logps/rejected": -456.74310302734375, "loss": 0.0419, "rewards/accuracies": 0.96875, "rewards/chosen": -13.015867233276367, "rewards/margins": 9.875136375427246, "rewards/rejected": -22.891002655029297, "step": 460 }, { "epoch": 0.0834866915032093, "grad_norm": 1.703125, "learning_rate": 1.6427544282075123e-05, "logits/chosen": -1.3849332332611084, "logits/rejected": -1.4038164615631104, "logps/chosen": -353.74908447265625, "logps/rejected": -478.51287841796875, "loss": 0.0462, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -13.985809326171875, "rewards/margins": 10.343815803527832, "rewards/rejected": -24.32962417602539, "step": 465 }, { "epoch": 0.08438439786345886, "grad_norm": 0.1669921875, "learning_rate": 1.642707229231909e-05, "logits/chosen": -1.3579334020614624, "logits/rejected": -1.3771896362304688, "logps/chosen": -369.145751953125, "logps/rejected": -490.97100830078125, "loss": 0.0146, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.570086479187012, "rewards/margins": 9.97675895690918, "rewards/rejected": -25.546846389770508, "step": 470 }, { "epoch": 0.08528210422370842, "grad_norm": 1.8203125, "learning_rate": 1.6426594845656973e-05, "logits/chosen": -1.355943202972412, "logits/rejected": -1.3650095462799072, "logps/chosen": -376.259521484375, "logps/rejected": -499.44970703125, "loss": 0.0148, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -15.661959648132324, "rewards/margins": 10.384978294372559, "rewards/rejected": -26.04693603515625, "step": 475 }, { "epoch": 0.08617981058395799, "grad_norm": 2.265625, "learning_rate": 1.642611194251259e-05, "logits/chosen": -1.355452299118042, "logits/rejected": -1.3569139242172241, "logps/chosen": -376.9379577636719, "logps/rejected": -506.89666748046875, "loss": 0.0385, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -15.602932929992676, "rewards/margins": 10.967208862304688, "rewards/rejected": -26.570140838623047, "step": 480 }, { "epoch": 0.08707751694420755, "grad_norm": 1.71875, "learning_rate": 1.642562358331462e-05, "logits/chosen": -1.3472042083740234, "logits/rejected": -1.3535155057907104, "logps/chosen": -362.6130676269531, "logps/rejected": -499.9092712402344, "loss": 0.0131, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -14.739013671875, "rewards/margins": 11.621049880981445, "rewards/rejected": -26.360065460205078, "step": 485 }, { "epoch": 0.08797522330445712, "grad_norm": 1.1171875, "learning_rate": 1.6425129768496577e-05, "logits/chosen": -1.3245633840560913, "logits/rejected": -1.3288484811782837, "logps/chosen": -364.94903564453125, "logps/rejected": -503.7896423339844, "loss": 0.0225, "rewards/accuracies": 0.984375, "rewards/chosen": -14.195643424987793, "rewards/margins": 11.918843269348145, "rewards/rejected": -26.114486694335938, "step": 490 }, { "epoch": 0.08887292966470667, "grad_norm": 3.65625, "learning_rate": 1.6424630498496813e-05, "logits/chosen": -1.3164643049240112, "logits/rejected": -1.3220335245132446, "logps/chosen": -365.0807189941406, "logps/rejected": -510.30340576171875, "loss": 0.0186, "rewards/accuracies": 0.984375, "rewards/chosen": -14.422874450683594, "rewards/margins": 12.447381973266602, "rewards/rejected": -26.870258331298828, "step": 495 }, { "epoch": 0.08977063602495623, "grad_norm": 2.203125, "learning_rate": 1.6424125773758535e-05, "logits/chosen": -1.418001413345337, "logits/rejected": -1.4166367053985596, "logps/chosen": -361.4501037597656, "logps/rejected": -499.55902099609375, "loss": 0.0316, "rewards/accuracies": 0.984375, "rewards/chosen": -14.319913864135742, "rewards/margins": 11.92530632019043, "rewards/rejected": -26.245220184326172, "step": 500 }, { "epoch": 0.08977063602495623, "eval_logits/chosen": -1.3433603048324585, "eval_logits/rejected": -1.36442232131958, "eval_logps/chosen": -369.4107666015625, "eval_logps/rejected": -511.26239013671875, "eval_loss": 0.014039273373782635, "eval_rewards/accuracies": 0.9900000095367432, "eval_rewards/chosen": -13.992281913757324, "eval_rewards/margins": 12.298800468444824, "eval_rewards/rejected": -26.29108238220215, "eval_runtime": 10.3033, "eval_samples_per_second": 19.411, "eval_steps_per_second": 19.411, "step": 500 } ], "logging_steps": 5, "max_steps": 16707, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }