{ "best_metric": 0.0, "best_model_checkpoint": "./dpo-lora-output/checkpoint-50", "epoch": 19.473684210526315, "eval_steps": 50, "global_step": 1850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010526315789473684, "grad_norm": 3.8484461128973635e-07, "learning_rate": 0.00019997894736842106, "logits/chosen": -0.37785547971725464, "logits/rejected": -0.37785547971725464, "logps/chosen": -28.69106674194336, "logps/rejected": -28.69106674194336, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.021052631578947368, "grad_norm": 4.627001430890232e-07, "learning_rate": 0.00019995789473684214, "logits/chosen": -0.6673249006271362, "logits/rejected": -0.6673249006271362, "logps/chosen": -28.038116455078125, "logps/rejected": -28.038116455078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00644683837890625, "rewards/margins": 0.0, "rewards/rejected": -0.00644683837890625, "step": 2 }, { "epoch": 0.031578947368421054, "grad_norm": 4.5330713760449726e-07, "learning_rate": 0.00019993684210526318, "logits/chosen": 0.0750800147652626, "logits/rejected": 0.0750800147652626, "logps/chosen": -28.565792083740234, "logps/rejected": -28.565792083740234, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00939254742115736, "rewards/margins": 0.0, "rewards/rejected": -0.00939254742115736, "step": 3 }, { "epoch": 0.042105263157894736, "grad_norm": 3.203355163350352e-07, "learning_rate": 0.0001999157894736842, "logits/chosen": -0.2510129511356354, "logits/rejected": -0.2510129511356354, "logps/chosen": -28.46105194091797, "logps/rejected": -28.46105194091797, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.018783951178193092, "rewards/margins": 0.0, "rewards/rejected": -0.018783951178193092, "step": 4 }, { "epoch": 0.05263157894736842, "grad_norm": NaN, "learning_rate": 0.0001999157894736842, "logits/chosen": -0.31327107548713684, "logits/rejected": -0.31327107548713684, "logps/chosen": -29.54190444946289, "logps/rejected": -29.54190444946289, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03589210659265518, "rewards/margins": 0.0, "rewards/rejected": -0.03589210659265518, "step": 5 }, { "epoch": 0.06315789473684211, "grad_norm": 3.0991532184998505e-07, "learning_rate": 0.00019989473684210526, "logits/chosen": -0.7301732897758484, "logits/rejected": -0.7301732897758484, "logps/chosen": -28.049640655517578, "logps/rejected": -28.049640655517578, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.023381425067782402, "rewards/margins": 0.0, "rewards/rejected": -0.023381425067782402, "step": 6 }, { "epoch": 0.07368421052631578, "grad_norm": 5.081353720015613e-07, "learning_rate": 0.00019987368421052633, "logits/chosen": -0.15332195162773132, "logits/rejected": -0.15332195162773132, "logps/chosen": -29.715986251831055, "logps/rejected": -29.715986251831055, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.03316478803753853, "rewards/margins": 0.0, "rewards/rejected": -0.03316478803753853, "step": 7 }, { "epoch": 0.08421052631578947, "grad_norm": 4.6199787107070733e-07, "learning_rate": 0.00019985263157894738, "logits/chosen": 0.08251968026161194, "logits/rejected": 0.08251968026161194, "logps/chosen": -29.024179458618164, "logps/rejected": -29.024179458618164, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.055231284350156784, "rewards/margins": 0.0, "rewards/rejected": -0.055231284350156784, "step": 8 }, { "epoch": 0.09473684210526316, "grad_norm": 4.6873628889443353e-07, "learning_rate": 0.00019983157894736843, "logits/chosen": 0.0853329673409462, "logits/rejected": 0.0853329673409462, "logps/chosen": -29.1766357421875, "logps/rejected": -29.1766357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.07047691196203232, "rewards/margins": 0.0, "rewards/rejected": -0.07047691196203232, "step": 9 }, { "epoch": 0.10526315789473684, "grad_norm": 3.2983530218189117e-07, "learning_rate": 0.00019981052631578948, "logits/chosen": -0.24946345388889313, "logits/rejected": -0.24946345388889313, "logps/chosen": -28.859678268432617, "logps/rejected": -28.859678268432617, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.058646585792303085, "rewards/margins": 0.0, "rewards/rejected": -0.058646585792303085, "step": 10 }, { "epoch": 0.11578947368421053, "grad_norm": 3.8660323298245203e-07, "learning_rate": 0.00019978947368421053, "logits/chosen": -0.41132038831710815, "logits/rejected": -0.41132038831710815, "logps/chosen": -29.74775505065918, "logps/rejected": -29.74775505065918, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.10566883534193039, "rewards/margins": 0.0, "rewards/rejected": -0.10566883534193039, "step": 11 }, { "epoch": 0.12631578947368421, "grad_norm": 4.862479272560449e-07, "learning_rate": 0.00019976842105263158, "logits/chosen": 0.0920749232172966, "logits/rejected": 0.0920749232172966, "logps/chosen": -29.676916122436523, "logps/rejected": -29.676916122436523, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.1205049529671669, "rewards/margins": 0.0, "rewards/rejected": -0.1205049529671669, "step": 12 }, { "epoch": 0.1368421052631579, "grad_norm": 3.2230357760454353e-07, "learning_rate": 0.00019974736842105263, "logits/chosen": -0.45576807856559753, "logits/rejected": -0.45576807856559753, "logps/chosen": -31.059167861938477, "logps/rejected": -31.059167861938477, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.08003788441419601, "rewards/margins": 0.0, "rewards/rejected": -0.08003788441419601, "step": 13 }, { "epoch": 0.14736842105263157, "grad_norm": 5.542100893762836e-07, "learning_rate": 0.0001997263157894737, "logits/chosen": -0.12948763370513916, "logits/rejected": -0.12948763370513916, "logps/chosen": -30.639995574951172, "logps/rejected": -30.639995574951172, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.12556572258472443, "rewards/margins": 0.0, "rewards/rejected": -0.12556572258472443, "step": 14 }, { "epoch": 0.15789473684210525, "grad_norm": 4.4282163003117603e-07, "learning_rate": 0.00019970526315789475, "logits/chosen": -0.32608121633529663, "logits/rejected": -0.32608121633529663, "logps/chosen": -30.3846378326416, "logps/rejected": -30.3846378326416, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.12016544491052628, "rewards/margins": 0.0, "rewards/rejected": -0.12016544491052628, "step": 15 }, { "epoch": 0.16842105263157894, "grad_norm": 5.774842293249094e-07, "learning_rate": 0.0001996842105263158, "logits/chosen": -0.12218209356069565, "logits/rejected": -0.12218209356069565, "logps/chosen": -30.943218231201172, "logps/rejected": -30.943218231201172, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.15588799118995667, "rewards/margins": 0.0, "rewards/rejected": -0.15588799118995667, "step": 16 }, { "epoch": 0.17894736842105263, "grad_norm": 3.617555535129213e-07, "learning_rate": 0.00019966315789473685, "logits/chosen": -0.2481478750705719, "logits/rejected": -0.2481478750705719, "logps/chosen": -29.472448348999023, "logps/rejected": -29.472448348999023, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.11992359161376953, "rewards/margins": 0.0, "rewards/rejected": -0.11992359161376953, "step": 17 }, { "epoch": 0.18947368421052632, "grad_norm": 5.118704962114862e-07, "learning_rate": 0.0001996421052631579, "logits/chosen": -0.6967118978500366, "logits/rejected": -0.6967118978500366, "logps/chosen": -28.57967758178711, "logps/rejected": -28.57967758178711, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.13177242875099182, "rewards/margins": 0.0, "rewards/rejected": -0.13177242875099182, "step": 18 }, { "epoch": 0.2, "grad_norm": 4.311013412916509e-07, "learning_rate": 0.00019962105263157895, "logits/chosen": -0.33055102825164795, "logits/rejected": -0.33055102825164795, "logps/chosen": -33.06563186645508, "logps/rejected": -33.06563186645508, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.12168388813734055, "rewards/margins": 0.0, "rewards/rejected": -0.12168388813734055, "step": 19 }, { "epoch": 0.21052631578947367, "grad_norm": 3.7089651527821843e-07, "learning_rate": 0.0001996, "logits/chosen": -0.24710780382156372, "logits/rejected": -0.24710780382156372, "logps/chosen": -29.734893798828125, "logps/rejected": -29.734893798828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.14616814255714417, "rewards/margins": 0.0, "rewards/rejected": -0.14616814255714417, "step": 20 }, { "epoch": 0.22105263157894736, "grad_norm": 4.842980843022815e-07, "learning_rate": 0.00019957894736842108, "logits/chosen": -0.3371769189834595, "logits/rejected": -0.3371769189834595, "logps/chosen": -31.09835433959961, "logps/rejected": -31.09835433959961, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.19153709709644318, "rewards/margins": 0.0, "rewards/rejected": -0.19153709709644318, "step": 21 }, { "epoch": 0.23157894736842105, "grad_norm": 6.520318152070104e-07, "learning_rate": 0.00019955789473684213, "logits/chosen": -0.10046061873435974, "logits/rejected": -0.10046061873435974, "logps/chosen": -31.823505401611328, "logps/rejected": -31.823505401611328, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.24391670525074005, "rewards/margins": 0.0, "rewards/rejected": -0.24391670525074005, "step": 22 }, { "epoch": 0.24210526315789474, "grad_norm": 3.677838549265289e-07, "learning_rate": 0.00019953684210526317, "logits/chosen": -0.492754727602005, "logits/rejected": -0.492754727602005, "logps/chosen": -31.88969612121582, "logps/rejected": -31.88969612121582, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.16309070587158203, "rewards/margins": 0.0, "rewards/rejected": -0.16309070587158203, "step": 23 }, { "epoch": 0.25263157894736843, "grad_norm": 4.003734943580639e-07, "learning_rate": 0.00019951578947368422, "logits/chosen": -0.45025864243507385, "logits/rejected": -0.45025864243507385, "logps/chosen": -30.97303009033203, "logps/rejected": -30.97303009033203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.22819633781909943, "rewards/margins": 0.0, "rewards/rejected": -0.22819633781909943, "step": 24 }, { "epoch": 0.2631578947368421, "grad_norm": 4.081555289303651e-07, "learning_rate": 0.00019949473684210527, "logits/chosen": -0.24635930359363556, "logits/rejected": -0.24635930359363556, "logps/chosen": -30.218860626220703, "logps/rejected": -30.218860626220703, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.1945648193359375, "rewards/margins": 0.0, "rewards/rejected": -0.1945648193359375, "step": 25 }, { "epoch": 0.2736842105263158, "grad_norm": 6.133328724899911e-07, "learning_rate": 0.00019947368421052632, "logits/chosen": 0.11159560084342957, "logits/rejected": 0.11159560084342957, "logps/chosen": -31.607969284057617, "logps/rejected": -31.607969284057617, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.3136102855205536, "rewards/margins": 0.0, "rewards/rejected": -0.3136102855205536, "step": 26 }, { "epoch": 0.28421052631578947, "grad_norm": 5.27759880242229e-07, "learning_rate": 0.00019945263157894737, "logits/chosen": -0.7260520458221436, "logits/rejected": -0.7260520458221436, "logps/chosen": -29.5076961517334, "logps/rejected": -29.5076961517334, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.2245742827653885, "rewards/margins": 0.0, "rewards/rejected": -0.2245742827653885, "step": 27 }, { "epoch": 0.29473684210526313, "grad_norm": 3.844326954549615e-07, "learning_rate": 0.00019943157894736845, "logits/chosen": -0.5129251480102539, "logits/rejected": -0.5129251480102539, "logps/chosen": -32.33815002441406, "logps/rejected": -32.33815002441406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.207936093211174, "rewards/margins": 0.0, "rewards/rejected": -0.207936093211174, "step": 28 }, { "epoch": 0.30526315789473685, "grad_norm": 6.856218988104956e-07, "learning_rate": 0.0001994105263157895, "logits/chosen": 0.11357959359884262, "logits/rejected": 0.11357959359884262, "logps/chosen": -32.002952575683594, "logps/rejected": -32.002952575683594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.35310861468315125, "rewards/margins": 0.0, "rewards/rejected": -0.35310861468315125, "step": 29 }, { "epoch": 0.3157894736842105, "grad_norm": 8.271791784864035e-07, "learning_rate": 0.00019938947368421052, "logits/chosen": 0.11376441270112991, "logits/rejected": 0.11376441270112991, "logps/chosen": -32.15091323852539, "logps/rejected": -32.15091323852539, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.3679046630859375, "rewards/margins": 0.0, "rewards/rejected": -0.3679046630859375, "step": 30 }, { "epoch": 0.3263157894736842, "grad_norm": 5.130211206960666e-07, "learning_rate": 0.00019936842105263157, "logits/chosen": -0.3664137125015259, "logits/rejected": -0.3664137125015259, "logps/chosen": -34.016319274902344, "logps/rejected": -34.016319274902344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.21675263345241547, "rewards/margins": 0.0, "rewards/rejected": -0.21675263345241547, "step": 31 }, { "epoch": 0.3368421052631579, "grad_norm": 4.994833489035955e-07, "learning_rate": 0.00019934736842105265, "logits/chosen": -0.24606359004974365, "logits/rejected": -0.24606359004974365, "logps/chosen": -30.879606246948242, "logps/rejected": -30.879606246948242, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.26063939929008484, "rewards/margins": 0.0, "rewards/rejected": -0.26063939929008484, "step": 32 }, { "epoch": 0.3473684210526316, "grad_norm": 3.894836311246763e-07, "learning_rate": 0.0001993263157894737, "logits/chosen": -0.8016754984855652, "logits/rejected": -0.8016754984855652, "logps/chosen": -30.204538345336914, "logps/rejected": -30.204538345336914, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.23887120187282562, "rewards/margins": 0.0, "rewards/rejected": -0.23887120187282562, "step": 33 }, { "epoch": 0.35789473684210527, "grad_norm": 7.926749390207988e-07, "learning_rate": 0.00019930526315789474, "logits/chosen": 0.11515742540359497, "logits/rejected": 0.11515742540359497, "logps/chosen": -32.81196975708008, "logps/rejected": -32.81196975708008, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.4340103268623352, "rewards/margins": 0.0, "rewards/rejected": -0.4340103268623352, "step": 34 }, { "epoch": 0.3684210526315789, "grad_norm": 5.802260147902416e-07, "learning_rate": 0.0001992842105263158, "logits/chosen": -0.7546766996383667, "logits/rejected": -0.7546766996383667, "logps/chosen": -30.477094650268555, "logps/rejected": -30.477094650268555, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.3215141296386719, "rewards/margins": 0.0, "rewards/rejected": -0.3215141296386719, "step": 35 }, { "epoch": 0.37894736842105264, "grad_norm": 9.166934091808798e-07, "learning_rate": 0.00019926315789473687, "logits/chosen": -0.07739929854869843, "logits/rejected": -0.07739929854869843, "logps/chosen": -33.753623962402344, "logps/rejected": -33.753623962402344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.43692857027053833, "rewards/margins": 0.0, "rewards/rejected": -0.43692857027053833, "step": 36 }, { "epoch": 0.3894736842105263, "grad_norm": 7.862927873247827e-07, "learning_rate": 0.0001992421052631579, "logits/chosen": -0.8069098591804504, "logits/rejected": -0.8069098591804504, "logps/chosen": -31.68317413330078, "logps/rejected": -31.68317413330078, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.3709526062011719, "rewards/margins": 0.0, "rewards/rejected": -0.3709526062011719, "step": 37 }, { "epoch": 0.4, "grad_norm": 5.130741556058638e-07, "learning_rate": 0.00019922105263157894, "logits/chosen": -0.5547216534614563, "logits/rejected": -0.5547216534614563, "logps/chosen": -33.31239318847656, "logps/rejected": -33.31239318847656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.3053604066371918, "rewards/margins": 0.0, "rewards/rejected": -0.3053604066371918, "step": 38 }, { "epoch": 0.4105263157894737, "grad_norm": 8.027863032111782e-07, "learning_rate": 0.00019920000000000002, "logits/chosen": 0.11398957669734955, "logits/rejected": 0.11398957669734955, "logps/chosen": -33.665382385253906, "logps/rejected": -33.665382385253906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.519351601600647, "rewards/margins": 0.0, "rewards/rejected": -0.519351601600647, "step": 39 }, { "epoch": 0.42105263157894735, "grad_norm": 6.375659040713799e-07, "learning_rate": 0.00019917894736842107, "logits/chosen": -0.38180088996887207, "logits/rejected": -0.38180088996887207, "logps/chosen": -33.36181640625, "logps/rejected": -33.36181640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.4178833067417145, "rewards/margins": 0.0, "rewards/rejected": -0.4178833067417145, "step": 40 }, { "epoch": 0.43157894736842106, "grad_norm": 1.0929778682111646e-06, "learning_rate": 0.00019915789473684212, "logits/chosen": -0.07893847674131393, "logits/rejected": -0.07893847674131393, "logps/chosen": -34.562191009521484, "logps/rejected": -34.562191009521484, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.5177852511405945, "rewards/margins": 0.0, "rewards/rejected": -0.5177852511405945, "step": 41 }, { "epoch": 0.4421052631578947, "grad_norm": 1.0339128948544385e-06, "learning_rate": 0.00019913684210526317, "logits/chosen": -0.07934816181659698, "logits/rejected": -0.07934816181659698, "logps/chosen": -34.74945831298828, "logps/rejected": -34.74945831298828, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.536512017250061, "rewards/margins": 0.0, "rewards/rejected": -0.536512017250061, "step": 42 }, { "epoch": 0.45263157894736844, "grad_norm": 6.676766020063951e-07, "learning_rate": 0.00019911578947368421, "logits/chosen": -0.4101516604423523, "logits/rejected": -0.4101516604423523, "logps/chosen": -35.05952453613281, "logps/rejected": -35.05952453613281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.32107314467430115, "rewards/margins": 0.0, "rewards/rejected": -0.32107314467430115, "step": 43 }, { "epoch": 0.4631578947368421, "grad_norm": 5.393874857873016e-07, "learning_rate": 0.00019909473684210526, "logits/chosen": -0.5251553058624268, "logits/rejected": -0.5251553058624268, "logps/chosen": -32.90340805053711, "logps/rejected": -32.90340805053711, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.421234130859375, "rewards/margins": 0.0, "rewards/rejected": -0.421234130859375, "step": 44 }, { "epoch": 0.47368421052631576, "grad_norm": 5.218650471761066e-07, "learning_rate": 0.0001990736842105263, "logits/chosen": -0.5867052674293518, "logits/rejected": -0.5867052674293518, "logps/chosen": -34.09800720214844, "logps/rejected": -34.09800720214844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.3839218318462372, "rewards/margins": 0.0, "rewards/rejected": -0.3839218318462372, "step": 45 }, { "epoch": 0.4842105263157895, "grad_norm": 5.97272503455315e-07, "learning_rate": 0.0001990526315789474, "logits/chosen": -0.5924510955810547, "logits/rejected": -0.5924510955810547, "logps/chosen": -34.22953796386719, "logps/rejected": -34.22953796386719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.3970749080181122, "rewards/margins": 0.0, "rewards/rejected": -0.3970749080181122, "step": 46 }, { "epoch": 0.49473684210526314, "grad_norm": 5.906633759877877e-07, "learning_rate": 0.00019903157894736844, "logits/chosen": -0.8118030428886414, "logits/rejected": -0.8118030428886414, "logps/chosen": -32.110595703125, "logps/rejected": -32.110595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.4848642349243164, "rewards/margins": 0.0, "rewards/rejected": -0.4848642349243164, "step": 47 }, { "epoch": 0.5052631578947369, "grad_norm": 6.80591313084733e-07, "learning_rate": 0.0001990105263157895, "logits/chosen": -0.25058746337890625, "logits/rejected": -0.25058746337890625, "logps/chosen": -32.47521209716797, "logps/rejected": -32.47521209716797, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.4201999604701996, "rewards/margins": 0.0, "rewards/rejected": -0.4201999604701996, "step": 48 }, { "epoch": 0.5157894736842106, "grad_norm": 6.16580393852928e-07, "learning_rate": 0.00019898947368421054, "logits/chosen": -0.6074540615081787, "logits/rejected": -0.6074540615081787, "logps/chosen": -34.65234375, "logps/rejected": -34.65234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.4393554627895355, "rewards/margins": 0.0, "rewards/rejected": -0.4393554627895355, "step": 49 }, { "epoch": 0.5263157894736842, "grad_norm": 1.2105505220461055e-06, "learning_rate": 0.00019896842105263159, "logits/chosen": -0.09993575513362885, "logits/rejected": -0.09993575513362885, "logps/chosen": -36.35881042480469, "logps/rejected": -36.35881042480469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.6974472403526306, "rewards/margins": 0.0, "rewards/rejected": -0.6974472403526306, "step": 50 }, { "epoch": 0.5263157894736842, "eval_logits/chosen": -0.35655614733695984, "eval_logits/rejected": -0.35655614733695984, "eval_logps/chosen": -35.03644561767578, "eval_logps/rejected": -35.03644561767578, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -0.6004651188850403, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -0.6004651188850403, "eval_runtime": 4.5852, "eval_samples_per_second": 2.181, "eval_steps_per_second": 2.181, "step": 50 }, { "epoch": 0.5368421052631579, "grad_norm": 1.3206732774051488e-06, "learning_rate": 0.00019894736842105264, "logits/chosen": -0.10419120639562607, "logits/rejected": -0.10419120639562607, "logps/chosen": -36.56370544433594, "logps/rejected": -36.56370544433594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.7179366946220398, "rewards/margins": 0.0, "rewards/rejected": -0.7179366946220398, "step": 51 }, { "epoch": 0.5473684210526316, "grad_norm": 1.391149226037669e-06, "learning_rate": 0.00019892631578947368, "logits/chosen": -0.1077694371342659, "logits/rejected": -0.1077694371342659, "logps/chosen": -36.836097717285156, "logps/rejected": -36.836097717285156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.7451759576797485, "rewards/margins": 0.0, "rewards/rejected": -0.7451759576797485, "step": 52 }, { "epoch": 0.5578947368421052, "grad_norm": 6.259977567424357e-07, "learning_rate": 0.00019890526315789476, "logits/chosen": -0.629492461681366, "logits/rejected": -0.629492461681366, "logps/chosen": -35.2900390625, "logps/rejected": -35.2900390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.503125011920929, "rewards/margins": 0.0, "rewards/rejected": -0.503125011920929, "step": 53 }, { "epoch": 0.5684210526315789, "grad_norm": 9.902968258757028e-07, "learning_rate": 0.0001988842105263158, "logits/chosen": -0.8901774287223816, "logits/rejected": -0.8901774287223816, "logps/chosen": -34.150787353515625, "logps/rejected": -34.150787353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.6177139282226562, "rewards/margins": 0.0, "rewards/rejected": -0.6177139282226562, "step": 54 }, { "epoch": 0.5789473684210527, "grad_norm": 7.941444550851884e-07, "learning_rate": 0.00019886315789473686, "logits/chosen": -0.25454312562942505, "logits/rejected": -0.25454312562942505, "logps/chosen": -33.346412658691406, "logps/rejected": -33.346412658691406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.5073200464248657, "rewards/margins": 0.0, "rewards/rejected": -0.5073200464248657, "step": 55 }, { "epoch": 0.5894736842105263, "grad_norm": 5.561834086620365e-07, "learning_rate": 0.0001988421052631579, "logits/chosen": -0.8666642904281616, "logits/rejected": -0.8666642904281616, "logps/chosen": -32.6674690246582, "logps/rejected": -32.6674690246582, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.48516425490379333, "rewards/margins": 0.0, "rewards/rejected": -0.48516425490379333, "step": 56 }, { "epoch": 0.6, "grad_norm": 1.085165763470286e-06, "learning_rate": 0.00019882105263157896, "logits/chosen": -0.9087679386138916, "logits/rejected": -0.9087679386138916, "logps/chosen": -34.766693115234375, "logps/rejected": -34.766693115234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.6793045401573181, "rewards/margins": 0.0, "rewards/rejected": -0.6793045401573181, "step": 57 }, { "epoch": 0.6105263157894737, "grad_norm": 6.484092978098488e-07, "learning_rate": 0.0001988, "logits/chosen": -0.8767624497413635, "logits/rejected": -0.8767624497413635, "logps/chosen": -33.77532196044922, "logps/rejected": -33.77532196044922, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.6513368487358093, "rewards/margins": 0.0, "rewards/rejected": -0.6513368487358093, "step": 58 }, { "epoch": 0.6210526315789474, "grad_norm": 5.605424462373776e-07, "learning_rate": 0.00019877894736842106, "logits/chosen": -0.8755089044570923, "logits/rejected": -0.8755089044570923, "logps/chosen": -33.11381149291992, "logps/rejected": -33.11381149291992, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.5297985076904297, "rewards/margins": 0.0, "rewards/rejected": -0.5297985076904297, "step": 59 }, { "epoch": 0.631578947368421, "grad_norm": 7.012636160652619e-07, "learning_rate": 0.00019875789473684213, "logits/chosen": -0.44773852825164795, "logits/rejected": -0.44773852825164795, "logps/chosen": -36.03230285644531, "logps/rejected": -36.03230285644531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.6849319338798523, "rewards/margins": 0.0, "rewards/rejected": -0.6849319338798523, "step": 60 }, { "epoch": 0.6421052631578947, "grad_norm": 7.327849971261458e-07, "learning_rate": 0.00019873684210526318, "logits/chosen": -0.4542182385921478, "logits/rejected": -0.4542182385921478, "logps/chosen": -36.22119140625, "logps/rejected": -36.22119140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.7038208246231079, "rewards/margins": 0.0, "rewards/rejected": -0.7038208246231079, "step": 61 }, { "epoch": 0.6526315789473685, "grad_norm": 8.450563200312899e-07, "learning_rate": 0.0001987157894736842, "logits/chosen": -0.4958184063434601, "logits/rejected": -0.4958184063434601, "logps/chosen": -37.08246612548828, "logps/rejected": -37.08246612548828, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.5233673453330994, "rewards/margins": 0.0, "rewards/rejected": -0.5233673453330994, "step": 62 }, { "epoch": 0.6631578947368421, "grad_norm": 6.643780920967401e-07, "learning_rate": 0.00019869473684210525, "logits/chosen": -0.8882527351379395, "logits/rejected": -0.8882527351379395, "logps/chosen": -33.791908264160156, "logps/rejected": -33.791908264160156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.597608208656311, "rewards/margins": 0.0, "rewards/rejected": -0.597608208656311, "step": 63 }, { "epoch": 0.6736842105263158, "grad_norm": 1.3917771184424055e-06, "learning_rate": 0.00019867368421052633, "logits/chosen": 0.06700892001390457, "logits/rejected": 0.06700892001390457, "logps/chosen": -38.569244384765625, "logps/rejected": -38.569244384765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.0097378492355347, "rewards/margins": 0.0, "rewards/rejected": -1.0097378492355347, "step": 64 }, { "epoch": 0.6842105263157895, "grad_norm": 1.605672423465876e-06, "learning_rate": 0.00019865263157894738, "logits/chosen": 0.060907840728759766, "logits/rejected": 0.060907840728759766, "logps/chosen": -38.83467483520508, "logps/rejected": -38.83467483520508, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.036280870437622, "rewards/margins": 0.0, "rewards/rejected": -1.036280870437622, "step": 65 }, { "epoch": 0.6947368421052632, "grad_norm": 1.30730154523917e-06, "learning_rate": 0.00019863157894736843, "logits/chosen": -0.19624063372612, "logits/rejected": -0.19624063372612, "logps/chosen": -40.40142822265625, "logps/rejected": -40.40142822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.101709008216858, "rewards/margins": 0.0, "rewards/rejected": -1.101709008216858, "step": 66 }, { "epoch": 0.7052631578947368, "grad_norm": 1.4441985740631935e-06, "learning_rate": 0.00019861052631578948, "logits/chosen": 0.04746666178107262, "logits/rejected": 0.04746666178107262, "logps/chosen": -39.52649688720703, "logps/rejected": -39.52649688720703, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.1054630279541016, "rewards/margins": 0.0, "rewards/rejected": -1.1054630279541016, "step": 67 }, { "epoch": 0.7157894736842105, "grad_norm": 1.154596361629956e-06, "learning_rate": 0.00019858947368421053, "logits/chosen": -0.2756457030773163, "logits/rejected": -0.2756457030773163, "logps/chosen": -35.44649887084961, "logps/rejected": -35.44649887084961, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.717328667640686, "rewards/margins": 0.0, "rewards/rejected": -0.717328667640686, "step": 68 }, { "epoch": 0.7263157894736842, "grad_norm": 1.64998914442549e-06, "learning_rate": 0.00019856842105263158, "logits/chosen": 0.029611682519316673, "logits/rejected": 0.029611682519316673, "logps/chosen": -40.39375305175781, "logps/rejected": -40.39375305175781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.1921886205673218, "rewards/margins": 0.0, "rewards/rejected": -1.1921886205673218, "step": 69 }, { "epoch": 0.7368421052631579, "grad_norm": 7.885269610596879e-07, "learning_rate": 0.00019854736842105263, "logits/chosen": -0.7255390286445618, "logits/rejected": -0.7255390286445618, "logps/chosen": -38.25248718261719, "logps/rejected": -38.25248718261719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.7993698120117188, "rewards/margins": 0.0, "rewards/rejected": -0.7993698120117188, "step": 70 }, { "epoch": 0.7473684210526316, "grad_norm": 1.3629967270389898e-06, "learning_rate": 0.0001985263157894737, "logits/chosen": -0.283242791891098, "logits/rejected": -0.283242791891098, "logps/chosen": -36.21233367919922, "logps/rejected": -36.21233367919922, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.7939121127128601, "rewards/margins": 0.0, "rewards/rejected": -0.7939121127128601, "step": 71 }, { "epoch": 0.7578947368421053, "grad_norm": 1.0566282071522437e-06, "learning_rate": 0.00019850526315789475, "logits/chosen": -0.737543523311615, "logits/rejected": -0.737543523311615, "logps/chosen": -38.730262756347656, "logps/rejected": -38.730262756347656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.8471474051475525, "rewards/margins": 0.0, "rewards/rejected": -0.8471474051475525, "step": 72 }, { "epoch": 0.7684210526315789, "grad_norm": 7.137317084016104e-07, "learning_rate": 0.0001984842105263158, "logits/chosen": -0.9320713877677917, "logits/rejected": -0.9320713877677917, "logps/chosen": -35.780574798583984, "logps/rejected": -35.780574798583984, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.7964748740196228, "rewards/margins": 0.0, "rewards/rejected": -0.7964748740196228, "step": 73 }, { "epoch": 0.7789473684210526, "grad_norm": 1.0629120197336306e-06, "learning_rate": 0.00019846315789473685, "logits/chosen": -0.7492873072624207, "logits/rejected": -0.7492873072624207, "logps/chosen": -39.273284912109375, "logps/rejected": -39.273284912109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.9014496207237244, "rewards/margins": 0.0, "rewards/rejected": -0.9014496207237244, "step": 74 }, { "epoch": 0.7894736842105263, "grad_norm": 1.5789109966135584e-06, "learning_rate": 0.0001984421052631579, "logits/chosen": -0.03452115133404732, "logits/rejected": -0.03452115133404732, "logps/chosen": -42.939208984375, "logps/rejected": -42.939208984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.4467343091964722, "rewards/margins": 0.0, "rewards/rejected": -1.4467343091964722, "step": 75 }, { "epoch": 0.8, "grad_norm": 1.606632508810435e-06, "learning_rate": 0.00019842105263157895, "logits/chosen": -0.3005423843860626, "logits/rejected": -0.3005423843860626, "logps/chosen": -37.789302825927734, "logps/rejected": -37.789302825927734, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.9516090750694275, "rewards/margins": 0.0, "rewards/rejected": -0.9516090750694275, "step": 76 }, { "epoch": 0.8105263157894737, "grad_norm": 2.1266146177367773e-06, "learning_rate": 0.0001984, "logits/chosen": -1.0360406637191772, "logits/rejected": -1.0360406637191772, "logps/chosen": -39.363311767578125, "logps/rejected": -39.363311767578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.13896644115448, "rewards/margins": 0.0, "rewards/rejected": -1.13896644115448, "step": 77 }, { "epoch": 0.8210526315789474, "grad_norm": 1.5555992831650656e-06, "learning_rate": 0.00019837894736842107, "logits/chosen": -0.3110392987728119, "logits/rejected": -0.3110392987728119, "logps/chosen": -38.595924377441406, "logps/rejected": -38.595924377441406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.0322712659835815, "rewards/margins": 0.0, "rewards/rejected": -1.0322712659835815, "step": 78 }, { "epoch": 0.8315789473684211, "grad_norm": 1.6888642448975588e-06, "learning_rate": 0.00019835789473684212, "logits/chosen": -0.2862299978733063, "logits/rejected": -0.2862299978733063, "logps/chosen": -45.134254455566406, "logps/rejected": -45.134254455566406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.5749915838241577, "rewards/margins": 0.0, "rewards/rejected": -1.5749915838241577, "step": 79 }, { "epoch": 0.8421052631578947, "grad_norm": 7.403574500131072e-07, "learning_rate": 0.00019833684210526317, "logits/chosen": -0.9862018823623657, "logits/rejected": -0.9862018823623657, "logps/chosen": -37.278656005859375, "logps/rejected": -37.278656005859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.9462829828262329, "rewards/margins": 0.0, "rewards/rejected": -0.9462829828262329, "step": 80 }, { "epoch": 0.8526315789473684, "grad_norm": 1.5618402358086314e-06, "learning_rate": 0.00019831578947368422, "logits/chosen": -0.32921895384788513, "logits/rejected": -0.32921895384788513, "logps/chosen": -40.038116455078125, "logps/rejected": -40.038116455078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.1764904260635376, "rewards/margins": 0.0, "rewards/rejected": -1.1764904260635376, "step": 81 }, { "epoch": 0.8631578947368421, "grad_norm": NaN, "learning_rate": 0.00019831578947368422, "logits/chosen": -1.0665150880813599, "logits/rejected": -1.0665150880813599, "logps/chosen": -40.92689514160156, "logps/rejected": -40.92689514160156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.295324683189392, "rewards/margins": 0.0, "rewards/rejected": -1.295324683189392, "step": 82 }, { "epoch": 0.8736842105263158, "grad_norm": 8.206494612750248e-07, "learning_rate": 0.00019829473684210527, "logits/chosen": -1.011318564414978, "logits/rejected": -1.011318564414978, "logps/chosen": -37.846710205078125, "logps/rejected": -37.846710205078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.003088355064392, "rewards/margins": 0.0, "rewards/rejected": -1.003088355064392, "step": 83 }, { "epoch": 0.8842105263157894, "grad_norm": 1.1817699032690143e-06, "learning_rate": 0.00019827368421052632, "logits/chosen": -0.8081866502761841, "logits/rejected": -0.8081866502761841, "logps/chosen": -42.06470489501953, "logps/rejected": -42.06470489501953, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.1805915832519531, "rewards/margins": 0.0, "rewards/rejected": -1.1805915832519531, "step": 84 }, { "epoch": 0.8947368421052632, "grad_norm": 1.2260217090442893e-06, "learning_rate": 0.00019825263157894737, "logits/chosen": -0.7983872294425964, "logits/rejected": -0.7983872294425964, "logps/chosen": -40.32405090332031, "logps/rejected": -40.32405090332031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.163298487663269, "rewards/margins": 0.0, "rewards/rejected": -1.163298487663269, "step": 85 }, { "epoch": 0.9052631578947369, "grad_norm": 1.0451011576151359e-06, "learning_rate": 0.00019823157894736845, "logits/chosen": -0.6131337285041809, "logits/rejected": -0.6131337285041809, "logps/chosen": -41.054622650146484, "logps/rejected": -41.054622650146484, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.9205829501152039, "rewards/margins": 0.0, "rewards/rejected": -0.9205829501152039, "step": 86 }, { "epoch": 0.9157894736842105, "grad_norm": 1.6260490838249098e-06, "learning_rate": 0.0001982105263157895, "logits/chosen": -0.36974355578422546, "logits/rejected": -0.36974355578422546, "logps/chosen": -42.681610107421875, "logps/rejected": -42.681610107421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.4408397674560547, "rewards/margins": 0.0, "rewards/rejected": -1.4408397674560547, "step": 87 }, { "epoch": 0.9263157894736842, "grad_norm": 2.0204777229082538e-06, "learning_rate": 0.00019818947368421052, "logits/chosen": -0.8440442085266113, "logits/rejected": -0.8440442085266113, "logps/chosen": -43.71595764160156, "logps/rejected": -43.71595764160156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.3457168340682983, "rewards/margins": 0.0, "rewards/rejected": -1.3457168340682983, "step": 88 }, { "epoch": 0.9368421052631579, "grad_norm": NaN, "learning_rate": 0.00019818947368421052, "logits/chosen": -0.8557513952255249, "logits/rejected": -0.8557513952255249, "logps/chosen": -44.365943908691406, "logps/rejected": -44.365943908691406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.4107154607772827, "rewards/margins": 0.0, "rewards/rejected": -1.4107154607772827, "step": 89 }, { "epoch": 0.9473684210526315, "grad_norm": 7.714015737292357e-07, "learning_rate": 0.0001981684210526316, "logits/chosen": -1.0934278964996338, "logits/rejected": -1.0934278964996338, "logps/chosen": -39.80454635620117, "logps/rejected": -39.80454635620117, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.1988719701766968, "rewards/margins": 0.0, "rewards/rejected": -1.1988719701766968, "step": 90 }, { "epoch": 0.9578947368421052, "grad_norm": 9.797598750083125e-07, "learning_rate": 0.00019814736842105264, "logits/chosen": -0.8375434875488281, "logits/rejected": -0.8375434875488281, "logps/chosen": -42.35945129394531, "logps/rejected": -42.35945129394531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.3668384552001953, "rewards/margins": 0.0, "rewards/rejected": -1.3668384552001953, "step": 91 }, { "epoch": 0.968421052631579, "grad_norm": 1.3935992910774075e-06, "learning_rate": 0.0001981263157894737, "logits/chosen": -0.841162919998169, "logits/rejected": -0.841162919998169, "logps/chosen": -42.723976135253906, "logps/rejected": -42.723976135253906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.4032909870147705, "rewards/margins": 0.0, "rewards/rejected": -1.4032909870147705, "step": 92 }, { "epoch": 0.9789473684210527, "grad_norm": NaN, "learning_rate": 0.0001981263157894737, "logits/chosen": -1.1410332918167114, "logits/rejected": -1.1410332918167114, "logps/chosen": -40.401309967041016, "logps/rejected": -40.401309967041016, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.258548378944397, "rewards/margins": 0.0, "rewards/rejected": -1.258548378944397, "step": 93 }, { "epoch": 0.9894736842105263, "grad_norm": 1.48494746099459e-06, "learning_rate": 0.00019810526315789474, "logits/chosen": -1.1042628288269043, "logits/rejected": -1.1042628288269043, "logps/chosen": -39.81610107421875, "logps/rejected": -39.81610107421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.2554148435592651, "rewards/margins": 0.0, "rewards/rejected": -1.2554148435592651, "step": 94 }, { "epoch": 1.0, "grad_norm": 2.16625994653441e-06, "learning_rate": 0.00019808421052631582, "logits/chosen": -0.42906633019447327, "logits/rejected": -0.42906633019447327, "logps/chosen": -45.888736724853516, "logps/rejected": -45.888736724853516, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.7615524530410767, "rewards/margins": 0.0, "rewards/rejected": -1.7615524530410767, "step": 95 }, { "epoch": 1.0105263157894737, "grad_norm": 3.047139443879132e-06, "learning_rate": 0.00019806315789473687, "logits/chosen": -0.4366866648197174, "logits/rejected": -0.4366866648197174, "logps/chosen": -46.279537200927734, "logps/rejected": -46.279537200927734, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.8006324768066406, "rewards/margins": 0.0, "rewards/rejected": -1.8006324768066406, "step": 96 }, { "epoch": 1.0210526315789474, "grad_norm": 3.3121996239060536e-06, "learning_rate": 0.0001980421052631579, "logits/chosen": -0.4385872781276703, "logits/rejected": -0.4385872781276703, "logps/chosen": -46.83531951904297, "logps/rejected": -46.83531951904297, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.856210708618164, "rewards/margins": 0.0, "rewards/rejected": -1.856210708618164, "step": 97 }, { "epoch": 1.0315789473684212, "grad_norm": NaN, "learning_rate": 0.0001980421052631579, "logits/chosen": -1.1934232711791992, "logits/rejected": -1.1934232711791992, "logps/chosen": -40.99372482299805, "logps/rejected": -40.99372482299805, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.317789912223816, "rewards/margins": 0.0, "rewards/rejected": -1.317789912223816, "step": 98 }, { "epoch": 1.0421052631578946, "grad_norm": 1.223079038936703e-06, "learning_rate": 0.00019802105263157894, "logits/chosen": -0.9469152092933655, "logits/rejected": -0.9469152092933655, "logps/chosen": -47.82305908203125, "logps/rejected": -47.82305908203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.7564270496368408, "rewards/margins": 0.0, "rewards/rejected": -1.7564270496368408, "step": 99 }, { "epoch": 1.0526315789473684, "grad_norm": 3.1345925890491344e-06, "learning_rate": 0.00019800000000000002, "logits/chosen": -0.6653971076011658, "logits/rejected": -0.6653971076011658, "logps/chosen": -43.97892761230469, "logps/rejected": -43.97892761230469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.2130135297775269, "rewards/margins": 0.0, "rewards/rejected": -1.2130135297775269, "step": 100 }, { "epoch": 1.0526315789473684, "eval_logits/chosen": -0.6583244800567627, "eval_logits/rejected": -0.6583244800567627, "eval_logps/chosen": -47.84056854248047, "eval_logps/rejected": -47.84056854248047, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -1.880876898765564, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -1.880876898765564, "eval_runtime": 4.4813, "eval_samples_per_second": 2.231, "eval_steps_per_second": 2.231, "step": 100 }, { "epoch": 1.063157894736842, "grad_norm": 1.936478838615585e-06, "learning_rate": 0.00019797894736842106, "logits/chosen": -0.43286311626434326, "logits/rejected": -0.43286311626434326, "logps/chosen": -49.19889831542969, "logps/rejected": -49.19889831542969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.0925686359405518, "rewards/margins": 0.0, "rewards/rejected": -2.0925686359405518, "step": 101 }, { "epoch": 1.0736842105263158, "grad_norm": 5.185303962207399e-06, "learning_rate": 0.00019795789473684211, "logits/chosen": -1.1521388292312622, "logits/rejected": -1.1521388292312622, "logps/chosen": -46.65351486206055, "logps/rejected": -46.65351486206055, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.8679866790771484, "rewards/margins": 0.0, "rewards/rejected": -1.8679866790771484, "step": 102 }, { "epoch": 1.0842105263157895, "grad_norm": 8.595636700192699e-07, "learning_rate": 0.00019793684210526316, "logits/chosen": -1.2388026714324951, "logits/rejected": -1.2388026714324951, "logps/chosen": -42.79230880737305, "logps/rejected": -42.79230880737305, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.4976482391357422, "rewards/margins": 0.0, "rewards/rejected": -1.4976482391357422, "step": 103 }, { "epoch": 1.0947368421052632, "grad_norm": 2.0735121779580368e-06, "learning_rate": 0.0001979157894736842, "logits/chosen": -0.25976407527923584, "logits/rejected": -0.25976407527923584, "logps/chosen": -53.21000289916992, "logps/rejected": -53.21000289916992, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.473813772201538, "rewards/margins": 0.0, "rewards/rejected": -2.473813772201538, "step": 104 }, { "epoch": 1.1052631578947367, "grad_norm": 2.4458829557261197e-06, "learning_rate": 0.00019789473684210526, "logits/chosen": -1.2645772695541382, "logits/rejected": -1.2645772695541382, "logps/chosen": -48.962562561035156, "logps/rejected": -48.962562561035156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.098891496658325, "rewards/margins": 0.0, "rewards/rejected": -2.098891496658325, "step": 105 }, { "epoch": 1.1157894736842104, "grad_norm": 1.8741600342764286e-06, "learning_rate": 0.0001978736842105263, "logits/chosen": -1.3398991823196411, "logits/rejected": -1.3398991823196411, "logps/chosen": -43.861549377441406, "logps/rejected": -43.861549377441406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.6599596738815308, "rewards/margins": 0.0, "rewards/rejected": -1.6599596738815308, "step": 106 }, { "epoch": 1.1263157894736842, "grad_norm": 2.5885437935357913e-06, "learning_rate": 0.0001978526315789474, "logits/chosen": -0.29697465896606445, "logits/rejected": -0.29697465896606445, "logps/chosen": -55.49425506591797, "logps/rejected": -55.49425506591797, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.7022387981414795, "rewards/margins": 0.0, "rewards/rejected": -2.7022387981414795, "step": 107 }, { "epoch": 1.1368421052631579, "grad_norm": 1.910174660224584e-06, "learning_rate": 0.00019783157894736844, "logits/chosen": -0.7355058193206787, "logits/rejected": -0.7355058193206787, "logps/chosen": -46.72576141357422, "logps/rejected": -46.72576141357422, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.7542778253555298, "rewards/margins": 0.0, "rewards/rejected": -1.7542778253555298, "step": 108 }, { "epoch": 1.1473684210526316, "grad_norm": 1.6612037825325388e-06, "learning_rate": 0.00019781052631578949, "logits/chosen": -1.1229007244110107, "logits/rejected": -1.1229007244110107, "logps/chosen": -50.912113189697266, "logps/rejected": -50.912113189697266, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.0653324127197266, "rewards/margins": 0.0, "rewards/rejected": -2.0653324127197266, "step": 109 }, { "epoch": 1.1578947368421053, "grad_norm": 1.6714056982891634e-06, "learning_rate": 0.00019778947368421053, "logits/chosen": -1.4076998233795166, "logits/rejected": -1.4076998233795166, "logps/chosen": -46.0490837097168, "logps/rejected": -46.0490837097168, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -1.8787130117416382, "rewards/margins": 0.0, "rewards/rejected": -1.8787130117416382, "step": 110 }, { "epoch": 1.168421052631579, "grad_norm": 2.404258111710078e-06, "learning_rate": 0.00019776842105263158, "logits/chosen": -0.4164087772369385, "logits/rejected": -0.4164087772369385, "logps/chosen": -55.265403747558594, "logps/rejected": -55.265403747558594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.699219226837158, "rewards/margins": 0.0, "rewards/rejected": -2.699219226837158, "step": 111 }, { "epoch": 1.1789473684210527, "grad_norm": 2.5129768346232595e-06, "learning_rate": 0.00019774736842105263, "logits/chosen": -0.48479190468788147, "logits/rejected": -0.48479190468788147, "logps/chosen": -57.135475158691406, "logps/rejected": -57.135475158691406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.775113821029663, "rewards/margins": 0.0, "rewards/rejected": -2.775113821029663, "step": 112 }, { "epoch": 1.1894736842105262, "grad_norm": 1.6344749838026473e-06, "learning_rate": 0.00019772631578947368, "logits/chosen": -1.1951954364776611, "logits/rejected": -1.1951954364776611, "logps/chosen": -53.28643798828125, "logps/rejected": -53.28643798828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.302764892578125, "rewards/margins": 0.0, "rewards/rejected": -2.302764892578125, "step": 113 }, { "epoch": 1.2, "grad_norm": 2.0741767912113573e-06, "learning_rate": 0.00019770526315789476, "logits/chosen": -1.1239030361175537, "logits/rejected": -1.1239030361175537, "logps/chosen": -53.50406265258789, "logps/rejected": -53.50406265258789, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.481299638748169, "rewards/margins": 0.0, "rewards/rejected": -2.481299638748169, "step": 114 }, { "epoch": 1.2105263157894737, "grad_norm": 3.643415084297885e-06, "learning_rate": 0.0001976842105263158, "logits/chosen": -0.4708452522754669, "logits/rejected": -0.4708452522754669, "logps/chosen": -63.93821334838867, "logps/rejected": -63.93821334838867, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -3.5466346740722656, "rewards/margins": 0.0, "rewards/rejected": -3.5466346740722656, "step": 115 }, { "epoch": 1.2210526315789474, "grad_norm": 3.5905320601159474e-06, "learning_rate": 0.00019766315789473686, "logits/chosen": -0.44383522868156433, "logits/rejected": -0.44383522868156433, "logps/chosen": -59.75299072265625, "logps/rejected": -59.75299072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -3.147977828979492, "rewards/margins": 0.0, "rewards/rejected": -3.147977828979492, "step": 116 }, { "epoch": 1.231578947368421, "grad_norm": 3.9981105146580376e-06, "learning_rate": 0.0001976421052631579, "logits/chosen": -0.551936686038971, "logits/rejected": -0.551936686038971, "logps/chosen": -61.86214828491211, "logps/rejected": -61.86214828491211, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -3.2477810382843018, "rewards/margins": 0.0, "rewards/rejected": -3.2477810382843018, "step": 117 }, { "epoch": 1.2421052631578948, "grad_norm": 3.1762576782057295e-06, "learning_rate": 0.00019762105263157896, "logits/chosen": -1.1889318227767944, "logits/rejected": -1.1889318227767944, "logps/chosen": -57.67156219482422, "logps/rejected": -57.67156219482422, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.8980495929718018, "rewards/margins": 0.0, "rewards/rejected": -2.8980495929718018, "step": 118 }, { "epoch": 1.2526315789473683, "grad_norm": 1.6450478597107576e-06, "learning_rate": 0.0001976, "logits/chosen": -1.4779409170150757, "logits/rejected": -1.4779409170150757, "logps/chosen": -52.22726058959961, "logps/rejected": -52.22726058959961, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.496530771255493, "rewards/margins": 0.0, "rewards/rejected": -2.496530771255493, "step": 119 }, { "epoch": 1.263157894736842, "grad_norm": 4.226046712574316e-06, "learning_rate": 0.00019757894736842105, "logits/chosen": -0.6531275510787964, "logits/rejected": -0.6531275510787964, "logps/chosen": -72.43806457519531, "logps/rejected": -72.43806457519531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.39661979675293, "rewards/margins": 0.0, "rewards/rejected": -4.39661979675293, "step": 120 }, { "epoch": 1.2736842105263158, "grad_norm": 4.228716534271371e-06, "learning_rate": 0.00019755789473684213, "logits/chosen": -0.6977147459983826, "logits/rejected": -0.6977147459983826, "logps/chosen": -74.83479309082031, "logps/rejected": -74.83479309082031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.636292934417725, "rewards/margins": 0.0, "rewards/rejected": -4.636292934417725, "step": 121 }, { "epoch": 1.2842105263157895, "grad_norm": 4.3411905608081724e-06, "learning_rate": 0.00019753684210526318, "logits/chosen": -0.5020732879638672, "logits/rejected": -0.5020732879638672, "logps/chosen": -68.32476806640625, "logps/rejected": -68.32476806640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.005155563354492, "rewards/margins": 0.0, "rewards/rejected": -4.005155563354492, "step": 122 }, { "epoch": 1.2947368421052632, "grad_norm": 5.095363121654373e-06, "learning_rate": 0.0001975157894736842, "logits/chosen": -0.7868385314941406, "logits/rejected": -0.7868385314941406, "logps/chosen": -80.11665344238281, "logps/rejected": -80.11665344238281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -5.164478778839111, "rewards/margins": 0.0, "rewards/rejected": -5.164478778839111, "step": 123 }, { "epoch": 1.305263157894737, "grad_norm": 2.0824199964408763e-06, "learning_rate": 0.00019749473684210528, "logits/chosen": -1.533855676651001, "logits/rejected": -1.533855676651001, "logps/chosen": -49.14099884033203, "logps/rejected": -49.14099884033203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.1325173377990723, "rewards/margins": 0.0, "rewards/rejected": -2.1325173377990723, "step": 124 }, { "epoch": 1.3157894736842106, "grad_norm": 5.472765678860014e-06, "learning_rate": 0.00019747368421052633, "logits/chosen": -1.0018510818481445, "logits/rejected": -1.0018510818481445, "logps/chosen": -57.671409606933594, "logps/rejected": -57.671409606933594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.582261800765991, "rewards/margins": 0.0, "rewards/rejected": -2.582261800765991, "step": 125 }, { "epoch": 1.3263157894736843, "grad_norm": 4.48773016614723e-06, "learning_rate": 0.00019745263157894738, "logits/chosen": -1.3305102586746216, "logits/rejected": -1.3305102586746216, "logps/chosen": -69.93067932128906, "logps/rejected": -69.93067932128906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.123961448669434, "rewards/margins": 0.0, "rewards/rejected": -4.123961448669434, "step": 126 }, { "epoch": 1.3368421052631578, "grad_norm": 4.489726961764973e-06, "learning_rate": 0.00019743157894736843, "logits/chosen": -1.3374897241592407, "logits/rejected": -1.3374897241592407, "logps/chosen": -72.15799713134766, "logps/rejected": -72.15799713134766, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.34669303894043, "rewards/margins": 0.0, "rewards/rejected": -4.34669303894043, "step": 127 }, { "epoch": 1.3473684210526315, "grad_norm": 2.618641701701563e-06, "learning_rate": 0.0001974105263157895, "logits/chosen": -1.471704125404358, "logits/rejected": -1.471704125404358, "logps/chosen": -68.04132080078125, "logps/rejected": -68.04132080078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -3.7782533168792725, "rewards/margins": 0.0, "rewards/rejected": -3.7782533168792725, "step": 128 }, { "epoch": 1.3578947368421053, "grad_norm": 2.595986188680399e-06, "learning_rate": 0.00019738947368421055, "logits/chosen": -1.5000438690185547, "logits/rejected": -1.5000438690185547, "logps/chosen": -60.63397216796875, "logps/rejected": -60.63397216796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -3.3372018337249756, "rewards/margins": 0.0, "rewards/rejected": -3.3372018337249756, "step": 129 }, { "epoch": 1.368421052631579, "grad_norm": 4.121559413761133e-06, "learning_rate": 0.00019736842105263157, "logits/chosen": -0.7774177193641663, "logits/rejected": -0.7774177193641663, "logps/chosen": -79.11760711669922, "logps/rejected": -79.11760711669922, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.973327159881592, "rewards/margins": 0.0, "rewards/rejected": -4.973327159881592, "step": 130 }, { "epoch": 1.3789473684210527, "grad_norm": 2.6745083232526667e-06, "learning_rate": 0.00019734736842105262, "logits/chosen": -1.5010743141174316, "logits/rejected": -1.5010743141174316, "logps/chosen": -72.77798461914062, "logps/rejected": -72.77798461914062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.251919746398926, "rewards/margins": 0.0, "rewards/rejected": -4.251919746398926, "step": 131 }, { "epoch": 1.3894736842105262, "grad_norm": 6.172317171149189e-06, "learning_rate": 0.0001973263157894737, "logits/chosen": -0.6379980444908142, "logits/rejected": -0.6379980444908142, "logps/chosen": -86.76994323730469, "logps/rejected": -86.76994323730469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -5.849673271179199, "rewards/margins": 0.0, "rewards/rejected": -5.849673271179199, "step": 132 }, { "epoch": 1.4, "grad_norm": 2.97183146358293e-06, "learning_rate": 0.00019730526315789475, "logits/chosen": -1.270950436592102, "logits/rejected": -1.270950436592102, "logps/chosen": -66.19747161865234, "logps/rejected": -66.19747161865234, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -3.701448917388916, "rewards/margins": 0.0, "rewards/rejected": -3.701448917388916, "step": 133 }, { "epoch": 1.4105263157894736, "grad_norm": 3.7268039250193397e-06, "learning_rate": 0.0001972842105263158, "logits/chosen": -1.5355697870254517, "logits/rejected": -1.5355697870254517, "logps/chosen": -78.51725769042969, "logps/rejected": -78.51725769042969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.825847148895264, "rewards/margins": 0.0, "rewards/rejected": -4.825847148895264, "step": 134 }, { "epoch": 1.4210526315789473, "grad_norm": 7.989572623046115e-06, "learning_rate": 0.00019726315789473685, "logits/chosen": -0.9482132792472839, "logits/rejected": -0.9482132792472839, "logps/chosen": -105.5180435180664, "logps/rejected": -105.5180435180664, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -7.704617500305176, "rewards/margins": 0.0, "rewards/rejected": -7.704617500305176, "step": 135 }, { "epoch": 1.431578947368421, "grad_norm": 5.123813025420532e-06, "learning_rate": 0.0001972421052631579, "logits/chosen": -0.8642720580101013, "logits/rejected": -0.8642720580101013, "logps/chosen": -89.34291076660156, "logps/rejected": -89.34291076660156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -5.995857238769531, "rewards/margins": 0.0, "rewards/rejected": -5.995857238769531, "step": 136 }, { "epoch": 1.4421052631578948, "grad_norm": 7.485965397791006e-06, "learning_rate": 0.00019722105263157895, "logits/chosen": -1.2587380409240723, "logits/rejected": -1.2587380409240723, "logps/chosen": -79.7330322265625, "logps/rejected": -79.7330322265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -4.788424015045166, "rewards/margins": 0.0, "rewards/rejected": -4.788424015045166, "step": 137 }, { "epoch": 1.4526315789473685, "grad_norm": 1.5202248278001207e-06, "learning_rate": 0.0001972, "logits/chosen": -1.5258516073226929, "logits/rejected": -1.5258516073226929, "logps/chosen": -57.422080993652344, "logps/rejected": -57.422080993652344, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -2.960625410079956, "rewards/margins": 0.0, "rewards/rejected": -2.960625410079956, "step": 138 }, { "epoch": 1.4631578947368422, "grad_norm": 5.481057996803429e-06, "learning_rate": 0.00019717894736842107, "logits/chosen": -1.6797330379486084, "logits/rejected": -1.6797330379486084, "logps/chosen": -85.02195739746094, "logps/rejected": -85.02195739746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -5.704831123352051, "rewards/margins": 0.0, "rewards/rejected": -5.704831123352051, "step": 139 }, { "epoch": 1.4736842105263157, "grad_norm": 1.5807910358489607e-06, "learning_rate": 0.00019715789473684212, "logits/chosen": -1.524339199066162, "logits/rejected": -1.524339199066162, "logps/chosen": -59.21200942993164, "logps/rejected": -59.21200942993164, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -3.139618396759033, "rewards/margins": 0.0, "rewards/rejected": -3.139618396759033, "step": 140 }, { "epoch": 1.4842105263157894, "grad_norm": 1.2956250429851934e-05, "learning_rate": 0.00019713684210526317, "logits/chosen": -1.0321005582809448, "logits/rejected": -1.0321005582809448, "logps/chosen": -124.25344848632812, "logps/rejected": -124.25344848632812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -9.578158378601074, "rewards/margins": 0.0, "rewards/rejected": -9.578158378601074, "step": 141 }, { "epoch": 1.4947368421052631, "grad_norm": 4.446477305464214e-06, "learning_rate": 0.00019711578947368422, "logits/chosen": -1.4601666927337646, "logits/rejected": -1.4601666927337646, "logps/chosen": -109.08540344238281, "logps/rejected": -109.08540344238281, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -8.039433479309082, "rewards/margins": 0.0, "rewards/rejected": -8.039433479309082, "step": 142 }, { "epoch": 1.5052631578947369, "grad_norm": 6.148378815851174e-06, "learning_rate": 0.00019709473684210527, "logits/chosen": -1.7236757278442383, "logits/rejected": -1.7236757278442383, "logps/chosen": -95.50958251953125, "logps/rejected": -95.50958251953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -6.753593444824219, "rewards/margins": 0.0, "rewards/rejected": -6.753593444824219, "step": 143 }, { "epoch": 1.5157894736842106, "grad_norm": 2.541372623454663e-06, "learning_rate": 0.00019707368421052632, "logits/chosen": -1.5324490070343018, "logits/rejected": -1.5324490070343018, "logps/chosen": -63.43986892700195, "logps/rejected": -63.43986892700195, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -3.5624043941497803, "rewards/margins": 0.0, "rewards/rejected": -3.5624043941497803, "step": 144 }, { "epoch": 1.526315789473684, "grad_norm": 1.5561738109681755e-05, "learning_rate": 0.00019705263157894737, "logits/chosen": -1.092746615409851, "logits/rejected": -1.092746615409851, "logps/chosen": -140.5499267578125, "logps/rejected": -140.5499267578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -11.207806587219238, "rewards/margins": 0.0, "rewards/rejected": -11.207806587219238, "step": 145 }, { "epoch": 1.5368421052631578, "grad_norm": 1.0378664228483103e-05, "learning_rate": 0.00019703157894736844, "logits/chosen": -1.0650830268859863, "logits/rejected": -1.0650830268859863, "logps/chosen": -115.74560546875, "logps/rejected": -115.74560546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -8.636126518249512, "rewards/margins": 0.0, "rewards/rejected": -8.636126518249512, "step": 146 }, { "epoch": 1.5473684210526315, "grad_norm": 7.213519438664662e-06, "learning_rate": 0.0001970105263157895, "logits/chosen": -1.0977295637130737, "logits/rejected": -1.0977295637130737, "logps/chosen": -120.56465148925781, "logps/rejected": -120.56465148925781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -9.11803150177002, "rewards/margins": 0.0, "rewards/rejected": -9.11803150177002, "step": 147 }, { "epoch": 1.5578947368421052, "grad_norm": 1.71927113115089e-05, "learning_rate": 0.00019698947368421054, "logits/chosen": -1.1391836404800415, "logits/rejected": -1.1391836404800415, "logps/chosen": -156.61782836914062, "logps/rejected": -156.61782836914062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -12.814597129821777, "rewards/margins": 0.0, "rewards/rejected": -12.814597129821777, "step": 148 }, { "epoch": 1.568421052631579, "grad_norm": 1.0912938705587294e-05, "learning_rate": 0.0001969684210526316, "logits/chosen": -1.1804676055908203, "logits/rejected": -1.1804676055908203, "logps/chosen": -132.6087646484375, "logps/rejected": -132.6087646484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -10.322443008422852, "rewards/margins": 0.0, "rewards/rejected": -10.322443008422852, "step": 149 }, { "epoch": 1.5789473684210527, "grad_norm": 5.256761141936295e-06, "learning_rate": 0.00019694736842105264, "logits/chosen": -1.7506847381591797, "logits/rejected": -1.7506847381591797, "logps/chosen": -113.24552154541016, "logps/rejected": -113.24552154541016, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -8.298673629760742, "rewards/margins": 0.0, "rewards/rejected": -8.298673629760742, "step": 150 }, { "epoch": 1.5789473684210527, "eval_logits/chosen": -1.4358314275741577, "eval_logits/rejected": -1.4358314275741577, "eval_logps/chosen": -133.2693634033203, "eval_logps/rejected": -133.2693634033203, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -10.42375659942627, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -10.42375659942627, "eval_runtime": 4.4621, "eval_samples_per_second": 2.241, "eval_steps_per_second": 2.241, "step": 150 }, { "epoch": 1.5894736842105264, "grad_norm": 5.490222520165844e-06, "learning_rate": 0.0001969263157894737, "logits/chosen": -1.763961672782898, "logits/rejected": -1.763961672782898, "logps/chosen": -116.49879455566406, "logps/rejected": -116.49879455566406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -8.624000549316406, "rewards/margins": 0.0, "rewards/rejected": -8.624000549316406, "step": 151 }, { "epoch": 1.6, "grad_norm": 9.477753337705508e-06, "learning_rate": 0.00019690526315789474, "logits/chosen": -1.051688551902771, "logits/rejected": -1.051688551902771, "logps/chosen": -140.0754852294922, "logps/rejected": -140.0754852294922, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -11.180228233337402, "rewards/margins": 0.0, "rewards/rejected": -11.180228233337402, "step": 152 }, { "epoch": 1.6105263157894738, "grad_norm": 9.789489013201091e-06, "learning_rate": 0.00019688421052631582, "logits/chosen": -1.5712676048278809, "logits/rejected": -1.5712676048278809, "logps/chosen": -134.6369171142578, "logps/rejected": -134.6369171142578, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -10.278812408447266, "rewards/margins": 0.0, "rewards/rejected": -10.278812408447266, "step": 153 }, { "epoch": 1.6210526315789475, "grad_norm": 1.5471950973733328e-05, "learning_rate": 0.00019686315789473687, "logits/chosen": -1.3094755411148071, "logits/rejected": -1.3094755411148071, "logps/chosen": -167.6383819580078, "logps/rejected": -167.6383819580078, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -13.825404167175293, "rewards/margins": 0.0, "rewards/rejected": -13.825404167175293, "step": 154 }, { "epoch": 1.631578947368421, "grad_norm": 2.7842013423651224e-06, "learning_rate": 0.0001968421052631579, "logits/chosen": -1.6425766944885254, "logits/rejected": -1.6425766944885254, "logps/chosen": -79.62688446044922, "logps/rejected": -79.62688446044922, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -5.181106090545654, "rewards/margins": 0.0, "rewards/rejected": -5.181106090545654, "step": 155 }, { "epoch": 1.6421052631578947, "grad_norm": 5.890120519325137e-06, "learning_rate": 0.00019682105263157896, "logits/chosen": -1.5033855438232422, "logits/rejected": -1.5033855438232422, "logps/chosen": -121.18205261230469, "logps/rejected": -121.18205261230469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -9.199907302856445, "rewards/margins": 0.0, "rewards/rejected": -9.199907302856445, "step": 156 }, { "epoch": 1.6526315789473685, "grad_norm": 5.56485565539333e-06, "learning_rate": 0.0001968, "logits/chosen": -1.4966881275177002, "logits/rejected": -1.4966881275177002, "logps/chosen": -126.2377700805664, "logps/rejected": -126.2377700805664, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -9.70547866821289, "rewards/margins": 0.0, "rewards/rejected": -9.70547866821289, "step": 157 }, { "epoch": 1.663157894736842, "grad_norm": 6.466461854870431e-06, "learning_rate": 0.00019677894736842106, "logits/chosen": -1.487974762916565, "logits/rejected": -1.487974762916565, "logps/chosen": -132.53555297851562, "logps/rejected": -132.53555297851562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -10.335257530212402, "rewards/margins": 0.0, "rewards/rejected": -10.335257530212402, "step": 158 }, { "epoch": 1.6736842105263157, "grad_norm": 3.8276161831163336e-06, "learning_rate": 0.0001967578947368421, "logits/chosen": -1.6804078817367554, "logits/rejected": -1.6804078817367554, "logps/chosen": -91.00701904296875, "logps/rejected": -91.00701904296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -6.319119453430176, "rewards/margins": 0.0, "rewards/rejected": -6.319119453430176, "step": 159 }, { "epoch": 1.6842105263157894, "grad_norm": 1.2967062502866611e-05, "learning_rate": 0.0001967368421052632, "logits/chosen": -1.1012163162231445, "logits/rejected": -1.1012163162231445, "logps/chosen": -182.14588928222656, "logps/rejected": -182.14588928222656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -15.38726806640625, "rewards/margins": 0.0, "rewards/rejected": -15.38726806640625, "step": 160 }, { "epoch": 1.694736842105263, "grad_norm": 7.505981557187624e-06, "learning_rate": 0.0001967157894736842, "logits/chosen": -1.452441692352295, "logits/rejected": -1.452441692352295, "logps/chosen": -153.8447265625, "logps/rejected": -153.8447265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -12.466174125671387, "rewards/margins": 0.0, "rewards/rejected": -12.466174125671387, "step": 161 }, { "epoch": 1.7052631578947368, "grad_norm": 1.2408400834829081e-05, "learning_rate": 0.00019669473684210526, "logits/chosen": -1.0768696069717407, "logits/rejected": -1.0768696069717407, "logps/chosen": -195.4624786376953, "logps/rejected": -195.4624786376953, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -16.71892738342285, "rewards/margins": 0.0, "rewards/rejected": -16.71892738342285, "step": 162 }, { "epoch": 1.7157894736842105, "grad_norm": 1.3855148608854506e-05, "learning_rate": 0.0001966736842105263, "logits/chosen": -1.061508059501648, "logits/rejected": -1.061508059501648, "logps/chosen": -203.4642333984375, "logps/rejected": -203.4642333984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -17.519102096557617, "rewards/margins": 0.0, "rewards/rejected": -17.519102096557617, "step": 163 }, { "epoch": 1.7263157894736842, "grad_norm": 5.5583345783816185e-06, "learning_rate": 0.00019665263157894739, "logits/chosen": -1.4160618782043457, "logits/rejected": -1.4160618782043457, "logps/chosen": -187.67039489746094, "logps/rejected": -187.67039489746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -15.897933006286621, "rewards/margins": 0.0, "rewards/rejected": -15.897933006286621, "step": 164 }, { "epoch": 1.736842105263158, "grad_norm": 9.477433195570484e-06, "learning_rate": 0.00019663157894736843, "logits/chosen": -1.9752657413482666, "logits/rejected": -1.9752657413482666, "logps/chosen": -128.8132781982422, "logps/rejected": -128.8132781982422, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -10.155132293701172, "rewards/margins": 0.0, "rewards/rejected": -10.155132293701172, "step": 165 }, { "epoch": 1.7473684210526317, "grad_norm": 1.932295890583191e-05, "learning_rate": 0.00019661052631578948, "logits/chosen": -1.5955842733383179, "logits/rejected": -1.5955842733383179, "logps/chosen": -228.4840850830078, "logps/rejected": -228.4840850830078, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -19.663528442382812, "rewards/margins": 0.0, "rewards/rejected": -19.663528442382812, "step": 166 }, { "epoch": 1.7578947368421054, "grad_norm": 1.5974874258972704e-05, "learning_rate": 0.00019658947368421053, "logits/chosen": -1.0810773372650146, "logits/rejected": -1.0810773372650146, "logps/chosen": -252.7413787841797, "logps/rejected": -252.7413787841797, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -22.335704803466797, "rewards/margins": 0.0, "rewards/rejected": -22.335704803466797, "step": 167 }, { "epoch": 1.768421052631579, "grad_norm": 9.09858317754697e-06, "learning_rate": 0.00019656842105263158, "logits/chosen": -1.4648233652114868, "logits/rejected": -1.4648233652114868, "logps/chosen": -198.26095581054688, "logps/rejected": -198.26095581054688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -16.800216674804688, "rewards/margins": 0.0, "rewards/rejected": -16.800216674804688, "step": 168 }, { "epoch": 1.7789473684210526, "grad_norm": 1.4740438928129151e-05, "learning_rate": 0.00019654736842105263, "logits/chosen": -1.0363273620605469, "logits/rejected": -1.0363273620605469, "logps/chosen": -286.6956787109375, "logps/rejected": -286.6956787109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -25.8223819732666, "rewards/margins": 0.0, "rewards/rejected": -25.8223819732666, "step": 169 }, { "epoch": 1.7894736842105263, "grad_norm": 1.0695047421904746e-05, "learning_rate": 0.00019652631578947368, "logits/chosen": -0.8622230887413025, "logits/rejected": -0.8622230887413025, "logps/chosen": -252.5042266845703, "logps/rejected": -252.5042266845703, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -22.4231014251709, "rewards/margins": 0.0, "rewards/rejected": -22.4231014251709, "step": 170 }, { "epoch": 1.8, "grad_norm": 8.539610462321434e-06, "learning_rate": 0.00019650526315789476, "logits/chosen": -1.2167575359344482, "logits/rejected": -1.2167575359344482, "logps/chosen": -221.032958984375, "logps/rejected": -221.032958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -19.077417373657227, "rewards/margins": 0.0, "rewards/rejected": -19.077417373657227, "step": 171 }, { "epoch": 1.8105263157894735, "grad_norm": 1.6939706256380305e-05, "learning_rate": 0.0001964842105263158, "logits/chosen": -0.8433440923690796, "logits/rejected": -0.8433440923690796, "logps/chosen": -298.406982421875, "logps/rejected": -298.406982421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -26.902265548706055, "rewards/margins": 0.0, "rewards/rejected": -26.902265548706055, "step": 172 }, { "epoch": 1.8210526315789473, "grad_norm": 7.195099442469655e-06, "learning_rate": 0.00019646315789473686, "logits/chosen": -1.062929630279541, "logits/rejected": -1.062929630279541, "logps/chosen": -232.75599670410156, "logps/rejected": -232.75599670410156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -20.24972152709961, "rewards/margins": 0.0, "rewards/rejected": -20.24972152709961, "step": 173 }, { "epoch": 1.831578947368421, "grad_norm": 7.192861630755942e-06, "learning_rate": 0.0001964421052631579, "logits/chosen": -1.0091142654418945, "logits/rejected": -1.0091142654418945, "logps/chosen": -240.50201416015625, "logps/rejected": -240.50201416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -21.024322509765625, "rewards/margins": 0.0, "rewards/rejected": -21.024322509765625, "step": 174 }, { "epoch": 1.8421052631578947, "grad_norm": 1.6050831618485972e-05, "learning_rate": 0.00019642105263157895, "logits/chosen": -0.6946796774864197, "logits/rejected": -0.6946796774864197, "logps/chosen": -343.2923889160156, "logps/rejected": -343.2923889160156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -31.390806198120117, "rewards/margins": 0.0, "rewards/rejected": -31.390806198120117, "step": 175 }, { "epoch": 1.8526315789473684, "grad_norm": 1.0542914424149785e-05, "learning_rate": 0.0001964, "logits/chosen": -0.6068615317344666, "logits/rejected": -0.6068615317344666, "logps/chosen": -293.0101623535156, "logps/rejected": -293.0101623535156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -26.47369384765625, "rewards/margins": 0.0, "rewards/rejected": -26.47369384765625, "step": 176 }, { "epoch": 1.8631578947368421, "grad_norm": 6.1548585108539555e-06, "learning_rate": 0.00019637894736842105, "logits/chosen": -0.8551391363143921, "logits/rejected": -0.8551391363143921, "logps/chosen": -266.0399169921875, "logps/rejected": -266.0399169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -23.578113555908203, "rewards/margins": 0.0, "rewards/rejected": -23.578113555908203, "step": 177 }, { "epoch": 1.8736842105263158, "grad_norm": 1.620031071070116e-05, "learning_rate": 0.00019635789473684213, "logits/chosen": -0.5634011626243591, "logits/rejected": -0.5634011626243591, "logps/chosen": -378.9129943847656, "logps/rejected": -378.9129943847656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -35.04411315917969, "rewards/margins": 0.0, "rewards/rejected": -35.04411315917969, "step": 178 }, { "epoch": 1.8842105263157896, "grad_norm": 1.1069708307331894e-05, "learning_rate": 0.00019633684210526318, "logits/chosen": -0.471484899520874, "logits/rejected": -0.471484899520874, "logps/chosen": -324.33038330078125, "logps/rejected": -324.33038330078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -29.605716705322266, "rewards/margins": 0.0, "rewards/rejected": -29.605716705322266, "step": 179 }, { "epoch": 1.8947368421052633, "grad_norm": 1.13854057417484e-05, "learning_rate": 0.00019631578947368423, "logits/chosen": -0.4289996027946472, "logits/rejected": -0.4289996027946472, "logps/chosen": -338.95654296875, "logps/rejected": -338.95654296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -31.06833267211914, "rewards/margins": 0.0, "rewards/rejected": -31.06833267211914, "step": 180 }, { "epoch": 1.905263157894737, "grad_norm": 7.206224836409092e-06, "learning_rate": 0.00019629473684210528, "logits/chosen": -0.637728214263916, "logits/rejected": -0.637728214263916, "logps/chosen": -293.6654052734375, "logps/rejected": -293.6654052734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -26.340662002563477, "rewards/margins": 0.0, "rewards/rejected": -26.340662002563477, "step": 181 }, { "epoch": 1.9157894736842105, "grad_norm": 7.431865924445447e-06, "learning_rate": 0.00019627368421052633, "logits/chosen": -0.798254132270813, "logits/rejected": -0.798254132270813, "logps/chosen": -211.24868774414062, "logps/rejected": -211.24868774414062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -18.343286514282227, "rewards/margins": 0.0, "rewards/rejected": -18.343286514282227, "step": 182 }, { "epoch": 1.9263157894736842, "grad_norm": 1.6567742932238616e-05, "learning_rate": 0.00019625263157894738, "logits/chosen": -0.25748512148857117, "logits/rejected": -0.25748512148857117, "logps/chosen": -448.50189208984375, "logps/rejected": -448.50189208984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -42.00300216674805, "rewards/margins": 0.0, "rewards/rejected": -42.00300216674805, "step": 183 }, { "epoch": 1.936842105263158, "grad_norm": 1.677876389294397e-05, "learning_rate": 0.00019623157894736842, "logits/chosen": -0.15099215507507324, "logits/rejected": -0.15099215507507324, "logps/chosen": -471.9569091796875, "logps/rejected": -471.9569091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -44.348506927490234, "rewards/margins": 0.0, "rewards/rejected": -44.348506927490234, "step": 184 }, { "epoch": 1.9473684210526314, "grad_norm": 8.614895705250092e-06, "learning_rate": 0.0001962105263157895, "logits/chosen": -0.37013179063796997, "logits/rejected": -0.37013179063796997, "logps/chosen": -238.4066162109375, "logps/rejected": -238.4066162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -21.11446762084961, "rewards/margins": 0.0, "rewards/rejected": -21.11446762084961, "step": 185 }, { "epoch": 1.9578947368421051, "grad_norm": 3.001785989908967e-05, "learning_rate": 0.00019618947368421055, "logits/chosen": 0.251768559217453, "logits/rejected": 0.251768559217453, "logps/chosen": -510.8426208496094, "logps/rejected": -510.8426208496094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -48.14582824707031, "rewards/margins": 0.0, "rewards/rejected": -48.14582824707031, "step": 186 }, { "epoch": 1.9684210526315788, "grad_norm": 2.127635343640577e-05, "learning_rate": 0.00019616842105263157, "logits/chosen": -0.1367361694574356, "logits/rejected": -0.1367361694574356, "logps/chosen": -355.6160583496094, "logps/rejected": -355.6160583496094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -32.76424026489258, "rewards/margins": 0.0, "rewards/rejected": -32.76424026489258, "step": 187 }, { "epoch": 1.9789473684210526, "grad_norm": 2.330858842469752e-05, "learning_rate": 0.00019614736842105262, "logits/chosen": 0.042760226875543594, "logits/rejected": 0.042760226875543594, "logps/chosen": -382.27593994140625, "logps/rejected": -382.27593994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -35.430233001708984, "rewards/margins": 0.0, "rewards/rejected": -35.430233001708984, "step": 188 }, { "epoch": 1.9894736842105263, "grad_norm": 1.5866742614889517e-05, "learning_rate": 0.0001961263157894737, "logits/chosen": 0.2904311418533325, "logits/rejected": 0.2904311418533325, "logps/chosen": -351.369873046875, "logps/rejected": -351.369873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -32.267879486083984, "rewards/margins": 0.0, "rewards/rejected": -32.267879486083984, "step": 189 }, { "epoch": 2.0, "grad_norm": 1.4786298379476648e-05, "learning_rate": 0.00019610526315789475, "logits/chosen": 0.43049368262290955, "logits/rejected": 0.43049368262290955, "logps/chosen": -276.5857238769531, "logps/rejected": -276.5857238769531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -24.876989364624023, "rewards/margins": 0.0, "rewards/rejected": -24.876989364624023, "step": 190 }, { "epoch": 2.0105263157894737, "grad_norm": NaN, "learning_rate": 0.00019610526315789475, "logits/chosen": 1.0581265687942505, "logits/rejected": 1.0581265687942505, "logps/chosen": -421.4101867675781, "logps/rejected": -421.4101867675781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -38.95614242553711, "rewards/margins": 0.0, "rewards/rejected": -38.95614242553711, "step": 191 }, { "epoch": 2.0210526315789474, "grad_norm": NaN, "learning_rate": 0.00019610526315789475, "logits/chosen": 2.1765575408935547, "logits/rejected": 2.1765575408935547, "logps/chosen": -624.4765625, "logps/rejected": -624.4765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -59.50922393798828, "rewards/margins": 0.0, "rewards/rejected": -59.50922393798828, "step": 192 }, { "epoch": 2.031578947368421, "grad_norm": 1.6681724446243607e-05, "learning_rate": 0.0001960842105263158, "logits/chosen": 0.8768024444580078, "logits/rejected": 0.8768024444580078, "logps/chosen": -303.2098388671875, "logps/rejected": -303.2098388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -27.53940200805664, "rewards/margins": 0.0, "rewards/rejected": -27.53940200805664, "step": 193 }, { "epoch": 2.042105263157895, "grad_norm": 2.653021874721162e-05, "learning_rate": 0.00019606315789473687, "logits/chosen": 2.153998851776123, "logits/rejected": 2.153998851776123, "logps/chosen": -414.113037109375, "logps/rejected": -414.113037109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -38.49300765991211, "rewards/margins": 0.0, "rewards/rejected": -38.49300765991211, "step": 194 }, { "epoch": 2.0526315789473686, "grad_norm": 5.0161706894868985e-05, "learning_rate": 0.0001960421052631579, "logits/chosen": 2.193232297897339, "logits/rejected": 2.193232297897339, "logps/chosen": -481.1329345703125, "logps/rejected": -481.1329345703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -44.92841720581055, "rewards/margins": 0.0, "rewards/rejected": -44.92841720581055, "step": 195 }, { "epoch": 2.0631578947368423, "grad_norm": 4.694354356615804e-05, "learning_rate": 0.00019602105263157894, "logits/chosen": 3.333601474761963, "logits/rejected": 3.333601474761963, "logps/chosen": -658.974853515625, "logps/rejected": -658.974853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -62.95905303955078, "rewards/margins": 0.0, "rewards/rejected": -62.95905303955078, "step": 196 }, { "epoch": 2.0736842105263156, "grad_norm": 3.98053161916323e-05, "learning_rate": 0.000196, "logits/chosen": 3.93233060836792, "logits/rejected": 3.93233060836792, "logps/chosen": -516.2655029296875, "logps/rejected": -516.2655029296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -48.757442474365234, "rewards/margins": 0.0, "rewards/rejected": -48.757442474365234, "step": 197 }, { "epoch": 2.0842105263157893, "grad_norm": NaN, "learning_rate": 0.000196, "logits/chosen": 4.583979606628418, "logits/rejected": 4.583979606628418, "logps/chosen": -566.2434692382812, "logps/rejected": -566.2434692382812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -53.797027587890625, "rewards/margins": 0.0, "rewards/rejected": -53.797027587890625, "step": 198 }, { "epoch": 2.094736842105263, "grad_norm": 4.385167267173529e-05, "learning_rate": 0.00019597894736842107, "logits/chosen": 4.913366794586182, "logits/rejected": 4.913366794586182, "logps/chosen": -737.3299560546875, "logps/rejected": -737.3299560546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -70.88581085205078, "rewards/margins": 0.0, "rewards/rejected": -70.88581085205078, "step": 199 }, { "epoch": 2.1052631578947367, "grad_norm": 4.879558764514513e-05, "learning_rate": 0.00019595789473684212, "logits/chosen": 5.830333232879639, "logits/rejected": 5.830333232879639, "logps/chosen": -820.52880859375, "logps/rejected": -820.52880859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -79.20569610595703, "rewards/margins": 0.0, "rewards/rejected": -79.20569610595703, "step": 200 }, { "epoch": 2.1052631578947367, "eval_logits/chosen": 6.817009925842285, "eval_logits/rejected": 6.817009925842285, "eval_logps/chosen": -809.7173461914062, "eval_logps/rejected": -809.7173461914062, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -78.06855773925781, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -78.06855773925781, "eval_runtime": 4.5433, "eval_samples_per_second": 2.201, "eval_steps_per_second": 2.201, "step": 200 }, { "epoch": 2.1157894736842104, "grad_norm": 2.9046863346593454e-05, "learning_rate": 0.00019593684210526317, "logits/chosen": 6.449033737182617, "logits/rejected": 6.449033737182617, "logps/chosen": -673.6071166992188, "logps/rejected": -673.6071166992188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -64.49160766601562, "rewards/margins": 0.0, "rewards/rejected": -64.49160766601562, "step": 201 }, { "epoch": 2.126315789473684, "grad_norm": 5.5444274039473385e-05, "learning_rate": 0.00019591578947368422, "logits/chosen": 8.087075233459473, "logits/rejected": 8.087075233459473, "logps/chosen": -1076.608642578125, "logps/rejected": -1076.608642578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -104.7224349975586, "rewards/margins": 0.0, "rewards/rejected": -104.7224349975586, "step": 202 }, { "epoch": 2.136842105263158, "grad_norm": 3.140119224553928e-05, "learning_rate": 0.00019589473684210527, "logits/chosen": 8.09109878540039, "logits/rejected": 8.09109878540039, "logps/chosen": -531.2365112304688, "logps/rejected": -531.2365112304688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -50.34206771850586, "rewards/margins": 0.0, "rewards/rejected": -50.34206771850586, "step": 203 }, { "epoch": 2.1473684210526316, "grad_norm": 5.327671897248365e-05, "learning_rate": 0.00019587368421052632, "logits/chosen": 9.973334312438965, "logits/rejected": 9.973334312438965, "logps/chosen": -1248.2523193359375, "logps/rejected": -1248.2523193359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -121.88679504394531, "rewards/margins": 0.0, "rewards/rejected": -121.88679504394531, "step": 204 }, { "epoch": 2.1578947368421053, "grad_norm": 1.3940511053078808e-05, "learning_rate": 0.00019585263157894737, "logits/chosen": 9.696736335754395, "logits/rejected": 9.696736335754395, "logps/chosen": -613.7274780273438, "logps/rejected": -613.7274780273438, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -58.591163635253906, "rewards/margins": 0.0, "rewards/rejected": -58.591163635253906, "step": 205 }, { "epoch": 2.168421052631579, "grad_norm": 2.2086864191805944e-05, "learning_rate": 0.00019583157894736844, "logits/chosen": 11.36934757232666, "logits/rejected": 11.36934757232666, "logps/chosen": -916.5037841796875, "logps/rejected": -916.5037841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -88.62450408935547, "rewards/margins": 0.0, "rewards/rejected": -88.62450408935547, "step": 206 }, { "epoch": 2.1789473684210527, "grad_norm": 1.9237877495470457e-05, "learning_rate": 0.0001958105263157895, "logits/chosen": 11.874436378479004, "logits/rejected": 11.874436378479004, "logps/chosen": -958.419921875, "logps/rejected": -958.419921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -92.81611633300781, "rewards/margins": 0.0, "rewards/rejected": -92.81611633300781, "step": 207 }, { "epoch": 2.1894736842105265, "grad_norm": 1.4395719517779071e-05, "learning_rate": 0.00019578947368421054, "logits/chosen": 12.384649276733398, "logits/rejected": 12.384649276733398, "logps/chosen": -1001.62060546875, "logps/rejected": -1001.62060546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -97.13618469238281, "rewards/margins": 0.0, "rewards/rejected": -97.13618469238281, "step": 208 }, { "epoch": 2.2, "grad_norm": 2.323817170690745e-05, "learning_rate": 0.0001957684210526316, "logits/chosen": 12.605050086975098, "logits/rejected": 12.605050086975098, "logps/chosen": -1179.7540283203125, "logps/rejected": -1179.7540283203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -115.17803955078125, "rewards/margins": 0.0, "rewards/rejected": -115.17803955078125, "step": 209 }, { "epoch": 2.2105263157894735, "grad_norm": 2.976310861413367e-05, "learning_rate": 0.00019574736842105264, "logits/chosen": 13.146588325500488, "logits/rejected": 13.146588325500488, "logps/chosen": -1539.37890625, "logps/rejected": -1539.37890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -150.9994659423828, "rewards/margins": 0.0, "rewards/rejected": -150.9994659423828, "step": 210 }, { "epoch": 2.221052631578947, "grad_norm": 2.592038072180003e-05, "learning_rate": 0.0001957263157894737, "logits/chosen": 13.31721305847168, "logits/rejected": 13.31721305847168, "logps/chosen": -1449.75341796875, "logps/rejected": -1449.75341796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -142.12815856933594, "rewards/margins": 0.0, "rewards/rejected": -142.12815856933594, "step": 211 }, { "epoch": 2.231578947368421, "grad_norm": 1.8985463611898012e-05, "learning_rate": 0.00019570526315789474, "logits/chosen": 14.342009544372559, "logits/rejected": 14.342009544372559, "logps/chosen": -1189.3717041015625, "logps/rejected": -1189.3717041015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -116.10985565185547, "rewards/margins": 0.0, "rewards/rejected": -116.10985565185547, "step": 212 }, { "epoch": 2.2421052631578946, "grad_norm": 8.969174814410508e-06, "learning_rate": 0.00019568421052631581, "logits/chosen": 14.59449577331543, "logits/rejected": 14.59449577331543, "logps/chosen": -963.65234375, "logps/rejected": -963.65234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -93.6390380859375, "rewards/margins": 0.0, "rewards/rejected": -93.6390380859375, "step": 213 }, { "epoch": 2.2526315789473683, "grad_norm": 8.754281225265004e-06, "learning_rate": 0.00019566315789473686, "logits/chosen": 15.000722885131836, "logits/rejected": 15.000722885131836, "logps/chosen": -996.5911254882812, "logps/rejected": -996.5911254882812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -96.93291473388672, "rewards/margins": 0.0, "rewards/rejected": -96.93291473388672, "step": 214 }, { "epoch": 2.263157894736842, "grad_norm": 2.284335823787842e-05, "learning_rate": 0.00019564210526315789, "logits/chosen": 15.543832778930664, "logits/rejected": 15.543832778930664, "logps/chosen": -1338.6514892578125, "logps/rejected": -1338.6514892578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -131.03782653808594, "rewards/margins": 0.0, "rewards/rejected": -131.03782653808594, "step": 215 }, { "epoch": 2.2736842105263158, "grad_norm": 8.72291548148496e-06, "learning_rate": 0.00019562105263157896, "logits/chosen": 15.671738624572754, "logits/rejected": 15.671738624572754, "logps/chosen": -1070.574951171875, "logps/rejected": -1070.574951171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -104.331298828125, "rewards/margins": 0.0, "rewards/rejected": -104.331298828125, "step": 216 }, { "epoch": 2.2842105263157895, "grad_norm": 1.989948759728577e-05, "learning_rate": 0.0001956, "logits/chosen": 16.083484649658203, "logits/rejected": 16.083484649658203, "logps/chosen": -1369.2926025390625, "logps/rejected": -1369.2926025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -133.90338134765625, "rewards/margins": 0.0, "rewards/rejected": -133.90338134765625, "step": 217 }, { "epoch": 2.294736842105263, "grad_norm": 1.5831390555831604e-05, "learning_rate": 0.00019557894736842106, "logits/chosen": 16.6643123626709, "logits/rejected": 16.6643123626709, "logps/chosen": -1440.6026611328125, "logps/rejected": -1440.6026611328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -141.1419677734375, "rewards/margins": 0.0, "rewards/rejected": -141.1419677734375, "step": 218 }, { "epoch": 2.305263157894737, "grad_norm": 3.188383925589733e-05, "learning_rate": 0.0001955578947368421, "logits/chosen": 16.663028717041016, "logits/rejected": 16.663028717041016, "logps/chosen": -2150.78564453125, "logps/rejected": -2150.78564453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -212.14013671875, "rewards/margins": 0.0, "rewards/rejected": -212.14013671875, "step": 219 }, { "epoch": 2.3157894736842106, "grad_norm": 2.217062865383923e-05, "learning_rate": 0.00019553684210526319, "logits/chosen": 17.02804183959961, "logits/rejected": 17.02804183959961, "logps/chosen": -1657.6380615234375, "logps/rejected": -1657.6380615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -162.8946990966797, "rewards/margins": 0.0, "rewards/rejected": -162.8946990966797, "step": 220 }, { "epoch": 2.3263157894736843, "grad_norm": 1.8672544683795422e-05, "learning_rate": 0.00019551578947368424, "logits/chosen": 16.935508728027344, "logits/rejected": 16.935508728027344, "logps/chosen": -1385.3035888671875, "logps/rejected": -1385.3035888671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -135.80416870117188, "rewards/margins": 0.0, "rewards/rejected": -135.80416870117188, "step": 221 }, { "epoch": 2.336842105263158, "grad_norm": 1.8624623407959007e-05, "learning_rate": 0.00019549473684210526, "logits/chosen": 17.23211669921875, "logits/rejected": 17.23211669921875, "logps/chosen": -1825.585693359375, "logps/rejected": -1825.585693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -179.53269958496094, "rewards/margins": 0.0, "rewards/rejected": -179.53269958496094, "step": 222 }, { "epoch": 2.3473684210526318, "grad_norm": 1.575539863551967e-05, "learning_rate": 0.0001954736842105263, "logits/chosen": 17.0297794342041, "logits/rejected": 17.0297794342041, "logps/chosen": -1393.029296875, "logps/rejected": -1393.029296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -136.52134704589844, "rewards/margins": 0.0, "rewards/rejected": -136.52134704589844, "step": 223 }, { "epoch": 2.3578947368421055, "grad_norm": 3.4856082493206486e-05, "learning_rate": 0.00019545263157894738, "logits/chosen": 17.499181747436523, "logits/rejected": 17.499181747436523, "logps/chosen": -2287.0009765625, "logps/rejected": -2287.0009765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -225.87278747558594, "rewards/margins": 0.0, "rewards/rejected": -225.87278747558594, "step": 224 }, { "epoch": 2.3684210526315788, "grad_norm": 5.088413308840245e-05, "learning_rate": 0.00019543157894736843, "logits/chosen": 17.674972534179688, "logits/rejected": 17.674972534179688, "logps/chosen": -2397.51611328125, "logps/rejected": -2397.51611328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -236.8824920654297, "rewards/margins": 0.0, "rewards/rejected": -236.8824920654297, "step": 225 }, { "epoch": 2.3789473684210525, "grad_norm": 8.878765947883949e-05, "learning_rate": 0.00019541052631578948, "logits/chosen": 17.203372955322266, "logits/rejected": 17.203372955322266, "logps/chosen": -3492.2802734375, "logps/rejected": -3492.2802734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -346.28961181640625, "rewards/margins": 0.0, "rewards/rejected": -346.28961181640625, "step": 226 }, { "epoch": 2.389473684210526, "grad_norm": 3.1237228540703654e-05, "learning_rate": 0.00019538947368421056, "logits/chosen": 17.443052291870117, "logits/rejected": 17.443052291870117, "logps/chosen": -2894.005859375, "logps/rejected": -2894.005859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -286.5732727050781, "rewards/margins": 0.0, "rewards/rejected": -286.5732727050781, "step": 227 }, { "epoch": 2.4, "grad_norm": 4.1664425225462765e-05, "learning_rate": 0.00019536842105263158, "logits/chosen": 17.173011779785156, "logits/rejected": 17.173011779785156, "logps/chosen": -3830.0087890625, "logps/rejected": -3830.0087890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -380.1536865234375, "rewards/margins": 0.0, "rewards/rejected": -380.1536865234375, "step": 228 }, { "epoch": 2.4105263157894736, "grad_norm": 1.3555698387790471e-05, "learning_rate": 0.00019534736842105263, "logits/chosen": 16.77044677734375, "logits/rejected": 16.77044677734375, "logps/chosen": -2450.82861328125, "logps/rejected": -2450.82861328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -242.35667419433594, "rewards/margins": 0.0, "rewards/rejected": -242.35667419433594, "step": 229 }, { "epoch": 2.4210526315789473, "grad_norm": 1.865607737272512e-05, "learning_rate": 0.00019532631578947368, "logits/chosen": 16.351192474365234, "logits/rejected": 16.351192474365234, "logps/chosen": -3230.26806640625, "logps/rejected": -3230.26806640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -320.1084899902344, "rewards/margins": 0.0, "rewards/rejected": -320.1084899902344, "step": 230 }, { "epoch": 2.431578947368421, "grad_norm": 2.954201954707969e-05, "learning_rate": 0.00019530526315789475, "logits/chosen": 15.748371124267578, "logits/rejected": 15.748371124267578, "logps/chosen": -4338.77783203125, "logps/rejected": -4338.77783203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -431.0306091308594, "rewards/margins": 0.0, "rewards/rejected": -431.0306091308594, "step": 231 }, { "epoch": 2.442105263157895, "grad_norm": 1.0253191248921212e-05, "learning_rate": 0.0001952842105263158, "logits/chosen": 15.156774520874023, "logits/rejected": 15.156774520874023, "logps/chosen": -2693.63916015625, "logps/rejected": -2693.63916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -266.6377258300781, "rewards/margins": 0.0, "rewards/rejected": -266.6377258300781, "step": 232 }, { "epoch": 2.4526315789473685, "grad_norm": 7.435526640620083e-06, "learning_rate": 0.00019526315789473685, "logits/chosen": 14.565388679504395, "logits/rejected": 14.565388679504395, "logps/chosen": -2461.76708984375, "logps/rejected": -2461.76708984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -243.39512634277344, "rewards/margins": 0.0, "rewards/rejected": -243.39512634277344, "step": 233 }, { "epoch": 2.463157894736842, "grad_norm": 1.767127469065599e-05, "learning_rate": 0.0001952421052631579, "logits/chosen": 14.204218864440918, "logits/rejected": 14.204218864440918, "logps/chosen": -4046.7734375, "logps/rejected": -4046.7734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -401.4924621582031, "rewards/margins": 0.0, "rewards/rejected": -401.4924621582031, "step": 234 }, { "epoch": 2.473684210526316, "grad_norm": 1.1389983228582423e-05, "learning_rate": 0.00019522105263157895, "logits/chosen": 13.95106315612793, "logits/rejected": 13.95106315612793, "logps/chosen": -3572.88134765625, "logps/rejected": -3572.88134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -354.4190368652344, "rewards/margins": 0.0, "rewards/rejected": -354.4190368652344, "step": 235 }, { "epoch": 2.4842105263157896, "grad_norm": 1.084066661860561e-05, "learning_rate": 0.0001952, "logits/chosen": 13.596949577331543, "logits/rejected": 13.596949577331543, "logps/chosen": -3798.3984375, "logps/rejected": -3798.3984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -377.0125427246094, "rewards/margins": 0.0, "rewards/rejected": -377.0125427246094, "step": 236 }, { "epoch": 2.4947368421052634, "grad_norm": 1.549422086100094e-05, "learning_rate": 0.00019517894736842105, "logits/chosen": 13.390946388244629, "logits/rejected": 13.390946388244629, "logps/chosen": -4671.4033203125, "logps/rejected": -4671.4033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -464.2931823730469, "rewards/margins": 0.0, "rewards/rejected": -464.2931823730469, "step": 237 }, { "epoch": 2.5052631578947366, "grad_norm": 9.202953151543625e-06, "learning_rate": 0.00019515789473684213, "logits/chosen": 12.943868637084961, "logits/rejected": 12.943868637084961, "logps/chosen": -3399.74951171875, "logps/rejected": -3399.74951171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -336.9490661621094, "rewards/margins": 0.0, "rewards/rejected": -336.9490661621094, "step": 238 }, { "epoch": 2.515789473684211, "grad_norm": 1.901045470731333e-05, "learning_rate": 0.00019513684210526318, "logits/chosen": 12.912806510925293, "logits/rejected": 12.912806510925293, "logps/chosen": -4980.595703125, "logps/rejected": -4980.595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -495.12115478515625, "rewards/margins": 0.0, "rewards/rejected": -495.12115478515625, "step": 239 }, { "epoch": 2.526315789473684, "grad_norm": 4.548305696516763e-06, "learning_rate": 0.00019511578947368423, "logits/chosen": 12.52194595336914, "logits/rejected": 12.52194595336914, "logps/chosen": -2582.576904296875, "logps/rejected": -2582.576904296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -255.47610473632812, "rewards/margins": 0.0, "rewards/rejected": -255.47610473632812, "step": 240 }, { "epoch": 2.536842105263158, "grad_norm": 7.886830644565634e-06, "learning_rate": 0.00019509473684210527, "logits/chosen": 12.501900672912598, "logits/rejected": 12.501900672912598, "logps/chosen": -3638.797607421875, "logps/rejected": -3638.797607421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -360.96148681640625, "rewards/margins": 0.0, "rewards/rejected": -360.96148681640625, "step": 241 }, { "epoch": 2.5473684210526315, "grad_norm": 1.6854215573403053e-05, "learning_rate": 0.00019507368421052632, "logits/chosen": 12.465921401977539, "logits/rejected": 12.465921401977539, "logps/chosen": -5014.8525390625, "logps/rejected": -5014.8525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -498.5468444824219, "rewards/margins": 0.0, "rewards/rejected": -498.5468444824219, "step": 242 }, { "epoch": 2.557894736842105, "grad_norm": 7.458407708327286e-06, "learning_rate": 0.00019505263157894737, "logits/chosen": 12.224055290222168, "logits/rejected": 12.224055290222168, "logps/chosen": -3439.12646484375, "logps/rejected": -3439.12646484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -340.88677978515625, "rewards/margins": 0.0, "rewards/rejected": -340.88677978515625, "step": 243 }, { "epoch": 2.568421052631579, "grad_norm": 1.0492250112292822e-05, "learning_rate": 0.00019503157894736842, "logits/chosen": 12.271549224853516, "logits/rejected": 12.271549224853516, "logps/chosen": -4212.18310546875, "logps/rejected": -4212.18310546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -418.033447265625, "rewards/margins": 0.0, "rewards/rejected": -418.033447265625, "step": 244 }, { "epoch": 2.5789473684210527, "grad_norm": 9.04471380636096e-06, "learning_rate": 0.0001950105263157895, "logits/chosen": 12.143754005432129, "logits/rejected": 12.143754005432129, "logps/chosen": -3889.806640625, "logps/rejected": -3889.806640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -386.1533508300781, "rewards/margins": 0.0, "rewards/rejected": -386.1533508300781, "step": 245 }, { "epoch": 2.5894736842105264, "grad_norm": 1.883074946817942e-05, "learning_rate": 0.00019498947368421055, "logits/chosen": 12.154573440551758, "logits/rejected": 12.154573440551758, "logps/chosen": -5032.49169921875, "logps/rejected": -5032.49169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -500.3107604980469, "rewards/margins": 0.0, "rewards/rejected": -500.3107604980469, "step": 246 }, { "epoch": 2.6, "grad_norm": 1.1391132829885464e-05, "learning_rate": 0.00019496842105263157, "logits/chosen": 11.987241744995117, "logits/rejected": 11.987241744995117, "logps/chosen": -3883.806640625, "logps/rejected": -3883.806640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -385.5533447265625, "rewards/margins": 0.0, "rewards/rejected": -385.5533447265625, "step": 247 }, { "epoch": 2.610526315789474, "grad_norm": 1.7160728020826355e-05, "learning_rate": 0.00019494736842105265, "logits/chosen": 11.933201789855957, "logits/rejected": 11.933201789855957, "logps/chosen": -4153.123046875, "logps/rejected": -4153.123046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -412.51495361328125, "rewards/margins": 0.0, "rewards/rejected": -412.51495361328125, "step": 248 }, { "epoch": 2.6210526315789475, "grad_norm": 2.486378616595175e-05, "learning_rate": 0.0001949263157894737, "logits/chosen": 11.837088584899902, "logits/rejected": 11.837088584899902, "logps/chosen": -4702.62109375, "logps/rejected": -4702.62109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -467.4149475097656, "rewards/margins": 0.0, "rewards/rejected": -467.4149475097656, "step": 249 }, { "epoch": 2.6315789473684212, "grad_norm": 1.9884004359482788e-05, "learning_rate": 0.00019490526315789475, "logits/chosen": 11.529038429260254, "logits/rejected": 11.529038429260254, "logps/chosen": -3380.365234375, "logps/rejected": -3380.365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -335.0106506347656, "rewards/margins": 0.0, "rewards/rejected": -335.0106506347656, "step": 250 }, { "epoch": 2.6315789473684212, "eval_logits/chosen": 11.380880355834961, "eval_logits/rejected": 11.380880355834961, "eval_logps/chosen": -4068.390625, "eval_logps/rejected": -4068.390625, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -403.9358825683594, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -403.9358825683594, "eval_runtime": 4.5019, "eval_samples_per_second": 2.221, "eval_steps_per_second": 2.221, "step": 250 }, { "epoch": 2.6421052631578945, "grad_norm": 1.5210554920486175e-05, "learning_rate": 0.0001948842105263158, "logits/chosen": 11.246644020080566, "logits/rejected": 11.246644020080566, "logps/chosen": -2526.46533203125, "logps/rejected": -2526.46533203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -249.8649444580078, "rewards/margins": 0.0, "rewards/rejected": -249.8649444580078, "step": 251 }, { "epoch": 2.6526315789473687, "grad_norm": 4.103879109607078e-05, "learning_rate": 0.00019486315789473687, "logits/chosen": 11.121448516845703, "logits/rejected": 11.121448516845703, "logps/chosen": -3731.0234375, "logps/rejected": -3731.0234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -370.2750244140625, "rewards/margins": 0.0, "rewards/rejected": -370.2750244140625, "step": 252 }, { "epoch": 2.663157894736842, "grad_norm": 2.5248224119422957e-05, "learning_rate": 0.00019484210526315792, "logits/chosen": 10.745014190673828, "logits/rejected": 10.745014190673828, "logps/chosen": -2451.896484375, "logps/rejected": -2451.896484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -242.40806579589844, "rewards/margins": 0.0, "rewards/rejected": -242.40806579589844, "step": 253 }, { "epoch": 2.6736842105263157, "grad_norm": 4.5085376768838614e-05, "learning_rate": 0.00019482105263157894, "logits/chosen": 10.564476013183594, "logits/rejected": 10.564476013183594, "logps/chosen": -3395.93359375, "logps/rejected": -3395.93359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -336.675048828125, "rewards/margins": 0.0, "rewards/rejected": -336.675048828125, "step": 254 }, { "epoch": 2.6842105263157894, "grad_norm": 6.854850653326139e-05, "learning_rate": 0.0001948, "logits/chosen": 10.420607566833496, "logits/rejected": 10.420607566833496, "logps/chosen": -3818.4208984375, "logps/rejected": -3818.4208984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -379.04473876953125, "rewards/margins": 0.0, "rewards/rejected": -379.04473876953125, "step": 255 }, { "epoch": 2.694736842105263, "grad_norm": 7.055894093355164e-05, "learning_rate": 0.00019477894736842107, "logits/chosen": 9.988139152526855, "logits/rejected": 9.988139152526855, "logps/chosen": -3482.1787109375, "logps/rejected": -3482.1787109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -345.39056396484375, "rewards/margins": 0.0, "rewards/rejected": -345.39056396484375, "step": 256 }, { "epoch": 2.705263157894737, "grad_norm": 3.460259540588595e-05, "learning_rate": 0.00019475789473684212, "logits/chosen": 9.430808067321777, "logits/rejected": 9.430808067321777, "logps/chosen": -2254.797607421875, "logps/rejected": -2254.797607421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -222.6981658935547, "rewards/margins": 0.0, "rewards/rejected": -222.6981658935547, "step": 257 }, { "epoch": 2.7157894736842105, "grad_norm": 0.00010256427776766941, "learning_rate": 0.00019473684210526317, "logits/chosen": 9.299915313720703, "logits/rejected": 9.299915313720703, "logps/chosen": -4039.5791015625, "logps/rejected": -4039.5791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -401.1107177734375, "rewards/margins": 0.0, "rewards/rejected": -401.1107177734375, "step": 258 }, { "epoch": 2.7263157894736842, "grad_norm": 9.223208326147869e-05, "learning_rate": 0.00019471578947368422, "logits/chosen": 8.70045280456543, "logits/rejected": 8.70045280456543, "logps/chosen": -3428.341552734375, "logps/rejected": -3428.341552734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -339.6492614746094, "rewards/margins": 0.0, "rewards/rejected": -339.6492614746094, "step": 259 }, { "epoch": 2.736842105263158, "grad_norm": 0.00010385946370661259, "learning_rate": 0.00019469473684210527, "logits/chosen": 8.262590408325195, "logits/rejected": 8.262590408325195, "logps/chosen": -3089.6640625, "logps/rejected": -3089.6640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -306.13909912109375, "rewards/margins": 0.0, "rewards/rejected": -306.13909912109375, "step": 260 }, { "epoch": 2.7473684210526317, "grad_norm": 9.678524656919762e-05, "learning_rate": 0.00019467368421052631, "logits/chosen": 7.670555114746094, "logits/rejected": 7.670555114746094, "logps/chosen": -2587.950439453125, "logps/rejected": -2587.950439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -255.7691650390625, "rewards/margins": 0.0, "rewards/rejected": -255.7691650390625, "step": 261 }, { "epoch": 2.7578947368421054, "grad_norm": 9.062133176485077e-05, "learning_rate": 0.00019465263157894736, "logits/chosen": 7.307285785675049, "logits/rejected": 7.307285785675049, "logps/chosen": -2509.31005859375, "logps/rejected": -2509.31005859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -247.90513610839844, "rewards/margins": 0.0, "rewards/rejected": -247.90513610839844, "step": 262 }, { "epoch": 2.768421052631579, "grad_norm": 9.935448179021478e-05, "learning_rate": 0.00019463157894736844, "logits/chosen": 7.144592761993408, "logits/rejected": 7.144592761993408, "logps/chosen": -2820.29638671875, "logps/rejected": -2820.29638671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -279.20233154296875, "rewards/margins": 0.0, "rewards/rejected": -279.20233154296875, "step": 263 }, { "epoch": 2.7789473684210524, "grad_norm": 9.599085024092346e-05, "learning_rate": 0.0001946105263157895, "logits/chosen": 6.889897346496582, "logits/rejected": 6.889897346496582, "logps/chosen": -2589.03564453125, "logps/rejected": -2589.03564453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -255.98526000976562, "rewards/margins": 0.0, "rewards/rejected": -255.98526000976562, "step": 264 }, { "epoch": 2.7894736842105265, "grad_norm": 8.2246377132833e-05, "learning_rate": 0.00019458947368421054, "logits/chosen": 6.846752643585205, "logits/rejected": 6.846752643585205, "logps/chosen": -2410.0673828125, "logps/rejected": -2410.0673828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -237.98086547851562, "rewards/margins": 0.0, "rewards/rejected": -237.98086547851562, "step": 265 }, { "epoch": 2.8, "grad_norm": 0.0001246191532118246, "learning_rate": 0.0001945684210526316, "logits/chosen": 6.849774360656738, "logits/rejected": 6.849774360656738, "logps/chosen": -3313.88623046875, "logps/rejected": -3313.88623046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -328.54144287109375, "rewards/margins": 0.0, "rewards/rejected": -328.54144287109375, "step": 266 }, { "epoch": 2.8105263157894735, "grad_norm": 0.00011715733126038685, "learning_rate": 0.00019454736842105264, "logits/chosen": 6.692395210266113, "logits/rejected": 6.692395210266113, "logps/chosen": -3254.78662109375, "logps/rejected": -3254.78662109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -322.6314697265625, "rewards/margins": 0.0, "rewards/rejected": -322.6314697265625, "step": 267 }, { "epoch": 2.8210526315789473, "grad_norm": 5.87665599596221e-05, "learning_rate": 0.00019452631578947369, "logits/chosen": 6.446358680725098, "logits/rejected": 6.446358680725098, "logps/chosen": -2285.24609375, "logps/rejected": -2285.24609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -225.4987335205078, "rewards/margins": 0.0, "rewards/rejected": -225.4987335205078, "step": 268 }, { "epoch": 2.831578947368421, "grad_norm": 7.499523053411394e-05, "learning_rate": 0.00019450526315789474, "logits/chosen": 6.247101783752441, "logits/rejected": 6.247101783752441, "logps/chosen": -2381.52587890625, "logps/rejected": -2381.52587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -235.28347778320312, "rewards/margins": 0.0, "rewards/rejected": -235.28347778320312, "step": 269 }, { "epoch": 2.8421052631578947, "grad_norm": 0.00012944728950969875, "learning_rate": 0.0001944842105263158, "logits/chosen": 6.291374206542969, "logits/rejected": 6.291374206542969, "logps/chosen": -3051.115234375, "logps/rejected": -3051.115234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -302.26434326171875, "rewards/margins": 0.0, "rewards/rejected": -302.26434326171875, "step": 270 }, { "epoch": 2.8526315789473684, "grad_norm": 6.670731090707704e-05, "learning_rate": 0.00019446315789473686, "logits/chosen": 6.329052925109863, "logits/rejected": 6.329052925109863, "logps/chosen": -2186.439453125, "logps/rejected": -2186.439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -215.61807250976562, "rewards/margins": 0.0, "rewards/rejected": -215.61807250976562, "step": 271 }, { "epoch": 2.863157894736842, "grad_norm": 0.00016976258484646678, "learning_rate": 0.0001944421052631579, "logits/chosen": 6.608790397644043, "logits/rejected": 6.608790397644043, "logps/chosen": -3241.544921875, "logps/rejected": -3241.544921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -321.216064453125, "rewards/margins": 0.0, "rewards/rejected": -321.216064453125, "step": 272 }, { "epoch": 2.873684210526316, "grad_norm": 3.563039354048669e-05, "learning_rate": 0.00019442105263157896, "logits/chosen": 7.397287845611572, "logits/rejected": 7.397287845611572, "logps/chosen": -1778.8427734375, "logps/rejected": -1778.8427734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -175.1027069091797, "rewards/margins": 0.0, "rewards/rejected": -175.1027069091797, "step": 273 }, { "epoch": 2.8842105263157896, "grad_norm": 6.053608012734912e-05, "learning_rate": 0.0001944, "logits/chosen": 8.16748046875, "logits/rejected": 8.16748046875, "logps/chosen": -2778.11279296875, "logps/rejected": -2778.11279296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -274.9839782714844, "rewards/margins": 0.0, "rewards/rejected": -274.9839782714844, "step": 274 }, { "epoch": 2.8947368421052633, "grad_norm": 0.00012767533189617097, "learning_rate": 0.00019437894736842106, "logits/chosen": 8.444000244140625, "logits/rejected": 8.444000244140625, "logps/chosen": -3656.23828125, "logps/rejected": -3656.23828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -362.6853942871094, "rewards/margins": 0.0, "rewards/rejected": -362.6853942871094, "step": 275 }, { "epoch": 2.905263157894737, "grad_norm": 0.00044787710066884756, "learning_rate": 0.0001943578947368421, "logits/chosen": 8.340536117553711, "logits/rejected": 8.340536117553711, "logps/chosen": -3042.4384765625, "logps/rejected": -3042.4384765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -301.4465026855469, "rewards/margins": 0.0, "rewards/rejected": -301.4465026855469, "step": 276 }, { "epoch": 2.9157894736842103, "grad_norm": 6.145967199699953e-05, "learning_rate": 0.00019433684210526318, "logits/chosen": 9.499319076538086, "logits/rejected": 9.499319076538086, "logps/chosen": -3730.025390625, "logps/rejected": -3730.025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -370.1553649902344, "rewards/margins": 0.0, "rewards/rejected": -370.1553649902344, "step": 277 }, { "epoch": 2.9263157894736844, "grad_norm": 5.096704626339488e-05, "learning_rate": 0.00019431578947368423, "logits/chosen": 10.031513214111328, "logits/rejected": 10.031513214111328, "logps/chosen": -3853.0146484375, "logps/rejected": -3853.0146484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -382.45428466796875, "rewards/margins": 0.0, "rewards/rejected": -382.45428466796875, "step": 278 }, { "epoch": 2.9368421052631577, "grad_norm": 3.408677002880722e-05, "learning_rate": 0.00019429473684210526, "logits/chosen": 10.48922061920166, "logits/rejected": 10.48922061920166, "logps/chosen": -3471.667236328125, "logps/rejected": -3471.667236328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -344.3693542480469, "rewards/margins": 0.0, "rewards/rejected": -344.3693542480469, "step": 279 }, { "epoch": 2.9473684210526314, "grad_norm": 2.416540155536495e-05, "learning_rate": 0.00019427368421052633, "logits/chosen": 10.883756637573242, "logits/rejected": 10.883756637573242, "logps/chosen": -3115.751953125, "logps/rejected": -3115.751953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -308.7060852050781, "rewards/margins": 0.0, "rewards/rejected": -308.7060852050781, "step": 280 }, { "epoch": 2.957894736842105, "grad_norm": 2.587015478638932e-05, "learning_rate": 0.00019425263157894738, "logits/chosen": 11.22066879272461, "logits/rejected": 11.22066879272461, "logps/chosen": -3352.4072265625, "logps/rejected": -3352.4072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -332.4134216308594, "rewards/margins": 0.0, "rewards/rejected": -332.4134216308594, "step": 281 }, { "epoch": 2.968421052631579, "grad_norm": 2.9007293051108718e-05, "learning_rate": 0.00019423157894736843, "logits/chosen": 11.473288536071777, "logits/rejected": 11.473288536071777, "logps/chosen": -3636.136962890625, "logps/rejected": -3636.136962890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -360.81634521484375, "rewards/margins": 0.0, "rewards/rejected": -360.81634521484375, "step": 282 }, { "epoch": 2.9789473684210526, "grad_norm": 0.00021073163952678442, "learning_rate": 0.00019421052631578948, "logits/chosen": 11.509716987609863, "logits/rejected": 11.509716987609863, "logps/chosen": -4141.9130859375, "logps/rejected": -4141.9130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -411.3441467285156, "rewards/margins": 0.0, "rewards/rejected": -411.3441467285156, "step": 283 }, { "epoch": 2.9894736842105263, "grad_norm": 1.5749537851661444e-05, "learning_rate": 0.00019418947368421056, "logits/chosen": 11.974430084228516, "logits/rejected": 11.974430084228516, "logps/chosen": -3471.6171875, "logps/rejected": -3471.6171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -344.33441162109375, "rewards/margins": 0.0, "rewards/rejected": -344.33441162109375, "step": 284 }, { "epoch": 3.0, "grad_norm": 1.054062886396423e-05, "learning_rate": 0.00019416842105263158, "logits/chosen": 12.16253662109375, "logits/rejected": 12.16253662109375, "logps/chosen": -3107.7216796875, "logps/rejected": -3107.7216796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -307.7463073730469, "rewards/margins": 0.0, "rewards/rejected": -307.7463073730469, "step": 285 }, { "epoch": 3.0105263157894737, "grad_norm": 1.4191447917255573e-05, "learning_rate": 0.00019414736842105263, "logits/chosen": 12.31031322479248, "logits/rejected": 12.31031322479248, "logps/chosen": -3822.673828125, "logps/rejected": -3822.673828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -379.0824890136719, "rewards/margins": 0.0, "rewards/rejected": -379.0824890136719, "step": 286 }, { "epoch": 3.0210526315789474, "grad_norm": 1.0503478733880911e-05, "learning_rate": 0.00019412631578947368, "logits/chosen": 12.43275260925293, "logits/rejected": 12.43275260925293, "logps/chosen": -3547.466796875, "logps/rejected": -3547.466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.91937255859375, "rewards/margins": 0.0, "rewards/rejected": -351.91937255859375, "step": 287 }, { "epoch": 3.031578947368421, "grad_norm": 4.318946594139561e-06, "learning_rate": 0.00019410526315789475, "logits/chosen": 12.568634986877441, "logits/rejected": 12.568634986877441, "logps/chosen": -2387.291015625, "logps/rejected": -2387.291015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -235.947509765625, "rewards/margins": 0.0, "rewards/rejected": -235.947509765625, "step": 288 }, { "epoch": 3.042105263157895, "grad_norm": 1.4017550711287186e-05, "learning_rate": 0.0001940842105263158, "logits/chosen": 12.663244247436523, "logits/rejected": 12.663244247436523, "logps/chosen": -4386.4853515625, "logps/rejected": -4386.4853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -435.8013610839844, "rewards/margins": 0.0, "rewards/rejected": -435.8013610839844, "step": 289 }, { "epoch": 3.0526315789473686, "grad_norm": 4.828956207347801e-06, "learning_rate": 0.00019406315789473685, "logits/chosen": 12.748896598815918, "logits/rejected": 12.748896598815918, "logps/chosen": -2679.640625, "logps/rejected": -2679.640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -265.2378845214844, "rewards/margins": 0.0, "rewards/rejected": -265.2378845214844, "step": 290 }, { "epoch": 3.0631578947368423, "grad_norm": 6.494905392173678e-06, "learning_rate": 0.0001940421052631579, "logits/chosen": 12.834653854370117, "logits/rejected": 12.834653854370117, "logps/chosen": -3218.525390625, "logps/rejected": -3218.525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -318.82666015625, "rewards/margins": 0.0, "rewards/rejected": -318.82666015625, "step": 291 }, { "epoch": 3.0736842105263156, "grad_norm": 7.902185643615667e-06, "learning_rate": 0.00019402105263157895, "logits/chosen": 12.906092643737793, "logits/rejected": 12.906092643737793, "logps/chosen": -3637.69140625, "logps/rejected": -3637.69140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -360.94183349609375, "rewards/margins": 0.0, "rewards/rejected": -360.94183349609375, "step": 292 }, { "epoch": 3.0842105263157893, "grad_norm": 5.871730991202639e-06, "learning_rate": 0.000194, "logits/chosen": 12.97829818725586, "logits/rejected": 12.97829818725586, "logps/chosen": -3247.337890625, "logps/rejected": -3247.337890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -321.7079162597656, "rewards/margins": 0.0, "rewards/rejected": -321.7079162597656, "step": 293 }, { "epoch": 3.094736842105263, "grad_norm": 1.3583415238827001e-05, "learning_rate": 0.00019397894736842105, "logits/chosen": 13.062776565551758, "logits/rejected": 13.062776565551758, "logps/chosen": -4755.3134765625, "logps/rejected": -4755.3134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -472.5929260253906, "rewards/margins": 0.0, "rewards/rejected": -472.5929260253906, "step": 294 }, { "epoch": 3.1052631578947367, "grad_norm": 1.0447386557643767e-05, "learning_rate": 0.00019395789473684212, "logits/chosen": 13.109066009521484, "logits/rejected": 13.109066009521484, "logps/chosen": -4507.43017578125, "logps/rejected": -4507.43017578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -447.8958435058594, "rewards/margins": 0.0, "rewards/rejected": -447.8958435058594, "step": 295 }, { "epoch": 3.1157894736842104, "grad_norm": 4.985206032870337e-06, "learning_rate": 0.00019393684210526317, "logits/chosen": 13.158016204833984, "logits/rejected": 13.158016204833984, "logps/chosen": -3289.4375, "logps/rejected": -3289.4375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -325.9178771972656, "rewards/margins": 0.0, "rewards/rejected": -325.9178771972656, "step": 296 }, { "epoch": 3.126315789473684, "grad_norm": 6.14567807133426e-06, "learning_rate": 0.00019391578947368422, "logits/chosen": 13.205784797668457, "logits/rejected": 13.205784797668457, "logps/chosen": -3716.341796875, "logps/rejected": -3716.341796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -368.8068542480469, "rewards/margins": 0.0, "rewards/rejected": -368.8068542480469, "step": 297 }, { "epoch": 3.136842105263158, "grad_norm": 8.479146345052868e-06, "learning_rate": 0.00019389473684210527, "logits/chosen": 13.259580612182617, "logits/rejected": 13.259580612182617, "logps/chosen": -4563.83251953125, "logps/rejected": -4563.83251953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -453.5361022949219, "rewards/margins": 0.0, "rewards/rejected": -453.5361022949219, "step": 298 }, { "epoch": 3.1473684210526316, "grad_norm": 2.743224968071445e-06, "learning_rate": 0.00019387368421052632, "logits/chosen": 13.301115989685059, "logits/rejected": 13.301115989685059, "logps/chosen": -2503.916015625, "logps/rejected": -2503.916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -247.61001586914062, "rewards/margins": 0.0, "rewards/rejected": -247.61001586914062, "step": 299 }, { "epoch": 3.1578947368421053, "grad_norm": 7.419741450576112e-06, "learning_rate": 0.00019385263157894737, "logits/chosen": 13.328307151794434, "logits/rejected": 13.328307151794434, "logps/chosen": -4598.9130859375, "logps/rejected": -4598.9130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -457.0441589355469, "rewards/margins": 0.0, "rewards/rejected": -457.0441589355469, "step": 300 }, { "epoch": 3.1578947368421053, "eval_logits/chosen": 13.36358642578125, "eval_logits/rejected": 13.36358642578125, "eval_logps/chosen": -4073.944580078125, "eval_logps/rejected": -4073.944580078125, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -404.49127197265625, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -404.49127197265625, "eval_runtime": 4.4168, "eval_samples_per_second": 2.264, "eval_steps_per_second": 2.264, "step": 300 }, { "epoch": 3.168421052631579, "grad_norm": 8.578429515182506e-06, "learning_rate": 0.00019383157894736842, "logits/chosen": 13.375102996826172, "logits/rejected": 13.375102996826172, "logps/chosen": -4891.9267578125, "logps/rejected": -4891.9267578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -486.2542419433594, "rewards/margins": 0.0, "rewards/rejected": -486.2542419433594, "step": 301 }, { "epoch": 3.1789473684210527, "grad_norm": 5.591416538663907e-06, "learning_rate": 0.0001938105263157895, "logits/chosen": 13.381508827209473, "logits/rejected": 13.381508827209473, "logps/chosen": -4063.48828125, "logps/rejected": -4063.48828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -403.5514831542969, "rewards/margins": 0.0, "rewards/rejected": -403.5514831542969, "step": 302 }, { "epoch": 3.1894736842105265, "grad_norm": 3.211605644537485e-06, "learning_rate": 0.00019378947368421055, "logits/chosen": 13.397092819213867, "logits/rejected": 13.397092819213867, "logps/chosen": -3377.318359375, "logps/rejected": -3377.318359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -334.7059631347656, "rewards/margins": 0.0, "rewards/rejected": -334.7059631347656, "step": 303 }, { "epoch": 3.2, "grad_norm": 2.1825333078595577e-06, "learning_rate": 0.0001937684210526316, "logits/chosen": 13.43088150024414, "logits/rejected": 13.43088150024414, "logps/chosen": -2549.171875, "logps/rejected": -2549.171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -252.13560485839844, "rewards/margins": 0.0, "rewards/rejected": -252.13560485839844, "step": 304 }, { "epoch": 3.2105263157894735, "grad_norm": 2.0641391529352404e-06, "learning_rate": 0.00019374736842105264, "logits/chosen": 13.444385528564453, "logits/rejected": 13.444385528564453, "logps/chosen": -2556.91796875, "logps/rejected": -2556.91796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -252.9102020263672, "rewards/margins": 0.0, "rewards/rejected": -252.9102020263672, "step": 305 }, { "epoch": 3.221052631578947, "grad_norm": 3.567095973266987e-06, "learning_rate": 0.0001937263157894737, "logits/chosen": 13.43985366821289, "logits/rejected": 13.43985366821289, "logps/chosen": -3626.75, "logps/rejected": -3626.75, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -359.8058776855469, "rewards/margins": 0.0, "rewards/rejected": -359.8058776855469, "step": 306 }, { "epoch": 3.231578947368421, "grad_norm": 5.465305093821371e-06, "learning_rate": 0.00019370526315789474, "logits/chosen": 13.44680118560791, "logits/rejected": 13.44680118560791, "logps/chosen": -4702.0712890625, "logps/rejected": -4702.0712890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -467.3599548339844, "rewards/margins": 0.0, "rewards/rejected": -467.3599548339844, "step": 307 }, { "epoch": 3.2421052631578946, "grad_norm": 3.405055622351938e-06, "learning_rate": 0.0001936842105263158, "logits/chosen": 13.455514907836914, "logits/rejected": 13.455514907836914, "logps/chosen": -3645.736328125, "logps/rejected": -3645.736328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -361.70452880859375, "rewards/margins": 0.0, "rewards/rejected": -361.70452880859375, "step": 308 }, { "epoch": 3.2526315789473683, "grad_norm": 3.167355544064776e-06, "learning_rate": 0.00019366315789473687, "logits/chosen": 13.459683418273926, "logits/rejected": 13.459683418273926, "logps/chosen": -3632.97265625, "logps/rejected": -3632.97265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -360.37896728515625, "rewards/margins": 0.0, "rewards/rejected": -360.37896728515625, "step": 309 }, { "epoch": 3.263157894736842, "grad_norm": 2.1577347979473416e-06, "learning_rate": 0.00019364210526315792, "logits/chosen": 13.475505828857422, "logits/rejected": 13.475505828857422, "logps/chosen": -2878.51953125, "logps/rejected": -2878.51953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -285.1257629394531, "rewards/margins": 0.0, "rewards/rejected": -285.1257629394531, "step": 310 }, { "epoch": 3.2736842105263158, "grad_norm": 2.2971255475567887e-06, "learning_rate": 0.00019362105263157894, "logits/chosen": 13.465211868286133, "logits/rejected": 13.465211868286133, "logps/chosen": -3447.306640625, "logps/rejected": -3447.306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -341.7048034667969, "rewards/margins": 0.0, "rewards/rejected": -341.7048034667969, "step": 311 }, { "epoch": 3.2842105263157895, "grad_norm": 4.984345196135109e-06, "learning_rate": 0.00019360000000000002, "logits/chosen": 13.494173049926758, "logits/rejected": 13.494173049926758, "logps/chosen": -5041.0517578125, "logps/rejected": -5041.0517578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -501.166748046875, "rewards/margins": 0.0, "rewards/rejected": -501.166748046875, "step": 312 }, { "epoch": 3.294736842105263, "grad_norm": 2.226493052148726e-06, "learning_rate": 0.00019357894736842107, "logits/chosen": 13.483041763305664, "logits/rejected": 13.483041763305664, "logps/chosen": -2896.169921875, "logps/rejected": -2896.169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -286.89080810546875, "rewards/margins": 0.0, "rewards/rejected": -286.89080810546875, "step": 313 }, { "epoch": 3.305263157894737, "grad_norm": 3.9725127862766385e-06, "learning_rate": 0.00019355789473684212, "logits/chosen": 13.47500991821289, "logits/rejected": 13.47500991821289, "logps/chosen": -4773.4384765625, "logps/rejected": -4773.4384765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -474.4966735839844, "rewards/margins": 0.0, "rewards/rejected": -474.4966735839844, "step": 314 }, { "epoch": 3.3157894736842106, "grad_norm": 4.561420610116329e-06, "learning_rate": 0.00019353684210526316, "logits/chosen": 13.497587203979492, "logits/rejected": 13.497587203979492, "logps/chosen": -5068.68701171875, "logps/rejected": -5068.68701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -503.9302673339844, "rewards/margins": 0.0, "rewards/rejected": -503.9302673339844, "step": 315 }, { "epoch": 3.3263157894736843, "grad_norm": 2.6150439680350246e-06, "learning_rate": 0.00019351578947368424, "logits/chosen": 13.476734161376953, "logits/rejected": 13.476734161376953, "logps/chosen": -3706.12109375, "logps/rejected": -3706.12109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -367.7430114746094, "rewards/margins": 0.0, "rewards/rejected": -367.7430114746094, "step": 316 }, { "epoch": 3.336842105263158, "grad_norm": 2.6457962576387217e-06, "learning_rate": 0.00019349473684210526, "logits/chosen": 13.47456169128418, "logits/rejected": 13.47456169128418, "logps/chosen": -3712.359375, "logps/rejected": -3712.359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -368.3668212890625, "rewards/margins": 0.0, "rewards/rejected": -368.3668212890625, "step": 317 }, { "epoch": 3.3473684210526318, "grad_norm": 2.2700191948388238e-06, "learning_rate": 0.0001934736842105263, "logits/chosen": 13.46279525756836, "logits/rejected": 13.46279525756836, "logps/chosen": -3487.662109375, "logps/rejected": -3487.662109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -345.7403259277344, "rewards/margins": 0.0, "rewards/rejected": -345.7403259277344, "step": 318 }, { "epoch": 3.3578947368421055, "grad_norm": 4.183463261142606e-06, "learning_rate": 0.00019345263157894736, "logits/chosen": 13.485363006591797, "logits/rejected": 13.485363006591797, "logps/chosen": -5099.19970703125, "logps/rejected": -5099.19970703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -506.9815368652344, "rewards/margins": 0.0, "rewards/rejected": -506.9815368652344, "step": 319 }, { "epoch": 3.3684210526315788, "grad_norm": 2.0982406567782164e-06, "learning_rate": 0.00019343157894736844, "logits/chosen": 13.450308799743652, "logits/rejected": 13.450308799743652, "logps/chosen": -3496.373046875, "logps/rejected": -3496.373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -346.6114196777344, "rewards/margins": 0.0, "rewards/rejected": -346.6114196777344, "step": 320 }, { "epoch": 3.3789473684210525, "grad_norm": 2.1947737423033686e-06, "learning_rate": 0.0001934105263157895, "logits/chosen": 13.457178115844727, "logits/rejected": 13.457178115844727, "logps/chosen": -2930.548828125, "logps/rejected": -2930.548828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -290.3287048339844, "rewards/margins": 0.0, "rewards/rejected": -290.3287048339844, "step": 321 }, { "epoch": 3.389473684210526, "grad_norm": 1.7130514606833458e-06, "learning_rate": 0.00019338947368421054, "logits/chosen": 13.455402374267578, "logits/rejected": 13.455402374267578, "logps/chosen": -2640.44140625, "logps/rejected": -2640.44140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -261.2625427246094, "rewards/margins": 0.0, "rewards/rejected": -261.2625427246094, "step": 322 }, { "epoch": 3.4, "grad_norm": 1.4684018196930992e-06, "learning_rate": 0.00019336842105263159, "logits/chosen": 13.417861938476562, "logits/rejected": 13.417861938476562, "logps/chosen": -3507.39453125, "logps/rejected": -3507.39453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -347.7135925292969, "rewards/margins": 0.0, "rewards/rejected": -347.7135925292969, "step": 323 }, { "epoch": 3.4105263157894736, "grad_norm": 3.666943712232751e-06, "learning_rate": 0.00019334736842105263, "logits/chosen": 13.4088134765625, "logits/rejected": 13.4088134765625, "logps/chosen": -4832.970703125, "logps/rejected": -4832.970703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -480.4499206542969, "rewards/margins": 0.0, "rewards/rejected": -480.4499206542969, "step": 324 }, { "epoch": 3.4210526315789473, "grad_norm": 1.8650094943950535e-06, "learning_rate": 0.00019332631578947368, "logits/chosen": 13.40343189239502, "logits/rejected": 13.40343189239502, "logps/chosen": -3721.9345703125, "logps/rejected": -3721.9345703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -369.275146484375, "rewards/margins": 0.0, "rewards/rejected": -369.275146484375, "step": 325 }, { "epoch": 3.431578947368421, "grad_norm": 1.3328573231774499e-06, "learning_rate": 0.00019330526315789473, "logits/chosen": 13.392024040222168, "logits/rejected": 13.392024040222168, "logps/chosen": -3515.6240234375, "logps/rejected": -3515.6240234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -348.5365295410156, "rewards/margins": 0.0, "rewards/rejected": -348.5365295410156, "step": 326 }, { "epoch": 3.442105263157895, "grad_norm": 1.1790776852649287e-06, "learning_rate": 0.0001932842105263158, "logits/chosen": 13.412732124328613, "logits/rejected": 13.412732124328613, "logps/chosen": -2651.748046875, "logps/rejected": -2651.748046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -262.3932189941406, "rewards/margins": 0.0, "rewards/rejected": -262.3932189941406, "step": 327 }, { "epoch": 3.4526315789473685, "grad_norm": 2.2755937152396655e-06, "learning_rate": 0.00019326315789473686, "logits/chosen": 13.37868595123291, "logits/rejected": 13.37868595123291, "logps/chosen": -3965.984375, "logps/rejected": -3965.984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -393.7711181640625, "rewards/margins": 0.0, "rewards/rejected": -393.7711181640625, "step": 328 }, { "epoch": 3.463157894736842, "grad_norm": 2.9809966690663714e-06, "learning_rate": 0.0001932421052631579, "logits/chosen": 13.375978469848633, "logits/rejected": 13.375978469848633, "logps/chosen": -4848.76123046875, "logps/rejected": -4848.76123046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -482.0289611816406, "rewards/margins": 0.0, "rewards/rejected": -482.0289611816406, "step": 329 }, { "epoch": 3.473684210526316, "grad_norm": 1.7792560811358271e-06, "learning_rate": 0.00019322105263157896, "logits/chosen": 13.367987632751465, "logits/rejected": 13.367987632751465, "logps/chosen": -3970.8125, "logps/rejected": -3970.8125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -394.2539367675781, "rewards/margins": 0.0, "rewards/rejected": -394.2539367675781, "step": 330 }, { "epoch": 3.4842105263157896, "grad_norm": 1.8501725662645185e-06, "learning_rate": 0.0001932, "logits/chosen": 13.373222351074219, "logits/rejected": 13.373222351074219, "logps/chosen": -4302.0166015625, "logps/rejected": -4302.0166015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -427.0168151855469, "rewards/margins": 0.0, "rewards/rejected": -427.0168151855469, "step": 331 }, { "epoch": 3.4947368421052634, "grad_norm": 1.8526296798881958e-06, "learning_rate": 0.00019317894736842106, "logits/chosen": 13.355216979980469, "logits/rejected": 13.355216979980469, "logps/chosen": -3974.8515625, "logps/rejected": -3974.8515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -394.6578369140625, "rewards/margins": 0.0, "rewards/rejected": -394.6578369140625, "step": 332 }, { "epoch": 3.5052631578947366, "grad_norm": 2.7081296138931066e-06, "learning_rate": 0.0001931578947368421, "logits/chosen": 13.350250244140625, "logits/rejected": 13.350250244140625, "logps/chosen": -3739.2255859375, "logps/rejected": -3739.2255859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -371.0042419433594, "rewards/margins": 0.0, "rewards/rejected": -371.0042419433594, "step": 333 }, { "epoch": 3.515789473684211, "grad_norm": 2.4733512873353902e-06, "learning_rate": 0.00019313684210526318, "logits/chosen": 13.35256290435791, "logits/rejected": 13.35256290435791, "logps/chosen": -2659.8994140625, "logps/rejected": -2659.8994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -263.2083435058594, "rewards/margins": 0.0, "rewards/rejected": -263.2083435058594, "step": 334 }, { "epoch": 3.526315789473684, "grad_norm": 1.920208887895569e-06, "learning_rate": 0.00019311578947368423, "logits/chosen": 13.312738418579102, "logits/rejected": 13.312738418579102, "logps/chosen": -3980.271484375, "logps/rejected": -3980.271484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -395.1998291015625, "rewards/margins": 0.0, "rewards/rejected": -395.1998291015625, "step": 335 }, { "epoch": 3.536842105263158, "grad_norm": 2.4315995688084513e-06, "learning_rate": 0.00019309473684210525, "logits/chosen": 13.320900917053223, "logits/rejected": 13.320900917053223, "logps/chosen": -5155.7353515625, "logps/rejected": -5155.7353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -512.6351318359375, "rewards/margins": 0.0, "rewards/rejected": -512.6351318359375, "step": 336 }, { "epoch": 3.5473684210526315, "grad_norm": 1.4706346291859518e-06, "learning_rate": 0.00019307368421052633, "logits/chosen": 13.275252342224121, "logits/rejected": 13.275252342224121, "logps/chosen": -3532.2998046875, "logps/rejected": -3532.2998046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.2041015625, "rewards/margins": 0.0, "rewards/rejected": -350.2041015625, "step": 337 }, { "epoch": 3.557894736842105, "grad_norm": 2.0902027699776227e-06, "learning_rate": 0.00019305263157894738, "logits/chosen": 13.288823127746582, "logits/rejected": 13.288823127746582, "logps/chosen": -5158.9541015625, "logps/rejected": -5158.9541015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -512.9569702148438, "rewards/margins": 0.0, "rewards/rejected": -512.9569702148438, "step": 338 }, { "epoch": 3.568421052631579, "grad_norm": 1.6481401416967856e-06, "learning_rate": 0.00019303157894736843, "logits/chosen": 13.26176929473877, "logits/rejected": 13.26176929473877, "logps/chosen": -4313.7587890625, "logps/rejected": -4313.7587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -428.1910095214844, "rewards/margins": 0.0, "rewards/rejected": -428.1910095214844, "step": 339 }, { "epoch": 3.5789473684210527, "grad_norm": 2.2164347228681436e-06, "learning_rate": 0.00019301052631578948, "logits/chosen": 13.27258586883545, "logits/rejected": 13.27258586883545, "logps/chosen": -5161.4375, "logps/rejected": -5161.4375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -513.205322265625, "rewards/margins": 0.0, "rewards/rejected": -513.205322265625, "step": 340 }, { "epoch": 3.5894736842105264, "grad_norm": 1.5355477671619155e-06, "learning_rate": 0.00019298947368421055, "logits/chosen": 13.252595901489258, "logits/rejected": 13.252595901489258, "logps/chosen": -4276.59765625, "logps/rejected": -4276.59765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -424.8623962402344, "rewards/margins": 0.0, "rewards/rejected": -424.8623962402344, "step": 341 }, { "epoch": 3.6, "grad_norm": 1.5529604979747091e-06, "learning_rate": 0.0001929684210526316, "logits/chosen": 13.25332260131836, "logits/rejected": 13.25332260131836, "logps/chosen": -4317.2294921875, "logps/rejected": -4317.2294921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -428.5380859375, "rewards/margins": 0.0, "rewards/rejected": -428.5380859375, "step": 342 }, { "epoch": 3.610526315789474, "grad_norm": 1.4351383015309693e-06, "learning_rate": 0.00019294736842105263, "logits/chosen": 13.258505821228027, "logits/rejected": 13.258505821228027, "logps/chosen": -2961.806640625, "logps/rejected": -2961.806640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.4544677734375, "rewards/margins": 0.0, "rewards/rejected": -293.4544677734375, "step": 343 }, { "epoch": 3.6210526315789475, "grad_norm": 1.7823775806391495e-06, "learning_rate": 0.0001929263157894737, "logits/chosen": 13.270627975463867, "logits/rejected": 13.270627975463867, "logps/chosen": -5166.19921875, "logps/rejected": -5166.19921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -513.6815185546875, "rewards/margins": 0.0, "rewards/rejected": -513.6815185546875, "step": 344 }, { "epoch": 3.6315789473684212, "grad_norm": 1.610401000107231e-06, "learning_rate": 0.00019290526315789475, "logits/chosen": 13.24936580657959, "logits/rejected": 13.24936580657959, "logps/chosen": -4870.50390625, "logps/rejected": -4870.50390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.2032165527344, "rewards/margins": 0.0, "rewards/rejected": -484.2032165527344, "step": 345 }, { "epoch": 3.6421052631578945, "grad_norm": 1.3039498298894614e-06, "learning_rate": 0.0001928842105263158, "logits/chosen": 13.25102710723877, "logits/rejected": 13.25102710723877, "logps/chosen": -3538.1865234375, "logps/rejected": -3538.1865234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.79278564453125, "rewards/margins": 0.0, "rewards/rejected": -350.79278564453125, "step": 346 }, { "epoch": 3.6526315789473687, "grad_norm": 1.6418638324466883e-06, "learning_rate": 0.00019286315789473685, "logits/chosen": 13.280070304870605, "logits/rejected": 13.280070304870605, "logps/chosen": -2668.4765625, "logps/rejected": -2668.4765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.0660705566406, "rewards/margins": 0.0, "rewards/rejected": -264.0660705566406, "step": 347 }, { "epoch": 3.663157894736842, "grad_norm": 1.4463740853898344e-06, "learning_rate": 0.00019284210526315793, "logits/chosen": 13.26858139038086, "logits/rejected": 13.26858139038086, "logps/chosen": -3771.80859375, "logps/rejected": -3771.80859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.3117370605469, "rewards/margins": 0.0, "rewards/rejected": -374.3117370605469, "step": 348 }, { "epoch": 3.6736842105263157, "grad_norm": 1.3835291383657022e-06, "learning_rate": 0.00019282105263157895, "logits/chosen": 13.270120620727539, "logits/rejected": 13.270120620727539, "logps/chosen": -3772.134765625, "logps/rejected": -3772.134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.3443603515625, "rewards/margins": 0.0, "rewards/rejected": -374.3443603515625, "step": 349 }, { "epoch": 3.6842105263157894, "grad_norm": 1.1906093959623831e-06, "learning_rate": 0.0001928, "logits/chosen": 13.262545585632324, "logits/rejected": 13.262545585632324, "logps/chosen": -3540.462890625, "logps/rejected": -3540.462890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.0204162597656, "rewards/margins": 0.0, "rewards/rejected": -351.0204162597656, "step": 350 }, { "epoch": 3.6842105263157894, "eval_logits/chosen": 13.283880233764648, "eval_logits/rejected": 13.283880233764648, "eval_logps/chosen": -4306.6044921875, "eval_logps/rejected": -4306.6044921875, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -427.75726318359375, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -427.75726318359375, "eval_runtime": 4.5064, "eval_samples_per_second": 2.219, "eval_steps_per_second": 2.219, "step": 350 }, { "epoch": 3.694736842105263, "grad_norm": 1.1118454494862817e-06, "learning_rate": 0.00019277894736842105, "logits/chosen": 13.269845008850098, "logits/rejected": 13.269845008850098, "logps/chosen": -3753.693359375, "logps/rejected": -3753.693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.4510192871094, "rewards/margins": 0.0, "rewards/rejected": -372.4510192871094, "step": 351 }, { "epoch": 3.705263157894737, "grad_norm": 1.3817088984069414e-06, "learning_rate": 0.00019275789473684212, "logits/chosen": 13.27553939819336, "logits/rejected": 13.27553939819336, "logps/chosen": -4283.69775390625, "logps/rejected": -4283.69775390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.5724182128906, "rewards/margins": 0.0, "rewards/rejected": -425.5724182128906, "step": 352 }, { "epoch": 3.7157894736842105, "grad_norm": 1.2998527836316498e-06, "learning_rate": 0.00019273684210526317, "logits/chosen": 13.281391143798828, "logits/rejected": 13.281391143798828, "logps/chosen": -2965.2236328125, "logps/rejected": -2965.2236328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.7961730957031, "rewards/margins": 0.0, "rewards/rejected": -293.7961730957031, "step": 353 }, { "epoch": 3.7263157894736842, "grad_norm": 1.3068009820926818e-06, "learning_rate": 0.00019271578947368422, "logits/chosen": 13.27377986907959, "logits/rejected": 13.27377986907959, "logps/chosen": -4284.29052734375, "logps/rejected": -4284.29052734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.6316833496094, "rewards/margins": 0.0, "rewards/rejected": -425.6316833496094, "step": 354 }, { "epoch": 3.736842105263158, "grad_norm": 2.0515692540357122e-06, "learning_rate": 0.00019269473684210527, "logits/chosen": 13.265348434448242, "logits/rejected": 13.265348434448242, "logps/chosen": -4873.818359375, "logps/rejected": -4873.818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.53466796875, "rewards/margins": 0.0, "rewards/rejected": -484.53466796875, "step": 355 }, { "epoch": 3.7473684210526317, "grad_norm": 9.791186812435626e-07, "learning_rate": 0.00019267368421052632, "logits/chosen": 13.267481803894043, "logits/rejected": 13.267481803894043, "logps/chosen": -3755.564453125, "logps/rejected": -3755.564453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.6381530761719, "rewards/margins": 0.0, "rewards/rejected": -372.6381530761719, "step": 356 }, { "epoch": 3.7578947368421054, "grad_norm": 1.958782149813487e-06, "learning_rate": 0.00019265263157894737, "logits/chosen": 13.266080856323242, "logits/rejected": 13.266080856323242, "logps/chosen": -4874.7158203125, "logps/rejected": -4874.7158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.6244201660156, "rewards/margins": 0.0, "rewards/rejected": -484.6244201660156, "step": 357 }, { "epoch": 3.768421052631579, "grad_norm": 1.228083760906884e-06, "learning_rate": 0.00019263157894736842, "logits/chosen": 13.273797988891602, "logits/rejected": 13.273797988891602, "logps/chosen": -3775.4091796875, "logps/rejected": -3775.4091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.67181396484375, "rewards/margins": 0.0, "rewards/rejected": -374.67181396484375, "step": 358 }, { "epoch": 3.7789473684210524, "grad_norm": 1.2550809742606361e-06, "learning_rate": 0.0001926105263157895, "logits/chosen": 13.265068054199219, "logits/rejected": 13.265068054199219, "logps/chosen": -3993.08203125, "logps/rejected": -3993.08203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.48089599609375, "rewards/margins": 0.0, "rewards/rejected": -396.48089599609375, "step": 359 }, { "epoch": 3.7894736842105265, "grad_norm": 1.412403435097076e-06, "learning_rate": 0.00019258947368421054, "logits/chosen": 13.28840446472168, "logits/rejected": 13.28840446472168, "logps/chosen": -2671.6513671875, "logps/rejected": -2671.6513671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.383544921875, "rewards/margins": 0.0, "rewards/rejected": -264.383544921875, "step": 360 }, { "epoch": 3.8, "grad_norm": 1.3179900406612433e-06, "learning_rate": 0.0001925684210526316, "logits/chosen": 13.26526165008545, "logits/rejected": 13.26526165008545, "logps/chosen": -4876.15625, "logps/rejected": -4876.15625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.7684631347656, "rewards/margins": 0.0, "rewards/rejected": -484.7684631347656, "step": 361 }, { "epoch": 3.8105263157894735, "grad_norm": 1.2032616041324218e-06, "learning_rate": 0.00019254736842105264, "logits/chosen": 13.256503105163574, "logits/rejected": 13.256503105163574, "logps/chosen": -3993.77734375, "logps/rejected": -3993.77734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.5504150390625, "rewards/margins": 0.0, "rewards/rejected": -396.5504150390625, "step": 362 }, { "epoch": 3.8210526315789473, "grad_norm": 1.1747747521440033e-06, "learning_rate": 0.0001925263157894737, "logits/chosen": 13.263219833374023, "logits/rejected": 13.263219833374023, "logps/chosen": -4325.8203125, "logps/rejected": -4325.8203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3971862792969, "rewards/margins": 0.0, "rewards/rejected": -429.3971862792969, "step": 363 }, { "epoch": 3.831578947368421, "grad_norm": 1.5644076256648987e-06, "learning_rate": 0.00019250526315789474, "logits/chosen": 13.276131629943848, "logits/rejected": 13.276131629943848, "logps/chosen": -5173.10595703125, "logps/rejected": -5173.10595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3721923828125, "rewards/margins": 0.0, "rewards/rejected": -514.3721923828125, "step": 364 }, { "epoch": 3.8421052631578947, "grad_norm": 1.4643563872596133e-06, "learning_rate": 0.0001924842105263158, "logits/chosen": 13.245011329650879, "logits/rejected": 13.245011329650879, "logps/chosen": -3541.8720703125, "logps/rejected": -3541.8720703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1613464355469, "rewards/margins": 0.0, "rewards/rejected": -351.1613464355469, "step": 365 }, { "epoch": 3.8526315789473684, "grad_norm": 1.7050610949809197e-06, "learning_rate": 0.00019246315789473687, "logits/chosen": 13.251697540283203, "logits/rejected": 13.251697540283203, "logps/chosen": -4288.4130859375, "logps/rejected": -4288.4130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0439453125, "rewards/margins": 0.0, "rewards/rejected": -426.0439453125, "step": 366 }, { "epoch": 3.863157894736842, "grad_norm": 1.2089234360246337e-06, "learning_rate": 0.00019244210526315792, "logits/chosen": 13.23674488067627, "logits/rejected": 13.23674488067627, "logps/chosen": -3995.416015625, "logps/rejected": -3995.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.71429443359375, "rewards/margins": 0.0, "rewards/rejected": -396.71429443359375, "step": 367 }, { "epoch": 3.873684210526316, "grad_norm": 1.1628878837655066e-06, "learning_rate": 0.00019242105263157894, "logits/chosen": 13.228681564331055, "logits/rejected": 13.228681564331055, "logps/chosen": -3995.482421875, "logps/rejected": -3995.482421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7209167480469, "rewards/margins": 0.0, "rewards/rejected": -396.7209167480469, "step": 368 }, { "epoch": 3.8842105263157896, "grad_norm": 1.2132433084843797e-06, "learning_rate": 0.00019240000000000001, "logits/chosen": 13.220160484313965, "logits/rejected": 13.220160484313965, "logps/chosen": -3541.779296875, "logps/rejected": -3541.779296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1520690917969, "rewards/margins": 0.0, "rewards/rejected": -351.1520690917969, "step": 369 }, { "epoch": 3.8947368421052633, "grad_norm": 1.105313685911824e-06, "learning_rate": 0.00019237894736842106, "logits/chosen": 13.209223747253418, "logits/rejected": 13.209223747253418, "logps/chosen": -3995.791015625, "logps/rejected": -3995.791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7518005371094, "rewards/margins": 0.0, "rewards/rejected": -396.7518005371094, "step": 370 }, { "epoch": 3.905263157894737, "grad_norm": 1.171314693237946e-06, "learning_rate": 0.0001923578947368421, "logits/chosen": 13.208657264709473, "logits/rejected": 13.208657264709473, "logps/chosen": -4289.1083984375, "logps/rejected": -4289.1083984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1134948730469, "rewards/margins": 0.0, "rewards/rejected": -426.1134948730469, "step": 371 }, { "epoch": 3.9157894736842103, "grad_norm": 1.3756163070866023e-06, "learning_rate": 0.00019233684210526316, "logits/chosen": 13.194493293762207, "logits/rejected": 13.194493293762207, "logps/chosen": -4877.82177734375, "logps/rejected": -4877.82177734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9350280761719, "rewards/margins": 0.0, "rewards/rejected": -484.9350280761719, "step": 372 }, { "epoch": 3.9263157894736844, "grad_norm": 1.2630104038180434e-06, "learning_rate": 0.00019231578947368424, "logits/chosen": 13.204888343811035, "logits/rejected": 13.204888343811035, "logps/chosen": -2672.6591796875, "logps/rejected": -2672.6591796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4843444824219, "rewards/margins": 0.0, "rewards/rejected": -264.4843444824219, "step": 373 }, { "epoch": 3.9368421052631577, "grad_norm": 1.0803781833601533e-06, "learning_rate": 0.0001922947368421053, "logits/chosen": 13.172213554382324, "logits/rejected": 13.172213554382324, "logps/chosen": -3997.701171875, "logps/rejected": -3997.701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.94281005859375, "rewards/margins": 0.0, "rewards/rejected": -396.94281005859375, "step": 374 }, { "epoch": 3.9473684210526314, "grad_norm": 1.0733593853728962e-06, "learning_rate": 0.0001922736842105263, "logits/chosen": 13.162105560302734, "logits/rejected": 13.162105560302734, "logps/chosen": -3997.857421875, "logps/rejected": -3997.857421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.95843505859375, "rewards/margins": 0.0, "rewards/rejected": -396.95843505859375, "step": 375 }, { "epoch": 3.957894736842105, "grad_norm": 9.741016810949077e-07, "learning_rate": 0.0001922526315789474, "logits/chosen": 13.156563758850098, "logits/rejected": 13.156563758850098, "logps/chosen": -3757.431640625, "logps/rejected": -3757.431640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8248596191406, "rewards/margins": 0.0, "rewards/rejected": -372.8248596191406, "step": 376 }, { "epoch": 3.968421052631579, "grad_norm": 1.3226389228293556e-06, "learning_rate": 0.00019223157894736844, "logits/chosen": 13.146832466125488, "logits/rejected": 13.146832466125488, "logps/chosen": -4877.81298828125, "logps/rejected": -4877.81298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.93414306640625, "rewards/margins": 0.0, "rewards/rejected": -484.93414306640625, "step": 377 }, { "epoch": 3.9789473684210526, "grad_norm": 1.6002762777134194e-06, "learning_rate": 0.00019221052631578949, "logits/chosen": 13.164546966552734, "logits/rejected": 13.164546966552734, "logps/chosen": -5172.0595703125, "logps/rejected": -5172.0595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2675170898438, "rewards/margins": 0.0, "rewards/rejected": -514.2675170898438, "step": 378 }, { "epoch": 3.9894736842105263, "grad_norm": 1.0010132882598555e-06, "learning_rate": 0.00019218947368421053, "logits/chosen": 13.131548881530762, "logits/rejected": 13.131548881530762, "logps/chosen": -3999.15234375, "logps/rejected": -3999.15234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0879211425781, "rewards/margins": 0.0, "rewards/rejected": -397.0879211425781, "step": 379 }, { "epoch": 4.0, "grad_norm": 1.7763335335985175e-06, "learning_rate": 0.0001921684210526316, "logits/chosen": 13.159083366394043, "logits/rejected": 13.159083366394043, "logps/chosen": -5171.75732421875, "logps/rejected": -5171.75732421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2373046875, "rewards/margins": 0.0, "rewards/rejected": -514.2373046875, "step": 380 }, { "epoch": 4.010526315789473, "grad_norm": 1.333224417976453e-06, "learning_rate": 0.00019214736842105263, "logits/chosen": 13.134832382202148, "logits/rejected": 13.134832382202148, "logps/chosen": -4878.49365234375, "logps/rejected": -4878.49365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.002197265625, "rewards/margins": 0.0, "rewards/rejected": -485.002197265625, "step": 381 }, { "epoch": 4.021052631578947, "grad_norm": 1.2603699133251212e-06, "learning_rate": 0.00019212631578947368, "logits/chosen": 13.1292724609375, "logits/rejected": 13.1292724609375, "logps/chosen": -3999.875, "logps/rejected": -3999.875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1601867675781, "rewards/margins": 0.0, "rewards/rejected": -397.1601867675781, "step": 382 }, { "epoch": 4.031578947368421, "grad_norm": 2.4716996449569706e-06, "learning_rate": 0.00019210526315789473, "logits/chosen": 13.159635543823242, "logits/rejected": 13.159635543823242, "logps/chosen": -5172.27783203125, "logps/rejected": -5172.27783203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2893676757812, "rewards/margins": 0.0, "rewards/rejected": -514.2893676757812, "step": 383 }, { "epoch": 4.042105263157895, "grad_norm": 1.8309251572645735e-06, "learning_rate": 0.0001920842105263158, "logits/chosen": 13.128382682800293, "logits/rejected": 13.128382682800293, "logps/chosen": -3540.001953125, "logps/rejected": -3540.001953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.9743347167969, "rewards/margins": 0.0, "rewards/rejected": -350.9743347167969, "step": 384 }, { "epoch": 4.052631578947368, "grad_norm": 1.743502366480243e-06, "learning_rate": 0.00019206315789473686, "logits/chosen": 13.158990859985352, "logits/rejected": 13.158990859985352, "logps/chosen": -5173.2236328125, "logps/rejected": -5173.2236328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3839721679688, "rewards/margins": 0.0, "rewards/rejected": -514.3839721679688, "step": 385 }, { "epoch": 4.063157894736842, "grad_norm": 1.4177874163578963e-06, "learning_rate": 0.0001920421052631579, "logits/chosen": 13.139883995056152, "logits/rejected": 13.139883995056152, "logps/chosen": -4288.00390625, "logps/rejected": -4288.00390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0030212402344, "rewards/margins": 0.0, "rewards/rejected": -426.0030212402344, "step": 386 }, { "epoch": 4.073684210526316, "grad_norm": 1.272582608180528e-06, "learning_rate": 0.00019202105263157896, "logits/chosen": 13.142391204833984, "logits/rejected": 13.142391204833984, "logps/chosen": -4288.1630859375, "logps/rejected": -4288.1630859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0189514160156, "rewards/margins": 0.0, "rewards/rejected": -426.0189514160156, "step": 387 }, { "epoch": 4.08421052631579, "grad_norm": 1.4503170859825332e-06, "learning_rate": 0.000192, "logits/chosen": 13.14102840423584, "logits/rejected": 13.14102840423584, "logps/chosen": -4879.34765625, "logps/rejected": -4879.34765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0876159667969, "rewards/margins": 0.0, "rewards/rejected": -485.0876159667969, "step": 388 }, { "epoch": 4.094736842105263, "grad_norm": 1.0536674608374597e-06, "learning_rate": 0.00019197894736842105, "logits/chosen": 13.139214515686035, "logits/rejected": 13.139214515686035, "logps/chosen": -3999.828125, "logps/rejected": -3999.828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1554870605469, "rewards/margins": 0.0, "rewards/rejected": -397.1554870605469, "step": 389 }, { "epoch": 4.105263157894737, "grad_norm": 1.036173443935695e-06, "learning_rate": 0.0001919578947368421, "logits/chosen": 13.14235782623291, "logits/rejected": 13.14235782623291, "logps/chosen": -3999.95703125, "logps/rejected": -3999.95703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.16839599609375, "rewards/margins": 0.0, "rewards/rejected": -397.16839599609375, "step": 390 }, { "epoch": 4.11578947368421, "grad_norm": 1.262857836081821e-06, "learning_rate": 0.00019193684210526318, "logits/chosen": 13.149985313415527, "logits/rejected": 13.149985313415527, "logps/chosen": -4879.71923828125, "logps/rejected": -4879.71923828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.124755859375, "rewards/margins": 0.0, "rewards/rejected": -485.124755859375, "step": 391 }, { "epoch": 4.126315789473685, "grad_norm": 1.1243696462770458e-06, "learning_rate": 0.00019191578947368423, "logits/chosen": 13.153663635253906, "logits/rejected": 13.153663635253906, "logps/chosen": -4879.74462890625, "logps/rejected": -4879.74462890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1272888183594, "rewards/margins": 0.0, "rewards/rejected": -485.1272888183594, "step": 392 }, { "epoch": 4.136842105263158, "grad_norm": 1.6817705272842431e-06, "learning_rate": 0.00019189473684210528, "logits/chosen": 13.16176986694336, "logits/rejected": 13.16176986694336, "logps/chosen": -4288.1015625, "logps/rejected": -4288.1015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0127868652344, "rewards/margins": 0.0, "rewards/rejected": -426.0127868652344, "step": 393 }, { "epoch": 4.147368421052631, "grad_norm": 1.2323616829235107e-06, "learning_rate": 0.00019187368421052633, "logits/chosen": 13.162755966186523, "logits/rejected": 13.162755966186523, "logps/chosen": -4880.0029296875, "logps/rejected": -4880.0029296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.15313720703125, "rewards/margins": 0.0, "rewards/rejected": -485.15313720703125, "step": 394 }, { "epoch": 4.157894736842105, "grad_norm": 1.7906470475281822e-06, "learning_rate": 0.00019185263157894738, "logits/chosen": 13.166156768798828, "logits/rejected": 13.166156768798828, "logps/chosen": -3756.595703125, "logps/rejected": -3756.595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.74127197265625, "rewards/margins": 0.0, "rewards/rejected": -372.74127197265625, "step": 395 }, { "epoch": 4.168421052631579, "grad_norm": 1.3215309309089207e-06, "learning_rate": 0.00019183157894736843, "logits/chosen": 13.161626815795898, "logits/rejected": 13.161626815795898, "logps/chosen": -3999.72265625, "logps/rejected": -3999.72265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.14495849609375, "rewards/margins": 0.0, "rewards/rejected": -397.14495849609375, "step": 396 }, { "epoch": 4.178947368421053, "grad_norm": 1.270137659048487e-06, "learning_rate": 0.00019181052631578948, "logits/chosen": 13.1672945022583, "logits/rejected": 13.1672945022583, "logps/chosen": -4880.52880859375, "logps/rejected": -4880.52880859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2057189941406, "rewards/margins": 0.0, "rewards/rejected": -485.2057189941406, "step": 397 }, { "epoch": 4.189473684210526, "grad_norm": 2.2139588509162422e-06, "learning_rate": 0.00019178947368421055, "logits/chosen": 13.173166275024414, "logits/rejected": 13.173166275024414, "logps/chosen": -2964.701171875, "logps/rejected": -2964.701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.7439270019531, "rewards/margins": 0.0, "rewards/rejected": -293.7439270019531, "step": 398 }, { "epoch": 4.2, "grad_norm": 1.464019192098931e-06, "learning_rate": 0.0001917684210526316, "logits/chosen": 13.16378402709961, "logits/rejected": 13.16378402709961, "logps/chosen": -3774.458984375, "logps/rejected": -3774.458984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.5767822265625, "rewards/margins": 0.0, "rewards/rejected": -374.5767822265625, "step": 399 }, { "epoch": 4.2105263157894735, "grad_norm": 1.2820212305086898e-06, "learning_rate": 0.00019174736842105262, "logits/chosen": 13.157520294189453, "logits/rejected": 13.157520294189453, "logps/chosen": -4881.1240234375, "logps/rejected": -4881.1240234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2652282714844, "rewards/margins": 0.0, "rewards/rejected": -485.2652282714844, "step": 400 }, { "epoch": 4.2105263157894735, "eval_logits/chosen": 13.169970512390137, "eval_logits/rejected": 13.169970512390137, "eval_logps/chosen": -4308.177734375, "eval_logps/rejected": -4308.177734375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -427.91461181640625, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -427.91461181640625, "eval_runtime": 4.6141, "eval_samples_per_second": 2.167, "eval_steps_per_second": 2.167, "step": 400 }, { "epoch": 4.221052631578948, "grad_norm": 1.2124367003707448e-06, "learning_rate": 0.0001917263157894737, "logits/chosen": 13.173371315002441, "logits/rejected": 13.173371315002441, "logps/chosen": -2669.56640625, "logps/rejected": -2669.56640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.175048828125, "rewards/margins": 0.0, "rewards/rejected": -264.175048828125, "step": 401 }, { "epoch": 4.231578947368421, "grad_norm": 2.1893995381105924e-06, "learning_rate": 0.00019170526315789475, "logits/chosen": 13.178827285766602, "logits/rejected": 13.178827285766602, "logps/chosen": -5173.55859375, "logps/rejected": -5173.55859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4174194335938, "rewards/margins": 0.0, "rewards/rejected": -514.4174194335938, "step": 402 }, { "epoch": 4.242105263157895, "grad_norm": 1.2583254829223733e-06, "learning_rate": 0.0001916842105263158, "logits/chosen": 13.161089897155762, "logits/rejected": 13.161089897155762, "logps/chosen": -4288.7822265625, "logps/rejected": -4288.7822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.08087158203125, "rewards/margins": 0.0, "rewards/rejected": -426.08087158203125, "step": 403 }, { "epoch": 4.252631578947368, "grad_norm": 1.2309641306273988e-06, "learning_rate": 0.00019166315789473685, "logits/chosen": 13.155193328857422, "logits/rejected": 13.155193328857422, "logps/chosen": -3999.55859375, "logps/rejected": -3999.55859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1285400390625, "rewards/margins": 0.0, "rewards/rejected": -397.1285400390625, "step": 404 }, { "epoch": 4.2631578947368425, "grad_norm": 1.2718624020635616e-06, "learning_rate": 0.00019164210526315792, "logits/chosen": 13.167367935180664, "logits/rejected": 13.167367935180664, "logps/chosen": -3775.71875, "logps/rejected": -3775.71875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7027587890625, "rewards/margins": 0.0, "rewards/rejected": -374.7027587890625, "step": 405 }, { "epoch": 4.273684210526316, "grad_norm": 1.0539905588302645e-06, "learning_rate": 0.00019162105263157895, "logits/chosen": 13.168222427368164, "logits/rejected": 13.168222427368164, "logps/chosen": -4881.8994140625, "logps/rejected": -4881.8994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3427734375, "rewards/margins": 0.0, "rewards/rejected": -485.3427734375, "step": 406 }, { "epoch": 4.284210526315789, "grad_norm": 1.0759672477433924e-06, "learning_rate": 0.0001916, "logits/chosen": 13.163827896118164, "logits/rejected": 13.163827896118164, "logps/chosen": -3999.337890625, "logps/rejected": -3999.337890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1064758300781, "rewards/margins": 0.0, "rewards/rejected": -397.1064758300781, "step": 407 }, { "epoch": 4.294736842105263, "grad_norm": 1.7650302197580459e-06, "learning_rate": 0.00019157894736842104, "logits/chosen": 13.197267532348633, "logits/rejected": 13.197267532348633, "logps/chosen": -5173.19287109375, "logps/rejected": -5173.19287109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.380859375, "rewards/margins": 0.0, "rewards/rejected": -514.380859375, "step": 408 }, { "epoch": 4.3052631578947365, "grad_norm": 2.2256742795434548e-06, "learning_rate": 0.00019155789473684212, "logits/chosen": 13.2003812789917, "logits/rejected": 13.2003812789917, "logps/chosen": -5173.109375, "logps/rejected": -5173.109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3724975585938, "rewards/margins": 0.0, "rewards/rejected": -514.3724975585938, "step": 409 }, { "epoch": 4.315789473684211, "grad_norm": 2.0965801468264544e-06, "learning_rate": 0.00019153684210526317, "logits/chosen": 13.176103591918945, "logits/rejected": 13.176103591918945, "logps/chosen": -3756.27734375, "logps/rejected": -3756.27734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.7094421386719, "rewards/margins": 0.0, "rewards/rejected": -372.7094421386719, "step": 410 }, { "epoch": 4.326315789473684, "grad_norm": 1.487669351263321e-06, "learning_rate": 0.00019151578947368422, "logits/chosen": 13.169191360473633, "logits/rejected": 13.169191360473633, "logps/chosen": -3998.92578125, "logps/rejected": -3998.92578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0652770996094, "rewards/margins": 0.0, "rewards/rejected": -397.0652770996094, "step": 411 }, { "epoch": 4.336842105263158, "grad_norm": 1.2760125400745892e-06, "learning_rate": 0.0001914947368421053, "logits/chosen": 13.163147926330566, "logits/rejected": 13.163147926330566, "logps/chosen": -3998.853515625, "logps/rejected": -3998.853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.05804443359375, "rewards/margins": 0.0, "rewards/rejected": -397.05804443359375, "step": 412 }, { "epoch": 4.347368421052631, "grad_norm": 1.5911463151496719e-06, "learning_rate": 0.00019147368421052632, "logits/chosen": 13.163750648498535, "logits/rejected": 13.163750648498535, "logps/chosen": -4288.0498046875, "logps/rejected": -4288.0498046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.00762939453125, "rewards/margins": 0.0, "rewards/rejected": -426.00762939453125, "step": 413 }, { "epoch": 4.3578947368421055, "grad_norm": 1.0120020306203514e-06, "learning_rate": 0.00019145263157894737, "logits/chosen": 13.14409351348877, "logits/rejected": 13.14409351348877, "logps/chosen": -3999.26171875, "logps/rejected": -3999.26171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0988464355469, "rewards/margins": 0.0, "rewards/rejected": -397.0988464355469, "step": 414 }, { "epoch": 4.368421052631579, "grad_norm": 1.0539432651057723e-06, "learning_rate": 0.00019143157894736842, "logits/chosen": 13.138327598571777, "logits/rejected": 13.138327598571777, "logps/chosen": -3756.953125, "logps/rejected": -3756.953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.7770080566406, "rewards/margins": 0.0, "rewards/rejected": -372.7770080566406, "step": 415 }, { "epoch": 4.378947368421053, "grad_norm": 1.5022621937532676e-06, "learning_rate": 0.0001914105263157895, "logits/chosen": 13.130035400390625, "logits/rejected": 13.130035400390625, "logps/chosen": -4881.49169921875, "logps/rejected": -4881.49169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.302001953125, "rewards/margins": 0.0, "rewards/rejected": -485.302001953125, "step": 416 }, { "epoch": 4.389473684210526, "grad_norm": 1.2897043006887543e-06, "learning_rate": 0.00019138947368421054, "logits/chosen": 13.139795303344727, "logits/rejected": 13.139795303344727, "logps/chosen": -2670.173828125, "logps/rejected": -2670.173828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.2358093261719, "rewards/margins": 0.0, "rewards/rejected": -264.2358093261719, "step": 417 }, { "epoch": 4.4, "grad_norm": 1.2354365708233672e-06, "learning_rate": 0.0001913684210526316, "logits/chosen": 13.133476257324219, "logits/rejected": 13.133476257324219, "logps/chosen": -2670.3125, "logps/rejected": -2670.3125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.2496643066406, "rewards/margins": 0.0, "rewards/rejected": -264.2496643066406, "step": 418 }, { "epoch": 4.410526315789474, "grad_norm": 9.402820637660625e-07, "learning_rate": 0.00019134736842105264, "logits/chosen": 13.107630729675293, "logits/rejected": 13.107630729675293, "logps/chosen": -3756.966796875, "logps/rejected": -3756.966796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.77838134765625, "rewards/margins": 0.0, "rewards/rejected": -372.77838134765625, "step": 419 }, { "epoch": 4.421052631578947, "grad_norm": 1.1259535313001834e-06, "learning_rate": 0.0001913263157894737, "logits/chosen": 13.110640525817871, "logits/rejected": 13.110640525817871, "logps/chosen": -2965.091796875, "logps/rejected": -2965.091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.7829895019531, "rewards/margins": 0.0, "rewards/rejected": -293.7829895019531, "step": 420 }, { "epoch": 4.431578947368421, "grad_norm": 2.4301314169861143e-06, "learning_rate": 0.00019130526315789474, "logits/chosen": 13.100411415100098, "logits/rejected": 13.100411415100098, "logps/chosen": -4320.720703125, "logps/rejected": -4320.720703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -428.88720703125, "rewards/margins": 0.0, "rewards/rejected": -428.88720703125, "step": 421 }, { "epoch": 4.442105263157894, "grad_norm": 2.2063109099690337e-06, "learning_rate": 0.0001912842105263158, "logits/chosen": 13.100083351135254, "logits/rejected": 13.100083351135254, "logps/chosen": -4321.380859375, "logps/rejected": -4321.380859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -428.9532165527344, "rewards/margins": 0.0, "rewards/rejected": -428.9532165527344, "step": 422 }, { "epoch": 4.4526315789473685, "grad_norm": 1.871135395958845e-06, "learning_rate": 0.00019126315789473686, "logits/chosen": 13.125321388244629, "logits/rejected": 13.125321388244629, "logps/chosen": -5172.970703125, "logps/rejected": -5172.970703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.358642578125, "rewards/margins": 0.0, "rewards/rejected": -514.358642578125, "step": 423 }, { "epoch": 4.463157894736842, "grad_norm": 1.4180277503328398e-06, "learning_rate": 0.00019124210526315791, "logits/chosen": 13.104350090026855, "logits/rejected": 13.104350090026855, "logps/chosen": -3538.36328125, "logps/rejected": -3538.36328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.8104553222656, "rewards/margins": 0.0, "rewards/rejected": -350.8104553222656, "step": 424 }, { "epoch": 4.473684210526316, "grad_norm": 1.996656692426768e-06, "learning_rate": 0.00019122105263157896, "logits/chosen": 13.137791633605957, "logits/rejected": 13.137791633605957, "logps/chosen": -2671.154296875, "logps/rejected": -2671.154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.3338317871094, "rewards/margins": 0.0, "rewards/rejected": -264.3338317871094, "step": 425 }, { "epoch": 4.484210526315789, "grad_norm": 1.4522890978696523e-06, "learning_rate": 0.0001912, "logits/chosen": 13.131717681884766, "logits/rejected": 13.131717681884766, "logps/chosen": -4880.13671875, "logps/rejected": -4880.13671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.16650390625, "rewards/margins": 0.0, "rewards/rejected": -485.16650390625, "step": 426 }, { "epoch": 4.494736842105263, "grad_norm": 1.9031556348636514e-06, "learning_rate": 0.00019117894736842106, "logits/chosen": 13.130173683166504, "logits/rejected": 13.130173683166504, "logps/chosen": -3539.03125, "logps/rejected": -3539.03125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.87725830078125, "rewards/margins": 0.0, "rewards/rejected": -350.87725830078125, "step": 427 }, { "epoch": 4.505263157894737, "grad_norm": 2.2110975805844646e-06, "learning_rate": 0.0001911578947368421, "logits/chosen": 13.150248527526855, "logits/rejected": 13.150248527526855, "logps/chosen": -2966.357421875, "logps/rejected": -2966.357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9095458984375, "rewards/margins": 0.0, "rewards/rejected": -293.9095458984375, "step": 428 }, { "epoch": 4.515789473684211, "grad_norm": 1.2973713410247e-06, "learning_rate": 0.00019113684210526316, "logits/chosen": 13.148184776306152, "logits/rejected": 13.148184776306152, "logps/chosen": -3776.3828125, "logps/rejected": -3776.3828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7691650390625, "rewards/margins": 0.0, "rewards/rejected": -374.7691650390625, "step": 429 }, { "epoch": 4.526315789473684, "grad_norm": 9.659921715865494e-07, "learning_rate": 0.00019111578947368424, "logits/chosen": 13.14145278930664, "logits/rejected": 13.14145278930664, "logps/chosen": -3540.23828125, "logps/rejected": -3540.23828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.9979553222656, "rewards/margins": 0.0, "rewards/rejected": -350.9979553222656, "step": 430 }, { "epoch": 4.536842105263158, "grad_norm": 9.392854849465948e-07, "learning_rate": 0.00019109473684210529, "logits/chosen": 13.14527416229248, "logits/rejected": 13.14527416229248, "logps/chosen": -3540.732421875, "logps/rejected": -3540.732421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.04736328125, "rewards/margins": 0.0, "rewards/rejected": -351.04736328125, "step": 431 }, { "epoch": 4.5473684210526315, "grad_norm": 1.0593016668281052e-06, "learning_rate": 0.0001910736842105263, "logits/chosen": 13.15092945098877, "logits/rejected": 13.15092945098877, "logps/chosen": -3541.126953125, "logps/rejected": -3541.126953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.0868225097656, "rewards/margins": 0.0, "rewards/rejected": -351.0868225097656, "step": 432 }, { "epoch": 4.557894736842105, "grad_norm": 1.668764070927864e-06, "learning_rate": 0.00019105263157894738, "logits/chosen": 13.164057731628418, "logits/rejected": 13.164057731628418, "logps/chosen": -3758.6455078125, "logps/rejected": -3758.6455078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9462585449219, "rewards/margins": 0.0, "rewards/rejected": -372.9462585449219, "step": 433 }, { "epoch": 4.568421052631579, "grad_norm": 7.811093496457033e-07, "learning_rate": 0.00019103157894736843, "logits/chosen": 13.192194938659668, "logits/rejected": 13.192194938659668, "logps/chosen": -2673.822265625, "logps/rejected": -2673.822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.60064697265625, "rewards/margins": 0.0, "rewards/rejected": -264.60064697265625, "step": 434 }, { "epoch": 4.578947368421053, "grad_norm": 3.6859380543319276e-06, "learning_rate": 0.00019101052631578948, "logits/chosen": 13.189213752746582, "logits/rejected": 13.189213752746582, "logps/chosen": -4325.8125, "logps/rejected": -4325.8125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3963928222656, "rewards/margins": 0.0, "rewards/rejected": -429.3963928222656, "step": 435 }, { "epoch": 4.589473684210526, "grad_norm": 1.823381353460718e-06, "learning_rate": 0.00019098947368421053, "logits/chosen": 13.189521789550781, "logits/rejected": 13.189521789550781, "logps/chosen": -3997.427734375, "logps/rejected": -3997.427734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.91546630859375, "rewards/margins": 0.0, "rewards/rejected": -396.91546630859375, "step": 436 }, { "epoch": 4.6, "grad_norm": 8.962330753092829e-07, "learning_rate": 0.0001909684210526316, "logits/chosen": 13.202988624572754, "logits/rejected": 13.202988624572754, "logps/chosen": -3543.3759765625, "logps/rejected": -3543.3759765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3117370605469, "rewards/margins": 0.0, "rewards/rejected": -351.3117370605469, "step": 437 }, { "epoch": 4.610526315789474, "grad_norm": 1.3767996733804466e-06, "learning_rate": 0.00019094736842105263, "logits/chosen": 13.221596717834473, "logits/rejected": 13.221596717834473, "logps/chosen": -3777.140625, "logps/rejected": -3777.140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8449401855469, "rewards/margins": 0.0, "rewards/rejected": -374.8449401855469, "step": 438 }, { "epoch": 4.621052631578947, "grad_norm": 2.3824086383683607e-06, "learning_rate": 0.00019092631578947368, "logits/chosen": 13.229989051818848, "logits/rejected": 13.229989051818848, "logps/chosen": -4288.107421875, "logps/rejected": -4288.107421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0133972167969, "rewards/margins": 0.0, "rewards/rejected": -426.0133972167969, "step": 439 }, { "epoch": 4.631578947368421, "grad_norm": 2.202197947553941e-06, "learning_rate": 0.00019090526315789473, "logits/chosen": 13.244230270385742, "logits/rejected": 13.244230270385742, "logps/chosen": -2673.4326171875, "logps/rejected": -2673.4326171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5616760253906, "rewards/margins": 0.0, "rewards/rejected": -264.5616760253906, "step": 440 }, { "epoch": 4.6421052631578945, "grad_norm": 1.6115047856146703e-06, "learning_rate": 0.0001908842105263158, "logits/chosen": 13.221540451049805, "logits/rejected": 13.221540451049805, "logps/chosen": -3544.4091796875, "logps/rejected": -3544.4091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4150390625, "rewards/margins": 0.0, "rewards/rejected": -351.4150390625, "step": 441 }, { "epoch": 4.652631578947369, "grad_norm": 1.139371875069628e-06, "learning_rate": 0.00019086315789473686, "logits/chosen": 13.217147827148438, "logits/rejected": 13.217147827148438, "logps/chosen": -3544.6357421875, "logps/rejected": -3544.6357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4377136230469, "rewards/margins": 0.0, "rewards/rejected": -351.4377136230469, "step": 442 }, { "epoch": 4.663157894736842, "grad_norm": 1.5991441841833876e-06, "learning_rate": 0.0001908421052631579, "logits/chosen": 13.220754623413086, "logits/rejected": 13.220754623413086, "logps/chosen": -4876.76171875, "logps/rejected": -4876.76171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8290100097656, "rewards/margins": 0.0, "rewards/rejected": -484.8290100097656, "step": 443 }, { "epoch": 4.673684210526316, "grad_norm": 8.980619554677105e-07, "learning_rate": 0.00019082105263157895, "logits/chosen": 13.223969459533691, "logits/rejected": 13.223969459533691, "logps/chosen": -2969.2216796875, "logps/rejected": -2969.2216796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.19598388671875, "rewards/margins": 0.0, "rewards/rejected": -294.19598388671875, "step": 444 }, { "epoch": 4.684210526315789, "grad_norm": 3.341406909385114e-06, "learning_rate": 0.0001908, "logits/chosen": 13.24233627319336, "logits/rejected": 13.24233627319336, "logps/chosen": -5173.033203125, "logps/rejected": -5173.033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3649291992188, "rewards/margins": 0.0, "rewards/rejected": -514.3649291992188, "step": 445 }, { "epoch": 4.6947368421052635, "grad_norm": 8.327181717504573e-07, "learning_rate": 0.00019077894736842105, "logits/chosen": 13.230820655822754, "logits/rejected": 13.230820655822754, "logps/chosen": -2969.14453125, "logps/rejected": -2969.14453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1882629394531, "rewards/margins": 0.0, "rewards/rejected": -294.1882629394531, "step": 446 }, { "epoch": 4.705263157894737, "grad_norm": 9.506024412075931e-07, "learning_rate": 0.0001907578947368421, "logits/chosen": 13.245979309082031, "logits/rejected": 13.245979309082031, "logps/chosen": -2674.6337890625, "logps/rejected": -2674.6337890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6817932128906, "rewards/margins": 0.0, "rewards/rejected": -264.6817932128906, "step": 447 }, { "epoch": 4.715789473684211, "grad_norm": 7.398201091746159e-07, "learning_rate": 0.00019073684210526318, "logits/chosen": 13.229601860046387, "logits/rejected": 13.229601860046387, "logps/chosen": -3545.4248046875, "logps/rejected": -3545.4248046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5166015625, "rewards/margins": 0.0, "rewards/rejected": -351.5166015625, "step": 448 }, { "epoch": 4.726315789473684, "grad_norm": 2.0775814846274443e-06, "learning_rate": 0.00019071578947368423, "logits/chosen": 13.263916015625, "logits/rejected": 13.263916015625, "logps/chosen": -5173.59130859375, "logps/rejected": -5173.59130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4207153320312, "rewards/margins": 0.0, "rewards/rejected": -514.4207153320312, "step": 449 }, { "epoch": 4.7368421052631575, "grad_norm": 1.680143554949609e-06, "learning_rate": 0.00019069473684210528, "logits/chosen": 13.272430419921875, "logits/rejected": 13.272430419921875, "logps/chosen": -5173.77880859375, "logps/rejected": -5173.77880859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.439453125, "rewards/margins": 0.0, "rewards/rejected": -514.439453125, "step": 450 }, { "epoch": 4.7368421052631575, "eval_logits/chosen": 13.271191596984863, "eval_logits/rejected": 13.271191596984863, "eval_logps/chosen": -4310.1708984375, "eval_logps/rejected": -4310.1708984375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.11395263671875, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.11395263671875, "eval_runtime": 4.6236, "eval_samples_per_second": 2.163, "eval_steps_per_second": 2.163, "step": 450 }, { "epoch": 4.747368421052632, "grad_norm": 9.817190402827691e-07, "learning_rate": 0.00019067368421052633, "logits/chosen": 13.252490043640137, "logits/rejected": 13.252490043640137, "logps/chosen": -3546.0009765625, "logps/rejected": -3546.0009765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.57421875, "rewards/margins": 0.0, "rewards/rejected": -351.57421875, "step": 451 }, { "epoch": 4.757894736842105, "grad_norm": 1.3320783409653814e-06, "learning_rate": 0.00019065263157894737, "logits/chosen": 13.26960563659668, "logits/rejected": 13.26960563659668, "logps/chosen": -3777.662109375, "logps/rejected": -3777.662109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8970947265625, "rewards/margins": 0.0, "rewards/rejected": -374.8970947265625, "step": 452 }, { "epoch": 4.768421052631579, "grad_norm": 1.573008830746403e-06, "learning_rate": 0.00019063157894736842, "logits/chosen": 13.289057731628418, "logits/rejected": 13.289057731628418, "logps/chosen": -2674.783203125, "logps/rejected": -2674.783203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6967468261719, "rewards/margins": 0.0, "rewards/rejected": -264.6967468261719, "step": 453 }, { "epoch": 4.778947368421052, "grad_norm": 1.3278061032906407e-06, "learning_rate": 0.00019061052631578947, "logits/chosen": 13.281237602233887, "logits/rejected": 13.281237602233887, "logps/chosen": -4875.6162109375, "logps/rejected": -4875.6162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.7144470214844, "rewards/margins": 0.0, "rewards/rejected": -484.7144470214844, "step": 454 }, { "epoch": 4.7894736842105265, "grad_norm": 1.0953923492706963e-06, "learning_rate": 0.00019058947368421055, "logits/chosen": 13.275703430175781, "logits/rejected": 13.275703430175781, "logps/chosen": -3546.6806640625, "logps/rejected": -3546.6806640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.6421813964844, "rewards/margins": 0.0, "rewards/rejected": -351.6421813964844, "step": 455 }, { "epoch": 4.8, "grad_norm": 1.286256861021684e-06, "learning_rate": 0.0001905684210526316, "logits/chosen": 13.285297393798828, "logits/rejected": 13.285297393798828, "logps/chosen": -3777.6103515625, "logps/rejected": -3777.6103515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8919372558594, "rewards/margins": 0.0, "rewards/rejected": -374.8919372558594, "step": 456 }, { "epoch": 4.810526315789474, "grad_norm": 1.295603055950778e-06, "learning_rate": 0.00019054736842105262, "logits/chosen": 13.311622619628906, "logits/rejected": 13.311622619628906, "logps/chosen": -5175.2099609375, "logps/rejected": -5175.2099609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5825805664062, "rewards/margins": 0.0, "rewards/rejected": -514.5825805664062, "step": 457 }, { "epoch": 4.821052631578947, "grad_norm": 1.3328030945558567e-06, "learning_rate": 0.0001905263157894737, "logits/chosen": 13.314613342285156, "logits/rejected": 13.314613342285156, "logps/chosen": -5175.31396484375, "logps/rejected": -5175.31396484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5929565429688, "rewards/margins": 0.0, "rewards/rejected": -514.5929565429688, "step": 458 }, { "epoch": 4.831578947368421, "grad_norm": 1.3567895393862273e-06, "learning_rate": 0.00019050526315789475, "logits/chosen": 13.295086860656738, "logits/rejected": 13.295086860656738, "logps/chosen": -4875.93017578125, "logps/rejected": -4875.93017578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.745849609375, "rewards/margins": 0.0, "rewards/rejected": -484.745849609375, "step": 459 }, { "epoch": 4.842105263157895, "grad_norm": 1.2604548373928992e-06, "learning_rate": 0.0001904842105263158, "logits/chosen": 13.324274063110352, "logits/rejected": 13.324274063110352, "logps/chosen": -5176.4375, "logps/rejected": -5176.4375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.705322265625, "rewards/margins": 0.0, "rewards/rejected": -514.705322265625, "step": 460 }, { "epoch": 4.852631578947369, "grad_norm": 1.3131174227964948e-06, "learning_rate": 0.00019046315789473685, "logits/chosen": 13.296370506286621, "logits/rejected": 13.296370506286621, "logps/chosen": -3993.275390625, "logps/rejected": -3993.275390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.5002136230469, "rewards/margins": 0.0, "rewards/rejected": -396.5002136230469, "step": 461 }, { "epoch": 4.863157894736842, "grad_norm": 1.2121271311116288e-06, "learning_rate": 0.00019044210526315792, "logits/chosen": 13.317803382873535, "logits/rejected": 13.317803382873535, "logps/chosen": -2673.90234375, "logps/rejected": -2673.90234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.608642578125, "rewards/margins": 0.0, "rewards/rejected": -264.608642578125, "step": 462 }, { "epoch": 4.873684210526315, "grad_norm": 1.172584347841621e-06, "learning_rate": 0.00019042105263157897, "logits/chosen": 13.31067180633545, "logits/rejected": 13.31067180633545, "logps/chosen": -4328.0791015625, "logps/rejected": -4328.0791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.623046875, "rewards/margins": 0.0, "rewards/rejected": -429.623046875, "step": 463 }, { "epoch": 4.88421052631579, "grad_norm": 1.28936221699405e-06, "learning_rate": 0.0001904, "logits/chosen": 13.292346954345703, "logits/rejected": 13.292346954345703, "logps/chosen": -3993.26953125, "logps/rejected": -3993.26953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.4996337890625, "rewards/margins": 0.0, "rewards/rejected": -396.4996337890625, "step": 464 }, { "epoch": 4.894736842105263, "grad_norm": 1.2857520914622e-06, "learning_rate": 0.00019037894736842107, "logits/chosen": 13.285961151123047, "logits/rejected": 13.285961151123047, "logps/chosen": -3993.365234375, "logps/rejected": -3993.365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.50921630859375, "rewards/margins": 0.0, "rewards/rejected": -396.50921630859375, "step": 465 }, { "epoch": 4.905263157894737, "grad_norm": 1.0654831612555427e-06, "learning_rate": 0.00019035789473684212, "logits/chosen": 13.28244686126709, "logits/rejected": 13.28244686126709, "logps/chosen": -3757.052734375, "logps/rejected": -3757.052734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.7869567871094, "rewards/margins": 0.0, "rewards/rejected": -372.7869567871094, "step": 466 }, { "epoch": 4.91578947368421, "grad_norm": 1.2411975376380724e-06, "learning_rate": 0.00019033684210526317, "logits/chosen": 13.274484634399414, "logits/rejected": 13.274484634399414, "logps/chosen": -4877.8818359375, "logps/rejected": -4877.8818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9410095214844, "rewards/margins": 0.0, "rewards/rejected": -484.9410095214844, "step": 467 }, { "epoch": 4.926315789473684, "grad_norm": 1.0312362519471208e-06, "learning_rate": 0.00019031578947368422, "logits/chosen": 13.256425857543945, "logits/rejected": 13.256425857543945, "logps/chosen": -3545.3017578125, "logps/rejected": -3545.3017578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5043029785156, "rewards/margins": 0.0, "rewards/rejected": -351.5043029785156, "step": 468 }, { "epoch": 4.936842105263158, "grad_norm": 1.257968733625603e-06, "learning_rate": 0.0001902947368421053, "logits/chosen": 13.255985260009766, "logits/rejected": 13.255985260009766, "logps/chosen": -3777.646484375, "logps/rejected": -3777.646484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8955383300781, "rewards/margins": 0.0, "rewards/rejected": -374.8955383300781, "step": 469 }, { "epoch": 4.947368421052632, "grad_norm": 1.187660927826073e-06, "learning_rate": 0.00019027368421052632, "logits/chosen": 13.241244316101074, "logits/rejected": 13.241244316101074, "logps/chosen": -3995.07421875, "logps/rejected": -3995.07421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.68011474609375, "rewards/margins": 0.0, "rewards/rejected": -396.68011474609375, "step": 470 }, { "epoch": 4.957894736842105, "grad_norm": 1.061337911778537e-06, "learning_rate": 0.00019025263157894737, "logits/chosen": 13.232675552368164, "logits/rejected": 13.232675552368164, "logps/chosen": -3544.474609375, "logps/rejected": -3544.474609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4216003417969, "rewards/margins": 0.0, "rewards/rejected": -351.4216003417969, "step": 471 }, { "epoch": 4.968421052631579, "grad_norm": 1.2000216429441934e-06, "learning_rate": 0.00019023157894736841, "logits/chosen": 13.242695808410645, "logits/rejected": 13.242695808410645, "logps/chosen": -2673.0537109375, "logps/rejected": -2673.0537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5237731933594, "rewards/margins": 0.0, "rewards/rejected": -264.5237731933594, "step": 472 }, { "epoch": 4.978947368421053, "grad_norm": 1.227028064931801e-06, "learning_rate": 0.0001902105263157895, "logits/chosen": 13.229562759399414, "logits/rejected": 13.229562759399414, "logps/chosen": -4326.953125, "logps/rejected": -4326.953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.5104675292969, "rewards/margins": 0.0, "rewards/rejected": -429.5104675292969, "step": 473 }, { "epoch": 4.989473684210527, "grad_norm": 1.1333920610923087e-06, "learning_rate": 0.00019018947368421054, "logits/chosen": 13.216691017150879, "logits/rejected": 13.216691017150879, "logps/chosen": -2966.9140625, "logps/rejected": -2966.9140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9652099609375, "rewards/margins": 0.0, "rewards/rejected": -293.9652099609375, "step": 474 }, { "epoch": 5.0, "grad_norm": 1.122456751545542e-06, "learning_rate": 0.0001901684210526316, "logits/chosen": 13.198467254638672, "logits/rejected": 13.198467254638672, "logps/chosen": -3997.0859375, "logps/rejected": -3997.0859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.88128662109375, "rewards/margins": 0.0, "rewards/rejected": -396.88128662109375, "step": 475 }, { "epoch": 5.010526315789473, "grad_norm": 1.1456676247689757e-06, "learning_rate": 0.00019014736842105264, "logits/chosen": 13.1884183883667, "logits/rejected": 13.1884183883667, "logps/chosen": -3997.07421875, "logps/rejected": -3997.07421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8800964355469, "rewards/margins": 0.0, "rewards/rejected": -396.8800964355469, "step": 476 }, { "epoch": 5.021052631578947, "grad_norm": 9.451536584492715e-07, "learning_rate": 0.0001901263157894737, "logits/chosen": 13.196056365966797, "logits/rejected": 13.196056365966797, "logps/chosen": -2673.443359375, "logps/rejected": -2673.443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.562744140625, "rewards/margins": 0.0, "rewards/rejected": -264.562744140625, "step": 477 }, { "epoch": 5.031578947368421, "grad_norm": 8.419349342148053e-07, "learning_rate": 0.00019010526315789474, "logits/chosen": 13.166739463806152, "logits/rejected": 13.166739463806152, "logps/chosen": -3544.150390625, "logps/rejected": -3544.150390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.38916015625, "rewards/margins": 0.0, "rewards/rejected": -351.38916015625, "step": 478 }, { "epoch": 5.042105263157895, "grad_norm": 8.346195841113513e-07, "learning_rate": 0.00019008421052631579, "logits/chosen": 13.1581449508667, "logits/rejected": 13.1581449508667, "logps/chosen": -3544.2314453125, "logps/rejected": -3544.2314453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.39727783203125, "rewards/margins": 0.0, "rewards/rejected": -351.39727783203125, "step": 479 }, { "epoch": 5.052631578947368, "grad_norm": 1.2025379874103237e-06, "learning_rate": 0.00019006315789473686, "logits/chosen": 13.158472061157227, "logits/rejected": 13.158472061157227, "logps/chosen": -3778.328125, "logps/rejected": -3778.328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9637145996094, "rewards/margins": 0.0, "rewards/rejected": -374.9637145996094, "step": 480 }, { "epoch": 5.063157894736842, "grad_norm": 1.1906524832738796e-06, "learning_rate": 0.0001900421052631579, "logits/chosen": 13.154966354370117, "logits/rejected": 13.154966354370117, "logps/chosen": -3778.458984375, "logps/rejected": -3778.458984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9767761230469, "rewards/margins": 0.0, "rewards/rejected": -374.9767761230469, "step": 481 }, { "epoch": 5.073684210526316, "grad_norm": 8.846835157783062e-07, "learning_rate": 0.00019002105263157896, "logits/chosen": 13.16341495513916, "logits/rejected": 13.16341495513916, "logps/chosen": -2674.466796875, "logps/rejected": -2674.466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.66510009765625, "rewards/margins": 0.0, "rewards/rejected": -264.66510009765625, "step": 482 }, { "epoch": 5.08421052631579, "grad_norm": 1.1412429330448504e-06, "learning_rate": 0.00019, "logits/chosen": 13.149027824401855, "logits/rejected": 13.149027824401855, "logps/chosen": -3779.0517578125, "logps/rejected": -3779.0517578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.03607177734375, "rewards/margins": 0.0, "rewards/rejected": -375.03607177734375, "step": 483 }, { "epoch": 5.094736842105263, "grad_norm": 1.4646574300059e-06, "learning_rate": 0.00018997894736842106, "logits/chosen": 13.149612426757812, "logits/rejected": 13.149612426757812, "logps/chosen": -4877.3212890625, "logps/rejected": -4877.3212890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8849792480469, "rewards/margins": 0.0, "rewards/rejected": -484.8849792480469, "step": 484 }, { "epoch": 5.105263157894737, "grad_norm": 9.852100220086868e-07, "learning_rate": 0.0001899578947368421, "logits/chosen": 13.141609191894531, "logits/rejected": 13.141609191894531, "logps/chosen": -3999.216796875, "logps/rejected": -3999.216796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0943603515625, "rewards/margins": 0.0, "rewards/rejected": -397.0943603515625, "step": 485 }, { "epoch": 5.11578947368421, "grad_norm": 1.259721102542244e-06, "learning_rate": 0.00018993684210526316, "logits/chosen": 13.151392936706543, "logits/rejected": 13.151392936706543, "logps/chosen": -4877.60009765625, "logps/rejected": -4877.60009765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.912841796875, "rewards/margins": 0.0, "rewards/rejected": -484.912841796875, "step": 486 }, { "epoch": 5.126315789473685, "grad_norm": 1.444942881789757e-06, "learning_rate": 0.00018991578947368423, "logits/chosen": 13.153428077697754, "logits/rejected": 13.153428077697754, "logps/chosen": -4877.8466796875, "logps/rejected": -4877.8466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9375, "rewards/margins": 0.0, "rewards/rejected": -484.9375, "step": 487 }, { "epoch": 5.136842105263158, "grad_norm": 1.651527099966188e-06, "learning_rate": 0.00018989473684210528, "logits/chosen": 13.148782730102539, "logits/rejected": 13.148782730102539, "logps/chosen": -3543.2548828125, "logps/rejected": -3543.2548828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.29962158203125, "rewards/margins": 0.0, "rewards/rejected": -351.29962158203125, "step": 488 }, { "epoch": 5.147368421052631, "grad_norm": 2.338605099794222e-06, "learning_rate": 0.0001898736842105263, "logits/chosen": 13.183222770690918, "logits/rejected": 13.183222770690918, "logps/chosen": -5172.541015625, "logps/rejected": -5172.541015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.315673828125, "rewards/margins": 0.0, "rewards/rejected": -514.315673828125, "step": 489 }, { "epoch": 5.157894736842105, "grad_norm": 1.1970715831921552e-06, "learning_rate": 0.00018985263157894738, "logits/chosen": 13.153637886047363, "logits/rejected": 13.153637886047363, "logps/chosen": -3999.62109375, "logps/rejected": -3999.62109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1347961425781, "rewards/margins": 0.0, "rewards/rejected": -397.1347961425781, "step": 490 }, { "epoch": 5.168421052631579, "grad_norm": 1.2377339544400456e-06, "learning_rate": 0.00018983157894736843, "logits/chosen": 13.158660888671875, "logits/rejected": 13.158660888671875, "logps/chosen": -3756.373046875, "logps/rejected": -3756.373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.718994140625, "rewards/margins": 0.0, "rewards/rejected": -372.718994140625, "step": 491 }, { "epoch": 5.178947368421053, "grad_norm": 1.1482413810881553e-06, "learning_rate": 0.00018981052631578948, "logits/chosen": 13.168063163757324, "logits/rejected": 13.168063163757324, "logps/chosen": -2673.7548828125, "logps/rejected": -2673.7548828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5939025878906, "rewards/margins": 0.0, "rewards/rejected": -264.5939025878906, "step": 492 }, { "epoch": 5.189473684210526, "grad_norm": 1.0602715292407083e-06, "learning_rate": 0.00018978947368421053, "logits/chosen": 13.145636558532715, "logits/rejected": 13.145636558532715, "logps/chosen": -3999.296875, "logps/rejected": -3999.296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1023864746094, "rewards/margins": 0.0, "rewards/rejected": -397.1023864746094, "step": 493 }, { "epoch": 5.2, "grad_norm": 2.178872819058597e-06, "learning_rate": 0.0001897684210526316, "logits/chosen": 13.171972274780273, "logits/rejected": 13.171972274780273, "logps/chosen": -5172.416015625, "logps/rejected": -5172.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3031616210938, "rewards/margins": 0.0, "rewards/rejected": -514.3031616210938, "step": 494 }, { "epoch": 5.2105263157894735, "grad_norm": 1.7866142343336833e-06, "learning_rate": 0.00018974736842105266, "logits/chosen": 13.150436401367188, "logits/rejected": 13.150436401367188, "logps/chosen": -4878.61376953125, "logps/rejected": -4878.61376953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.01422119140625, "rewards/margins": 0.0, "rewards/rejected": -485.01422119140625, "step": 495 }, { "epoch": 5.221052631578948, "grad_norm": 1.0862628414542996e-06, "learning_rate": 0.00018972631578947368, "logits/chosen": 13.153552055358887, "logits/rejected": 13.153552055358887, "logps/chosen": -3780.3046875, "logps/rejected": -3780.3046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.1613464355469, "rewards/margins": 0.0, "rewards/rejected": -375.1613464355469, "step": 496 }, { "epoch": 5.231578947368421, "grad_norm": 1.1326321782689774e-06, "learning_rate": 0.00018970526315789475, "logits/chosen": 13.161069869995117, "logits/rejected": 13.161069869995117, "logps/chosen": -2966.298828125, "logps/rejected": -2966.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9036865234375, "rewards/margins": 0.0, "rewards/rejected": -293.9036865234375, "step": 497 }, { "epoch": 5.242105263157895, "grad_norm": 1.367556592413166e-06, "learning_rate": 0.0001896842105263158, "logits/chosen": 13.162830352783203, "logits/rejected": 13.162830352783203, "logps/chosen": -4286.22314453125, "logps/rejected": -4286.22314453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.824951171875, "rewards/margins": 0.0, "rewards/rejected": -425.824951171875, "step": 498 }, { "epoch": 5.252631578947368, "grad_norm": 1.0151137530556298e-06, "learning_rate": 0.00018966315789473685, "logits/chosen": 13.15966510772705, "logits/rejected": 13.15966510772705, "logps/chosen": -3999.76171875, "logps/rejected": -3999.76171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.14886474609375, "rewards/margins": 0.0, "rewards/rejected": -397.14886474609375, "step": 499 }, { "epoch": 5.2631578947368425, "grad_norm": 1.204264094667451e-06, "learning_rate": 0.0001896421052631579, "logits/chosen": 13.15974235534668, "logits/rejected": 13.15974235534668, "logps/chosen": -3542.126953125, "logps/rejected": -3542.126953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.18682861328125, "rewards/margins": 0.0, "rewards/rejected": -351.18682861328125, "step": 500 }, { "epoch": 5.2631578947368425, "eval_logits/chosen": 13.179182052612305, "eval_logits/rejected": 13.179182052612305, "eval_logps/chosen": -4309.7412109375, "eval_logps/rejected": -4309.7412109375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.0709533691406, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.0709533691406, "eval_runtime": 4.3709, "eval_samples_per_second": 2.288, "eval_steps_per_second": 2.288, "step": 500 }, { "epoch": 5.273684210526316, "grad_norm": 1.5734547105239471e-06, "learning_rate": 0.00018962105263157898, "logits/chosen": 13.19274616241455, "logits/rejected": 13.19274616241455, "logps/chosen": -5172.8818359375, "logps/rejected": -5172.8818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3497924804688, "rewards/margins": 0.0, "rewards/rejected": -514.3497924804688, "step": 501 }, { "epoch": 5.284210526315789, "grad_norm": 1.6210292415053118e-06, "learning_rate": 0.0001896, "logits/chosen": 13.196554183959961, "logits/rejected": 13.196554183959961, "logps/chosen": -5172.888671875, "logps/rejected": -5172.888671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3504638671875, "rewards/margins": 0.0, "rewards/rejected": -514.3504638671875, "step": 502 }, { "epoch": 5.294736842105263, "grad_norm": 1.633610168028099e-06, "learning_rate": 0.00018957894736842105, "logits/chosen": 13.202737808227539, "logits/rejected": 13.202737808227539, "logps/chosen": -5172.873046875, "logps/rejected": -5172.873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.348876953125, "rewards/margins": 0.0, "rewards/rejected": -514.348876953125, "step": 503 }, { "epoch": 5.3052631578947365, "grad_norm": 1.2603325103555107e-06, "learning_rate": 0.0001895578947368421, "logits/chosen": 13.192971229553223, "logits/rejected": 13.192971229553223, "logps/chosen": -4324.396484375, "logps/rejected": -4324.396484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.2547912597656, "rewards/margins": 0.0, "rewards/rejected": -429.2547912597656, "step": 504 }, { "epoch": 5.315789473684211, "grad_norm": 1.2053794762323378e-06, "learning_rate": 0.00018953684210526318, "logits/chosen": 13.18458080291748, "logits/rejected": 13.18458080291748, "logps/chosen": -3999.69921875, "logps/rejected": -3999.69921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1426086425781, "rewards/margins": 0.0, "rewards/rejected": -397.1426086425781, "step": 505 }, { "epoch": 5.326315789473684, "grad_norm": 1.2172538390586851e-06, "learning_rate": 0.00018951578947368422, "logits/chosen": 13.187020301818848, "logits/rejected": 13.187020301818848, "logps/chosen": -3999.52734375, "logps/rejected": -3999.52734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.12542724609375, "rewards/margins": 0.0, "rewards/rejected": -397.12542724609375, "step": 506 }, { "epoch": 5.336842105263158, "grad_norm": 1.5117987004487077e-06, "learning_rate": 0.00018949473684210527, "logits/chosen": 13.192102432250977, "logits/rejected": 13.192102432250977, "logps/chosen": -2966.3984375, "logps/rejected": -2966.3984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9136657714844, "rewards/margins": 0.0, "rewards/rejected": -293.9136657714844, "step": 507 }, { "epoch": 5.347368421052631, "grad_norm": 1.1993967063972377e-06, "learning_rate": 0.00018947368421052632, "logits/chosen": 13.196795463562012, "logits/rejected": 13.196795463562012, "logps/chosen": -2672.544921875, "logps/rejected": -2672.544921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.472900390625, "rewards/margins": 0.0, "rewards/rejected": -264.472900390625, "step": 508 }, { "epoch": 5.3578947368421055, "grad_norm": 9.924002597472281e-07, "learning_rate": 0.00018945263157894737, "logits/chosen": 13.189982414245605, "logits/rejected": 13.189982414245605, "logps/chosen": -2672.9765625, "logps/rejected": -2672.9765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5160827636719, "rewards/margins": 0.0, "rewards/rejected": -264.5160827636719, "step": 509 }, { "epoch": 5.368421052631579, "grad_norm": 1.0404201020719483e-06, "learning_rate": 0.00018943157894736842, "logits/chosen": 13.169742584228516, "logits/rejected": 13.169742584228516, "logps/chosen": -3757.103515625, "logps/rejected": -3757.103515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.79205322265625, "rewards/margins": 0.0, "rewards/rejected": -372.79205322265625, "step": 510 }, { "epoch": 5.378947368421053, "grad_norm": 2.8191673209221335e-06, "learning_rate": 0.00018941052631578947, "logits/chosen": 13.166238784790039, "logits/rejected": 13.166238784790039, "logps/chosen": -4878.91748046875, "logps/rejected": -4878.91748046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0445861816406, "rewards/margins": 0.0, "rewards/rejected": -485.0445861816406, "step": 511 }, { "epoch": 5.389473684210526, "grad_norm": 1.0694178627090878e-06, "learning_rate": 0.00018938947368421055, "logits/chosen": 13.163310050964355, "logits/rejected": 13.163310050964355, "logps/chosen": -3757.392578125, "logps/rejected": -3757.392578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8209533691406, "rewards/margins": 0.0, "rewards/rejected": -372.8209533691406, "step": 512 }, { "epoch": 5.4, "grad_norm": 8.438643703811977e-07, "learning_rate": 0.0001893684210526316, "logits/chosen": 13.16645336151123, "logits/rejected": 13.16645336151123, "logps/chosen": -2967.0546875, "logps/rejected": -2967.0546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9792785644531, "rewards/margins": 0.0, "rewards/rejected": -293.9792785644531, "step": 513 }, { "epoch": 5.410526315789474, "grad_norm": 1.6386172774218721e-06, "learning_rate": 0.00018934736842105265, "logits/chosen": 13.19331169128418, "logits/rejected": 13.19331169128418, "logps/chosen": -5176.0, "logps/rejected": -5176.0, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6615600585938, "rewards/margins": 0.0, "rewards/rejected": -514.6615600585938, "step": 514 }, { "epoch": 5.421052631578947, "grad_norm": 1.3054219607511186e-06, "learning_rate": 0.0001893263157894737, "logits/chosen": 13.199670791625977, "logits/rejected": 13.199670791625977, "logps/chosen": -5176.2451171875, "logps/rejected": -5176.2451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6860961914062, "rewards/margins": 0.0, "rewards/rejected": -514.6860961914062, "step": 515 }, { "epoch": 5.431578947368421, "grad_norm": 1.3290152764966479e-06, "learning_rate": 0.00018930526315789474, "logits/chosen": 13.175046920776367, "logits/rejected": 13.175046920776367, "logps/chosen": -3542.28125, "logps/rejected": -3542.28125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2022399902344, "rewards/margins": 0.0, "rewards/rejected": -351.2022399902344, "step": 516 }, { "epoch": 5.442105263157894, "grad_norm": 1.8839148197002942e-06, "learning_rate": 0.0001892842105263158, "logits/chosen": 13.190823554992676, "logits/rejected": 13.190823554992676, "logps/chosen": -4287.044921875, "logps/rejected": -4287.044921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9071350097656, "rewards/margins": 0.0, "rewards/rejected": -425.9071350097656, "step": 517 }, { "epoch": 5.4526315789473685, "grad_norm": 1.6932900734900613e-06, "learning_rate": 0.00018926315789473684, "logits/chosen": 13.195592880249023, "logits/rejected": 13.195592880249023, "logps/chosen": -3776.611328125, "logps/rejected": -3776.611328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7920227050781, "rewards/margins": 0.0, "rewards/rejected": -374.7920227050781, "step": 518 }, { "epoch": 5.463157894736842, "grad_norm": 1.7988933223023196e-06, "learning_rate": 0.00018924210526315792, "logits/chosen": 13.198700904846191, "logits/rejected": 13.198700904846191, "logps/chosen": -4287.357421875, "logps/rejected": -4287.357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9383850097656, "rewards/margins": 0.0, "rewards/rejected": -425.9383850097656, "step": 519 }, { "epoch": 5.473684210526316, "grad_norm": 1.1877934866788564e-06, "learning_rate": 0.00018922105263157897, "logits/chosen": 13.20849609375, "logits/rejected": 13.20849609375, "logps/chosen": -4325.990234375, "logps/rejected": -4325.990234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4141540527344, "rewards/margins": 0.0, "rewards/rejected": -429.4141540527344, "step": 520 }, { "epoch": 5.484210526315789, "grad_norm": 1.2321351050559315e-06, "learning_rate": 0.0001892, "logits/chosen": 13.199177742004395, "logits/rejected": 13.199177742004395, "logps/chosen": -2967.66796875, "logps/rejected": -2967.66796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0406188964844, "rewards/margins": 0.0, "rewards/rejected": -294.0406188964844, "step": 521 }, { "epoch": 5.494736842105263, "grad_norm": 9.345473017674522e-07, "learning_rate": 0.00018917894736842107, "logits/chosen": 13.190621376037598, "logits/rejected": 13.190621376037598, "logps/chosen": -3542.87109375, "logps/rejected": -3542.87109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.26123046875, "rewards/margins": 0.0, "rewards/rejected": -351.26123046875, "step": 522 }, { "epoch": 5.505263157894737, "grad_norm": 1.3391311313171173e-06, "learning_rate": 0.00018915789473684212, "logits/chosen": 13.189898490905762, "logits/rejected": 13.189898490905762, "logps/chosen": -3997.154296875, "logps/rejected": -3997.154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.88812255859375, "rewards/margins": 0.0, "rewards/rejected": -396.88812255859375, "step": 523 }, { "epoch": 5.515789473684211, "grad_norm": 8.558351964893518e-07, "learning_rate": 0.00018913684210526317, "logits/chosen": 13.18808650970459, "logits/rejected": 13.18808650970459, "logps/chosen": -3543.158203125, "logps/rejected": -3543.158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2899475097656, "rewards/margins": 0.0, "rewards/rejected": -351.2899475097656, "step": 524 }, { "epoch": 5.526315789473684, "grad_norm": 2.4220507839345373e-06, "learning_rate": 0.00018911578947368422, "logits/chosen": 13.198655128479004, "logits/rejected": 13.198655128479004, "logps/chosen": -4878.38330078125, "logps/rejected": -4878.38330078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9911804199219, "rewards/margins": 0.0, "rewards/rejected": -484.9911804199219, "step": 525 }, { "epoch": 5.536842105263158, "grad_norm": 8.740883572500024e-07, "learning_rate": 0.0001890947368421053, "logits/chosen": 13.210992813110352, "logits/rejected": 13.210992813110352, "logps/chosen": -2673.4677734375, "logps/rejected": -2673.4677734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.565185546875, "rewards/margins": 0.0, "rewards/rejected": -264.565185546875, "step": 526 }, { "epoch": 5.5473684210526315, "grad_norm": 8.148468282342947e-07, "learning_rate": 0.00018907368421052631, "logits/chosen": 13.205110549926758, "logits/rejected": 13.205110549926758, "logps/chosen": -2968.787109375, "logps/rejected": -2968.787109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.15252685546875, "rewards/margins": 0.0, "rewards/rejected": -294.15252685546875, "step": 527 }, { "epoch": 5.557894736842105, "grad_norm": 1.4878780802973779e-06, "learning_rate": 0.00018905263157894736, "logits/chosen": 13.235568046569824, "logits/rejected": 13.235568046569824, "logps/chosen": -5176.36572265625, "logps/rejected": -5176.36572265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6981811523438, "rewards/margins": 0.0, "rewards/rejected": -514.6981811523438, "step": 528 }, { "epoch": 5.568421052631579, "grad_norm": 1.1538711532921297e-06, "learning_rate": 0.00018903157894736844, "logits/chosen": 13.207457542419434, "logits/rejected": 13.207457542419434, "logps/chosen": -3996.5703125, "logps/rejected": -3996.5703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8297119140625, "rewards/margins": 0.0, "rewards/rejected": -396.8297119140625, "step": 529 }, { "epoch": 5.578947368421053, "grad_norm": 1.1659416259135469e-06, "learning_rate": 0.0001890105263157895, "logits/chosen": 13.22602367401123, "logits/rejected": 13.22602367401123, "logps/chosen": -2673.1181640625, "logps/rejected": -2673.1181640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5302429199219, "rewards/margins": 0.0, "rewards/rejected": -264.5302429199219, "step": 530 }, { "epoch": 5.589473684210526, "grad_norm": 1.1985692935922998e-06, "learning_rate": 0.00018898947368421054, "logits/chosen": 13.208648681640625, "logits/rejected": 13.208648681640625, "logps/chosen": -3996.619140625, "logps/rejected": -3996.619140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8345947265625, "rewards/margins": 0.0, "rewards/rejected": -396.8345947265625, "step": 531 }, { "epoch": 5.6, "grad_norm": 1.1034218232452986e-06, "learning_rate": 0.0001889684210526316, "logits/chosen": 13.220032691955566, "logits/rejected": 13.220032691955566, "logps/chosen": -4327.482421875, "logps/rejected": -4327.482421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.5633850097656, "rewards/margins": 0.0, "rewards/rejected": -429.5633850097656, "step": 532 }, { "epoch": 5.610526315789474, "grad_norm": 1.2875439097115304e-06, "learning_rate": 0.00018894736842105266, "logits/chosen": 13.214885711669922, "logits/rejected": 13.214885711669922, "logps/chosen": -2673.916015625, "logps/rejected": -2673.916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6100158691406, "rewards/margins": 0.0, "rewards/rejected": -264.6100158691406, "step": 533 }, { "epoch": 5.621052631578947, "grad_norm": 1.3083537169222836e-06, "learning_rate": 0.00018892631578947369, "logits/chosen": 13.204473495483398, "logits/rejected": 13.204473495483398, "logps/chosen": -4877.75927734375, "logps/rejected": -4877.75927734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.92877197265625, "rewards/margins": 0.0, "rewards/rejected": -484.92877197265625, "step": 534 }, { "epoch": 5.631578947368421, "grad_norm": 1.1200177141290624e-06, "learning_rate": 0.00018890526315789473, "logits/chosen": 13.193305015563965, "logits/rejected": 13.193305015563965, "logps/chosen": -2969.091796875, "logps/rejected": -2969.091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1829833984375, "rewards/margins": 0.0, "rewards/rejected": -294.1829833984375, "step": 535 }, { "epoch": 5.6421052631578945, "grad_norm": 1.3789167496724986e-06, "learning_rate": 0.00018888421052631578, "logits/chosen": 13.215169906616211, "logits/rejected": 13.215169906616211, "logps/chosen": -5176.072265625, "logps/rejected": -5176.072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6688232421875, "rewards/margins": 0.0, "rewards/rejected": -514.6688232421875, "step": 536 }, { "epoch": 5.652631578947369, "grad_norm": 1.4613106031902134e-06, "learning_rate": 0.00018886315789473686, "logits/chosen": 13.212922096252441, "logits/rejected": 13.212922096252441, "logps/chosen": -5176.0166015625, "logps/rejected": -5176.0166015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6632690429688, "rewards/margins": 0.0, "rewards/rejected": -514.6632690429688, "step": 537 }, { "epoch": 5.663157894736842, "grad_norm": 1.2394501709422912e-06, "learning_rate": 0.0001888421052631579, "logits/chosen": 13.185783386230469, "logits/rejected": 13.185783386230469, "logps/chosen": -4289.8154296875, "logps/rejected": -4289.8154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1841735839844, "rewards/margins": 0.0, "rewards/rejected": -426.1841735839844, "step": 538 }, { "epoch": 5.673684210526316, "grad_norm": 8.939475151237275e-07, "learning_rate": 0.00018882105263157896, "logits/chosen": 13.183499336242676, "logits/rejected": 13.183499336242676, "logps/chosen": -3543.587890625, "logps/rejected": -3543.587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3329162597656, "rewards/margins": 0.0, "rewards/rejected": -351.3329162597656, "step": 539 }, { "epoch": 5.684210526315789, "grad_norm": 9.325666496806662e-07, "learning_rate": 0.0001888, "logits/chosen": 13.191852569580078, "logits/rejected": 13.191852569580078, "logps/chosen": -3758.19140625, "logps/rejected": -3758.19140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9008483886719, "rewards/margins": 0.0, "rewards/rejected": -372.9008483886719, "step": 540 }, { "epoch": 5.6947368421052635, "grad_norm": 9.438648476134404e-07, "learning_rate": 0.00018877894736842106, "logits/chosen": 13.193848609924316, "logits/rejected": 13.193848609924316, "logps/chosen": -3758.2880859375, "logps/rejected": -3758.2880859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9104919433594, "rewards/margins": 0.0, "rewards/rejected": -372.9104919433594, "step": 541 }, { "epoch": 5.705263157894737, "grad_norm": 1.2441748822311638e-06, "learning_rate": 0.0001887578947368421, "logits/chosen": 13.200854301452637, "logits/rejected": 13.200854301452637, "logps/chosen": -4877.8212890625, "logps/rejected": -4877.8212890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9349670410156, "rewards/margins": 0.0, "rewards/rejected": -484.9349670410156, "step": 542 }, { "epoch": 5.715789473684211, "grad_norm": 1.0261360330332536e-06, "learning_rate": 0.00018873684210526316, "logits/chosen": 13.206197738647461, "logits/rejected": 13.206197738647461, "logps/chosen": -2674.3232421875, "logps/rejected": -2674.3232421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6507263183594, "rewards/margins": 0.0, "rewards/rejected": -264.6507263183594, "step": 543 }, { "epoch": 5.726315789473684, "grad_norm": 1.1727012179107987e-06, "learning_rate": 0.00018871578947368423, "logits/chosen": 13.196168899536133, "logits/rejected": 13.196168899536133, "logps/chosen": -4290.02783203125, "logps/rejected": -4290.02783203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.2054138183594, "rewards/margins": 0.0, "rewards/rejected": -426.2054138183594, "step": 544 }, { "epoch": 5.7368421052631575, "grad_norm": 1.2502268873504363e-06, "learning_rate": 0.00018869473684210528, "logits/chosen": 13.202901840209961, "logits/rejected": 13.202901840209961, "logps/chosen": -4877.98779296875, "logps/rejected": -4877.98779296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9516296386719, "rewards/margins": 0.0, "rewards/rejected": -484.9516296386719, "step": 545 }, { "epoch": 5.747368421052632, "grad_norm": 1.3137529322193586e-06, "learning_rate": 0.00018867368421052633, "logits/chosen": 13.226284980773926, "logits/rejected": 13.226284980773926, "logps/chosen": -5176.2158203125, "logps/rejected": -5176.2158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6831665039062, "rewards/margins": 0.0, "rewards/rejected": -514.6831665039062, "step": 546 }, { "epoch": 5.757894736842105, "grad_norm": 1.1767713203880703e-06, "learning_rate": 0.00018865263157894738, "logits/chosen": 13.194772720336914, "logits/rejected": 13.194772720336914, "logps/chosen": -3996.72265625, "logps/rejected": -3996.72265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8449401855469, "rewards/margins": 0.0, "rewards/rejected": -396.8449401855469, "step": 547 }, { "epoch": 5.768421052631579, "grad_norm": 1.2771665751643013e-06, "learning_rate": 0.00018863157894736843, "logits/chosen": 13.207627296447754, "logits/rejected": 13.207627296447754, "logps/chosen": -4878.583984375, "logps/rejected": -4878.583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.01123046875, "rewards/margins": 0.0, "rewards/rejected": -485.01123046875, "step": 548 }, { "epoch": 5.778947368421052, "grad_norm": 1.0961126690745004e-06, "learning_rate": 0.00018861052631578948, "logits/chosen": 13.201512336730957, "logits/rejected": 13.201512336730957, "logps/chosen": -3759.333984375, "logps/rejected": -3759.333984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0151062011719, "rewards/margins": 0.0, "rewards/rejected": -373.0151062011719, "step": 549 }, { "epoch": 5.7894736842105265, "grad_norm": 1.263503122572729e-06, "learning_rate": 0.00018858947368421053, "logits/chosen": 13.207401275634766, "logits/rejected": 13.207401275634766, "logps/chosen": -4878.796875, "logps/rejected": -4878.796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.03253173828125, "rewards/margins": 0.0, "rewards/rejected": -485.03253173828125, "step": 550 }, { "epoch": 5.7894736842105265, "eval_logits/chosen": 13.214508056640625, "eval_logits/rejected": 13.214508056640625, "eval_logps/chosen": -4310.3515625, "eval_logps/rejected": -4310.3515625, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.1319885253906, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.1319885253906, "eval_runtime": 4.0248, "eval_samples_per_second": 2.485, "eval_steps_per_second": 2.485, "step": 550 }, { "epoch": 5.8, "grad_norm": 1.1792767509177793e-06, "learning_rate": 0.0001885684210526316, "logits/chosen": 13.194851875305176, "logits/rejected": 13.194851875305176, "logps/chosen": -3542.806640625, "logps/rejected": -3542.806640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2547912597656, "rewards/margins": 0.0, "rewards/rejected": -351.2547912597656, "step": 551 }, { "epoch": 5.810526315789474, "grad_norm": 1.3743325553150498e-06, "learning_rate": 0.00018854736842105265, "logits/chosen": 13.200400352478027, "logits/rejected": 13.200400352478027, "logps/chosen": -3775.3046875, "logps/rejected": -3775.3046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.6613464355469, "rewards/margins": 0.0, "rewards/rejected": -374.6613464355469, "step": 552 }, { "epoch": 5.821052631578947, "grad_norm": 1.205835133077926e-06, "learning_rate": 0.00018852631578947368, "logits/chosen": 13.209718704223633, "logits/rejected": 13.209718704223633, "logps/chosen": -4879.5869140625, "logps/rejected": -4879.5869140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1115417480469, "rewards/margins": 0.0, "rewards/rejected": -485.1115417480469, "step": 553 }, { "epoch": 5.831578947368421, "grad_norm": 1.1669042123685358e-06, "learning_rate": 0.00018850526315789475, "logits/chosen": 13.198144912719727, "logits/rejected": 13.198144912719727, "logps/chosen": -3996.380859375, "logps/rejected": -3996.380859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8107604980469, "rewards/margins": 0.0, "rewards/rejected": -396.8107604980469, "step": 554 }, { "epoch": 5.842105263157895, "grad_norm": 1.0688269185266108e-06, "learning_rate": 0.0001884842105263158, "logits/chosen": 13.211125373840332, "logits/rejected": 13.211125373840332, "logps/chosen": -2672.744140625, "logps/rejected": -2672.744140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4928283691406, "rewards/margins": 0.0, "rewards/rejected": -264.4928283691406, "step": 555 }, { "epoch": 5.852631578947369, "grad_norm": 1.1227840559513425e-06, "learning_rate": 0.00018846315789473685, "logits/chosen": 13.205501556396484, "logits/rejected": 13.205501556396484, "logps/chosen": -4879.8291015625, "logps/rejected": -4879.8291015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1357421875, "rewards/margins": 0.0, "rewards/rejected": -485.1357421875, "step": 556 }, { "epoch": 5.863157894736842, "grad_norm": 1.156195594376186e-06, "learning_rate": 0.0001884421052631579, "logits/chosen": 13.189672470092773, "logits/rejected": 13.189672470092773, "logps/chosen": -3996.57421875, "logps/rejected": -3996.57421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8301086425781, "rewards/margins": 0.0, "rewards/rejected": -396.8301086425781, "step": 557 }, { "epoch": 5.873684210526315, "grad_norm": 1.4351783192978473e-06, "learning_rate": 0.00018842105263157898, "logits/chosen": 13.2205171585083, "logits/rejected": 13.2205171585083, "logps/chosen": -5175.4443359375, "logps/rejected": -5175.4443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6060180664062, "rewards/margins": 0.0, "rewards/rejected": -514.6060180664062, "step": 558 }, { "epoch": 5.88421052631579, "grad_norm": 1.3104540812491905e-06, "learning_rate": 0.0001884, "logits/chosen": 13.188840866088867, "logits/rejected": 13.188840866088867, "logps/chosen": -3776.046875, "logps/rejected": -3776.046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7355651855469, "rewards/margins": 0.0, "rewards/rejected": -374.7355651855469, "step": 559 }, { "epoch": 5.894736842105263, "grad_norm": 1.1253225693508284e-06, "learning_rate": 0.00018837894736842105, "logits/chosen": 13.184172630310059, "logits/rejected": 13.184172630310059, "logps/chosen": -3540.91015625, "logps/rejected": -3540.91015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.0651550292969, "rewards/margins": 0.0, "rewards/rejected": -351.0651550292969, "step": 560 }, { "epoch": 5.905263157894737, "grad_norm": 1.3087112620269181e-06, "learning_rate": 0.00018835789473684212, "logits/chosen": 13.200813293457031, "logits/rejected": 13.200813293457031, "logps/chosen": -4323.94140625, "logps/rejected": -4323.94140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.20928955078125, "rewards/margins": 0.0, "rewards/rejected": -429.20928955078125, "step": 561 }, { "epoch": 5.91578947368421, "grad_norm": 1.20750951282389e-06, "learning_rate": 0.00018833684210526317, "logits/chosen": 13.20124340057373, "logits/rejected": 13.20124340057373, "logps/chosen": -4880.8564453125, "logps/rejected": -4880.8564453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2384948730469, "rewards/margins": 0.0, "rewards/rejected": -485.2384948730469, "step": 562 }, { "epoch": 5.926315789473684, "grad_norm": 1.3961901004222455e-06, "learning_rate": 0.00018831578947368422, "logits/chosen": 13.191816329956055, "logits/rejected": 13.191816329956055, "logps/chosen": -3540.873046875, "logps/rejected": -3540.873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.0614318847656, "rewards/margins": 0.0, "rewards/rejected": -351.0614318847656, "step": 563 }, { "epoch": 5.936842105263158, "grad_norm": 1.2689156392298173e-06, "learning_rate": 0.00018829473684210527, "logits/chosen": 13.211150169372559, "logits/rejected": 13.211150169372559, "logps/chosen": -4324.201171875, "logps/rejected": -4324.201171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.2352600097656, "rewards/margins": 0.0, "rewards/rejected": -429.2352600097656, "step": 564 }, { "epoch": 5.947368421052632, "grad_norm": 1.2820396477764007e-06, "learning_rate": 0.00018827368421052635, "logits/chosen": 13.198720932006836, "logits/rejected": 13.198720932006836, "logps/chosen": -3997.638671875, "logps/rejected": -3997.638671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9365539550781, "rewards/margins": 0.0, "rewards/rejected": -396.9365539550781, "step": 565 }, { "epoch": 5.957894736842105, "grad_norm": 1.4112313238001661e-06, "learning_rate": 0.00018825263157894737, "logits/chosen": 13.202353477478027, "logits/rejected": 13.202353477478027, "logps/chosen": -4287.662109375, "logps/rejected": -4287.662109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9688415527344, "rewards/margins": 0.0, "rewards/rejected": -425.9688415527344, "step": 566 }, { "epoch": 5.968421052631579, "grad_norm": 1.1650764690784854e-06, "learning_rate": 0.00018823157894736842, "logits/chosen": 13.19483757019043, "logits/rejected": 13.19483757019043, "logps/chosen": -3998.044921875, "logps/rejected": -3998.044921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9771728515625, "rewards/margins": 0.0, "rewards/rejected": -396.9771728515625, "step": 567 }, { "epoch": 5.978947368421053, "grad_norm": 1.1852655461552786e-06, "learning_rate": 0.00018821052631578947, "logits/chosen": 13.189155578613281, "logits/rejected": 13.189155578613281, "logps/chosen": -3541.6953125, "logps/rejected": -3541.6953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1436462402344, "rewards/margins": 0.0, "rewards/rejected": -351.1436462402344, "step": 568 }, { "epoch": 5.989473684210527, "grad_norm": 1.0672247299226e-06, "learning_rate": 0.00018818947368421055, "logits/chosen": 13.18425178527832, "logits/rejected": 13.18425178527832, "logps/chosen": -3541.740234375, "logps/rejected": -3541.740234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1481628417969, "rewards/margins": 0.0, "rewards/rejected": -351.1481628417969, "step": 569 }, { "epoch": 6.0, "grad_norm": 9.56930307438597e-07, "learning_rate": 0.0001881684210526316, "logits/chosen": 13.181585311889648, "logits/rejected": 13.181585311889648, "logps/chosen": -3542.328125, "logps/rejected": -3542.328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2069396972656, "rewards/margins": 0.0, "rewards/rejected": -351.2069396972656, "step": 570 }, { "epoch": 6.010526315789473, "grad_norm": 8.95854043392319e-07, "learning_rate": 0.00018814736842105264, "logits/chosen": 13.18505573272705, "logits/rejected": 13.18505573272705, "logps/chosen": -2967.0078125, "logps/rejected": -2967.0078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9745788574219, "rewards/margins": 0.0, "rewards/rejected": -293.9745788574219, "step": 571 }, { "epoch": 6.021052631578947, "grad_norm": 1.677127329458017e-06, "learning_rate": 0.0001881263157894737, "logits/chosen": 13.182439804077148, "logits/rejected": 13.182439804077148, "logps/chosen": -4287.9873046875, "logps/rejected": -4287.9873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0013732910156, "rewards/margins": 0.0, "rewards/rejected": -426.0013732910156, "step": 572 }, { "epoch": 6.031578947368421, "grad_norm": 7.974176128300314e-07, "learning_rate": 0.00018810526315789474, "logits/chosen": 13.184743881225586, "logits/rejected": 13.184743881225586, "logps/chosen": -2967.509765625, "logps/rejected": -2967.509765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0247802734375, "rewards/margins": 0.0, "rewards/rejected": -294.0247802734375, "step": 573 }, { "epoch": 6.042105263157895, "grad_norm": 1.8033014157481375e-06, "learning_rate": 0.0001880842105263158, "logits/chosen": 13.193608283996582, "logits/rejected": 13.193608283996582, "logps/chosen": -4880.2822265625, "logps/rejected": -4880.2822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1810607910156, "rewards/margins": 0.0, "rewards/rejected": -485.1810607910156, "step": 574 }, { "epoch": 6.052631578947368, "grad_norm": 9.459404282097239e-07, "learning_rate": 0.00018806315789473684, "logits/chosen": 13.190689086914062, "logits/rejected": 13.190689086914062, "logps/chosen": -3758.173828125, "logps/rejected": -3758.173828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8990783691406, "rewards/margins": 0.0, "rewards/rejected": -372.8990783691406, "step": 575 }, { "epoch": 6.063157894736842, "grad_norm": 1.5308801266655792e-06, "learning_rate": 0.00018804210526315792, "logits/chosen": 13.205794334411621, "logits/rejected": 13.205794334411621, "logps/chosen": -4326.60546875, "logps/rejected": -4326.60546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4756774902344, "rewards/margins": 0.0, "rewards/rejected": -429.4756774902344, "step": 576 }, { "epoch": 6.073684210526316, "grad_norm": 1.3134667824488133e-06, "learning_rate": 0.00018802105263157897, "logits/chosen": 13.210139274597168, "logits/rejected": 13.210139274597168, "logps/chosen": -4879.81494140625, "logps/rejected": -4879.81494140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.13433837890625, "rewards/margins": 0.0, "rewards/rejected": -485.13433837890625, "step": 577 }, { "epoch": 6.08421052631579, "grad_norm": 1.1836641533591319e-06, "learning_rate": 0.000188, "logits/chosen": 13.219319343566895, "logits/rejected": 13.219319343566895, "logps/chosen": -4326.8193359375, "logps/rejected": -4326.8193359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4970703125, "rewards/margins": 0.0, "rewards/rejected": -429.4970703125, "step": 578 }, { "epoch": 6.094736842105263, "grad_norm": 1.9596509446273558e-06, "learning_rate": 0.00018797894736842107, "logits/chosen": 13.209967613220215, "logits/rejected": 13.209967613220215, "logps/chosen": -3543.5205078125, "logps/rejected": -3543.5205078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.326171875, "rewards/margins": 0.0, "rewards/rejected": -351.326171875, "step": 579 }, { "epoch": 6.105263157894737, "grad_norm": 1.732796249598323e-06, "learning_rate": 0.00018795789473684211, "logits/chosen": 13.212206840515137, "logits/rejected": 13.212206840515137, "logps/chosen": -3998.2734375, "logps/rejected": -3998.2734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0000305175781, "rewards/margins": 0.0, "rewards/rejected": -397.0000305175781, "step": 580 }, { "epoch": 6.11578947368421, "grad_norm": 1.523095647826267e-06, "learning_rate": 0.00018793684210526316, "logits/chosen": 13.212565422058105, "logits/rejected": 13.212565422058105, "logps/chosen": -4288.8076171875, "logps/rejected": -4288.8076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0834045410156, "rewards/margins": 0.0, "rewards/rejected": -426.0834045410156, "step": 581 }, { "epoch": 6.126315789473685, "grad_norm": 1.6778822100604884e-06, "learning_rate": 0.0001879157894736842, "logits/chosen": 13.23731803894043, "logits/rejected": 13.23731803894043, "logps/chosen": -5173.009765625, "logps/rejected": -5173.009765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.362548828125, "rewards/margins": 0.0, "rewards/rejected": -514.362548828125, "step": 582 }, { "epoch": 6.136842105263158, "grad_norm": 1.2378862948025926e-06, "learning_rate": 0.0001878947368421053, "logits/chosen": 13.20161247253418, "logits/rejected": 13.20161247253418, "logps/chosen": -3777.7470703125, "logps/rejected": -3777.7470703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9056091308594, "rewards/margins": 0.0, "rewards/rejected": -374.9056091308594, "step": 583 }, { "epoch": 6.147368421052631, "grad_norm": 9.320282288172166e-07, "learning_rate": 0.00018787368421052634, "logits/chosen": 13.207496643066406, "logits/rejected": 13.207496643066406, "logps/chosen": -2672.783203125, "logps/rejected": -2672.783203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4967346191406, "rewards/margins": 0.0, "rewards/rejected": -264.4967346191406, "step": 584 }, { "epoch": 6.157894736842105, "grad_norm": 8.268946203315863e-07, "learning_rate": 0.00018785263157894736, "logits/chosen": 13.189412117004395, "logits/rejected": 13.189412117004395, "logps/chosen": -3544.30859375, "logps/rejected": -3544.30859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4049987792969, "rewards/margins": 0.0, "rewards/rejected": -351.4049987792969, "step": 585 }, { "epoch": 6.168421052631579, "grad_norm": 1.178776187771291e-06, "learning_rate": 0.00018783157894736844, "logits/chosen": 13.191641807556152, "logits/rejected": 13.191641807556152, "logps/chosen": -3758.3203125, "logps/rejected": -3758.3203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9137268066406, "rewards/margins": 0.0, "rewards/rejected": -372.9137268066406, "step": 586 }, { "epoch": 6.178947368421053, "grad_norm": 3.134376356683788e-06, "learning_rate": 0.0001878105263157895, "logits/chosen": 13.220187187194824, "logits/rejected": 13.220187187194824, "logps/chosen": -5172.9501953125, "logps/rejected": -5172.9501953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3566284179688, "rewards/margins": 0.0, "rewards/rejected": -514.3566284179688, "step": 587 }, { "epoch": 6.189473684210526, "grad_norm": 9.482012615080748e-07, "learning_rate": 0.00018778947368421054, "logits/chosen": 13.199373245239258, "logits/rejected": 13.199373245239258, "logps/chosen": -3758.64453125, "logps/rejected": -3758.64453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9461364746094, "rewards/margins": 0.0, "rewards/rejected": -372.9461364746094, "step": 588 }, { "epoch": 6.2, "grad_norm": 1.042478515955736e-06, "learning_rate": 0.00018776842105263159, "logits/chosen": 13.214068412780762, "logits/rejected": 13.214068412780762, "logps/chosen": -2673.265625, "logps/rejected": -2673.265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.54498291015625, "rewards/margins": 0.0, "rewards/rejected": -264.54498291015625, "step": 589 }, { "epoch": 6.2105263157894735, "grad_norm": 1.3472722457663622e-06, "learning_rate": 0.00018774736842105266, "logits/chosen": 13.20860481262207, "logits/rejected": 13.20860481262207, "logps/chosen": -3777.83203125, "logps/rejected": -3777.83203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9140930175781, "rewards/margins": 0.0, "rewards/rejected": -374.9140930175781, "step": 590 }, { "epoch": 6.221052631578948, "grad_norm": 1.472659732826287e-06, "learning_rate": 0.00018772631578947368, "logits/chosen": 13.211287498474121, "logits/rejected": 13.211287498474121, "logps/chosen": -3777.96484375, "logps/rejected": -3777.96484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9273681640625, "rewards/margins": 0.0, "rewards/rejected": -374.9273681640625, "step": 591 }, { "epoch": 6.231578947368421, "grad_norm": 1.5024987760625663e-06, "learning_rate": 0.00018770526315789473, "logits/chosen": 13.221588134765625, "logits/rejected": 13.221588134765625, "logps/chosen": -2673.189453125, "logps/rejected": -2673.189453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.537353515625, "rewards/margins": 0.0, "rewards/rejected": -264.537353515625, "step": 592 }, { "epoch": 6.242105263157895, "grad_norm": 1.4282520623964956e-06, "learning_rate": 0.00018768421052631578, "logits/chosen": 13.205028533935547, "logits/rejected": 13.205028533935547, "logps/chosen": -3996.93359375, "logps/rejected": -3996.93359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8660583496094, "rewards/margins": 0.0, "rewards/rejected": -396.8660583496094, "step": 593 }, { "epoch": 6.252631578947368, "grad_norm": 1.1856502624141285e-06, "learning_rate": 0.00018766315789473686, "logits/chosen": 13.20934009552002, "logits/rejected": 13.20934009552002, "logps/chosen": -2673.9345703125, "logps/rejected": -2673.9345703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.61187744140625, "rewards/margins": 0.0, "rewards/rejected": -264.61187744140625, "step": 594 }, { "epoch": 6.2631578947368425, "grad_norm": 9.582586244505364e-07, "learning_rate": 0.0001876421052631579, "logits/chosen": 13.18975830078125, "logits/rejected": 13.18975830078125, "logps/chosen": -2969.1669921875, "logps/rejected": -2969.1669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1905212402344, "rewards/margins": 0.0, "rewards/rejected": -294.1905212402344, "step": 595 }, { "epoch": 6.273684210526316, "grad_norm": 1.8523847984397435e-06, "learning_rate": 0.00018762105263157896, "logits/chosen": 13.210691452026367, "logits/rejected": 13.210691452026367, "logps/chosen": -5173.56396484375, "logps/rejected": -5173.56396484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.41796875, "rewards/margins": 0.0, "rewards/rejected": -514.41796875, "step": 596 }, { "epoch": 6.284210526315789, "grad_norm": 8.66443656377669e-07, "learning_rate": 0.0001876, "logits/chosen": 13.170330047607422, "logits/rejected": 13.170330047607422, "logps/chosen": -3544.0205078125, "logps/rejected": -3544.0205078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3761901855469, "rewards/margins": 0.0, "rewards/rejected": -351.3761901855469, "step": 597 }, { "epoch": 6.294736842105263, "grad_norm": 1.7496446389486664e-06, "learning_rate": 0.00018757894736842106, "logits/chosen": 13.168196678161621, "logits/rejected": 13.168196678161621, "logps/chosen": -3996.181640625, "logps/rejected": -3996.181640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7908630371094, "rewards/margins": 0.0, "rewards/rejected": -396.7908630371094, "step": 598 }, { "epoch": 6.3052631578947365, "grad_norm": 1.440081859982456e-06, "learning_rate": 0.0001875578947368421, "logits/chosen": 13.170392036437988, "logits/rejected": 13.170392036437988, "logps/chosen": -3779.1103515625, "logps/rejected": -3779.1103515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.04193115234375, "rewards/margins": 0.0, "rewards/rejected": -375.04193115234375, "step": 599 }, { "epoch": 6.315789473684211, "grad_norm": 1.4733492434970685e-06, "learning_rate": 0.00018753684210526315, "logits/chosen": 13.181976318359375, "logits/rejected": 13.181976318359375, "logps/chosen": -4876.5595703125, "logps/rejected": -4876.5595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8088073730469, "rewards/margins": 0.0, "rewards/rejected": -484.8088073730469, "step": 600 }, { "epoch": 6.315789473684211, "eval_logits/chosen": 13.191164016723633, "eval_logits/rejected": 13.191164016723633, "eval_logps/chosen": -4310.07666015625, "eval_logps/rejected": -4310.07666015625, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.1044921875, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.1044921875, "eval_runtime": 4.1623, "eval_samples_per_second": 2.403, "eval_steps_per_second": 2.403, "step": 600 }, { "epoch": 6.326315789473684, "grad_norm": 1.0408135722173029e-06, "learning_rate": 0.00018751578947368423, "logits/chosen": 13.175030708312988, "logits/rejected": 13.175030708312988, "logps/chosen": -2968.7265625, "logps/rejected": -2968.7265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1464538574219, "rewards/margins": 0.0, "rewards/rejected": -294.1464538574219, "step": 601 }, { "epoch": 6.336842105263158, "grad_norm": 2.0444217625481542e-06, "learning_rate": 0.00018749473684210528, "logits/chosen": 13.209887504577637, "logits/rejected": 13.209887504577637, "logps/chosen": -5173.6474609375, "logps/rejected": -5173.6474609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4263305664062, "rewards/margins": 0.0, "rewards/rejected": -514.4263305664062, "step": 602 }, { "epoch": 6.347368421052631, "grad_norm": 1.2794157555617858e-06, "learning_rate": 0.00018747368421052633, "logits/chosen": 13.186957359313965, "logits/rejected": 13.186957359313965, "logps/chosen": -2674.6982421875, "logps/rejected": -2674.6982421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.688232421875, "rewards/margins": 0.0, "rewards/rejected": -264.688232421875, "step": 603 }, { "epoch": 6.3578947368421055, "grad_norm": 1.315401846113673e-06, "learning_rate": 0.00018745263157894738, "logits/chosen": 13.173993110656738, "logits/rejected": 13.173993110656738, "logps/chosen": -3543.5556640625, "logps/rejected": -3543.5556640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3296813964844, "rewards/margins": 0.0, "rewards/rejected": -351.3296813964844, "step": 604 }, { "epoch": 6.368421052631579, "grad_norm": 1.1277600151515799e-06, "learning_rate": 0.00018743157894736843, "logits/chosen": 13.173890113830566, "logits/rejected": 13.173890113830566, "logps/chosen": -3543.80859375, "logps/rejected": -3543.80859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.35498046875, "rewards/margins": 0.0, "rewards/rejected": -351.35498046875, "step": 605 }, { "epoch": 6.378947368421053, "grad_norm": 1.1660972631943878e-06, "learning_rate": 0.00018741052631578948, "logits/chosen": 13.173995018005371, "logits/rejected": 13.173995018005371, "logps/chosen": -3996.23828125, "logps/rejected": -3996.23828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7965087890625, "rewards/margins": 0.0, "rewards/rejected": -396.7965087890625, "step": 606 }, { "epoch": 6.389473684210526, "grad_norm": 1.1859199275932042e-06, "learning_rate": 0.00018738947368421053, "logits/chosen": 13.171222686767578, "logits/rejected": 13.171222686767578, "logps/chosen": -3996.486328125, "logps/rejected": -3996.486328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8213195800781, "rewards/margins": 0.0, "rewards/rejected": -396.8213195800781, "step": 607 }, { "epoch": 6.4, "grad_norm": 1.2157888704678044e-06, "learning_rate": 0.0001873684210526316, "logits/chosen": 13.165254592895508, "logits/rejected": 13.165254592895508, "logps/chosen": -3996.791015625, "logps/rejected": -3996.791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8517761230469, "rewards/margins": 0.0, "rewards/rejected": -396.8517761230469, "step": 608 }, { "epoch": 6.410526315789474, "grad_norm": 1.221074739987671e-06, "learning_rate": 0.00018734736842105265, "logits/chosen": 13.156908988952637, "logits/rejected": 13.156908988952637, "logps/chosen": -3997.234375, "logps/rejected": -3997.234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8961181640625, "rewards/margins": 0.0, "rewards/rejected": -396.8961181640625, "step": 609 }, { "epoch": 6.421052631578947, "grad_norm": 1.5741740071462118e-06, "learning_rate": 0.00018732631578947367, "logits/chosen": 13.18421459197998, "logits/rejected": 13.18421459197998, "logps/chosen": -5173.595703125, "logps/rejected": -5173.595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.421142578125, "rewards/margins": 0.0, "rewards/rejected": -514.421142578125, "step": 610 }, { "epoch": 6.431578947368421, "grad_norm": 9.437316634830495e-07, "learning_rate": 0.00018730526315789475, "logits/chosen": 13.148723602294922, "logits/rejected": 13.148723602294922, "logps/chosen": -3758.5625, "logps/rejected": -3758.5625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9379577636719, "rewards/margins": 0.0, "rewards/rejected": -372.9379577636719, "step": 611 }, { "epoch": 6.442105263157894, "grad_norm": 1.4082871757636894e-06, "learning_rate": 0.0001872842105263158, "logits/chosen": 13.153526306152344, "logits/rejected": 13.153526306152344, "logps/chosen": -4877.2158203125, "logps/rejected": -4877.2158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8744201660156, "rewards/margins": 0.0, "rewards/rejected": -484.8744201660156, "step": 612 }, { "epoch": 6.4526315789473685, "grad_norm": 1.0770583003250067e-06, "learning_rate": 0.00018726315789473685, "logits/chosen": 13.135787963867188, "logits/rejected": 13.135787963867188, "logps/chosen": -3998.919921875, "logps/rejected": -3998.919921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0646667480469, "rewards/margins": 0.0, "rewards/rejected": -397.0646667480469, "step": 613 }, { "epoch": 6.463157894736842, "grad_norm": 1.1356005416018888e-06, "learning_rate": 0.0001872421052631579, "logits/chosen": 13.141548156738281, "logits/rejected": 13.141548156738281, "logps/chosen": -2674.537109375, "logps/rejected": -2674.537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.672119140625, "rewards/margins": 0.0, "rewards/rejected": -264.672119140625, "step": 614 }, { "epoch": 6.473684210526316, "grad_norm": 1.0908431704592658e-06, "learning_rate": 0.00018722105263157897, "logits/chosen": 13.12234878540039, "logits/rejected": 13.12234878540039, "logps/chosen": -3999.23828125, "logps/rejected": -3999.23828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0965270996094, "rewards/margins": 0.0, "rewards/rejected": -397.0965270996094, "step": 615 }, { "epoch": 6.484210526315789, "grad_norm": 1.038601681102591e-06, "learning_rate": 0.00018720000000000002, "logits/chosen": 13.111235618591309, "logits/rejected": 13.111235618591309, "logps/chosen": -3999.603515625, "logps/rejected": -3999.603515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1330261230469, "rewards/margins": 0.0, "rewards/rejected": -397.1330261230469, "step": 616 }, { "epoch": 6.494736842105263, "grad_norm": 9.794709967536619e-07, "learning_rate": 0.00018717894736842105, "logits/chosen": 13.097511291503906, "logits/rejected": 13.097511291503906, "logps/chosen": -4000.416015625, "logps/rejected": -4000.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.21429443359375, "rewards/margins": 0.0, "rewards/rejected": -397.21429443359375, "step": 617 }, { "epoch": 6.505263157894737, "grad_norm": 1.4964347201384953e-06, "learning_rate": 0.00018715789473684212, "logits/chosen": 13.097776412963867, "logits/rejected": 13.097776412963867, "logps/chosen": -4877.37451171875, "logps/rejected": -4877.37451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8902893066406, "rewards/margins": 0.0, "rewards/rejected": -484.8902893066406, "step": 618 }, { "epoch": 6.515789473684211, "grad_norm": 1.6902042716537835e-06, "learning_rate": 0.00018713684210526317, "logits/chosen": 13.107996940612793, "logits/rejected": 13.107996940612793, "logps/chosen": -5172.5341796875, "logps/rejected": -5172.5341796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3150024414062, "rewards/margins": 0.0, "rewards/rejected": -514.3150024414062, "step": 619 }, { "epoch": 6.526315789473684, "grad_norm": 1.4707082982567954e-06, "learning_rate": 0.00018711578947368422, "logits/chosen": 13.081208229064941, "logits/rejected": 13.081208229064941, "logps/chosen": -4877.89501953125, "logps/rejected": -4877.89501953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9423522949219, "rewards/margins": 0.0, "rewards/rejected": -484.9423522949219, "step": 620 }, { "epoch": 6.536842105263158, "grad_norm": 1.382074970024405e-06, "learning_rate": 0.00018709473684210527, "logits/chosen": 13.06719970703125, "logits/rejected": 13.06719970703125, "logps/chosen": -4286.072265625, "logps/rejected": -4286.072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.80987548828125, "rewards/margins": 0.0, "rewards/rejected": -425.80987548828125, "step": 621 }, { "epoch": 6.5473684210526315, "grad_norm": 8.750578217586735e-07, "learning_rate": 0.00018707368421052635, "logits/chosen": 13.063549995422363, "logits/rejected": 13.063549995422363, "logps/chosen": -4002.49609375, "logps/rejected": -4002.49609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.42230224609375, "rewards/margins": 0.0, "rewards/rejected": -397.42230224609375, "step": 622 }, { "epoch": 6.557894736842105, "grad_norm": 1.558595840833732e-06, "learning_rate": 0.00018705263157894737, "logits/chosen": 13.076499938964844, "logits/rejected": 13.076499938964844, "logps/chosen": -4878.43505859375, "logps/rejected": -4878.43505859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.996337890625, "rewards/margins": 0.0, "rewards/rejected": -484.996337890625, "step": 623 }, { "epoch": 6.568421052631579, "grad_norm": 1.5447517398570199e-06, "learning_rate": 0.00018703157894736842, "logits/chosen": 13.07684326171875, "logits/rejected": 13.07684326171875, "logps/chosen": -4878.20654296875, "logps/rejected": -4878.20654296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9734802246094, "rewards/margins": 0.0, "rewards/rejected": -484.9734802246094, "step": 624 }, { "epoch": 6.578947368421053, "grad_norm": 1.1281501883786405e-06, "learning_rate": 0.00018701052631578947, "logits/chosen": 13.074007987976074, "logits/rejected": 13.074007987976074, "logps/chosen": -2672.408203125, "logps/rejected": -2672.408203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.459228515625, "rewards/margins": 0.0, "rewards/rejected": -264.459228515625, "step": 625 }, { "epoch": 6.589473684210526, "grad_norm": 8.956453143582621e-07, "learning_rate": 0.00018698947368421054, "logits/chosen": 13.063782691955566, "logits/rejected": 13.063782691955566, "logps/chosen": -4002.556640625, "logps/rejected": -4002.556640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.4283447265625, "rewards/margins": 0.0, "rewards/rejected": -397.4283447265625, "step": 626 }, { "epoch": 6.6, "grad_norm": 1.1431949360485305e-06, "learning_rate": 0.0001869684210526316, "logits/chosen": 13.061800003051758, "logits/rejected": 13.061800003051758, "logps/chosen": -3539.251953125, "logps/rejected": -3539.251953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.8993225097656, "rewards/margins": 0.0, "rewards/rejected": -350.8993225097656, "step": 627 }, { "epoch": 6.610526315789474, "grad_norm": 1.4119310662863427e-06, "learning_rate": 0.00018694736842105264, "logits/chosen": 13.06440544128418, "logits/rejected": 13.06440544128418, "logps/chosen": -4285.8671875, "logps/rejected": -4285.8671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.78936767578125, "rewards/margins": 0.0, "rewards/rejected": -425.78936767578125, "step": 628 }, { "epoch": 6.621052631578947, "grad_norm": 1.0821867135746288e-06, "learning_rate": 0.0001869263157894737, "logits/chosen": 13.0635404586792, "logits/rejected": 13.0635404586792, "logps/chosen": -3539.0703125, "logps/rejected": -3539.0703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.88116455078125, "rewards/margins": 0.0, "rewards/rejected": -350.88116455078125, "step": 629 }, { "epoch": 6.631578947368421, "grad_norm": 1.7470437114752713e-06, "learning_rate": 0.00018690526315789474, "logits/chosen": 13.084620475769043, "logits/rejected": 13.084620475769043, "logps/chosen": -4321.90234375, "logps/rejected": -4321.90234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.00537109375, "rewards/margins": 0.0, "rewards/rejected": -429.00537109375, "step": 630 }, { "epoch": 6.6421052631578945, "grad_norm": 1.7168232488984358e-06, "learning_rate": 0.0001868842105263158, "logits/chosen": 13.114235877990723, "logits/rejected": 13.114235877990723, "logps/chosen": -5172.373046875, "logps/rejected": -5172.373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2988891601562, "rewards/margins": 0.0, "rewards/rejected": -514.2988891601562, "step": 631 }, { "epoch": 6.652631578947369, "grad_norm": 1.4030877082404913e-06, "learning_rate": 0.00018686315789473684, "logits/chosen": 13.106715202331543, "logits/rejected": 13.106715202331543, "logps/chosen": -4322.400390625, "logps/rejected": -4322.400390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.05517578125, "rewards/margins": 0.0, "rewards/rejected": -429.05517578125, "step": 632 }, { "epoch": 6.663157894736842, "grad_norm": 1.5272150903911097e-06, "learning_rate": 0.00018684210526315792, "logits/chosen": 13.105968475341797, "logits/rejected": 13.105968475341797, "logps/chosen": -4286.4130859375, "logps/rejected": -4286.4130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.8439636230469, "rewards/margins": 0.0, "rewards/rejected": -425.8439636230469, "step": 633 }, { "epoch": 6.673684210526316, "grad_norm": 1.843278937485593e-06, "learning_rate": 0.00018682105263157896, "logits/chosen": 13.114410400390625, "logits/rejected": 13.114410400390625, "logps/chosen": -3539.841796875, "logps/rejected": -3539.841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.95831298828125, "rewards/margins": 0.0, "rewards/rejected": -350.95831298828125, "step": 634 }, { "epoch": 6.684210526315789, "grad_norm": 1.5347978887803038e-06, "learning_rate": 0.00018680000000000001, "logits/chosen": 13.126788139343262, "logits/rejected": 13.126788139343262, "logps/chosen": -4287.0087890625, "logps/rejected": -4287.0087890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9035339355469, "rewards/margins": 0.0, "rewards/rejected": -425.9035339355469, "step": 635 }, { "epoch": 6.6947368421052635, "grad_norm": 1.3687921409655246e-06, "learning_rate": 0.00018677894736842106, "logits/chosen": 13.130768775939941, "logits/rejected": 13.130768775939941, "logps/chosen": -3540.142578125, "logps/rejected": -3540.142578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.9883728027344, "rewards/margins": 0.0, "rewards/rejected": -350.9883728027344, "step": 636 }, { "epoch": 6.705263157894737, "grad_norm": 1.0600799669191474e-06, "learning_rate": 0.0001867578947368421, "logits/chosen": 13.140104293823242, "logits/rejected": 13.140104293823242, "logps/chosen": -2966.134765625, "logps/rejected": -2966.134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.8872985839844, "rewards/margins": 0.0, "rewards/rejected": -293.8872985839844, "step": 637 }, { "epoch": 6.715789473684211, "grad_norm": 9.262695925826847e-07, "learning_rate": 0.00018673684210526316, "logits/chosen": 13.153818130493164, "logits/rejected": 13.153818130493164, "logps/chosen": -2672.134765625, "logps/rejected": -2672.134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.431884765625, "rewards/margins": 0.0, "rewards/rejected": -264.431884765625, "step": 638 }, { "epoch": 6.726315789473684, "grad_norm": 1.5997011360013857e-06, "learning_rate": 0.0001867157894736842, "logits/chosen": 13.146134376525879, "logits/rejected": 13.146134376525879, "logps/chosen": -4000.60546875, "logps/rejected": -4000.60546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.2332458496094, "rewards/margins": 0.0, "rewards/rejected": -397.2332458496094, "step": 639 }, { "epoch": 6.7368421052631575, "grad_norm": 2.5365302462887485e-06, "learning_rate": 0.0001866947368421053, "logits/chosen": 13.165045738220215, "logits/rejected": 13.165045738220215, "logps/chosen": -4878.8984375, "logps/rejected": -4878.8984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0426940917969, "rewards/margins": 0.0, "rewards/rejected": -485.0426940917969, "step": 640 }, { "epoch": 6.747368421052632, "grad_norm": 1.5845118923607515e-06, "learning_rate": 0.00018667368421052634, "logits/chosen": 13.175384521484375, "logits/rejected": 13.175384521484375, "logps/chosen": -4879.02587890625, "logps/rejected": -4879.02587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.055419921875, "rewards/margins": 0.0, "rewards/rejected": -485.055419921875, "step": 641 }, { "epoch": 6.757894736842105, "grad_norm": 1.344304223493964e-06, "learning_rate": 0.00018665263157894736, "logits/chosen": 13.19092845916748, "logits/rejected": 13.19092845916748, "logps/chosen": -4325.8671875, "logps/rejected": -4325.8671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.40185546875, "rewards/margins": 0.0, "rewards/rejected": -429.40185546875, "step": 642 }, { "epoch": 6.768421052631579, "grad_norm": 3.5879138522432186e-06, "learning_rate": 0.00018663157894736844, "logits/chosen": 13.225683212280273, "logits/rejected": 13.225683212280273, "logps/chosen": -5173.5927734375, "logps/rejected": -5173.5927734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4208374023438, "rewards/margins": 0.0, "rewards/rejected": -514.4208374023438, "step": 643 }, { "epoch": 6.778947368421052, "grad_norm": 3.543569619068876e-06, "learning_rate": 0.00018661052631578948, "logits/chosen": 13.232131004333496, "logits/rejected": 13.232131004333496, "logps/chosen": -5173.76513671875, "logps/rejected": -5173.76513671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4381103515625, "rewards/margins": 0.0, "rewards/rejected": -514.4381103515625, "step": 644 }, { "epoch": 6.7894736842105265, "grad_norm": 1.3645823173646932e-06, "learning_rate": 0.00018658947368421053, "logits/chosen": 13.19379997253418, "logits/rejected": 13.19379997253418, "logps/chosen": -3775.4765625, "logps/rejected": -3775.4765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.6785583496094, "rewards/margins": 0.0, "rewards/rejected": -374.6785583496094, "step": 645 }, { "epoch": 6.8, "grad_norm": 8.641389968033764e-07, "learning_rate": 0.00018656842105263158, "logits/chosen": 13.200854301452637, "logits/rejected": 13.200854301452637, "logps/chosen": -2672.62890625, "logps/rejected": -2672.62890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4812927246094, "rewards/margins": 0.0, "rewards/rejected": -264.4812927246094, "step": 646 }, { "epoch": 6.810526315789474, "grad_norm": 7.960022117003973e-07, "learning_rate": 0.00018654736842105266, "logits/chosen": 13.188977241516113, "logits/rejected": 13.188977241516113, "logps/chosen": -2967.58984375, "logps/rejected": -2967.58984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0328063964844, "rewards/margins": 0.0, "rewards/rejected": -294.0328063964844, "step": 647 }, { "epoch": 6.821052631578947, "grad_norm": 5.177711500436999e-06, "learning_rate": 0.00018652631578947368, "logits/chosen": 13.22115421295166, "logits/rejected": 13.22115421295166, "logps/chosen": -5174.9287109375, "logps/rejected": -5174.9287109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.554443359375, "rewards/margins": 0.0, "rewards/rejected": -514.554443359375, "step": 648 }, { "epoch": 6.831578947368421, "grad_norm": 1.72847023804934e-06, "learning_rate": 0.00018650526315789473, "logits/chosen": 13.200213432312012, "logits/rejected": 13.200213432312012, "logps/chosen": -3775.712890625, "logps/rejected": -3775.712890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7021789550781, "rewards/margins": 0.0, "rewards/rejected": -374.7021789550781, "step": 649 }, { "epoch": 6.842105263157895, "grad_norm": 9.512602332506503e-07, "learning_rate": 0.0001864842105263158, "logits/chosen": 13.214202880859375, "logits/rejected": 13.214202880859375, "logps/chosen": -3542.818359375, "logps/rejected": -3542.818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2559509277344, "rewards/margins": 0.0, "rewards/rejected": -351.2559509277344, "step": 650 }, { "epoch": 6.842105263157895, "eval_logits/chosen": 13.250373840332031, "eval_logits/rejected": 13.250373840332031, "eval_logps/chosen": -4310.6591796875, "eval_logps/rejected": -4310.6591796875, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.1627502441406, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.1627502441406, "eval_runtime": 4.4113, "eval_samples_per_second": 2.267, "eval_steps_per_second": 2.267, "step": 650 }, { "epoch": 6.852631578947369, "grad_norm": 1.7020762470565387e-06, "learning_rate": 0.00018646315789473686, "logits/chosen": 13.229392051696777, "logits/rejected": 13.229392051696777, "logps/chosen": -3543.119140625, "logps/rejected": -3543.119140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2860412597656, "rewards/margins": 0.0, "rewards/rejected": -351.2860412597656, "step": 651 }, { "epoch": 6.863157894736842, "grad_norm": 3.6161384286970133e-06, "learning_rate": 0.0001864421052631579, "logits/chosen": 13.259687423706055, "logits/rejected": 13.259687423706055, "logps/chosen": -4879.25537109375, "logps/rejected": -4879.25537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.078369140625, "rewards/margins": 0.0, "rewards/rejected": -485.078369140625, "step": 652 }, { "epoch": 6.873684210526315, "grad_norm": 3.063306849071523e-06, "learning_rate": 0.00018642105263157896, "logits/chosen": 13.283510208129883, "logits/rejected": 13.283510208129883, "logps/chosen": -5176.0205078125, "logps/rejected": -5176.0205078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6636352539062, "rewards/margins": 0.0, "rewards/rejected": -514.6636352539062, "step": 653 }, { "epoch": 6.88421052631579, "grad_norm": 1.32214620407467e-06, "learning_rate": 0.00018640000000000003, "logits/chosen": 13.257993698120117, "logits/rejected": 13.257993698120117, "logps/chosen": -4879.55810546875, "logps/rejected": -4879.55810546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.108642578125, "rewards/margins": 0.0, "rewards/rejected": -485.108642578125, "step": 654 }, { "epoch": 6.894736842105263, "grad_norm": 2.4693399609532207e-06, "learning_rate": 0.00018637894736842105, "logits/chosen": 13.275491714477539, "logits/rejected": 13.275491714477539, "logps/chosen": -5176.6357421875, "logps/rejected": -5176.6357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7251586914062, "rewards/margins": 0.0, "rewards/rejected": -514.7251586914062, "step": 655 }, { "epoch": 6.905263157894737, "grad_norm": 1.0873590099436115e-06, "learning_rate": 0.0001863578947368421, "logits/chosen": 13.24202823638916, "logits/rejected": 13.24202823638916, "logps/chosen": -3543.806640625, "logps/rejected": -3543.806640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.35479736328125, "rewards/margins": 0.0, "rewards/rejected": -351.35479736328125, "step": 656 }, { "epoch": 6.91578947368421, "grad_norm": 2.6836864890356082e-06, "learning_rate": 0.00018633684210526315, "logits/chosen": 13.268362998962402, "logits/rejected": 13.268362998962402, "logps/chosen": -4878.84375, "logps/rejected": -4878.84375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0372009277344, "rewards/margins": 0.0, "rewards/rejected": -485.0372009277344, "step": 657 }, { "epoch": 6.926315789473684, "grad_norm": 1.329520387116645e-06, "learning_rate": 0.00018631578947368423, "logits/chosen": 13.268468856811523, "logits/rejected": 13.268468856811523, "logps/chosen": -3776.91796875, "logps/rejected": -3776.91796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.82269287109375, "rewards/margins": 0.0, "rewards/rejected": -374.82269287109375, "step": 658 }, { "epoch": 6.936842105263158, "grad_norm": 1.1006051181539078e-06, "learning_rate": 0.00018629473684210528, "logits/chosen": 13.280906677246094, "logits/rejected": 13.280906677246094, "logps/chosen": -3544.0625, "logps/rejected": -3544.0625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.38037109375, "rewards/margins": 0.0, "rewards/rejected": -351.38037109375, "step": 659 }, { "epoch": 6.947368421052632, "grad_norm": 2.2968833945924416e-06, "learning_rate": 0.00018627368421052633, "logits/chosen": 13.311330795288086, "logits/rejected": 13.311330795288086, "logps/chosen": -4880.15869140625, "logps/rejected": -4880.15869140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.168701171875, "rewards/margins": 0.0, "rewards/rejected": -485.168701171875, "step": 660 }, { "epoch": 6.957894736842105, "grad_norm": 1.642505139898276e-06, "learning_rate": 0.00018625263157894738, "logits/chosen": 13.310528755187988, "logits/rejected": 13.310528755187988, "logps/chosen": -2671.78125, "logps/rejected": -2671.78125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.39654541015625, "rewards/margins": 0.0, "rewards/rejected": -264.39654541015625, "step": 661 }, { "epoch": 6.968421052631579, "grad_norm": 1.7499687601230107e-06, "learning_rate": 0.00018623157894736843, "logits/chosen": 13.307360649108887, "logits/rejected": 13.307360649108887, "logps/chosen": -3756.353515625, "logps/rejected": -3756.353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.717041015625, "rewards/margins": 0.0, "rewards/rejected": -372.717041015625, "step": 662 }, { "epoch": 6.978947368421053, "grad_norm": 9.157689646599465e-07, "learning_rate": 0.00018621052631578947, "logits/chosen": 13.295308113098145, "logits/rejected": 13.295308113098145, "logps/chosen": -3544.298828125, "logps/rejected": -3544.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4040222167969, "rewards/margins": 0.0, "rewards/rejected": -351.4040222167969, "step": 663 }, { "epoch": 6.989473684210527, "grad_norm": 1.0724845651566284e-06, "learning_rate": 0.00018618947368421052, "logits/chosen": 13.297536849975586, "logits/rejected": 13.297536849975586, "logps/chosen": -3756.572265625, "logps/rejected": -3756.572265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.7389221191406, "rewards/margins": 0.0, "rewards/rejected": -372.7389221191406, "step": 664 }, { "epoch": 7.0, "grad_norm": 1.8944460862257984e-06, "learning_rate": 0.0001861684210526316, "logits/chosen": 13.286480903625488, "logits/rejected": 13.286480903625488, "logps/chosen": -3993.939453125, "logps/rejected": -3993.939453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.5666198730469, "rewards/margins": 0.0, "rewards/rejected": -396.5666198730469, "step": 665 }, { "epoch": 7.010526315789473, "grad_norm": 2.970632522192318e-06, "learning_rate": 0.00018614736842105265, "logits/chosen": 13.296331405639648, "logits/rejected": 13.296331405639648, "logps/chosen": -4880.51708984375, "logps/rejected": -4880.51708984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2045593261719, "rewards/margins": 0.0, "rewards/rejected": -485.2045593261719, "step": 666 }, { "epoch": 7.021052631578947, "grad_norm": 8.019470101316983e-07, "learning_rate": 0.0001861263157894737, "logits/chosen": 13.281067848205566, "logits/rejected": 13.281067848205566, "logps/chosen": -3544.388671875, "logps/rejected": -3544.388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4129943847656, "rewards/margins": 0.0, "rewards/rejected": -351.4129943847656, "step": 667 }, { "epoch": 7.031578947368421, "grad_norm": 1.278908030144521e-06, "learning_rate": 0.00018610526315789475, "logits/chosen": 13.323651313781738, "logits/rejected": 13.323651313781738, "logps/chosen": -5178.09765625, "logps/rejected": -5178.09765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.871337890625, "rewards/margins": 0.0, "rewards/rejected": -514.871337890625, "step": 668 }, { "epoch": 7.042105263157895, "grad_norm": 1.986194774872274e-06, "learning_rate": 0.0001860842105263158, "logits/chosen": 13.32851505279541, "logits/rejected": 13.32851505279541, "logps/chosen": -5177.79296875, "logps/rejected": -5177.79296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8408813476562, "rewards/margins": 0.0, "rewards/rejected": -514.8408813476562, "step": 669 }, { "epoch": 7.052631578947368, "grad_norm": 1.6273856999760028e-06, "learning_rate": 0.00018606315789473685, "logits/chosen": 13.29207706451416, "logits/rejected": 13.29207706451416, "logps/chosen": -3544.349609375, "logps/rejected": -3544.349609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4090881347656, "rewards/margins": 0.0, "rewards/rejected": -351.4090881347656, "step": 670 }, { "epoch": 7.063157894736842, "grad_norm": 1.5197899756458355e-06, "learning_rate": 0.0001860421052631579, "logits/chosen": 13.292393684387207, "logits/rejected": 13.292393684387207, "logps/chosen": -3544.490234375, "logps/rejected": -3544.490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.42315673828125, "rewards/margins": 0.0, "rewards/rejected": -351.42315673828125, "step": 671 }, { "epoch": 7.073684210526316, "grad_norm": 1.2948888752362109e-06, "learning_rate": 0.00018602105263157897, "logits/chosen": 13.297388076782227, "logits/rejected": 13.297388076782227, "logps/chosen": -3757.548828125, "logps/rejected": -3757.548828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8365783691406, "rewards/margins": 0.0, "rewards/rejected": -372.8365783691406, "step": 672 }, { "epoch": 7.08421052631579, "grad_norm": 1.240227788912307e-06, "learning_rate": 0.00018600000000000002, "logits/chosen": 13.324767112731934, "logits/rejected": 13.324767112731934, "logps/chosen": -5178.626953125, "logps/rejected": -5178.626953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.9242553710938, "rewards/margins": 0.0, "rewards/rejected": -514.9242553710938, "step": 673 }, { "epoch": 7.094736842105263, "grad_norm": 1.3515210639525321e-06, "learning_rate": 0.00018597894736842104, "logits/chosen": 13.28498649597168, "logits/rejected": 13.28498649597168, "logps/chosen": -3776.9296875, "logps/rejected": -3776.9296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8238525390625, "rewards/margins": 0.0, "rewards/rejected": -374.8238525390625, "step": 674 }, { "epoch": 7.105263157894737, "grad_norm": 1.4354501445268397e-06, "learning_rate": 0.00018595789473684212, "logits/chosen": 13.283524513244629, "logits/rejected": 13.283524513244629, "logps/chosen": -3994.044921875, "logps/rejected": -3994.044921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.5771789550781, "rewards/margins": 0.0, "rewards/rejected": -396.5771789550781, "step": 675 }, { "epoch": 7.11578947368421, "grad_norm": 1.4314811096483027e-06, "learning_rate": 0.00018593684210526317, "logits/chosen": 13.27773666381836, "logits/rejected": 13.27773666381836, "logps/chosen": -3776.982421875, "logps/rejected": -3776.982421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8291320800781, "rewards/margins": 0.0, "rewards/rejected": -374.8291320800781, "step": 676 }, { "epoch": 7.126315789473685, "grad_norm": 1.3188907814765116e-06, "learning_rate": 0.00018591578947368422, "logits/chosen": 13.273841857910156, "logits/rejected": 13.273841857910156, "logps/chosen": -3993.76171875, "logps/rejected": -3993.76171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.5488586425781, "rewards/margins": 0.0, "rewards/rejected": -396.5488586425781, "step": 677 }, { "epoch": 7.136842105263158, "grad_norm": 1.1867692819578224e-06, "learning_rate": 0.00018589473684210527, "logits/chosen": 13.283470153808594, "logits/rejected": 13.283470153808594, "logps/chosen": -4879.34814453125, "logps/rejected": -4879.34814453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.087646484375, "rewards/margins": 0.0, "rewards/rejected": -485.087646484375, "step": 678 }, { "epoch": 7.147368421052631, "grad_norm": 1.3514687680071802e-06, "learning_rate": 0.00018587368421052634, "logits/chosen": 13.277853965759277, "logits/rejected": 13.277853965759277, "logps/chosen": -4325.521484375, "logps/rejected": -4325.521484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3672790527344, "rewards/margins": 0.0, "rewards/rejected": -429.3672790527344, "step": 679 }, { "epoch": 7.157894736842105, "grad_norm": 1.7907515257320483e-06, "learning_rate": 0.00018585263157894737, "logits/chosen": 13.255786895751953, "logits/rejected": 13.255786895751953, "logps/chosen": -3995.244140625, "logps/rejected": -3995.244140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.6971130371094, "rewards/margins": 0.0, "rewards/rejected": -396.6971130371094, "step": 680 }, { "epoch": 7.168421052631579, "grad_norm": 1.462976570110186e-06, "learning_rate": 0.00018583157894736842, "logits/chosen": 13.25105094909668, "logits/rejected": 13.25105094909668, "logps/chosen": -2671.18359375, "logps/rejected": -2671.18359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.3367614746094, "rewards/margins": 0.0, "rewards/rejected": -264.3367614746094, "step": 681 }, { "epoch": 7.178947368421053, "grad_norm": 1.485650273025385e-06, "learning_rate": 0.0001858105263157895, "logits/chosen": 13.227911949157715, "logits/rejected": 13.227911949157715, "logps/chosen": -3777.5625, "logps/rejected": -3777.5625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.88714599609375, "rewards/margins": 0.0, "rewards/rejected": -374.88714599609375, "step": 682 }, { "epoch": 7.189473684210526, "grad_norm": 1.1892149132108898e-06, "learning_rate": 0.00018578947368421054, "logits/chosen": 13.217670440673828, "logits/rejected": 13.217670440673828, "logps/chosen": -3758.11328125, "logps/rejected": -3758.11328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8930358886719, "rewards/margins": 0.0, "rewards/rejected": -372.8930358886719, "step": 683 }, { "epoch": 7.2, "grad_norm": 9.843425914368709e-07, "learning_rate": 0.0001857684210526316, "logits/chosen": 13.203829765319824, "logits/rejected": 13.203829765319824, "logps/chosen": -2671.970703125, "logps/rejected": -2671.970703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4154968261719, "rewards/margins": 0.0, "rewards/rejected": -264.4154968261719, "step": 684 }, { "epoch": 7.2105263157894735, "grad_norm": 8.688029993209057e-07, "learning_rate": 0.00018574736842105264, "logits/chosen": 13.187112808227539, "logits/rejected": 13.187112808227539, "logps/chosen": -2672.224609375, "logps/rejected": -2672.224609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4408874511719, "rewards/margins": 0.0, "rewards/rejected": -264.4408874511719, "step": 685 }, { "epoch": 7.221052631578948, "grad_norm": 8.484906288686034e-07, "learning_rate": 0.00018572631578947372, "logits/chosen": 13.170920372009277, "logits/rejected": 13.170920372009277, "logps/chosen": -2672.4990234375, "logps/rejected": -2672.4990234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.46832275390625, "rewards/margins": 0.0, "rewards/rejected": -264.46832275390625, "step": 686 }, { "epoch": 7.231578947368421, "grad_norm": 1.3189201126806438e-06, "learning_rate": 0.00018570526315789474, "logits/chosen": 13.146041870117188, "logits/rejected": 13.146041870117188, "logps/chosen": -3543.5703125, "logps/rejected": -3543.5703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3311462402344, "rewards/margins": 0.0, "rewards/rejected": -351.3311462402344, "step": 687 }, { "epoch": 7.242105263157895, "grad_norm": 3.5408058920438634e-06, "learning_rate": 0.0001856842105263158, "logits/chosen": 13.157353401184082, "logits/rejected": 13.157353401184082, "logps/chosen": -4878.5478515625, "logps/rejected": -4878.5478515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.00762939453125, "rewards/margins": 0.0, "rewards/rejected": -485.00762939453125, "step": 688 }, { "epoch": 7.252631578947368, "grad_norm": 2.385500010859687e-06, "learning_rate": 0.00018566315789473684, "logits/chosen": 13.164440155029297, "logits/rejected": 13.164440155029297, "logps/chosen": -4325.8369140625, "logps/rejected": -4325.8369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3988342285156, "rewards/margins": 0.0, "rewards/rejected": -429.3988342285156, "step": 689 }, { "epoch": 7.2631578947368425, "grad_norm": 1.5604205145791639e-06, "learning_rate": 0.0001856421052631579, "logits/chosen": 13.159071922302246, "logits/rejected": 13.159071922302246, "logps/chosen": -3997.8828125, "logps/rejected": -3997.8828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9609680175781, "rewards/margins": 0.0, "rewards/rejected": -396.9609680175781, "step": 690 }, { "epoch": 7.273684210526316, "grad_norm": 1.8582973098091315e-06, "learning_rate": 0.00018562105263157896, "logits/chosen": 13.169404983520508, "logits/rejected": 13.169404983520508, "logps/chosen": -2674.0634765625, "logps/rejected": -2674.0634765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.624755859375, "rewards/margins": 0.0, "rewards/rejected": -264.624755859375, "step": 691 }, { "epoch": 7.284210526315789, "grad_norm": 2.6544091724645114e-06, "learning_rate": 0.0001856, "logits/chosen": 13.18026065826416, "logits/rejected": 13.18026065826416, "logps/chosen": -4326.390625, "logps/rejected": -4326.390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4541931152344, "rewards/margins": 0.0, "rewards/rejected": -429.4541931152344, "step": 692 }, { "epoch": 7.294736842105263, "grad_norm": 2.2888709736434976e-06, "learning_rate": 0.00018557894736842106, "logits/chosen": 13.155786514282227, "logits/rejected": 13.155786514282227, "logps/chosen": -4286.35693359375, "logps/rejected": -4286.35693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.8383483886719, "rewards/margins": 0.0, "rewards/rejected": -425.8383483886719, "step": 693 }, { "epoch": 7.3052631578947365, "grad_norm": 1.6445006849608035e-06, "learning_rate": 0.0001855578947368421, "logits/chosen": 13.16234016418457, "logits/rejected": 13.16234016418457, "logps/chosen": -4878.3720703125, "logps/rejected": -4878.3720703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.99005126953125, "rewards/margins": 0.0, "rewards/rejected": -484.99005126953125, "step": 694 }, { "epoch": 7.315789473684211, "grad_norm": 9.28954591472575e-07, "learning_rate": 0.00018553684210526316, "logits/chosen": 13.133416175842285, "logits/rejected": 13.133416175842285, "logps/chosen": -3543.490234375, "logps/rejected": -3543.490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3231506347656, "rewards/margins": 0.0, "rewards/rejected": -351.3231506347656, "step": 695 }, { "epoch": 7.326315789473684, "grad_norm": 3.3765720672818134e-06, "learning_rate": 0.0001855157894736842, "logits/chosen": 13.147964477539062, "logits/rejected": 13.147964477539062, "logps/chosen": -4878.134765625, "logps/rejected": -4878.134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.96630859375, "rewards/margins": 0.0, "rewards/rejected": -484.96630859375, "step": 696 }, { "epoch": 7.336842105263158, "grad_norm": 3.7226850508886855e-06, "learning_rate": 0.00018549473684210529, "logits/chosen": 13.175617218017578, "logits/rejected": 13.175617218017578, "logps/chosen": -5173.36669921875, "logps/rejected": -5173.36669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3982543945312, "rewards/margins": 0.0, "rewards/rejected": -514.3982543945312, "step": 697 }, { "epoch": 7.347368421052631, "grad_norm": 9.498692747911264e-07, "learning_rate": 0.00018547368421052633, "logits/chosen": 13.16496467590332, "logits/rejected": 13.16496467590332, "logps/chosen": -3758.5390625, "logps/rejected": -3758.5390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.93560791015625, "rewards/margins": 0.0, "rewards/rejected": -372.93560791015625, "step": 698 }, { "epoch": 7.3578947368421055, "grad_norm": 2.379321131229517e-06, "learning_rate": 0.00018545263157894736, "logits/chosen": 13.217950820922852, "logits/rejected": 13.217950820922852, "logps/chosen": -5173.88623046875, "logps/rejected": -5173.88623046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4501953125, "rewards/margins": 0.0, "rewards/rejected": -514.4501953125, "step": 699 }, { "epoch": 7.368421052631579, "grad_norm": 2.889323013732792e-06, "learning_rate": 0.00018543157894736843, "logits/chosen": 13.214141845703125, "logits/rejected": 13.214141845703125, "logps/chosen": -4878.8359375, "logps/rejected": -4878.8359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.03643798828125, "rewards/margins": 0.0, "rewards/rejected": -485.03643798828125, "step": 700 }, { "epoch": 7.368421052631579, "eval_logits/chosen": 13.222890853881836, "eval_logits/rejected": 13.222890853881836, "eval_logps/chosen": -4310.6044921875, "eval_logps/rejected": -4310.6044921875, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.1573181152344, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.1573181152344, "eval_runtime": 4.2544, "eval_samples_per_second": 2.351, "eval_steps_per_second": 2.351, "step": 700 }, { "epoch": 7.378947368421053, "grad_norm": 1.1947388429689454e-06, "learning_rate": 0.00018541052631578948, "logits/chosen": 13.222030639648438, "logits/rejected": 13.222030639648438, "logps/chosen": -4327.9228515625, "logps/rejected": -4327.9228515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.607421875, "rewards/margins": 0.0, "rewards/rejected": -429.607421875, "step": 701 }, { "epoch": 7.389473684210526, "grad_norm": 1.419441900907259e-06, "learning_rate": 0.00018538947368421053, "logits/chosen": 13.211609840393066, "logits/rejected": 13.211609840393066, "logps/chosen": -4287.52099609375, "logps/rejected": -4287.52099609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9547424316406, "rewards/margins": 0.0, "rewards/rejected": -425.9547424316406, "step": 702 }, { "epoch": 7.4, "grad_norm": 1.3463594541462953e-06, "learning_rate": 0.00018536842105263158, "logits/chosen": 13.216324806213379, "logits/rejected": 13.216324806213379, "logps/chosen": -4287.73095703125, "logps/rejected": -4287.73095703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9757385253906, "rewards/margins": 0.0, "rewards/rejected": -425.9757385253906, "step": 703 }, { "epoch": 7.410526315789474, "grad_norm": 1.0049189995697816e-06, "learning_rate": 0.00018534736842105266, "logits/chosen": 13.217493057250977, "logits/rejected": 13.217493057250977, "logps/chosen": -3543.5703125, "logps/rejected": -3543.5703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3311462402344, "rewards/margins": 0.0, "rewards/rejected": -351.3311462402344, "step": 704 }, { "epoch": 7.421052631578947, "grad_norm": 1.2000433571301983e-06, "learning_rate": 0.0001853263157894737, "logits/chosen": 13.222097396850586, "logits/rejected": 13.222097396850586, "logps/chosen": -3996.91015625, "logps/rejected": -3996.91015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.86370849609375, "rewards/margins": 0.0, "rewards/rejected": -396.86370849609375, "step": 705 }, { "epoch": 7.431578947368421, "grad_norm": 1.2435954204192967e-06, "learning_rate": 0.00018530526315789473, "logits/chosen": 13.22079849243164, "logits/rejected": 13.22079849243164, "logps/chosen": -3996.921875, "logps/rejected": -3996.921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8648681640625, "rewards/margins": 0.0, "rewards/rejected": -396.8648681640625, "step": 706 }, { "epoch": 7.442105263157894, "grad_norm": 1.420723378942057e-06, "learning_rate": 0.0001852842105263158, "logits/chosen": 13.214343070983887, "logits/rejected": 13.214343070983887, "logps/chosen": -3777.92578125, "logps/rejected": -3777.92578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9234619140625, "rewards/margins": 0.0, "rewards/rejected": -374.9234619140625, "step": 707 }, { "epoch": 7.4526315789473685, "grad_norm": 1.2402593938531936e-06, "learning_rate": 0.00018526315789473685, "logits/chosen": 13.21203899383545, "logits/rejected": 13.21203899383545, "logps/chosen": -3997.42578125, "logps/rejected": -3997.42578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9152526855469, "rewards/margins": 0.0, "rewards/rejected": -396.9152526855469, "step": 708 }, { "epoch": 7.463157894736842, "grad_norm": 8.847569006320555e-07, "learning_rate": 0.0001852421052631579, "logits/chosen": 13.203995704650879, "logits/rejected": 13.203995704650879, "logps/chosen": -3543.4814453125, "logps/rejected": -3543.4814453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.322265625, "rewards/margins": 0.0, "rewards/rejected": -351.322265625, "step": 709 }, { "epoch": 7.473684210526316, "grad_norm": 1.2636772908081184e-06, "learning_rate": 0.00018522105263157895, "logits/chosen": 13.199202537536621, "logits/rejected": 13.199202537536621, "logps/chosen": -3778.107421875, "logps/rejected": -3778.107421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9416198730469, "rewards/margins": 0.0, "rewards/rejected": -374.9416198730469, "step": 710 }, { "epoch": 7.484210526315789, "grad_norm": 9.880624247671221e-07, "learning_rate": 0.00018520000000000003, "logits/chosen": 13.195527076721191, "logits/rejected": 13.195527076721191, "logps/chosen": -3543.294921875, "logps/rejected": -3543.294921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3036193847656, "rewards/margins": 0.0, "rewards/rejected": -351.3036193847656, "step": 711 }, { "epoch": 7.494736842105263, "grad_norm": 1.0237110927846516e-06, "learning_rate": 0.00018517894736842105, "logits/chosen": 13.193687438964844, "logits/rejected": 13.193687438964844, "logps/chosen": -3543.259765625, "logps/rejected": -3543.259765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.30010986328125, "rewards/margins": 0.0, "rewards/rejected": -351.30010986328125, "step": 712 }, { "epoch": 7.505263157894737, "grad_norm": 1.0467896345289773e-06, "learning_rate": 0.0001851578947368421, "logits/chosen": 13.198676109313965, "logits/rejected": 13.198676109313965, "logps/chosen": -3758.3916015625, "logps/rejected": -3758.3916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9208679199219, "rewards/margins": 0.0, "rewards/rejected": -372.9208679199219, "step": 713 }, { "epoch": 7.515789473684211, "grad_norm": 1.1414570053602802e-06, "learning_rate": 0.00018513684210526318, "logits/chosen": 13.191129684448242, "logits/rejected": 13.191129684448242, "logps/chosen": -3998.025390625, "logps/rejected": -3998.025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9752197265625, "rewards/margins": 0.0, "rewards/rejected": -396.9752197265625, "step": 714 }, { "epoch": 7.526315789473684, "grad_norm": 1.6184153537324164e-06, "learning_rate": 0.00018511578947368423, "logits/chosen": 13.223023414611816, "logits/rejected": 13.223023414611816, "logps/chosen": -5174.17236328125, "logps/rejected": -5174.17236328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4788208007812, "rewards/margins": 0.0, "rewards/rejected": -514.4788208007812, "step": 715 }, { "epoch": 7.536842105263158, "grad_norm": 1.2566912346301251e-06, "learning_rate": 0.00018509473684210528, "logits/chosen": 13.179170608520508, "logits/rejected": 13.179170608520508, "logps/chosen": -3778.5859375, "logps/rejected": -3778.5859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9894714355469, "rewards/margins": 0.0, "rewards/rejected": -374.9894714355469, "step": 716 }, { "epoch": 7.5473684210526315, "grad_norm": 8.977760330708406e-07, "learning_rate": 0.00018507368421052632, "logits/chosen": 13.176719665527344, "logits/rejected": 13.176719665527344, "logps/chosen": -3543.41796875, "logps/rejected": -3543.41796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.31591796875, "rewards/margins": 0.0, "rewards/rejected": -351.31591796875, "step": 717 }, { "epoch": 7.557894736842105, "grad_norm": 8.587193178755115e-07, "learning_rate": 0.00018505263157894737, "logits/chosen": 13.175223350524902, "logits/rejected": 13.175223350524902, "logps/chosen": -2967.9755859375, "logps/rejected": -2967.9755859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0713806152344, "rewards/margins": 0.0, "rewards/rejected": -294.0713806152344, "step": 718 }, { "epoch": 7.568421052631579, "grad_norm": 1.6446061863462091e-06, "learning_rate": 0.00018503157894736842, "logits/chosen": 13.213809967041016, "logits/rejected": 13.213809967041016, "logps/chosen": -5173.822265625, "logps/rejected": -5173.822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4437866210938, "rewards/margins": 0.0, "rewards/rejected": -514.4437866210938, "step": 719 }, { "epoch": 7.578947368421053, "grad_norm": 1.615068413229892e-06, "learning_rate": 0.00018501052631578947, "logits/chosen": 13.215839385986328, "logits/rejected": 13.215839385986328, "logps/chosen": -5174.09814453125, "logps/rejected": -5174.09814453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4713745117188, "rewards/margins": 0.0, "rewards/rejected": -514.4713745117188, "step": 720 }, { "epoch": 7.589473684210526, "grad_norm": 8.918532898860576e-07, "learning_rate": 0.00018498947368421052, "logits/chosen": 13.180612564086914, "logits/rejected": 13.180612564086914, "logps/chosen": -3544.0849609375, "logps/rejected": -3544.0849609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.38262939453125, "rewards/margins": 0.0, "rewards/rejected": -351.38262939453125, "step": 721 }, { "epoch": 7.6, "grad_norm": 1.1378305089237983e-06, "learning_rate": 0.0001849684210526316, "logits/chosen": 13.187461853027344, "logits/rejected": 13.187461853027344, "logps/chosen": -3997.93359375, "logps/rejected": -3997.93359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9660339355469, "rewards/margins": 0.0, "rewards/rejected": -396.9660339355469, "step": 722 }, { "epoch": 7.610526315789474, "grad_norm": 1.1426026276240009e-06, "learning_rate": 0.00018494736842105265, "logits/chosen": 13.188652038574219, "logits/rejected": 13.188652038574219, "logps/chosen": -3997.9140625, "logps/rejected": -3997.9140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9640808105469, "rewards/margins": 0.0, "rewards/rejected": -396.9640808105469, "step": 723 }, { "epoch": 7.621052631578947, "grad_norm": 1.634643354009313e-06, "learning_rate": 0.0001849263157894737, "logits/chosen": 13.204824447631836, "logits/rejected": 13.204824447631836, "logps/chosen": -4877.31201171875, "logps/rejected": -4877.31201171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.884033203125, "rewards/margins": 0.0, "rewards/rejected": -484.884033203125, "step": 724 }, { "epoch": 7.631578947368421, "grad_norm": 1.768266542967467e-06, "learning_rate": 0.00018490526315789475, "logits/chosen": 13.202880859375, "logits/rejected": 13.202880859375, "logps/chosen": -4877.3203125, "logps/rejected": -4877.3203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8848571777344, "rewards/margins": 0.0, "rewards/rejected": -484.8848571777344, "step": 725 }, { "epoch": 7.6421052631578945, "grad_norm": 1.7937302345671924e-06, "learning_rate": 0.0001848842105263158, "logits/chosen": 13.2013578414917, "logits/rejected": 13.2013578414917, "logps/chosen": -4877.49462890625, "logps/rejected": -4877.49462890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9023132324219, "rewards/margins": 0.0, "rewards/rejected": -484.9023132324219, "step": 726 }, { "epoch": 7.652631578947369, "grad_norm": 1.173831947198778e-06, "learning_rate": 0.00018486315789473684, "logits/chosen": 13.180315971374512, "logits/rejected": 13.180315971374512, "logps/chosen": -3998.373046875, "logps/rejected": -3998.373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0099792480469, "rewards/margins": 0.0, "rewards/rejected": -397.0099792480469, "step": 727 }, { "epoch": 7.663157894736842, "grad_norm": 1.8086591353494441e-06, "learning_rate": 0.0001848421052631579, "logits/chosen": 13.21358871459961, "logits/rejected": 13.21358871459961, "logps/chosen": -5175.4736328125, "logps/rejected": -5175.4736328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6089477539062, "rewards/margins": 0.0, "rewards/rejected": -514.6089477539062, "step": 728 }, { "epoch": 7.673684210526316, "grad_norm": 1.3535036487155594e-06, "learning_rate": 0.00018482105263157897, "logits/chosen": 13.170319557189941, "logits/rejected": 13.170319557189941, "logps/chosen": -4288.05224609375, "logps/rejected": -4288.05224609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.00787353515625, "rewards/margins": 0.0, "rewards/rejected": -426.00787353515625, "step": 729 }, { "epoch": 7.684210526315789, "grad_norm": 1.0312728591088671e-06, "learning_rate": 0.00018480000000000002, "logits/chosen": 13.164056777954102, "logits/rejected": 13.164056777954102, "logps/chosen": -2966.96484375, "logps/rejected": -2966.96484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9703063964844, "rewards/margins": 0.0, "rewards/rejected": -293.9703063964844, "step": 730 }, { "epoch": 7.6947368421052635, "grad_norm": 1.3390608728514053e-06, "learning_rate": 0.00018477894736842104, "logits/chosen": 13.161995887756348, "logits/rejected": 13.161995887756348, "logps/chosen": -4287.9638671875, "logps/rejected": -4287.9638671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9990234375, "rewards/margins": 0.0, "rewards/rejected": -425.9990234375, "step": 731 }, { "epoch": 7.705263157894737, "grad_norm": 2.1017040126025677e-06, "learning_rate": 0.00018475789473684212, "logits/chosen": 13.177229881286621, "logits/rejected": 13.177229881286621, "logps/chosen": -4878.3525390625, "logps/rejected": -4878.3525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.98809814453125, "rewards/margins": 0.0, "rewards/rejected": -484.98809814453125, "step": 732 }, { "epoch": 7.715789473684211, "grad_norm": 1.4877908824928454e-06, "learning_rate": 0.00018473684210526317, "logits/chosen": 13.151344299316406, "logits/rejected": 13.151344299316406, "logps/chosen": -3776.767578125, "logps/rejected": -3776.767578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8076477050781, "rewards/margins": 0.0, "rewards/rejected": -374.8076477050781, "step": 733 }, { "epoch": 7.726315789473684, "grad_norm": 1.8996194057763205e-06, "learning_rate": 0.00018471578947368422, "logits/chosen": 13.189788818359375, "logits/rejected": 13.189788818359375, "logps/chosen": -5175.0615234375, "logps/rejected": -5175.0615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5677490234375, "rewards/margins": 0.0, "rewards/rejected": -514.5677490234375, "step": 734 }, { "epoch": 7.7368421052631575, "grad_norm": 1.2435864391591167e-06, "learning_rate": 0.00018469473684210527, "logits/chosen": 13.152837753295898, "logits/rejected": 13.152837753295898, "logps/chosen": -3756.8515625, "logps/rejected": -3756.8515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.766845703125, "rewards/margins": 0.0, "rewards/rejected": -372.766845703125, "step": 735 }, { "epoch": 7.747368421052632, "grad_norm": 1.2046779147567577e-06, "learning_rate": 0.00018467368421052634, "logits/chosen": 13.14518928527832, "logits/rejected": 13.14518928527832, "logps/chosen": -3998.337890625, "logps/rejected": -3998.337890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0064697265625, "rewards/margins": 0.0, "rewards/rejected": -397.0064697265625, "step": 736 }, { "epoch": 7.757894736842105, "grad_norm": 2.169391109418939e-06, "learning_rate": 0.0001846526315789474, "logits/chosen": 13.157787322998047, "logits/rejected": 13.157787322998047, "logps/chosen": -4878.3857421875, "logps/rejected": -4878.3857421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9914245605469, "rewards/margins": 0.0, "rewards/rejected": -484.9914245605469, "step": 737 }, { "epoch": 7.768421052631579, "grad_norm": 2.0689542452601017e-06, "learning_rate": 0.00018463157894736841, "logits/chosen": 13.153223991394043, "logits/rejected": 13.153223991394043, "logps/chosen": -4878.4873046875, "logps/rejected": -4878.4873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0015563964844, "rewards/margins": 0.0, "rewards/rejected": -485.0015563964844, "step": 738 }, { "epoch": 7.778947368421052, "grad_norm": 1.987834821193246e-06, "learning_rate": 0.0001846105263157895, "logits/chosen": 13.151511192321777, "logits/rejected": 13.151511192321777, "logps/chosen": -4878.623046875, "logps/rejected": -4878.623046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.01513671875, "rewards/margins": 0.0, "rewards/rejected": -485.01513671875, "step": 739 }, { "epoch": 7.7894736842105265, "grad_norm": 1.003082729766902e-06, "learning_rate": 0.00018458947368421054, "logits/chosen": 13.128302574157715, "logits/rejected": 13.128302574157715, "logps/chosen": -2966.07421875, "logps/rejected": -2966.07421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.8812255859375, "rewards/margins": 0.0, "rewards/rejected": -293.8812255859375, "step": 740 }, { "epoch": 7.8, "grad_norm": 1.1290779866612866e-06, "learning_rate": 0.0001845684210526316, "logits/chosen": 13.134882926940918, "logits/rejected": 13.134882926940918, "logps/chosen": -2669.935546875, "logps/rejected": -2669.935546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.21197509765625, "rewards/margins": 0.0, "rewards/rejected": -264.21197509765625, "step": 741 }, { "epoch": 7.810526315789474, "grad_norm": 1.2003541769445292e-06, "learning_rate": 0.00018454736842105264, "logits/chosen": 13.126954078674316, "logits/rejected": 13.126954078674316, "logps/chosen": -3540.2578125, "logps/rejected": -3540.2578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.9999084472656, "rewards/margins": 0.0, "rewards/rejected": -350.9999084472656, "step": 742 }, { "epoch": 7.821052631578947, "grad_norm": 2.1146092876733746e-06, "learning_rate": 0.00018452631578947371, "logits/chosen": 13.170341491699219, "logits/rejected": 13.170341491699219, "logps/chosen": -5173.37109375, "logps/rejected": -5173.37109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.398681640625, "rewards/margins": 0.0, "rewards/rejected": -514.398681640625, "step": 743 }, { "epoch": 7.831578947368421, "grad_norm": 9.55191012508294e-07, "learning_rate": 0.00018450526315789474, "logits/chosen": 13.131455421447754, "logits/rejected": 13.131455421447754, "logps/chosen": -2965.537109375, "logps/rejected": -2965.537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.8275146484375, "rewards/margins": 0.0, "rewards/rejected": -293.8275146484375, "step": 744 }, { "epoch": 7.842105263157895, "grad_norm": 1.2373399158605025e-06, "learning_rate": 0.00018448421052631579, "logits/chosen": 13.138838768005371, "logits/rejected": 13.138838768005371, "logps/chosen": -3997.7421875, "logps/rejected": -3997.7421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9468994140625, "rewards/margins": 0.0, "rewards/rejected": -396.9468994140625, "step": 745 }, { "epoch": 7.852631578947369, "grad_norm": 9.146277761828969e-07, "learning_rate": 0.00018446315789473686, "logits/chosen": 13.135359764099121, "logits/rejected": 13.135359764099121, "logps/chosen": -2965.5234375, "logps/rejected": -2965.5234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.8261413574219, "rewards/margins": 0.0, "rewards/rejected": -293.8261413574219, "step": 746 }, { "epoch": 7.863157894736842, "grad_norm": 9.955664381777751e-07, "learning_rate": 0.0001844421052631579, "logits/chosen": 13.141283988952637, "logits/rejected": 13.141283988952637, "logps/chosen": -2669.865234375, "logps/rejected": -2669.865234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.2049255371094, "rewards/margins": 0.0, "rewards/rejected": -264.2049255371094, "step": 747 }, { "epoch": 7.873684210526315, "grad_norm": 1.2750660971505567e-06, "learning_rate": 0.00018442105263157896, "logits/chosen": 13.137335777282715, "logits/rejected": 13.137335777282715, "logps/chosen": -3997.3125, "logps/rejected": -3997.3125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9039306640625, "rewards/margins": 0.0, "rewards/rejected": -396.9039306640625, "step": 748 }, { "epoch": 7.88421052631579, "grad_norm": 2.5515464585623704e-06, "learning_rate": 0.0001844, "logits/chosen": 13.152668952941895, "logits/rejected": 13.152668952941895, "logps/chosen": -4321.96875, "logps/rejected": -4321.96875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.01202392578125, "rewards/margins": 0.0, "rewards/rejected": -429.01202392578125, "step": 749 }, { "epoch": 7.894736842105263, "grad_norm": 1.0633815463734209e-06, "learning_rate": 0.00018437894736842106, "logits/chosen": 13.141569137573242, "logits/rejected": 13.141569137573242, "logps/chosen": -2670.77734375, "logps/rejected": -2670.77734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.296142578125, "rewards/margins": 0.0, "rewards/rejected": -264.296142578125, "step": 750 }, { "epoch": 7.894736842105263, "eval_logits/chosen": 13.161173820495605, "eval_logits/rejected": 13.161173820495605, "eval_logps/chosen": -4308.30615234375, "eval_logps/rejected": -4308.30615234375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -427.92742919921875, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -427.92742919921875, "eval_runtime": 4.3335, "eval_samples_per_second": 2.308, "eval_steps_per_second": 2.308, "step": 750 }, { "epoch": 7.905263157894737, "grad_norm": 1.4386038174052374e-06, "learning_rate": 0.0001843578947368421, "logits/chosen": 13.139097213745117, "logits/rejected": 13.139097213745117, "logps/chosen": -3539.87890625, "logps/rejected": -3539.87890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.9620056152344, "rewards/margins": 0.0, "rewards/rejected": -350.9620056152344, "step": 751 }, { "epoch": 7.91578947368421, "grad_norm": 1.4269968460212112e-06, "learning_rate": 0.00018433684210526316, "logits/chosen": 13.14493465423584, "logits/rejected": 13.14493465423584, "logps/chosen": -4287.3388671875, "logps/rejected": -4287.3388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9365234375, "rewards/margins": 0.0, "rewards/rejected": -425.9365234375, "step": 752 }, { "epoch": 7.926315789473684, "grad_norm": 1.2781040368281538e-06, "learning_rate": 0.0001843157894736842, "logits/chosen": 13.150657653808594, "logits/rejected": 13.150657653808594, "logps/chosen": -2671.9609375, "logps/rejected": -2671.9609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4145202636719, "rewards/margins": 0.0, "rewards/rejected": -264.4145202636719, "step": 753 }, { "epoch": 7.936842105263158, "grad_norm": 1.382065192956361e-06, "learning_rate": 0.00018429473684210528, "logits/chosen": 13.151413917541504, "logits/rejected": 13.151413917541504, "logps/chosen": -3757.353515625, "logps/rejected": -3757.353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8170471191406, "rewards/margins": 0.0, "rewards/rejected": -372.8170471191406, "step": 754 }, { "epoch": 7.947368421052632, "grad_norm": 2.0304607915022643e-06, "learning_rate": 0.00018427368421052633, "logits/chosen": 13.185158729553223, "logits/rejected": 13.185158729553223, "logps/chosen": -5173.490234375, "logps/rejected": -5173.490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4105834960938, "rewards/margins": 0.0, "rewards/rejected": -514.4105834960938, "step": 755 }, { "epoch": 7.957894736842105, "grad_norm": 9.280582844439778e-07, "learning_rate": 0.00018425263157894738, "logits/chosen": 13.14723014831543, "logits/rejected": 13.14723014831543, "logps/chosen": -2673.048828125, "logps/rejected": -2673.048828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5232849121094, "rewards/margins": 0.0, "rewards/rejected": -264.5232849121094, "step": 756 }, { "epoch": 7.968421052631579, "grad_norm": 1.270628899874282e-06, "learning_rate": 0.00018423157894736843, "logits/chosen": 13.143248558044434, "logits/rejected": 13.143248558044434, "logps/chosen": -3998.091796875, "logps/rejected": -3998.091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.98187255859375, "rewards/margins": 0.0, "rewards/rejected": -396.98187255859375, "step": 757 }, { "epoch": 7.978947368421053, "grad_norm": 8.530884088031598e-07, "learning_rate": 0.00018421052631578948, "logits/chosen": 13.136255264282227, "logits/rejected": 13.136255264282227, "logps/chosen": -2967.951171875, "logps/rejected": -2967.951171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0689392089844, "rewards/margins": 0.0, "rewards/rejected": -294.0689392089844, "step": 758 }, { "epoch": 7.989473684210527, "grad_norm": 1.972178324649576e-06, "learning_rate": 0.00018418947368421053, "logits/chosen": 13.158719062805176, "logits/rejected": 13.158719062805176, "logps/chosen": -4877.99609375, "logps/rejected": -4877.99609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.95245361328125, "rewards/margins": 0.0, "rewards/rejected": -484.95245361328125, "step": 759 }, { "epoch": 8.0, "grad_norm": 1.0304795523552457e-06, "learning_rate": 0.00018416842105263158, "logits/chosen": 13.136702537536621, "logits/rejected": 13.136702537536621, "logps/chosen": -3541.220703125, "logps/rejected": -3541.220703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.09619140625, "rewards/margins": 0.0, "rewards/rejected": -351.09619140625, "step": 760 }, { "epoch": 8.010526315789473, "grad_norm": 9.369648523716023e-07, "learning_rate": 0.00018414736842105266, "logits/chosen": 13.140413284301758, "logits/rejected": 13.140413284301758, "logps/chosen": -2968.234375, "logps/rejected": -2968.234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0972595214844, "rewards/margins": 0.0, "rewards/rejected": -294.0972595214844, "step": 761 }, { "epoch": 8.021052631578947, "grad_norm": 2.2311128304863814e-06, "learning_rate": 0.0001841263157894737, "logits/chosen": 13.189372062683105, "logits/rejected": 13.189372062683105, "logps/chosen": -5173.771484375, "logps/rejected": -5173.771484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.438720703125, "rewards/margins": 0.0, "rewards/rejected": -514.438720703125, "step": 762 }, { "epoch": 8.031578947368422, "grad_norm": 2.4661135284986813e-06, "learning_rate": 0.00018410526315789473, "logits/chosen": 13.174569129943848, "logits/rejected": 13.174569129943848, "logps/chosen": -4877.75439453125, "logps/rejected": -4877.75439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.92828369140625, "rewards/margins": 0.0, "rewards/rejected": -484.92828369140625, "step": 763 }, { "epoch": 8.042105263157895, "grad_norm": 1.6518553138666903e-06, "learning_rate": 0.0001840842105263158, "logits/chosen": 13.149615287780762, "logits/rejected": 13.149615287780762, "logps/chosen": -3774.9501953125, "logps/rejected": -3774.9501953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.62591552734375, "rewards/margins": 0.0, "rewards/rejected": -374.62591552734375, "step": 764 }, { "epoch": 8.052631578947368, "grad_norm": 1.2084163927283953e-06, "learning_rate": 0.00018406315789473685, "logits/chosen": 13.151865005493164, "logits/rejected": 13.151865005493164, "logps/chosen": -3541.8154296875, "logps/rejected": -3541.8154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1556701660156, "rewards/margins": 0.0, "rewards/rejected": -351.1556701660156, "step": 765 }, { "epoch": 8.063157894736841, "grad_norm": 1.1007351758962614e-06, "learning_rate": 0.0001840421052631579, "logits/chosen": 13.15410041809082, "logits/rejected": 13.15410041809082, "logps/chosen": -3542.0419921875, "logps/rejected": -3542.0419921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1783142089844, "rewards/margins": 0.0, "rewards/rejected": -351.1783142089844, "step": 766 }, { "epoch": 8.073684210526316, "grad_norm": 2.2168753730511526e-06, "learning_rate": 0.00018402105263157895, "logits/chosen": 13.183789253234863, "logits/rejected": 13.183789253234863, "logps/chosen": -4878.0517578125, "logps/rejected": -4878.0517578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9580078125, "rewards/margins": 0.0, "rewards/rejected": -484.9580078125, "step": 767 }, { "epoch": 8.08421052631579, "grad_norm": 1.6371959645766765e-06, "learning_rate": 0.00018400000000000003, "logits/chosen": 13.161255836486816, "logits/rejected": 13.161255836486816, "logps/chosen": -3775.40234375, "logps/rejected": -3775.40234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.6711120605469, "rewards/margins": 0.0, "rewards/rejected": -374.6711120605469, "step": 768 }, { "epoch": 8.094736842105263, "grad_norm": 1.6546122196814395e-06, "learning_rate": 0.00018397894736842105, "logits/chosen": 13.165945053100586, "logits/rejected": 13.165945053100586, "logps/chosen": -3775.83984375, "logps/rejected": -3775.83984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7148742675781, "rewards/margins": 0.0, "rewards/rejected": -374.7148742675781, "step": 769 }, { "epoch": 8.105263157894736, "grad_norm": 1.4430454484681832e-06, "learning_rate": 0.0001839578947368421, "logits/chosen": 13.174822807312012, "logits/rejected": 13.174822807312012, "logps/chosen": -3996.798828125, "logps/rejected": -3996.798828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8525695800781, "rewards/margins": 0.0, "rewards/rejected": -396.8525695800781, "step": 770 }, { "epoch": 8.115789473684211, "grad_norm": 9.826237601373577e-07, "learning_rate": 0.00018393684210526318, "logits/chosen": 13.167154312133789, "logits/rejected": 13.167154312133789, "logps/chosen": -2967.8173828125, "logps/rejected": -2967.8173828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0555419921875, "rewards/margins": 0.0, "rewards/rejected": -294.0555419921875, "step": 771 }, { "epoch": 8.126315789473685, "grad_norm": 1.1678483815558138e-06, "learning_rate": 0.00018391578947368422, "logits/chosen": 13.165459632873535, "logits/rejected": 13.165459632873535, "logps/chosen": -3542.490234375, "logps/rejected": -3542.490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.22314453125, "rewards/margins": 0.0, "rewards/rejected": -351.22314453125, "step": 772 }, { "epoch": 8.136842105263158, "grad_norm": 2.1347743768274086e-06, "learning_rate": 0.00018389473684210527, "logits/chosen": 13.209723472595215, "logits/rejected": 13.209723472595215, "logps/chosen": -5172.513671875, "logps/rejected": -5172.513671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3129272460938, "rewards/margins": 0.0, "rewards/rejected": -514.3129272460938, "step": 773 }, { "epoch": 8.147368421052631, "grad_norm": 1.1475698329377337e-06, "learning_rate": 0.00018387368421052632, "logits/chosen": 13.165400505065918, "logits/rejected": 13.165400505065918, "logps/chosen": -3542.751953125, "logps/rejected": -3542.751953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.24932861328125, "rewards/margins": 0.0, "rewards/rejected": -351.24932861328125, "step": 774 }, { "epoch": 8.157894736842104, "grad_norm": 2.191958628827706e-06, "learning_rate": 0.0001838526315789474, "logits/chosen": 13.214207649230957, "logits/rejected": 13.214207649230957, "logps/chosen": -5172.4833984375, "logps/rejected": -5172.4833984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3099365234375, "rewards/margins": 0.0, "rewards/rejected": -514.3099365234375, "step": 775 }, { "epoch": 8.16842105263158, "grad_norm": 1.6320094573529786e-06, "learning_rate": 0.00018383157894736842, "logits/chosen": 13.17676830291748, "logits/rejected": 13.17676830291748, "logps/chosen": -3995.75, "logps/rejected": -3995.75, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7476806640625, "rewards/margins": 0.0, "rewards/rejected": -396.7476806640625, "step": 776 }, { "epoch": 8.178947368421053, "grad_norm": 1.0968159358526464e-06, "learning_rate": 0.00018381052631578947, "logits/chosen": 13.174088478088379, "logits/rejected": 13.174088478088379, "logps/chosen": -2672.7353515625, "logps/rejected": -2672.7353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.491943359375, "rewards/margins": 0.0, "rewards/rejected": -264.491943359375, "step": 777 }, { "epoch": 8.189473684210526, "grad_norm": 1.1563734005903825e-06, "learning_rate": 0.00018378947368421055, "logits/chosen": 13.166203498840332, "logits/rejected": 13.166203498840332, "logps/chosen": -3542.9287109375, "logps/rejected": -3542.9287109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2669982910156, "rewards/margins": 0.0, "rewards/rejected": -351.2669982910156, "step": 778 }, { "epoch": 8.2, "grad_norm": 2.483112439222168e-06, "learning_rate": 0.0001837684210526316, "logits/chosen": 13.211294174194336, "logits/rejected": 13.211294174194336, "logps/chosen": -5172.6845703125, "logps/rejected": -5172.6845703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3300170898438, "rewards/margins": 0.0, "rewards/rejected": -514.3300170898438, "step": 779 }, { "epoch": 8.210526315789474, "grad_norm": 1.3655018165081856e-06, "learning_rate": 0.00018374736842105265, "logits/chosen": 13.173303604125977, "logits/rejected": 13.173303604125977, "logps/chosen": -3756.1572265625, "logps/rejected": -3756.1572265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.6974182128906, "rewards/margins": 0.0, "rewards/rejected": -372.6974182128906, "step": 780 }, { "epoch": 8.221052631578948, "grad_norm": 1.114269934987533e-06, "learning_rate": 0.0001837263157894737, "logits/chosen": 13.165274620056152, "logits/rejected": 13.165274620056152, "logps/chosen": -2966.501953125, "logps/rejected": -2966.501953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.92401123046875, "rewards/margins": 0.0, "rewards/rejected": -293.92401123046875, "step": 781 }, { "epoch": 8.23157894736842, "grad_norm": 1.7835608332461561e-06, "learning_rate": 0.00018370526315789474, "logits/chosen": 13.170153617858887, "logits/rejected": 13.170153617858887, "logps/chosen": -3994.923828125, "logps/rejected": -3994.923828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.6650695800781, "rewards/margins": 0.0, "rewards/rejected": -396.6650695800781, "step": 782 }, { "epoch": 8.242105263157894, "grad_norm": 3.086871402047109e-06, "learning_rate": 0.0001836842105263158, "logits/chosen": 13.18794059753418, "logits/rejected": 13.18794059753418, "logps/chosen": -4874.416015625, "logps/rejected": -4874.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.5944519042969, "rewards/margins": 0.0, "rewards/rejected": -484.5944519042969, "step": 783 }, { "epoch": 8.25263157894737, "grad_norm": 1.2969069302926073e-06, "learning_rate": 0.00018366315789473684, "logits/chosen": 13.152263641357422, "logits/rejected": 13.152263641357422, "logps/chosen": -3543.099609375, "logps/rejected": -3543.099609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2840881347656, "rewards/margins": 0.0, "rewards/rejected": -351.2840881347656, "step": 784 }, { "epoch": 8.263157894736842, "grad_norm": 1.7018771814036882e-06, "learning_rate": 0.0001836421052631579, "logits/chosen": 13.152185440063477, "logits/rejected": 13.152185440063477, "logps/chosen": -3995.17578125, "logps/rejected": -3995.17578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.6902770996094, "rewards/margins": 0.0, "rewards/rejected": -396.6902770996094, "step": 785 }, { "epoch": 8.273684210526316, "grad_norm": 1.430383690603776e-06, "learning_rate": 0.00018362105263157897, "logits/chosen": 13.146105766296387, "logits/rejected": 13.146105766296387, "logps/chosen": -3755.7626953125, "logps/rejected": -3755.7626953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.657958984375, "rewards/margins": 0.0, "rewards/rejected": -372.657958984375, "step": 786 }, { "epoch": 8.284210526315789, "grad_norm": 3.482311967673013e-06, "learning_rate": 0.00018360000000000002, "logits/chosen": 13.158924102783203, "logits/rejected": 13.158924102783203, "logps/chosen": -4873.8955078125, "logps/rejected": -4873.8955078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.5423889160156, "rewards/margins": 0.0, "rewards/rejected": -484.5423889160156, "step": 787 }, { "epoch": 8.294736842105262, "grad_norm": 1.1829019967990462e-06, "learning_rate": 0.00018357894736842104, "logits/chosen": 13.123421669006348, "logits/rejected": 13.123421669006348, "logps/chosen": -2671.015625, "logps/rejected": -2671.015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.3199768066406, "rewards/margins": 0.0, "rewards/rejected": -264.3199768066406, "step": 788 }, { "epoch": 8.305263157894737, "grad_norm": 3.6452552194532473e-06, "learning_rate": 0.00018355789473684212, "logits/chosen": 13.161881446838379, "logits/rejected": 13.161881446838379, "logps/chosen": -5171.544921875, "logps/rejected": -5171.544921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.216064453125, "rewards/margins": 0.0, "rewards/rejected": -514.216064453125, "step": 789 }, { "epoch": 8.31578947368421, "grad_norm": 4.052975327795139e-06, "learning_rate": 0.00018353684210526317, "logits/chosen": 13.157443046569824, "logits/rejected": 13.157443046569824, "logps/chosen": -5171.37451171875, "logps/rejected": -5171.37451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.1990356445312, "rewards/margins": 0.0, "rewards/rejected": -514.1990356445312, "step": 790 }, { "epoch": 8.326315789473684, "grad_norm": 4.249576250003884e-06, "learning_rate": 0.00018351578947368421, "logits/chosen": 13.154475212097168, "logits/rejected": 13.154475212097168, "logps/chosen": -5171.173828125, "logps/rejected": -5171.173828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.178955078125, "rewards/margins": 0.0, "rewards/rejected": -514.178955078125, "step": 791 }, { "epoch": 8.336842105263157, "grad_norm": 2.7736061838368187e-06, "learning_rate": 0.00018349473684210526, "logits/chosen": 13.104389190673828, "logits/rejected": 13.104389190673828, "logps/chosen": -3994.251953125, "logps/rejected": -3994.251953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.5978698730469, "rewards/margins": 0.0, "rewards/rejected": -396.5978698730469, "step": 792 }, { "epoch": 8.347368421052632, "grad_norm": 4.41437759945984e-06, "learning_rate": 0.00018347368421052634, "logits/chosen": 13.141545295715332, "logits/rejected": 13.141545295715332, "logps/chosen": -5171.1044921875, "logps/rejected": -5171.1044921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.1720581054688, "rewards/margins": 0.0, "rewards/rejected": -514.1720581054688, "step": 793 }, { "epoch": 8.357894736842105, "grad_norm": 2.124402953995741e-06, "learning_rate": 0.0001834526315789474, "logits/chosen": 13.08736515045166, "logits/rejected": 13.08736515045166, "logps/chosen": -3993.392578125, "logps/rejected": -3993.392578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.5119323730469, "rewards/margins": 0.0, "rewards/rejected": -396.5119323730469, "step": 794 }, { "epoch": 8.368421052631579, "grad_norm": 2.1847272364539094e-06, "learning_rate": 0.0001834315789473684, "logits/chosen": 13.079763412475586, "logits/rejected": 13.079763412475586, "logps/chosen": -3753.921875, "logps/rejected": -3753.921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.473876953125, "rewards/margins": 0.0, "rewards/rejected": -372.473876953125, "step": 795 }, { "epoch": 8.378947368421052, "grad_norm": 3.434222207943094e-06, "learning_rate": 0.0001834105263157895, "logits/chosen": 13.064455032348633, "logits/rejected": 13.064455032348633, "logps/chosen": -4279.939453125, "logps/rejected": -4279.939453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.19659423828125, "rewards/margins": 0.0, "rewards/rejected": -425.19659423828125, "step": 796 }, { "epoch": 8.389473684210527, "grad_norm": 2.2049848666938487e-06, "learning_rate": 0.00018338947368421054, "logits/chosen": 13.054069519042969, "logits/rejected": 13.054069519042969, "logps/chosen": -3538.095703125, "logps/rejected": -3538.095703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.78369140625, "rewards/margins": 0.0, "rewards/rejected": -350.78369140625, "step": 797 }, { "epoch": 8.4, "grad_norm": 1.7060622212738963e-06, "learning_rate": 0.0001833684210526316, "logits/chosen": 13.050280570983887, "logits/rejected": 13.050280570983887, "logps/chosen": -2667.341796875, "logps/rejected": -2667.341796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -263.9526062011719, "rewards/margins": 0.0, "rewards/rejected": -263.9526062011719, "step": 798 }, { "epoch": 8.410526315789474, "grad_norm": 5.362900992622599e-06, "learning_rate": 0.00018334736842105264, "logits/chosen": 13.101919174194336, "logits/rejected": 13.101919174194336, "logps/chosen": -5169.5625, "logps/rejected": -5169.5625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.017822265625, "rewards/margins": 0.0, "rewards/rejected": -514.017822265625, "step": 799 }, { "epoch": 8.421052631578947, "grad_norm": 3.467218675723416e-06, "learning_rate": 0.0001833263157894737, "logits/chosen": 13.050479888916016, "logits/rejected": 13.050479888916016, "logps/chosen": -3991.6640625, "logps/rejected": -3991.6640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.3390808105469, "rewards/margins": 0.0, "rewards/rejected": -396.3390808105469, "step": 800 }, { "epoch": 8.421052631578947, "eval_logits/chosen": 13.067541122436523, "eval_logits/rejected": 13.067541122436523, "eval_logps/chosen": -4302.88427734375, "eval_logps/rejected": -4302.88427734375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -427.38519287109375, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -427.38519287109375, "eval_runtime": 4.1659, "eval_samples_per_second": 2.4, "eval_steps_per_second": 2.4, "step": 800 }, { "epoch": 8.431578947368422, "grad_norm": 1.5714698520241654e-06, "learning_rate": 0.00018330526315789473, "logits/chosen": 13.039494514465332, "logits/rejected": 13.039494514465332, "logps/chosen": -2667.16015625, "logps/rejected": -2667.16015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -263.9344177246094, "rewards/margins": 0.0, "rewards/rejected": -263.9344177246094, "step": 801 }, { "epoch": 8.442105263157895, "grad_norm": 3.1759427656652406e-06, "learning_rate": 0.00018328421052631578, "logits/chosen": 13.030718803405762, "logits/rejected": 13.030718803405762, "logps/chosen": -3767.5625, "logps/rejected": -3767.5625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.88714599609375, "rewards/margins": 0.0, "rewards/rejected": -373.88714599609375, "step": 802 }, { "epoch": 8.452631578947368, "grad_norm": 1.6672837546138908e-06, "learning_rate": 0.00018326315789473686, "logits/chosen": 13.02714729309082, "logits/rejected": 13.02714729309082, "logps/chosen": -2960.42578125, "logps/rejected": -2960.42578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.3163757324219, "rewards/margins": 0.0, "rewards/rejected": -293.3163757324219, "step": 803 }, { "epoch": 8.463157894736842, "grad_norm": 2.9747789085377008e-06, "learning_rate": 0.0001832421052631579, "logits/chosen": 13.038200378417969, "logits/rejected": 13.038200378417969, "logps/chosen": -3751.80078125, "logps/rejected": -3751.80078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.26177978515625, "rewards/margins": 0.0, "rewards/rejected": -372.26177978515625, "step": 804 }, { "epoch": 8.473684210526315, "grad_norm": 3.270132538091275e-06, "learning_rate": 0.00018322105263157896, "logits/chosen": 13.025984764099121, "logits/rejected": 13.025984764099121, "logps/chosen": -3767.630859375, "logps/rejected": -3767.630859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.89398193359375, "rewards/margins": 0.0, "rewards/rejected": -373.89398193359375, "step": 805 }, { "epoch": 8.48421052631579, "grad_norm": 3.1023328119772486e-06, "learning_rate": 0.0001832, "logits/chosen": 13.033570289611816, "logits/rejected": 13.033570289611816, "logps/chosen": -3991.08203125, "logps/rejected": -3991.08203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.2808837890625, "rewards/margins": 0.0, "rewards/rejected": -396.2808837890625, "step": 806 }, { "epoch": 8.494736842105263, "grad_norm": 2.9121283660060726e-06, "learning_rate": 0.00018317894736842108, "logits/chosen": 13.022355079650879, "logits/rejected": 13.022355079650879, "logps/chosen": -3767.962890625, "logps/rejected": -3767.962890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.92718505859375, "rewards/margins": 0.0, "rewards/rejected": -373.92718505859375, "step": 807 }, { "epoch": 8.505263157894737, "grad_norm": 5.836352556798374e-06, "learning_rate": 0.0001831578947368421, "logits/chosen": 13.06139087677002, "logits/rejected": 13.06139087677002, "logps/chosen": -4867.3837890625, "logps/rejected": -4867.3837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -483.8912048339844, "rewards/margins": 0.0, "rewards/rejected": -483.8912048339844, "step": 808 }, { "epoch": 8.51578947368421, "grad_norm": 2.658028734003892e-06, "learning_rate": 0.00018313684210526316, "logits/chosen": 13.023484230041504, "logits/rejected": 13.023484230041504, "logps/chosen": -3535.515625, "logps/rejected": -3535.515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.52569580078125, "rewards/margins": 0.0, "rewards/rejected": -350.52569580078125, "step": 809 }, { "epoch": 8.526315789473685, "grad_norm": 5.972752205707366e-06, "learning_rate": 0.0001831157894736842, "logits/chosen": 13.087800979614258, "logits/rejected": 13.087800979614258, "logps/chosen": -5165.70703125, "logps/rejected": -5165.70703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -513.6322631835938, "rewards/margins": 0.0, "rewards/rejected": -513.6322631835938, "step": 810 }, { "epoch": 8.536842105263158, "grad_norm": 2.155912397938664e-06, "learning_rate": 0.00018309473684210528, "logits/chosen": 13.027406692504883, "logits/rejected": 13.027406692504883, "logps/chosen": -2959.890625, "logps/rejected": -2959.890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.26287841796875, "rewards/margins": 0.0, "rewards/rejected": -293.26287841796875, "step": 811 }, { "epoch": 8.547368421052632, "grad_norm": 2.102140115312068e-06, "learning_rate": 0.00018307368421052633, "logits/chosen": 13.044645309448242, "logits/rejected": 13.044645309448242, "logps/chosen": -3535.67578125, "logps/rejected": -3535.67578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.5417175292969, "rewards/margins": 0.0, "rewards/rejected": -350.5417175292969, "step": 812 }, { "epoch": 8.557894736842105, "grad_norm": 2.10280563806009e-06, "learning_rate": 0.00018305263157894738, "logits/chosen": 13.0581636428833, "logits/rejected": 13.0581636428833, "logps/chosen": -3535.451171875, "logps/rejected": -3535.451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.5192565917969, "rewards/margins": 0.0, "rewards/rejected": -350.5192565917969, "step": 813 }, { "epoch": 8.568421052631578, "grad_norm": 2.7769708594860276e-06, "learning_rate": 0.00018303157894736843, "logits/chosen": 13.076176643371582, "logits/rejected": 13.076176643371582, "logps/chosen": -3990.396484375, "logps/rejected": -3990.396484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.21234130859375, "rewards/margins": 0.0, "rewards/rejected": -396.21234130859375, "step": 814 }, { "epoch": 8.578947368421053, "grad_norm": 4.9173436309501994e-06, "learning_rate": 0.00018301052631578948, "logits/chosen": 13.120112419128418, "logits/rejected": 13.120112419128418, "logps/chosen": -4867.4033203125, "logps/rejected": -4867.4033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -483.8931579589844, "rewards/margins": 0.0, "rewards/rejected": -483.8931579589844, "step": 815 }, { "epoch": 8.589473684210526, "grad_norm": 1.79261712673906e-06, "learning_rate": 0.00018298947368421053, "logits/chosen": 13.083232879638672, "logits/rejected": 13.083232879638672, "logps/chosen": -2667.4736328125, "logps/rejected": -2667.4736328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -263.9657897949219, "rewards/margins": 0.0, "rewards/rejected": -263.9657897949219, "step": 816 }, { "epoch": 8.6, "grad_norm": 4.694145900430158e-06, "learning_rate": 0.00018296842105263158, "logits/chosen": 13.135228157043457, "logits/rejected": 13.135228157043457, "logps/chosen": -4867.4580078125, "logps/rejected": -4867.4580078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -483.8986511230469, "rewards/margins": 0.0, "rewards/rejected": -483.8986511230469, "step": 817 }, { "epoch": 8.610526315789473, "grad_norm": 3.729123363882536e-06, "learning_rate": 0.00018294736842105265, "logits/chosen": 13.127240180969238, "logits/rejected": 13.127240180969238, "logps/chosen": -4316.6748046875, "logps/rejected": -4316.6748046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -428.4826354980469, "rewards/margins": 0.0, "rewards/rejected": -428.4826354980469, "step": 818 }, { "epoch": 8.621052631578948, "grad_norm": 5.973961378913373e-06, "learning_rate": 0.0001829263157894737, "logits/chosen": 13.143715858459473, "logits/rejected": 13.143715858459473, "logps/chosen": -4867.77734375, "logps/rejected": -4867.77734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -483.9305725097656, "rewards/margins": 0.0, "rewards/rejected": -483.9305725097656, "step": 819 }, { "epoch": 8.631578947368421, "grad_norm": 4.26080896431813e-06, "learning_rate": 0.00018290526315789472, "logits/chosen": 13.102783203125, "logits/rejected": 13.102783203125, "logps/chosen": -3990.439453125, "logps/rejected": -3990.439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.2166442871094, "rewards/margins": 0.0, "rewards/rejected": -396.2166442871094, "step": 820 }, { "epoch": 8.642105263157895, "grad_norm": 4.782623818755383e-06, "learning_rate": 0.0001828842105263158, "logits/chosen": 13.121586799621582, "logits/rejected": 13.121586799621582, "logps/chosen": -4316.8583984375, "logps/rejected": -4316.8583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -428.5009765625, "rewards/margins": 0.0, "rewards/rejected": -428.5009765625, "step": 821 }, { "epoch": 8.652631578947368, "grad_norm": 4.768724465975538e-06, "learning_rate": 0.00018286315789473685, "logits/chosen": 13.10221004486084, "logits/rejected": 13.10221004486084, "logps/chosen": -4278.83251953125, "logps/rejected": -4278.83251953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.0859069824219, "rewards/margins": 0.0, "rewards/rejected": -425.0859069824219, "step": 822 }, { "epoch": 8.663157894736843, "grad_norm": 1.587369979461073e-06, "learning_rate": 0.0001828421052631579, "logits/chosen": 13.131707191467285, "logits/rejected": 13.131707191467285, "logps/chosen": -3752.439453125, "logps/rejected": -3752.439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.3256530761719, "rewards/margins": 0.0, "rewards/rejected": -372.3256530761719, "step": 823 }, { "epoch": 8.673684210526316, "grad_norm": 3.828426542895613e-06, "learning_rate": 0.00018282105263157895, "logits/chosen": 13.185013771057129, "logits/rejected": 13.185013771057129, "logps/chosen": -4871.19189453125, "logps/rejected": -4871.19189453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.27203369140625, "rewards/margins": 0.0, "rewards/rejected": -484.27203369140625, "step": 824 }, { "epoch": 8.68421052631579, "grad_norm": 6.832236067566555e-06, "learning_rate": 0.00018280000000000003, "logits/chosen": 13.153717041015625, "logits/rejected": 13.153717041015625, "logps/chosen": -3991.80078125, "logps/rejected": -3991.80078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.3527526855469, "rewards/margins": 0.0, "rewards/rejected": -396.3527526855469, "step": 825 }, { "epoch": 8.694736842105263, "grad_norm": 1.7575574702277663e-06, "learning_rate": 0.00018277894736842107, "logits/chosen": 13.152008056640625, "logits/rejected": 13.152008056640625, "logps/chosen": -3993.4921875, "logps/rejected": -3993.4921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.52191162109375, "rewards/margins": 0.0, "rewards/rejected": -396.52191162109375, "step": 826 }, { "epoch": 8.705263157894738, "grad_norm": 9.150597179541364e-06, "learning_rate": 0.0001827578947368421, "logits/chosen": 13.180081367492676, "logits/rejected": 13.180081367492676, "logps/chosen": -4872.33642578125, "logps/rejected": -4872.33642578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.386474609375, "rewards/margins": 0.0, "rewards/rejected": -484.386474609375, "step": 827 }, { "epoch": 8.715789473684211, "grad_norm": 2.8743502298311796e-06, "learning_rate": 0.00018273684210526317, "logits/chosen": 13.176284790039062, "logits/rejected": 13.176284790039062, "logps/chosen": -4320.876953125, "logps/rejected": -4320.876953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -428.90283203125, "rewards/margins": 0.0, "rewards/rejected": -428.90283203125, "step": 828 }, { "epoch": 8.726315789473684, "grad_norm": 3.557847776392009e-06, "learning_rate": 0.00018271578947368422, "logits/chosen": 13.146232604980469, "logits/rejected": 13.146232604980469, "logps/chosen": -2669.1416015625, "logps/rejected": -2669.1416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.132568359375, "rewards/margins": 0.0, "rewards/rejected": -264.132568359375, "step": 829 }, { "epoch": 8.736842105263158, "grad_norm": 3.86333886126522e-06, "learning_rate": 0.00018269473684210527, "logits/chosen": 13.152488708496094, "logits/rejected": 13.152488708496094, "logps/chosen": -2669.3154296875, "logps/rejected": -2669.3154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.14996337890625, "rewards/margins": 0.0, "rewards/rejected": -264.14996337890625, "step": 830 }, { "epoch": 8.74736842105263, "grad_norm": 1.5854774346735212e-06, "learning_rate": 0.00018267368421052632, "logits/chosen": 13.175165176391602, "logits/rejected": 13.175165176391602, "logps/chosen": -3755.2841796875, "logps/rejected": -3755.2841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.610107421875, "rewards/margins": 0.0, "rewards/rejected": -372.610107421875, "step": 831 }, { "epoch": 8.757894736842106, "grad_norm": 3.911684416380012e-06, "learning_rate": 0.0001826526315789474, "logits/chosen": 13.164422988891602, "logits/rejected": 13.164422988891602, "logps/chosen": -4283.81982421875, "logps/rejected": -4283.81982421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.5846252441406, "rewards/margins": 0.0, "rewards/rejected": -425.5846252441406, "step": 832 }, { "epoch": 8.76842105263158, "grad_norm": 2.929838956333697e-06, "learning_rate": 0.00018263157894736842, "logits/chosen": 13.160375595092773, "logits/rejected": 13.160375595092773, "logps/chosen": -3775.306640625, "logps/rejected": -3775.306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.66156005859375, "rewards/margins": 0.0, "rewards/rejected": -374.66156005859375, "step": 833 }, { "epoch": 8.778947368421052, "grad_norm": 1.0393248430773383e-06, "learning_rate": 0.00018261052631578947, "logits/chosen": 13.15914249420166, "logits/rejected": 13.15914249420166, "logps/chosen": -2672.3173828125, "logps/rejected": -2672.3173828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4501647949219, "rewards/margins": 0.0, "rewards/rejected": -264.4501647949219, "step": 834 }, { "epoch": 8.789473684210526, "grad_norm": 2.0899205992463976e-06, "learning_rate": 0.00018258947368421054, "logits/chosen": 13.212599754333496, "logits/rejected": 13.212599754333496, "logps/chosen": -4877.4931640625, "logps/rejected": -4877.4931640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.90216064453125, "rewards/margins": 0.0, "rewards/rejected": -484.90216064453125, "step": 835 }, { "epoch": 8.8, "grad_norm": 2.8235390345798805e-06, "learning_rate": 0.0001825684210526316, "logits/chosen": 13.169595718383789, "logits/rejected": 13.169595718383789, "logps/chosen": -3997.38671875, "logps/rejected": -3997.38671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9113464355469, "rewards/margins": 0.0, "rewards/rejected": -396.9113464355469, "step": 836 }, { "epoch": 8.810526315789474, "grad_norm": 1.5295136108761653e-06, "learning_rate": 0.00018254736842105264, "logits/chosen": 13.15715503692627, "logits/rejected": 13.15715503692627, "logps/chosen": -4286.63330078125, "logps/rejected": -4286.63330078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.865966796875, "rewards/margins": 0.0, "rewards/rejected": -425.865966796875, "step": 837 }, { "epoch": 8.821052631578947, "grad_norm": 1.4351624031405663e-06, "learning_rate": 0.0001825263157894737, "logits/chosen": 13.143031120300293, "logits/rejected": 13.143031120300293, "logps/chosen": -4287.49609375, "logps/rejected": -4287.49609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9522399902344, "rewards/margins": 0.0, "rewards/rejected": -425.9522399902344, "step": 838 }, { "epoch": 8.83157894736842, "grad_norm": 2.3897052869870095e-06, "learning_rate": 0.00018250526315789474, "logits/chosen": 13.192338943481445, "logits/rejected": 13.192338943481445, "logps/chosen": -5168.6064453125, "logps/rejected": -5168.6064453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -513.9222412109375, "rewards/margins": 0.0, "rewards/rejected": -513.9222412109375, "step": 839 }, { "epoch": 8.842105263157894, "grad_norm": 2.318000269951881e-06, "learning_rate": 0.0001824842105263158, "logits/chosen": 13.167464256286621, "logits/rejected": 13.167464256286621, "logps/chosen": -4878.60498046875, "logps/rejected": -4878.60498046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0133361816406, "rewards/margins": 0.0, "rewards/rejected": -485.0133361816406, "step": 840 }, { "epoch": 8.852631578947369, "grad_norm": 2.030115410889266e-06, "learning_rate": 0.00018246315789473684, "logits/chosen": 13.184605598449707, "logits/rejected": 13.184605598449707, "logps/chosen": -5169.498046875, "logps/rejected": -5169.498046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.0114135742188, "rewards/margins": 0.0, "rewards/rejected": -514.0114135742188, "step": 841 }, { "epoch": 8.863157894736842, "grad_norm": 2.633723170220037e-06, "learning_rate": 0.0001824421052631579, "logits/chosen": 13.164407730102539, "logits/rejected": 13.164407730102539, "logps/chosen": -4879.04052734375, "logps/rejected": -4879.04052734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.056884765625, "rewards/margins": 0.0, "rewards/rejected": -485.056884765625, "step": 842 }, { "epoch": 8.873684210526315, "grad_norm": 1.2787404557457194e-06, "learning_rate": 0.00018242105263157897, "logits/chosen": 13.122583389282227, "logits/rejected": 13.122583389282227, "logps/chosen": -4289.45849609375, "logps/rejected": -4289.45849609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.14849853515625, "rewards/margins": 0.0, "rewards/rejected": -426.14849853515625, "step": 843 }, { "epoch": 8.884210526315789, "grad_norm": 1.4480827985607903e-06, "learning_rate": 0.00018240000000000002, "logits/chosen": 13.151036262512207, "logits/rejected": 13.151036262512207, "logps/chosen": -4324.541015625, "logps/rejected": -4324.541015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.2692565917969, "rewards/margins": 0.0, "rewards/rejected": -429.2692565917969, "step": 844 }, { "epoch": 8.894736842105264, "grad_norm": 1.2705459084827453e-06, "learning_rate": 0.00018237894736842106, "logits/chosen": 13.122919082641602, "logits/rejected": 13.122919082641602, "logps/chosen": -3539.318359375, "logps/rejected": -3539.318359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.9059753417969, "rewards/margins": 0.0, "rewards/rejected": -350.9059753417969, "step": 845 }, { "epoch": 8.905263157894737, "grad_norm": 1.544367364658683e-06, "learning_rate": 0.00018235789473684211, "logits/chosen": 13.173503875732422, "logits/rejected": 13.173503875732422, "logps/chosen": -4880.1103515625, "logps/rejected": -4880.1103515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.16387939453125, "rewards/margins": 0.0, "rewards/rejected": -485.16387939453125, "step": 846 }, { "epoch": 8.91578947368421, "grad_norm": 1.495149717811728e-06, "learning_rate": 0.00018233684210526316, "logits/chosen": 13.166247367858887, "logits/rejected": 13.166247367858887, "logps/chosen": -4325.126953125, "logps/rejected": -4325.126953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3278503417969, "rewards/margins": 0.0, "rewards/rejected": -429.3278503417969, "step": 847 }, { "epoch": 8.926315789473684, "grad_norm": 1.0864290516110486e-06, "learning_rate": 0.0001823157894736842, "logits/chosen": 13.142928123474121, "logits/rejected": 13.142928123474121, "logps/chosen": -3539.875, "logps/rejected": -3539.875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.9616394042969, "rewards/margins": 0.0, "rewards/rejected": -350.9616394042969, "step": 848 }, { "epoch": 8.936842105263159, "grad_norm": 1.004168666440819e-06, "learning_rate": 0.00018229473684210526, "logits/chosen": 13.144794464111328, "logits/rejected": 13.144794464111328, "logps/chosen": -2673.27734375, "logps/rejected": -2673.27734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.546142578125, "rewards/margins": 0.0, "rewards/rejected": -264.546142578125, "step": 849 }, { "epoch": 8.947368421052632, "grad_norm": 1.0678752460080432e-06, "learning_rate": 0.00018227368421052634, "logits/chosen": 13.162507057189941, "logits/rejected": 13.162507057189941, "logps/chosen": -3540.509765625, "logps/rejected": -3540.509765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.0251159667969, "rewards/margins": 0.0, "rewards/rejected": -351.0251159667969, "step": 850 }, { "epoch": 8.947368421052632, "eval_logits/chosen": 13.206059455871582, "eval_logits/rejected": 13.206059455871582, "eval_logps/chosen": -4309.0830078125, "eval_logps/rejected": -4309.0830078125, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.005126953125, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.005126953125, "eval_runtime": 4.4669, "eval_samples_per_second": 2.239, "eval_steps_per_second": 2.239, "step": 850 }, { "epoch": 8.957894736842105, "grad_norm": 9.507299978395167e-07, "learning_rate": 0.0001822526315789474, "logits/chosen": 13.165428161621094, "logits/rejected": 13.165428161621094, "logps/chosen": -2673.208984375, "logps/rejected": -2673.208984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.539306640625, "rewards/margins": 0.0, "rewards/rejected": -264.539306640625, "step": 851 }, { "epoch": 8.968421052631578, "grad_norm": 1.3000028502574423e-06, "learning_rate": 0.0001822315789473684, "logits/chosen": 13.173028945922852, "logits/rejected": 13.173028945922852, "logps/chosen": -2967.26171875, "logps/rejected": -2967.26171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9999694824219, "rewards/margins": 0.0, "rewards/rejected": -293.9999694824219, "step": 852 }, { "epoch": 8.978947368421053, "grad_norm": 1.4566235222446267e-06, "learning_rate": 0.00018221052631578949, "logits/chosen": 13.192514419555664, "logits/rejected": 13.192514419555664, "logps/chosen": -3541.283203125, "logps/rejected": -3541.283203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1024475097656, "rewards/margins": 0.0, "rewards/rejected": -351.1024475097656, "step": 853 }, { "epoch": 8.989473684210527, "grad_norm": 1.726867367324303e-06, "learning_rate": 0.00018218947368421054, "logits/chosen": 13.20378589630127, "logits/rejected": 13.20378589630127, "logps/chosen": -3995.724609375, "logps/rejected": -3995.724609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7451477050781, "rewards/margins": 0.0, "rewards/rejected": -396.7451477050781, "step": 854 }, { "epoch": 9.0, "grad_norm": 1.4313933434095816e-06, "learning_rate": 0.00018216842105263158, "logits/chosen": 13.209630012512207, "logits/rejected": 13.209630012512207, "logps/chosen": -3996.169921875, "logps/rejected": -3996.169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7896728515625, "rewards/margins": 0.0, "rewards/rejected": -396.7896728515625, "step": 855 }, { "epoch": 9.010526315789473, "grad_norm": 9.583002338331426e-07, "learning_rate": 0.00018214736842105263, "logits/chosen": 13.211526870727539, "logits/rejected": 13.211526870727539, "logps/chosen": -3542.412109375, "logps/rejected": -3542.412109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.21533203125, "rewards/margins": 0.0, "rewards/rejected": -351.21533203125, "step": 856 }, { "epoch": 9.021052631578947, "grad_norm": 1.2720788618025836e-06, "learning_rate": 0.0001821263157894737, "logits/chosen": 13.24713134765625, "logits/rejected": 13.24713134765625, "logps/chosen": -4327.7177734375, "logps/rejected": -4327.7177734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.5869140625, "rewards/margins": 0.0, "rewards/rejected": -429.5869140625, "step": 857 }, { "epoch": 9.031578947368422, "grad_norm": 1.4559635701516527e-06, "learning_rate": 0.00018210526315789476, "logits/chosen": 13.207448959350586, "logits/rejected": 13.207448959350586, "logps/chosen": -2674.173828125, "logps/rejected": -2674.173828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.63580322265625, "rewards/margins": 0.0, "rewards/rejected": -264.63580322265625, "step": 858 }, { "epoch": 9.042105263157895, "grad_norm": 1.3104336176184006e-06, "learning_rate": 0.00018208421052631578, "logits/chosen": 13.251220703125, "logits/rejected": 13.251220703125, "logps/chosen": -4328.32421875, "logps/rejected": -4328.32421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.6475524902344, "rewards/margins": 0.0, "rewards/rejected": -429.6475524902344, "step": 859 }, { "epoch": 9.052631578947368, "grad_norm": 1.0783320476548397e-06, "learning_rate": 0.00018206315789473686, "logits/chosen": 13.22087574005127, "logits/rejected": 13.22087574005127, "logps/chosen": -3543.7529296875, "logps/rejected": -3543.7529296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.34942626953125, "rewards/margins": 0.0, "rewards/rejected": -351.34942626953125, "step": 860 }, { "epoch": 9.063157894736841, "grad_norm": 9.301684826823475e-07, "learning_rate": 0.0001820421052631579, "logits/chosen": 13.215072631835938, "logits/rejected": 13.215072631835938, "logps/chosen": -2674.900390625, "logps/rejected": -2674.900390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7084655761719, "rewards/margins": 0.0, "rewards/rejected": -264.7084655761719, "step": 861 }, { "epoch": 9.073684210526316, "grad_norm": 1.5434849274242879e-06, "learning_rate": 0.00018202105263157896, "logits/chosen": 13.234474182128906, "logits/rejected": 13.234474182128906, "logps/chosen": -3757.888671875, "logps/rejected": -3757.888671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8705749511719, "rewards/margins": 0.0, "rewards/rejected": -372.8705749511719, "step": 862 }, { "epoch": 9.08421052631579, "grad_norm": 6.232121904758969e-06, "learning_rate": 0.000182, "logits/chosen": 13.29504108428955, "logits/rejected": 13.29504108428955, "logps/chosen": -5173.61376953125, "logps/rejected": -5173.61376953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4229736328125, "rewards/margins": 0.0, "rewards/rejected": -514.4229736328125, "step": 863 }, { "epoch": 9.094736842105263, "grad_norm": 2.5549634301569313e-06, "learning_rate": 0.00018197894736842108, "logits/chosen": 13.307636260986328, "logits/rejected": 13.307636260986328, "logps/chosen": -5174.4599609375, "logps/rejected": -5174.4599609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.507568359375, "rewards/margins": 0.0, "rewards/rejected": -514.507568359375, "step": 864 }, { "epoch": 9.105263157894736, "grad_norm": 6.885562470415607e-06, "learning_rate": 0.0001819578947368421, "logits/chosen": 13.242020606994629, "logits/rejected": 13.242020606994629, "logps/chosen": -3543.9609375, "logps/rejected": -3543.9609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3702087402344, "rewards/margins": 0.0, "rewards/rejected": -351.3702087402344, "step": 865 }, { "epoch": 9.115789473684211, "grad_norm": 2.926565230154665e-06, "learning_rate": 0.00018193684210526315, "logits/chosen": 13.247761726379395, "logits/rejected": 13.247761726379395, "logps/chosen": -3545.162109375, "logps/rejected": -3545.162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4903259277344, "rewards/margins": 0.0, "rewards/rejected": -351.4903259277344, "step": 866 }, { "epoch": 9.126315789473685, "grad_norm": 1.8148497247238993e-06, "learning_rate": 0.00018191578947368423, "logits/chosen": 13.250940322875977, "logits/rejected": 13.250940322875977, "logps/chosen": -3545.8134765625, "logps/rejected": -3545.8134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.55548095703125, "rewards/margins": 0.0, "rewards/rejected": -351.55548095703125, "step": 867 }, { "epoch": 9.136842105263158, "grad_norm": 1.4375851606018841e-05, "learning_rate": 0.00018189473684210528, "logits/chosen": 13.320199966430664, "logits/rejected": 13.320199966430664, "logps/chosen": -5172.951171875, "logps/rejected": -5172.951171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.356689453125, "rewards/margins": 0.0, "rewards/rejected": -514.356689453125, "step": 868 }, { "epoch": 9.147368421052631, "grad_norm": 1.908138074213639e-06, "learning_rate": 0.00018187368421052633, "logits/chosen": 13.347817420959473, "logits/rejected": 13.347817420959473, "logps/chosen": -5176.21923828125, "logps/rejected": -5176.21923828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6835327148438, "rewards/margins": 0.0, "rewards/rejected": -514.6835327148438, "step": 869 }, { "epoch": 9.157894736842104, "grad_norm": 1.8556129361968488e-05, "learning_rate": 0.00018185263157894738, "logits/chosen": 13.335928916931152, "logits/rejected": 13.335928916931152, "logps/chosen": -4871.64404296875, "logps/rejected": -4871.64404296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.3172302246094, "rewards/margins": 0.0, "rewards/rejected": -484.3172302246094, "step": 870 }, { "epoch": 9.16842105263158, "grad_norm": 2.607876240290352e-06, "learning_rate": 0.00018183157894736843, "logits/chosen": 13.350255012512207, "logits/rejected": 13.350255012512207, "logps/chosen": -5177.1142578125, "logps/rejected": -5177.1142578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7730102539062, "rewards/margins": 0.0, "rewards/rejected": -514.7730102539062, "step": 871 }, { "epoch": 9.178947368421053, "grad_norm": 5.983564733469393e-06, "learning_rate": 0.00018181052631578948, "logits/chosen": 13.271990776062012, "logits/rejected": 13.271990776062012, "logps/chosen": -3545.2451171875, "logps/rejected": -3545.2451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4986267089844, "rewards/margins": 0.0, "rewards/rejected": -351.4986267089844, "step": 872 }, { "epoch": 9.189473684210526, "grad_norm": 7.573837137897499e-06, "learning_rate": 0.00018178947368421053, "logits/chosen": 13.269919395446777, "logits/rejected": 13.269919395446777, "logps/chosen": -3544.470703125, "logps/rejected": -3544.470703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.42120361328125, "rewards/margins": 0.0, "rewards/rejected": -351.42120361328125, "step": 873 }, { "epoch": 9.2, "grad_norm": 2.123792000929825e-06, "learning_rate": 0.00018176842105263157, "logits/chosen": 13.277111053466797, "logits/rejected": 13.277111053466797, "logps/chosen": -2673.2978515625, "logps/rejected": -2673.2978515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5481872558594, "rewards/margins": 0.0, "rewards/rejected": -264.5481872558594, "step": 874 }, { "epoch": 9.210526315789474, "grad_norm": 2.9013976927672047e-06, "learning_rate": 0.00018174736842105265, "logits/chosen": 13.341736793518066, "logits/rejected": 13.341736793518066, "logps/chosen": -4875.658203125, "logps/rejected": -4875.658203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.7186584472656, "rewards/margins": 0.0, "rewards/rejected": -484.7186584472656, "step": 875 }, { "epoch": 9.221052631578948, "grad_norm": 3.285511411377229e-06, "learning_rate": 0.0001817263157894737, "logits/chosen": 13.295886993408203, "logits/rejected": 13.295886993408203, "logps/chosen": -2966.7255859375, "logps/rejected": -2966.7255859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9463806152344, "rewards/margins": 0.0, "rewards/rejected": -293.9463806152344, "step": 876 }, { "epoch": 9.23157894736842, "grad_norm": 6.2026820160099305e-06, "learning_rate": 0.00018170526315789475, "logits/chosen": 13.3085298538208, "logits/rejected": 13.3085298538208, "logps/chosen": -3771.986328125, "logps/rejected": -3771.986328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.32952880859375, "rewards/margins": 0.0, "rewards/rejected": -374.32952880859375, "step": 877 }, { "epoch": 9.242105263157894, "grad_norm": 3.626013949542539e-06, "learning_rate": 0.0001816842105263158, "logits/chosen": 13.35233211517334, "logits/rejected": 13.35233211517334, "logps/chosen": -4329.10546875, "logps/rejected": -4329.10546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.7256774902344, "rewards/margins": 0.0, "rewards/rejected": -429.7256774902344, "step": 878 }, { "epoch": 9.25263157894737, "grad_norm": 2.1689302229788154e-06, "learning_rate": 0.00018166315789473685, "logits/chosen": 13.316628456115723, "logits/rejected": 13.316628456115723, "logps/chosen": -3992.447265625, "logps/rejected": -3992.447265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.41741943359375, "rewards/margins": 0.0, "rewards/rejected": -396.41741943359375, "step": 879 }, { "epoch": 9.263157894736842, "grad_norm": 6.798334652557969e-06, "learning_rate": 0.0001816421052631579, "logits/chosen": 13.310672760009766, "logits/rejected": 13.310672760009766, "logps/chosen": -4283.88916015625, "logps/rejected": -4283.88916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.591552734375, "rewards/margins": 0.0, "rewards/rejected": -425.591552734375, "step": 880 }, { "epoch": 9.273684210526316, "grad_norm": 4.215072749502724e-06, "learning_rate": 0.00018162105263157895, "logits/chosen": 13.305066108703613, "logits/rejected": 13.305066108703613, "logps/chosen": -3992.12109375, "logps/rejected": -3992.12109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.3847961425781, "rewards/margins": 0.0, "rewards/rejected": -396.3847961425781, "step": 881 }, { "epoch": 9.284210526315789, "grad_norm": 1.9807341686828295e-06, "learning_rate": 0.00018160000000000002, "logits/chosen": 13.34605598449707, "logits/rejected": 13.34605598449707, "logps/chosen": -4876.4375, "logps/rejected": -4876.4375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.7966003417969, "rewards/margins": 0.0, "rewards/rejected": -484.7966003417969, "step": 882 }, { "epoch": 9.294736842105262, "grad_norm": 4.367466772237094e-06, "learning_rate": 0.00018157894736842107, "logits/chosen": 13.340597152709961, "logits/rejected": 13.340597152709961, "logps/chosen": -4876.533203125, "logps/rejected": -4876.533203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.80615234375, "rewards/margins": 0.0, "rewards/rejected": -484.80615234375, "step": 883 }, { "epoch": 9.305263157894737, "grad_norm": 5.206487912801094e-06, "learning_rate": 0.0001815578947368421, "logits/chosen": 13.27581787109375, "logits/rejected": 13.27581787109375, "logps/chosen": -3993.515625, "logps/rejected": -3993.515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.5242614746094, "rewards/margins": 0.0, "rewards/rejected": -396.5242614746094, "step": 884 }, { "epoch": 9.31578947368421, "grad_norm": 2.3805457658454543e-06, "learning_rate": 0.00018153684210526317, "logits/chosen": 13.257878303527832, "logits/rejected": 13.257878303527832, "logps/chosen": -3545.986328125, "logps/rejected": -3545.986328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.57275390625, "rewards/margins": 0.0, "rewards/rejected": -351.57275390625, "step": 885 }, { "epoch": 9.326315789473684, "grad_norm": 4.452103894436732e-06, "learning_rate": 0.00018151578947368422, "logits/chosen": 13.283812522888184, "logits/rejected": 13.283812522888184, "logps/chosen": -4877.609375, "logps/rejected": -4877.609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9137878417969, "rewards/margins": 0.0, "rewards/rejected": -484.9137878417969, "step": 886 }, { "epoch": 9.336842105263157, "grad_norm": 3.1537988434138242e-06, "learning_rate": 0.00018149473684210527, "logits/chosen": 13.224468231201172, "logits/rejected": 13.224468231201172, "logps/chosen": -3995.57421875, "logps/rejected": -3995.57421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7301025390625, "rewards/margins": 0.0, "rewards/rejected": -396.7301025390625, "step": 887 }, { "epoch": 9.347368421052632, "grad_norm": 3.684247531055007e-06, "learning_rate": 0.00018147368421052632, "logits/chosen": 13.248326301574707, "logits/rejected": 13.248326301574707, "logps/chosen": -4327.654296875, "logps/rejected": -4327.654296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.58056640625, "rewards/margins": 0.0, "rewards/rejected": -429.58056640625, "step": 888 }, { "epoch": 9.357894736842105, "grad_norm": 1.287758095713798e-06, "learning_rate": 0.0001814526315789474, "logits/chosen": 13.253173828125, "logits/rejected": 13.253173828125, "logps/chosen": -4878.9853515625, "logps/rejected": -4878.9853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0513610839844, "rewards/margins": 0.0, "rewards/rejected": -485.0513610839844, "step": 889 }, { "epoch": 9.368421052631579, "grad_norm": 3.224185547878733e-06, "learning_rate": 0.00018143157894736842, "logits/chosen": 13.204877853393555, "logits/rejected": 13.204877853393555, "logps/chosen": -3756.822265625, "logps/rejected": -3756.822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.763916015625, "rewards/margins": 0.0, "rewards/rejected": -372.763916015625, "step": 890 }, { "epoch": 9.378947368421052, "grad_norm": 4.099476427654736e-06, "learning_rate": 0.00018141052631578947, "logits/chosen": 13.187684059143066, "logits/rejected": 13.187684059143066, "logps/chosen": -3543.75, "logps/rejected": -3543.75, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.34912109375, "rewards/margins": 0.0, "rewards/rejected": -351.34912109375, "step": 891 }, { "epoch": 9.389473684210527, "grad_norm": 1.3864772654414992e-06, "learning_rate": 0.00018138947368421054, "logits/chosen": 13.17573070526123, "logits/rejected": 13.17573070526123, "logps/chosen": -2967.041015625, "logps/rejected": -2967.041015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9779052734375, "rewards/margins": 0.0, "rewards/rejected": -293.9779052734375, "step": 892 }, { "epoch": 9.4, "grad_norm": 2.9899169931013603e-06, "learning_rate": 0.0001813684210526316, "logits/chosen": 13.222283363342285, "logits/rejected": 13.222283363342285, "logps/chosen": -4880.19873046875, "logps/rejected": -4880.19873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1726989746094, "rewards/margins": 0.0, "rewards/rejected": -485.1726989746094, "step": 893 }, { "epoch": 9.410526315789474, "grad_norm": 2.495072294550482e-06, "learning_rate": 0.00018134736842105264, "logits/chosen": 13.171360969543457, "logits/rejected": 13.171360969543457, "logps/chosen": -3776.3671875, "logps/rejected": -3776.3671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7676086425781, "rewards/margins": 0.0, "rewards/rejected": -374.7676086425781, "step": 894 }, { "epoch": 9.421052631578947, "grad_norm": 8.472140393678274e-07, "learning_rate": 0.0001813263157894737, "logits/chosen": 13.171323776245117, "logits/rejected": 13.171323776245117, "logps/chosen": -2672.482421875, "logps/rejected": -2672.482421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4666442871094, "rewards/margins": 0.0, "rewards/rejected": -264.4666442871094, "step": 895 }, { "epoch": 9.431578947368422, "grad_norm": 8.538849556316563e-07, "learning_rate": 0.00018130526315789477, "logits/chosen": 13.17543888092041, "logits/rejected": 13.17543888092041, "logps/chosen": -2672.521484375, "logps/rejected": -2672.521484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4705505371094, "rewards/margins": 0.0, "rewards/rejected": -264.4705505371094, "step": 896 }, { "epoch": 9.442105263157895, "grad_norm": 9.238790994459123e-07, "learning_rate": 0.0001812842105263158, "logits/chosen": 13.184248924255371, "logits/rejected": 13.184248924255371, "logps/chosen": -2967.34375, "logps/rejected": -2967.34375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0081787109375, "rewards/margins": 0.0, "rewards/rejected": -294.0081787109375, "step": 897 }, { "epoch": 9.452631578947368, "grad_norm": 2.0927225250488846e-06, "learning_rate": 0.00018126315789473684, "logits/chosen": 13.237820625305176, "logits/rejected": 13.237820625305176, "logps/chosen": -4881.296875, "logps/rejected": -4881.296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.28253173828125, "rewards/margins": 0.0, "rewards/rejected": -485.28253173828125, "step": 898 }, { "epoch": 9.463157894736842, "grad_norm": 1.3098076578899054e-06, "learning_rate": 0.00018124210526315791, "logits/chosen": 13.197992324829102, "logits/rejected": 13.197992324829102, "logps/chosen": -3542.921875, "logps/rejected": -3542.921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2663269042969, "rewards/margins": 0.0, "rewards/rejected": -351.2663269042969, "step": 899 }, { "epoch": 9.473684210526315, "grad_norm": 1.5872328731347807e-06, "learning_rate": 0.00018122105263157896, "logits/chosen": 13.200522422790527, "logits/rejected": 13.200522422790527, "logps/chosen": -3998.515625, "logps/rejected": -3998.515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0242614746094, "rewards/margins": 0.0, "rewards/rejected": -397.0242614746094, "step": 900 }, { "epoch": 9.473684210526315, "eval_logits/chosen": 13.234708786010742, "eval_logits/rejected": 13.234708786010742, "eval_logps/chosen": -4309.9384765625, "eval_logps/rejected": -4309.9384765625, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.0907287597656, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.0907287597656, "eval_runtime": 4.3359, "eval_samples_per_second": 2.306, "eval_steps_per_second": 2.306, "step": 900 }, { "epoch": 9.48421052631579, "grad_norm": 1.831944700825261e-06, "learning_rate": 0.0001812, "logits/chosen": 13.205326080322266, "logits/rejected": 13.205326080322266, "logps/chosen": -3542.62109375, "logps/rejected": -3542.62109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2362365722656, "rewards/margins": 0.0, "rewards/rejected": -351.2362365722656, "step": 901 }, { "epoch": 9.494736842105263, "grad_norm": 1.1897652711923001e-06, "learning_rate": 0.00018117894736842106, "logits/chosen": 13.196013450622559, "logits/rejected": 13.196013450622559, "logps/chosen": -2673.1015625, "logps/rejected": -2673.1015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.528564453125, "rewards/margins": 0.0, "rewards/rejected": -264.528564453125, "step": 902 }, { "epoch": 9.505263157894737, "grad_norm": 8.924276357902272e-07, "learning_rate": 0.0001811578947368421, "logits/chosen": 13.195990562438965, "logits/rejected": 13.195990562438965, "logps/chosen": -2673.314453125, "logps/rejected": -2673.314453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.54986572265625, "rewards/margins": 0.0, "rewards/rejected": -264.54986572265625, "step": 903 }, { "epoch": 9.51578947368421, "grad_norm": 1.743557845657051e-06, "learning_rate": 0.00018113684210526316, "logits/chosen": 13.210175514221191, "logits/rejected": 13.210175514221191, "logps/chosen": -3757.796875, "logps/rejected": -3757.796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.86138916015625, "rewards/margins": 0.0, "rewards/rejected": -372.86138916015625, "step": 904 }, { "epoch": 9.526315789473685, "grad_norm": 1.0627876463331631e-06, "learning_rate": 0.0001811157894736842, "logits/chosen": 13.201913833618164, "logits/rejected": 13.201913833618164, "logps/chosen": -3542.98046875, "logps/rejected": -3542.98046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2721862792969, "rewards/margins": 0.0, "rewards/rejected": -351.2721862792969, "step": 905 }, { "epoch": 9.536842105263158, "grad_norm": 2.079559862977476e-06, "learning_rate": 0.00018109473684210526, "logits/chosen": 13.196248054504395, "logits/rejected": 13.196248054504395, "logps/chosen": -3779.36328125, "logps/rejected": -3779.36328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0672302246094, "rewards/margins": 0.0, "rewards/rejected": -375.0672302246094, "step": 906 }, { "epoch": 9.547368421052632, "grad_norm": 1.8719908894127002e-06, "learning_rate": 0.00018107368421052634, "logits/chosen": 13.248462677001953, "logits/rejected": 13.248462677001953, "logps/chosen": -4881.23779296875, "logps/rejected": -4881.23779296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.276611328125, "rewards/margins": 0.0, "rewards/rejected": -485.276611328125, "step": 907 }, { "epoch": 9.557894736842105, "grad_norm": 1.5655275547032943e-06, "learning_rate": 0.00018105263157894739, "logits/chosen": 13.20699405670166, "logits/rejected": 13.20699405670166, "logps/chosen": -4287.06298828125, "logps/rejected": -4287.06298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.908935546875, "rewards/margins": 0.0, "rewards/rejected": -425.908935546875, "step": 908 }, { "epoch": 9.568421052631578, "grad_norm": 1.6612819990768912e-06, "learning_rate": 0.0001810315789473684, "logits/chosen": 13.194329261779785, "logits/rejected": 13.194329261779785, "logps/chosen": -2674.3115234375, "logps/rejected": -2674.3115234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6495666503906, "rewards/margins": 0.0, "rewards/rejected": -264.6495666503906, "step": 909 }, { "epoch": 9.578947368421053, "grad_norm": 5.043406417826191e-06, "learning_rate": 0.00018101052631578948, "logits/chosen": 13.264094352722168, "logits/rejected": 13.264094352722168, "logps/chosen": -5171.0244140625, "logps/rejected": -5171.0244140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.1640014648438, "rewards/margins": 0.0, "rewards/rejected": -514.1640014648438, "step": 910 }, { "epoch": 9.589473684210526, "grad_norm": 1.3647395462612621e-06, "learning_rate": 0.00018098947368421053, "logits/chosen": 13.19222354888916, "logits/rejected": 13.19222354888916, "logps/chosen": -3998.6875, "logps/rejected": -3998.6875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.04144287109375, "rewards/margins": 0.0, "rewards/rejected": -397.04144287109375, "step": 911 }, { "epoch": 9.6, "grad_norm": 3.274992650403874e-06, "learning_rate": 0.00018096842105263158, "logits/chosen": 13.244412422180176, "logits/rejected": 13.244412422180176, "logps/chosen": -5171.88427734375, "logps/rejected": -5171.88427734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.25, "rewards/margins": 0.0, "rewards/rejected": -514.25, "step": 912 }, { "epoch": 9.610526315789473, "grad_norm": 1.3367305200517876e-06, "learning_rate": 0.00018094736842105263, "logits/chosen": 13.1802978515625, "logits/rejected": 13.1802978515625, "logps/chosen": -3543.05859375, "logps/rejected": -3543.05859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2799987792969, "rewards/margins": 0.0, "rewards/rejected": -351.2799987792969, "step": 913 }, { "epoch": 9.621052631578948, "grad_norm": 2.5271988306485582e-06, "learning_rate": 0.0001809263157894737, "logits/chosen": 13.175594329833984, "logits/rejected": 13.175594329833984, "logps/chosen": -3998.55078125, "logps/rejected": -3998.55078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.02777099609375, "rewards/margins": 0.0, "rewards/rejected": -397.02777099609375, "step": 914 }, { "epoch": 9.631578947368421, "grad_norm": 1.7182225064971135e-06, "learning_rate": 0.00018090526315789476, "logits/chosen": 13.172041893005371, "logits/rejected": 13.172041893005371, "logps/chosen": -3998.62109375, "logps/rejected": -3998.62109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0347900390625, "rewards/margins": 0.0, "rewards/rejected": -397.0347900390625, "step": 915 }, { "epoch": 9.642105263157895, "grad_norm": 8.808316351860412e-07, "learning_rate": 0.00018088421052631578, "logits/chosen": 13.164517402648926, "logits/rejected": 13.164517402648926, "logps/chosen": -2968.2216796875, "logps/rejected": -2968.2216796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0959777832031, "rewards/margins": 0.0, "rewards/rejected": -294.0959777832031, "step": 916 }, { "epoch": 9.652631578947368, "grad_norm": 1.9230894849897595e-06, "learning_rate": 0.00018086315789473686, "logits/chosen": 13.156488418579102, "logits/rejected": 13.156488418579102, "logps/chosen": -3998.646484375, "logps/rejected": -3998.646484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0373229980469, "rewards/margins": 0.0, "rewards/rejected": -397.0373229980469, "step": 917 }, { "epoch": 9.663157894736843, "grad_norm": 2.5991857910412364e-06, "learning_rate": 0.0001808421052631579, "logits/chosen": 13.193648338317871, "logits/rejected": 13.193648338317871, "logps/chosen": -4880.2578125, "logps/rejected": -4880.2578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1786193847656, "rewards/margins": 0.0, "rewards/rejected": -485.1786193847656, "step": 918 }, { "epoch": 9.673684210526316, "grad_norm": 1.4714457847730955e-06, "learning_rate": 0.00018082105263157895, "logits/chosen": 13.170114517211914, "logits/rejected": 13.170114517211914, "logps/chosen": -4325.29296875, "logps/rejected": -4325.29296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3444519042969, "rewards/margins": 0.0, "rewards/rejected": -429.3444519042969, "step": 919 }, { "epoch": 9.68421052631579, "grad_norm": 1.1151461194458534e-06, "learning_rate": 0.0001808, "logits/chosen": 13.122346878051758, "logits/rejected": 13.122346878051758, "logps/chosen": -3999.84765625, "logps/rejected": -3999.84765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1574401855469, "rewards/margins": 0.0, "rewards/rejected": -397.1574401855469, "step": 920 }, { "epoch": 9.694736842105263, "grad_norm": 1.959126848305459e-06, "learning_rate": 0.00018077894736842108, "logits/chosen": 13.17595100402832, "logits/rejected": 13.17595100402832, "logps/chosen": -5173.25, "logps/rejected": -5173.25, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3865966796875, "rewards/margins": 0.0, "rewards/rejected": -514.3865966796875, "step": 921 }, { "epoch": 9.705263157894738, "grad_norm": 1.5191129705272033e-06, "learning_rate": 0.0001807578947368421, "logits/chosen": 13.107925415039062, "logits/rejected": 13.107925415039062, "logps/chosen": -4000.228515625, "logps/rejected": -4000.228515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1955261230469, "rewards/margins": 0.0, "rewards/rejected": -397.1955261230469, "step": 922 }, { "epoch": 9.715789473684211, "grad_norm": 8.850959147821413e-07, "learning_rate": 0.00018073684210526315, "logits/chosen": 13.103530883789062, "logits/rejected": 13.103530883789062, "logps/chosen": -2967.771484375, "logps/rejected": -2967.771484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.05096435546875, "rewards/margins": 0.0, "rewards/rejected": -294.05096435546875, "step": 923 }, { "epoch": 9.726315789473684, "grad_norm": 1.1583940704440465e-06, "learning_rate": 0.00018071578947368423, "logits/chosen": 13.108007431030273, "logits/rejected": 13.108007431030273, "logps/chosen": -3758.005859375, "logps/rejected": -3758.005859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8822937011719, "rewards/margins": 0.0, "rewards/rejected": -372.8822937011719, "step": 924 }, { "epoch": 9.736842105263158, "grad_norm": 1.7935511777977808e-06, "learning_rate": 0.00018069473684210528, "logits/chosen": 13.160980224609375, "logits/rejected": 13.160980224609375, "logps/chosen": -5173.95361328125, "logps/rejected": -5173.95361328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4569702148438, "rewards/margins": 0.0, "rewards/rejected": -514.4569702148438, "step": 925 }, { "epoch": 9.74736842105263, "grad_norm": 1.3837476444678032e-06, "learning_rate": 0.00018067368421052633, "logits/chosen": 13.14988899230957, "logits/rejected": 13.14988899230957, "logps/chosen": -4879.6416015625, "logps/rejected": -4879.6416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.11700439453125, "rewards/margins": 0.0, "rewards/rejected": -485.11700439453125, "step": 926 }, { "epoch": 9.757894736842106, "grad_norm": 1.2462518270695e-06, "learning_rate": 0.00018065263157894738, "logits/chosen": 13.096955299377441, "logits/rejected": 13.096955299377441, "logps/chosen": -2673.61328125, "logps/rejected": -2673.61328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5797424316406, "rewards/margins": 0.0, "rewards/rejected": -264.5797424316406, "step": 927 }, { "epoch": 9.76842105263158, "grad_norm": 1.6706468386473716e-06, "learning_rate": 0.00018063157894736845, "logits/chosen": 13.113860130310059, "logits/rejected": 13.113860130310059, "logps/chosen": -4287.2919921875, "logps/rejected": -4287.2919921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9318542480469, "rewards/margins": 0.0, "rewards/rejected": -425.9318542480469, "step": 928 }, { "epoch": 9.778947368421052, "grad_norm": 1.5169850939855678e-06, "learning_rate": 0.00018061052631578947, "logits/chosen": 13.162243843078613, "logits/rejected": 13.162243843078613, "logps/chosen": -4879.82958984375, "logps/rejected": -4879.82958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.13580322265625, "rewards/margins": 0.0, "rewards/rejected": -485.13580322265625, "step": 929 }, { "epoch": 9.789473684210526, "grad_norm": 1.4269896837504348e-06, "learning_rate": 0.00018058947368421052, "logits/chosen": 13.114461898803711, "logits/rejected": 13.114461898803711, "logps/chosen": -3777.1953125, "logps/rejected": -3777.1953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8504333496094, "rewards/margins": 0.0, "rewards/rejected": -374.8504333496094, "step": 930 }, { "epoch": 9.8, "grad_norm": 1.0124748541784356e-06, "learning_rate": 0.0001805684210526316, "logits/chosen": 13.113466262817383, "logits/rejected": 13.113466262817383, "logps/chosen": -2673.595703125, "logps/rejected": -2673.595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5779724121094, "rewards/margins": 0.0, "rewards/rejected": -264.5779724121094, "step": 931 }, { "epoch": 9.810526315789474, "grad_norm": 1.7347774701192975e-06, "learning_rate": 0.00018054736842105265, "logits/chosen": 13.17525863647461, "logits/rejected": 13.17525863647461, "logps/chosen": -4880.5234375, "logps/rejected": -4880.5234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2051696777344, "rewards/margins": 0.0, "rewards/rejected": -485.2051696777344, "step": 932 }, { "epoch": 9.821052631578947, "grad_norm": 7.927322371870105e-07, "learning_rate": 0.0001805263157894737, "logits/chosen": 13.13425064086914, "logits/rejected": 13.13425064086914, "logps/chosen": -2967.8828125, "logps/rejected": -2967.8828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0621032714844, "rewards/margins": 0.0, "rewards/rejected": -294.0621032714844, "step": 933 }, { "epoch": 9.83157894736842, "grad_norm": 1.0932701570709469e-06, "learning_rate": 0.00018050526315789475, "logits/chosen": 13.151144027709961, "logits/rejected": 13.151144027709961, "logps/chosen": -3758.41015625, "logps/rejected": -3758.41015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9226989746094, "rewards/margins": 0.0, "rewards/rejected": -372.9226989746094, "step": 934 }, { "epoch": 9.842105263157894, "grad_norm": 1.906367288029287e-06, "learning_rate": 0.0001804842105263158, "logits/chosen": 13.213872909545898, "logits/rejected": 13.213872909545898, "logps/chosen": -5175.31494140625, "logps/rejected": -5175.31494140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5930786132812, "rewards/margins": 0.0, "rewards/rejected": -514.5930786132812, "step": 935 }, { "epoch": 9.852631578947369, "grad_norm": 1.2445665333871148e-06, "learning_rate": 0.00018046315789473685, "logits/chosen": 13.160307884216309, "logits/rejected": 13.160307884216309, "logps/chosen": -3999.84375, "logps/rejected": -3999.84375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1570739746094, "rewards/margins": 0.0, "rewards/rejected": -397.1570739746094, "step": 936 }, { "epoch": 9.863157894736842, "grad_norm": 1.4144438864605036e-06, "learning_rate": 0.0001804421052631579, "logits/chosen": 13.167046546936035, "logits/rejected": 13.167046546936035, "logps/chosen": -3777.84375, "logps/rejected": -3777.84375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9152526855469, "rewards/margins": 0.0, "rewards/rejected": -374.9152526855469, "step": 937 }, { "epoch": 9.873684210526315, "grad_norm": 1.6998163800963084e-06, "learning_rate": 0.00018042105263157894, "logits/chosen": 13.238704681396484, "logits/rejected": 13.238704681396484, "logps/chosen": -5175.4814453125, "logps/rejected": -5175.4814453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6097412109375, "rewards/margins": 0.0, "rewards/rejected": -514.6097412109375, "step": 938 }, { "epoch": 9.884210526315789, "grad_norm": 1.5435953173437156e-06, "learning_rate": 0.00018040000000000002, "logits/chosen": 13.185133934020996, "logits/rejected": 13.185133934020996, "logps/chosen": -4288.92333984375, "logps/rejected": -4288.92333984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.094970703125, "rewards/margins": 0.0, "rewards/rejected": -426.094970703125, "step": 939 }, { "epoch": 9.894736842105264, "grad_norm": 1.576958652549365e-06, "learning_rate": 0.00018037894736842107, "logits/chosen": 13.179280281066895, "logits/rejected": 13.179280281066895, "logps/chosen": -3778.392578125, "logps/rejected": -3778.392578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.97015380859375, "rewards/margins": 0.0, "rewards/rejected": -374.97015380859375, "step": 940 }, { "epoch": 9.905263157894737, "grad_norm": 1.4728045698575443e-06, "learning_rate": 0.0001803578947368421, "logits/chosen": 13.18721866607666, "logits/rejected": 13.18721866607666, "logps/chosen": -4289.30419921875, "logps/rejected": -4289.30419921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.133056640625, "rewards/margins": 0.0, "rewards/rejected": -426.133056640625, "step": 941 }, { "epoch": 9.91578947368421, "grad_norm": 1.2911617659483454e-06, "learning_rate": 0.00018033684210526317, "logits/chosen": 13.178040504455566, "logits/rejected": 13.178040504455566, "logps/chosen": -3999.052734375, "logps/rejected": -3999.052734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0779724121094, "rewards/margins": 0.0, "rewards/rejected": -397.0779724121094, "step": 942 }, { "epoch": 9.926315789473684, "grad_norm": 1.1795951877502375e-06, "learning_rate": 0.00018031578947368422, "logits/chosen": 13.176817893981934, "logits/rejected": 13.176817893981934, "logps/chosen": -3541.595703125, "logps/rejected": -3541.595703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1336975097656, "rewards/margins": 0.0, "rewards/rejected": -351.1336975097656, "step": 943 }, { "epoch": 9.936842105263159, "grad_norm": 1.463585476813023e-06, "learning_rate": 0.00018029473684210527, "logits/chosen": 13.174623489379883, "logits/rejected": 13.174623489379883, "logps/chosen": -4290.2265625, "logps/rejected": -4290.2265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.2253112792969, "rewards/margins": 0.0, "rewards/rejected": -426.2253112792969, "step": 944 }, { "epoch": 9.947368421052632, "grad_norm": 1.9414749203860993e-06, "learning_rate": 0.00018027368421052632, "logits/chosen": 13.215839385986328, "logits/rejected": 13.215839385986328, "logps/chosen": -4880.64794921875, "logps/rejected": -4880.64794921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2176208496094, "rewards/margins": 0.0, "rewards/rejected": -485.2176208496094, "step": 945 }, { "epoch": 9.957894736842105, "grad_norm": 1.4988735301812994e-06, "learning_rate": 0.0001802526315789474, "logits/chosen": 13.165823936462402, "logits/rejected": 13.165823936462402, "logps/chosen": -3779.154296875, "logps/rejected": -3779.154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.04632568359375, "rewards/margins": 0.0, "rewards/rejected": -375.04632568359375, "step": 946 }, { "epoch": 9.968421052631578, "grad_norm": 1.4839308732916834e-06, "learning_rate": 0.00018023157894736844, "logits/chosen": 13.233963012695312, "logits/rejected": 13.233963012695312, "logps/chosen": -5174.9248046875, "logps/rejected": -5174.9248046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5540771484375, "rewards/margins": 0.0, "rewards/rejected": -514.5540771484375, "step": 947 }, { "epoch": 9.978947368421053, "grad_norm": 1.2255666206328897e-06, "learning_rate": 0.00018021052631578946, "logits/chosen": 13.172708511352539, "logits/rejected": 13.172708511352539, "logps/chosen": -3998.29296875, "logps/rejected": -3998.29296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0019836425781, "rewards/margins": 0.0, "rewards/rejected": -397.0019836425781, "step": 948 }, { "epoch": 9.989473684210527, "grad_norm": 1.206728256875067e-06, "learning_rate": 0.00018018947368421054, "logits/chosen": 13.182438850402832, "logits/rejected": 13.182438850402832, "logps/chosen": -3758.41015625, "logps/rejected": -3758.41015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9226989746094, "rewards/margins": 0.0, "rewards/rejected": -372.9226989746094, "step": 949 }, { "epoch": 10.0, "grad_norm": 1.5243218740579323e-06, "learning_rate": 0.0001801684210526316, "logits/chosen": 13.168974876403809, "logits/rejected": 13.168974876403809, "logps/chosen": -3998.42578125, "logps/rejected": -3998.42578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0152587890625, "rewards/margins": 0.0, "rewards/rejected": -397.0152587890625, "step": 950 }, { "epoch": 10.0, "eval_logits/chosen": 13.194429397583008, "eval_logits/rejected": 13.194429397583008, "eval_logps/chosen": -4310.5576171875, "eval_logps/rejected": -4310.5576171875, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.15252685546875, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.15252685546875, "eval_runtime": 4.5454, "eval_samples_per_second": 2.2, "eval_steps_per_second": 2.2, "step": 950 }, { "epoch": 10.010526315789473, "grad_norm": 1.4693589491798775e-06, "learning_rate": 0.00018014736842105264, "logits/chosen": 13.159584999084473, "logits/rejected": 13.159584999084473, "logps/chosen": -3780.01953125, "logps/rejected": -3780.01953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.1328430175781, "rewards/margins": 0.0, "rewards/rejected": -375.1328430175781, "step": 951 }, { "epoch": 10.021052631578947, "grad_norm": 1.1617106565608992e-06, "learning_rate": 0.0001801263157894737, "logits/chosen": 13.150762557983398, "logits/rejected": 13.150762557983398, "logps/chosen": -3998.5390625, "logps/rejected": -3998.5390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0265808105469, "rewards/margins": 0.0, "rewards/rejected": -397.0265808105469, "step": 952 }, { "epoch": 10.031578947368422, "grad_norm": 1.0501075848878827e-06, "learning_rate": 0.00018010526315789477, "logits/chosen": 13.132506370544434, "logits/rejected": 13.132506370544434, "logps/chosen": -2672.892578125, "logps/rejected": -2672.892578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5076599121094, "rewards/margins": 0.0, "rewards/rejected": -264.5076599121094, "step": 953 }, { "epoch": 10.042105263157895, "grad_norm": 8.266853228633408e-07, "learning_rate": 0.0001800842105263158, "logits/chosen": 13.129226684570312, "logits/rejected": 13.129226684570312, "logps/chosen": -2967.841796875, "logps/rejected": -2967.841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0579833984375, "rewards/margins": 0.0, "rewards/rejected": -294.0579833984375, "step": 954 }, { "epoch": 10.052631578947368, "grad_norm": 1.613939616618154e-06, "learning_rate": 0.00018006315789473684, "logits/chosen": 13.123612403869629, "logits/rejected": 13.123612403869629, "logps/chosen": -4291.8515625, "logps/rejected": -4291.8515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.3877868652344, "rewards/margins": 0.0, "rewards/rejected": -426.3877868652344, "step": 955 }, { "epoch": 10.063157894736841, "grad_norm": 1.5553962384728948e-06, "learning_rate": 0.0001800421052631579, "logits/chosen": 13.108367919921875, "logits/rejected": 13.108367919921875, "logps/chosen": -3781.0693359375, "logps/rejected": -3781.0693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.2378234863281, "rewards/margins": 0.0, "rewards/rejected": -375.2378234863281, "step": 956 }, { "epoch": 10.073684210526316, "grad_norm": 9.385697126162995e-07, "learning_rate": 0.00018002105263157896, "logits/chosen": 13.097867965698242, "logits/rejected": 13.097867965698242, "logps/chosen": -2673.2138671875, "logps/rejected": -2673.2138671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.539794921875, "rewards/margins": 0.0, "rewards/rejected": -264.539794921875, "step": 957 }, { "epoch": 10.08421052631579, "grad_norm": 1.2404476592564606e-06, "learning_rate": 0.00018, "logits/chosen": 13.106371879577637, "logits/rejected": 13.106371879577637, "logps/chosen": -4292.078125, "logps/rejected": -4292.078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.41046142578125, "rewards/margins": 0.0, "rewards/rejected": -426.41046142578125, "step": 958 }, { "epoch": 10.094736842105263, "grad_norm": 1.864947648755333e-06, "learning_rate": 0.00017997894736842106, "logits/chosen": 13.161351203918457, "logits/rejected": 13.161351203918457, "logps/chosen": -5172.3759765625, "logps/rejected": -5172.3759765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2991943359375, "rewards/margins": 0.0, "rewards/rejected": -514.2991943359375, "step": 959 }, { "epoch": 10.105263157894736, "grad_norm": 2.015239260799717e-06, "learning_rate": 0.0001799578947368421, "logits/chosen": 13.162423133850098, "logits/rejected": 13.162423133850098, "logps/chosen": -5172.1923828125, "logps/rejected": -5172.1923828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2808227539062, "rewards/margins": 0.0, "rewards/rejected": -514.2808227539062, "step": 960 }, { "epoch": 10.115789473684211, "grad_norm": 1.4148392892820993e-06, "learning_rate": 0.00017993684210526316, "logits/chosen": 13.10560131072998, "logits/rejected": 13.10560131072998, "logps/chosen": -3540.525390625, "logps/rejected": -3540.525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.02667236328125, "rewards/margins": 0.0, "rewards/rejected": -351.02667236328125, "step": 961 }, { "epoch": 10.126315789473685, "grad_norm": 1.7931035927176708e-06, "learning_rate": 0.0001799157894736842, "logits/chosen": 13.172406196594238, "logits/rejected": 13.172406196594238, "logps/chosen": -5172.849609375, "logps/rejected": -5172.849609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3465576171875, "rewards/margins": 0.0, "rewards/rejected": -514.3465576171875, "step": 962 }, { "epoch": 10.136842105263158, "grad_norm": 1.007341325021116e-06, "learning_rate": 0.00017989473684210528, "logits/chosen": 13.121959686279297, "logits/rejected": 13.121959686279297, "logps/chosen": -3541.4306640625, "logps/rejected": -3541.4306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1171875, "rewards/margins": 0.0, "rewards/rejected": -351.1171875, "step": 963 }, { "epoch": 10.147368421052631, "grad_norm": 8.368168664674158e-07, "learning_rate": 0.00017987368421052633, "logits/chosen": 13.122076988220215, "logits/rejected": 13.122076988220215, "logps/chosen": -2674.181640625, "logps/rejected": -2674.181640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6365661621094, "rewards/margins": 0.0, "rewards/rejected": -264.6365661621094, "step": 964 }, { "epoch": 10.157894736842104, "grad_norm": 1.775458144948061e-06, "learning_rate": 0.00017985263157894738, "logits/chosen": 13.139272689819336, "logits/rejected": 13.139272689819336, "logps/chosen": -3997.744140625, "logps/rejected": -3997.744140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9471130371094, "rewards/margins": 0.0, "rewards/rejected": -396.9471130371094, "step": 965 }, { "epoch": 10.16842105263158, "grad_norm": 8.284944215120049e-07, "learning_rate": 0.00017983157894736843, "logits/chosen": 13.147418022155762, "logits/rejected": 13.147418022155762, "logps/chosen": -2968.232421875, "logps/rejected": -2968.232421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0970458984375, "rewards/margins": 0.0, "rewards/rejected": -294.0970458984375, "step": 966 }, { "epoch": 10.178947368421053, "grad_norm": 9.165895562546211e-07, "learning_rate": 0.00017981052631578948, "logits/chosen": 13.15627384185791, "logits/rejected": 13.15627384185791, "logps/chosen": -3542.287109375, "logps/rejected": -3542.287109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2028503417969, "rewards/margins": 0.0, "rewards/rejected": -351.2028503417969, "step": 967 }, { "epoch": 10.189473684210526, "grad_norm": 7.98844666860532e-07, "learning_rate": 0.00017978947368421053, "logits/chosen": 13.160597801208496, "logits/rejected": 13.160597801208496, "logps/chosen": -2968.58984375, "logps/rejected": -2968.58984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1327819824219, "rewards/margins": 0.0, "rewards/rejected": -294.1327819824219, "step": 968 }, { "epoch": 10.2, "grad_norm": 1.2824373243347509e-06, "learning_rate": 0.00017976842105263158, "logits/chosen": 13.165441513061523, "logits/rejected": 13.165441513061523, "logps/chosen": -3997.59765625, "logps/rejected": -3997.59765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9324645996094, "rewards/margins": 0.0, "rewards/rejected": -396.9324645996094, "step": 969 }, { "epoch": 10.210526315789474, "grad_norm": 1.0446786973261624e-06, "learning_rate": 0.00017974736842105263, "logits/chosen": 13.174192428588867, "logits/rejected": 13.174192428588867, "logps/chosen": -3543.2490234375, "logps/rejected": -3543.2490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2990417480469, "rewards/margins": 0.0, "rewards/rejected": -351.2990417480469, "step": 970 }, { "epoch": 10.221052631578948, "grad_norm": 1.2605731853909674e-06, "learning_rate": 0.0001797263157894737, "logits/chosen": 13.172664642333984, "logits/rejected": 13.172664642333984, "logps/chosen": -3997.59375, "logps/rejected": -3997.59375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.93206787109375, "rewards/margins": 0.0, "rewards/rejected": -396.93206787109375, "step": 971 }, { "epoch": 10.23157894736842, "grad_norm": 1.0253037316942937e-06, "learning_rate": 0.00017970526315789476, "logits/chosen": 13.179628372192383, "logits/rejected": 13.179628372192383, "logps/chosen": -3543.8369140625, "logps/rejected": -3543.8369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3578186035156, "rewards/margins": 0.0, "rewards/rejected": -351.3578186035156, "step": 972 }, { "epoch": 10.242105263157894, "grad_norm": 1.3200043440519948e-06, "learning_rate": 0.00017968421052631578, "logits/chosen": 13.17689037322998, "logits/rejected": 13.17689037322998, "logps/chosen": -3779.640625, "logps/rejected": -3779.640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0949401855469, "rewards/margins": 0.0, "rewards/rejected": -375.0949401855469, "step": 973 }, { "epoch": 10.25263157894737, "grad_norm": 8.757655223234906e-07, "learning_rate": 0.00017966315789473685, "logits/chosen": 13.185495376586914, "logits/rejected": 13.185495376586914, "logps/chosen": -3544.4033203125, "logps/rejected": -3544.4033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4144592285156, "rewards/margins": 0.0, "rewards/rejected": -351.4144592285156, "step": 974 }, { "epoch": 10.263157894736842, "grad_norm": 1.3463446748573915e-06, "learning_rate": 0.0001796421052631579, "logits/chosen": 13.184090614318848, "logits/rejected": 13.184090614318848, "logps/chosen": -3998.13671875, "logps/rejected": -3998.13671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9863586425781, "rewards/margins": 0.0, "rewards/rejected": -396.9863586425781, "step": 975 }, { "epoch": 10.273684210526316, "grad_norm": 1.8277658000442898e-06, "learning_rate": 0.00017962105263157895, "logits/chosen": 13.236319541931152, "logits/rejected": 13.236319541931152, "logps/chosen": -4876.94091796875, "logps/rejected": -4876.94091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.846923828125, "rewards/margins": 0.0, "rewards/rejected": -484.846923828125, "step": 976 }, { "epoch": 10.284210526315789, "grad_norm": 1.055819097928179e-06, "learning_rate": 0.0001796, "logits/chosen": 13.196540832519531, "logits/rejected": 13.196540832519531, "logps/chosen": -3757.6884765625, "logps/rejected": -3757.6884765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8505554199219, "rewards/margins": 0.0, "rewards/rejected": -372.8505554199219, "step": 977 }, { "epoch": 10.294736842105262, "grad_norm": 1.4574362694474985e-06, "learning_rate": 0.00017957894736842108, "logits/chosen": 13.253937721252441, "logits/rejected": 13.253937721252441, "logps/chosen": -5175.04248046875, "logps/rejected": -5175.04248046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5658569335938, "rewards/margins": 0.0, "rewards/rejected": -514.5658569335938, "step": 978 }, { "epoch": 10.305263157894737, "grad_norm": 1.3784009524897556e-06, "learning_rate": 0.00017955789473684213, "logits/chosen": 13.242537498474121, "logits/rejected": 13.242537498474121, "logps/chosen": -4877.05078125, "logps/rejected": -4877.05078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.85791015625, "rewards/margins": 0.0, "rewards/rejected": -484.85791015625, "step": 979 }, { "epoch": 10.31578947368421, "grad_norm": 1.3029579122303403e-06, "learning_rate": 0.00017953684210526315, "logits/chosen": 13.20040512084961, "logits/rejected": 13.20040512084961, "logps/chosen": -4289.892578125, "logps/rejected": -4289.892578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.19189453125, "rewards/margins": 0.0, "rewards/rejected": -426.19189453125, "step": 980 }, { "epoch": 10.326315789473684, "grad_norm": 1.6156998299265979e-06, "learning_rate": 0.00017951578947368423, "logits/chosen": 13.245430946350098, "logits/rejected": 13.245430946350098, "logps/chosen": -4877.677734375, "logps/rejected": -4877.677734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9206237792969, "rewards/margins": 0.0, "rewards/rejected": -484.9206237792969, "step": 981 }, { "epoch": 10.336842105263157, "grad_norm": 1.3154404996384983e-06, "learning_rate": 0.00017949473684210528, "logits/chosen": 13.232123374938965, "logits/rejected": 13.232123374938965, "logps/chosen": -4326.23046875, "logps/rejected": -4326.23046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4382019042969, "rewards/margins": 0.0, "rewards/rejected": -429.4382019042969, "step": 982 }, { "epoch": 10.347368421052632, "grad_norm": 1.483234314036963e-06, "learning_rate": 0.00017947368421052632, "logits/chosen": 13.189722061157227, "logits/rejected": 13.189722061157227, "logps/chosen": -3998.69140625, "logps/rejected": -3998.69140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0418395996094, "rewards/margins": 0.0, "rewards/rejected": -397.0418395996094, "step": 983 }, { "epoch": 10.357894736842105, "grad_norm": 1.1578424619074212e-06, "learning_rate": 0.00017945263157894737, "logits/chosen": 13.174674987792969, "logits/rejected": 13.174674987792969, "logps/chosen": -2673.5751953125, "logps/rejected": -2673.5751953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.575927734375, "rewards/margins": 0.0, "rewards/rejected": -264.575927734375, "step": 984 }, { "epoch": 10.368421052631579, "grad_norm": 9.803466127777938e-07, "learning_rate": 0.00017943157894736845, "logits/chosen": 13.18294906616211, "logits/rejected": 13.18294906616211, "logps/chosen": -3758.1171875, "logps/rejected": -3758.1171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8934020996094, "rewards/margins": 0.0, "rewards/rejected": -372.8934020996094, "step": 985 }, { "epoch": 10.378947368421052, "grad_norm": 1.6069811863417272e-06, "learning_rate": 0.00017941052631578947, "logits/chosen": 13.230050086975098, "logits/rejected": 13.230050086975098, "logps/chosen": -5175.53955078125, "logps/rejected": -5175.53955078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6155395507812, "rewards/margins": 0.0, "rewards/rejected": -514.6155395507812, "step": 986 }, { "epoch": 10.389473684210527, "grad_norm": 1.5931537973301602e-06, "learning_rate": 0.00017938947368421052, "logits/chosen": 13.160172462463379, "logits/rejected": 13.160172462463379, "logps/chosen": -3778.580078125, "logps/rejected": -3778.580078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9888916015625, "rewards/margins": 0.0, "rewards/rejected": -374.9888916015625, "step": 987 }, { "epoch": 10.4, "grad_norm": 1.9772703581111273e-06, "learning_rate": 0.0001793684210526316, "logits/chosen": 13.211435317993164, "logits/rejected": 13.211435317993164, "logps/chosen": -4879.33349609375, "logps/rejected": -4879.33349609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.086181640625, "rewards/margins": 0.0, "rewards/rejected": -485.086181640625, "step": 988 }, { "epoch": 10.410526315789474, "grad_norm": 1.1682045624183957e-06, "learning_rate": 0.00017934736842105265, "logits/chosen": 13.163681983947754, "logits/rejected": 13.163681983947754, "logps/chosen": -3999.25, "logps/rejected": -3999.25, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0976867675781, "rewards/margins": 0.0, "rewards/rejected": -397.0976867675781, "step": 989 }, { "epoch": 10.421052631578947, "grad_norm": 1.691668558123638e-06, "learning_rate": 0.0001793263157894737, "logits/chosen": 13.231330871582031, "logits/rejected": 13.231330871582031, "logps/chosen": -5175.73779296875, "logps/rejected": -5175.73779296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6353759765625, "rewards/margins": 0.0, "rewards/rejected": -514.6353759765625, "step": 990 }, { "epoch": 10.431578947368422, "grad_norm": 1.5367602372862166e-06, "learning_rate": 0.00017930526315789475, "logits/chosen": 13.220803260803223, "logits/rejected": 13.220803260803223, "logps/chosen": -4880.1689453125, "logps/rejected": -4880.1689453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.16973876953125, "rewards/margins": 0.0, "rewards/rejected": -485.16973876953125, "step": 991 }, { "epoch": 10.442105263157895, "grad_norm": 1.2719325468424358e-06, "learning_rate": 0.0001792842105263158, "logits/chosen": 13.18104362487793, "logits/rejected": 13.18104362487793, "logps/chosen": -3758.6015625, "logps/rejected": -3758.6015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9418640136719, "rewards/margins": 0.0, "rewards/rejected": -372.9418640136719, "step": 992 }, { "epoch": 10.452631578947368, "grad_norm": 1.4966909702707198e-06, "learning_rate": 0.00017926315789473684, "logits/chosen": 13.178886413574219, "logits/rejected": 13.178886413574219, "logps/chosen": -4288.48583984375, "logps/rejected": -4288.48583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0512390136719, "rewards/margins": 0.0, "rewards/rejected": -426.0512390136719, "step": 993 }, { "epoch": 10.463157894736842, "grad_norm": 1.3262939546621055e-06, "learning_rate": 0.0001792421052631579, "logits/chosen": 13.222832679748535, "logits/rejected": 13.222832679748535, "logps/chosen": -4880.97265625, "logps/rejected": -4880.97265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2500915527344, "rewards/margins": 0.0, "rewards/rejected": -485.2500915527344, "step": 994 }, { "epoch": 10.473684210526315, "grad_norm": 1.1435511169111123e-06, "learning_rate": 0.00017922105263157897, "logits/chosen": 13.222113609313965, "logits/rejected": 13.222113609313965, "logps/chosen": -4881.46630859375, "logps/rejected": -4881.46630859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2994689941406, "rewards/margins": 0.0, "rewards/rejected": -485.2994689941406, "step": 995 }, { "epoch": 10.48421052631579, "grad_norm": 1.5773370023453026e-06, "learning_rate": 0.00017920000000000002, "logits/chosen": 13.235569953918457, "logits/rejected": 13.235569953918457, "logps/chosen": -5175.8994140625, "logps/rejected": -5175.8994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6515502929688, "rewards/margins": 0.0, "rewards/rejected": -514.6515502929688, "step": 996 }, { "epoch": 10.494736842105263, "grad_norm": 1.402113412041217e-06, "learning_rate": 0.00017917894736842107, "logits/chosen": 13.211935043334961, "logits/rejected": 13.211935043334961, "logps/chosen": -4325.302734375, "logps/rejected": -4325.302734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3454284667969, "rewards/margins": 0.0, "rewards/rejected": -429.3454284667969, "step": 997 }, { "epoch": 10.505263157894737, "grad_norm": 1.350139541500539e-06, "learning_rate": 0.00017915789473684212, "logits/chosen": 13.21486759185791, "logits/rejected": 13.21486759185791, "logps/chosen": -4325.42578125, "logps/rejected": -4325.42578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.35772705078125, "rewards/margins": 0.0, "rewards/rejected": -429.35772705078125, "step": 998 }, { "epoch": 10.51578947368421, "grad_norm": 1.136086893893662e-06, "learning_rate": 0.00017913684210526317, "logits/chosen": 13.187580108642578, "logits/rejected": 13.187580108642578, "logps/chosen": -3759.060546875, "logps/rejected": -3759.060546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9877624511719, "rewards/margins": 0.0, "rewards/rejected": -372.9877624511719, "step": 999 }, { "epoch": 10.526315789473685, "grad_norm": 1.4942453390176524e-06, "learning_rate": 0.00017911578947368422, "logits/chosen": 13.178108215332031, "logits/rejected": 13.178108215332031, "logps/chosen": -3998.203125, "logps/rejected": -3998.203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9930114746094, "rewards/margins": 0.0, "rewards/rejected": -396.9930114746094, "step": 1000 }, { "epoch": 10.526315789473685, "eval_logits/chosen": 13.210248947143555, "eval_logits/rejected": 13.210248947143555, "eval_logps/chosen": -4310.80810546875, "eval_logps/rejected": -4310.80810546875, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.17767333984375, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.17767333984375, "eval_runtime": 4.3934, "eval_samples_per_second": 2.276, "eval_steps_per_second": 2.276, "step": 1000 }, { "epoch": 10.536842105263158, "grad_norm": 1.3362285926632467e-06, "learning_rate": 0.00017909473684210527, "logits/chosen": 13.228311538696289, "logits/rejected": 13.228311538696289, "logps/chosen": -4882.1826171875, "logps/rejected": -4882.1826171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.37109375, "rewards/margins": 0.0, "rewards/rejected": -485.37109375, "step": 1001 }, { "epoch": 10.547368421052632, "grad_norm": 1.278688728234556e-06, "learning_rate": 0.00017907368421052631, "logits/chosen": 13.169454574584961, "logits/rejected": 13.169454574584961, "logps/chosen": -3998.1796875, "logps/rejected": -3998.1796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.99066162109375, "rewards/margins": 0.0, "rewards/rejected": -396.99066162109375, "step": 1002 }, { "epoch": 10.557894736842105, "grad_norm": 1.2729005902656354e-06, "learning_rate": 0.0001790526315789474, "logits/chosen": 13.21623420715332, "logits/rejected": 13.21623420715332, "logps/chosen": -4882.1455078125, "logps/rejected": -4882.1455078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3674011230469, "rewards/margins": 0.0, "rewards/rejected": -485.3674011230469, "step": 1003 }, { "epoch": 10.568421052631578, "grad_norm": 1.3228390116637456e-06, "learning_rate": 0.00017903157894736844, "logits/chosen": 13.209847450256348, "logits/rejected": 13.209847450256348, "logps/chosen": -4882.48486328125, "logps/rejected": -4882.48486328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.4013366699219, "rewards/margins": 0.0, "rewards/rejected": -485.4013366699219, "step": 1004 }, { "epoch": 10.578947368421053, "grad_norm": 1.2972160448043724e-06, "learning_rate": 0.00017901052631578946, "logits/chosen": 13.191460609436035, "logits/rejected": 13.191460609436035, "logps/chosen": -4326.072265625, "logps/rejected": -4326.072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.42236328125, "rewards/margins": 0.0, "rewards/rejected": -429.42236328125, "step": 1005 }, { "epoch": 10.589473684210526, "grad_norm": 1.2427194633346517e-06, "learning_rate": 0.00017898947368421054, "logits/chosen": 13.139479637145996, "logits/rejected": 13.139479637145996, "logps/chosen": -2671.33984375, "logps/rejected": -2671.33984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.3523864746094, "rewards/margins": 0.0, "rewards/rejected": -264.3523864746094, "step": 1006 }, { "epoch": 10.6, "grad_norm": 1.1622055353655014e-06, "learning_rate": 0.0001789684210526316, "logits/chosen": 13.137301445007324, "logits/rejected": 13.137301445007324, "logps/chosen": -2671.369140625, "logps/rejected": -2671.369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.3553161621094, "rewards/margins": 0.0, "rewards/rejected": -264.3553161621094, "step": 1007 }, { "epoch": 10.610526315789473, "grad_norm": 1.1334084319969406e-06, "learning_rate": 0.00017894736842105264, "logits/chosen": 13.14943790435791, "logits/rejected": 13.14943790435791, "logps/chosen": -3540.439453125, "logps/rejected": -3540.439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.01806640625, "rewards/margins": 0.0, "rewards/rejected": -351.01806640625, "step": 1008 }, { "epoch": 10.621052631578948, "grad_norm": 1.0162290209336788e-06, "learning_rate": 0.0001789263157894737, "logits/chosen": 13.152040481567383, "logits/rejected": 13.152040481567383, "logps/chosen": -3540.490234375, "logps/rejected": -3540.490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.0231628417969, "rewards/margins": 0.0, "rewards/rejected": -351.0231628417969, "step": 1009 }, { "epoch": 10.631578947368421, "grad_norm": 1.959740302481805e-06, "learning_rate": 0.00017890526315789476, "logits/chosen": 13.152101516723633, "logits/rejected": 13.152101516723633, "logps/chosen": -3998.369140625, "logps/rejected": -3998.369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0096130371094, "rewards/margins": 0.0, "rewards/rejected": -397.0096130371094, "step": 1010 }, { "epoch": 10.642105263157895, "grad_norm": 1.0505898444534978e-06, "learning_rate": 0.00017888421052631579, "logits/chosen": 13.162419319152832, "logits/rejected": 13.162419319152832, "logps/chosen": -3540.927734375, "logps/rejected": -3540.927734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.06689453125, "rewards/margins": 0.0, "rewards/rejected": -351.06689453125, "step": 1011 }, { "epoch": 10.652631578947368, "grad_norm": 2.120922317772056e-06, "learning_rate": 0.00017886315789473683, "logits/chosen": 13.204904556274414, "logits/rejected": 13.204904556274414, "logps/chosen": -4326.396484375, "logps/rejected": -4326.396484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4548034667969, "rewards/margins": 0.0, "rewards/rejected": -429.4548034667969, "step": 1012 }, { "epoch": 10.663157894736843, "grad_norm": 1.5682876437494997e-06, "learning_rate": 0.0001788421052631579, "logits/chosen": 13.172957420349121, "logits/rejected": 13.172957420349121, "logps/chosen": -3776.220703125, "logps/rejected": -3776.220703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7529602050781, "rewards/margins": 0.0, "rewards/rejected": -374.7529602050781, "step": 1013 }, { "epoch": 10.673684210526316, "grad_norm": 2.4487833343300736e-06, "learning_rate": 0.00017882105263157896, "logits/chosen": 13.251319885253906, "logits/rejected": 13.251319885253906, "logps/chosen": -5175.296875, "logps/rejected": -5175.296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5912475585938, "rewards/margins": 0.0, "rewards/rejected": -514.5912475585938, "step": 1014 }, { "epoch": 10.68421052631579, "grad_norm": 1.9198505469830707e-06, "learning_rate": 0.0001788, "logits/chosen": 13.196731567382812, "logits/rejected": 13.196731567382812, "logps/chosen": -3542.06640625, "logps/rejected": -3542.06640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1807556152344, "rewards/margins": 0.0, "rewards/rejected": -351.1807556152344, "step": 1015 }, { "epoch": 10.694736842105263, "grad_norm": 1.5869774188104202e-06, "learning_rate": 0.00017877894736842106, "logits/chosen": 13.27206039428711, "logits/rejected": 13.27206039428711, "logps/chosen": -5175.7421875, "logps/rejected": -5175.7421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6358032226562, "rewards/margins": 0.0, "rewards/rejected": -514.6358032226562, "step": 1016 }, { "epoch": 10.705263157894738, "grad_norm": 9.356546684102796e-07, "learning_rate": 0.00017875789473684213, "logits/chosen": 13.20598316192627, "logits/rejected": 13.20598316192627, "logps/chosen": -2673.041015625, "logps/rejected": -2673.041015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.52252197265625, "rewards/margins": 0.0, "rewards/rejected": -264.52252197265625, "step": 1017 }, { "epoch": 10.715789473684211, "grad_norm": 2.2187205104273744e-06, "learning_rate": 0.00017873684210526316, "logits/chosen": 13.23035717010498, "logits/rejected": 13.23035717010498, "logps/chosen": -4286.69921875, "logps/rejected": -4286.69921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.87255859375, "rewards/margins": 0.0, "rewards/rejected": -425.87255859375, "step": 1018 }, { "epoch": 10.726315789473684, "grad_norm": 8.783920861787919e-07, "learning_rate": 0.0001787157894736842, "logits/chosen": 13.237408638000488, "logits/rejected": 13.237408638000488, "logps/chosen": -3543.453125, "logps/rejected": -3543.453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3194274902344, "rewards/margins": 0.0, "rewards/rejected": -351.3194274902344, "step": 1019 }, { "epoch": 10.736842105263158, "grad_norm": 1.9901049199688714e-06, "learning_rate": 0.00017869473684210528, "logits/chosen": 13.248115539550781, "logits/rejected": 13.248115539550781, "logps/chosen": -4287.0732421875, "logps/rejected": -4287.0732421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.90997314453125, "rewards/margins": 0.0, "rewards/rejected": -425.90997314453125, "step": 1020 }, { "epoch": 10.74736842105263, "grad_norm": 8.802184652267897e-07, "learning_rate": 0.00017867368421052633, "logits/chosen": 13.25467586517334, "logits/rejected": 13.25467586517334, "logps/chosen": -3544.111328125, "logps/rejected": -3544.111328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.38525390625, "rewards/margins": 0.0, "rewards/rejected": -351.38525390625, "step": 1021 }, { "epoch": 10.757894736842106, "grad_norm": 1.355450990558893e-06, "learning_rate": 0.00017865263157894738, "logits/chosen": 13.256010055541992, "logits/rejected": 13.256010055541992, "logps/chosen": -3997.22265625, "logps/rejected": -3997.22265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.89495849609375, "rewards/margins": 0.0, "rewards/rejected": -396.89495849609375, "step": 1022 }, { "epoch": 10.76842105263158, "grad_norm": 1.2228549621795537e-06, "learning_rate": 0.00017863157894736843, "logits/chosen": 13.251581192016602, "logits/rejected": 13.251581192016602, "logps/chosen": -2673.673828125, "logps/rejected": -2673.673828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5857849121094, "rewards/margins": 0.0, "rewards/rejected": -264.5857849121094, "step": 1023 }, { "epoch": 10.778947368421052, "grad_norm": 1.1635554528766079e-06, "learning_rate": 0.00017861052631578948, "logits/chosen": 13.2525053024292, "logits/rejected": 13.2525053024292, "logps/chosen": -2673.8095703125, "logps/rejected": -2673.8095703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.599365234375, "rewards/margins": 0.0, "rewards/rejected": -264.599365234375, "step": 1024 }, { "epoch": 10.789473684210526, "grad_norm": 1.3122258906150819e-06, "learning_rate": 0.00017858947368421053, "logits/chosen": 13.258501052856445, "logits/rejected": 13.258501052856445, "logps/chosen": -3997.408203125, "logps/rejected": -3997.408203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.91351318359375, "rewards/margins": 0.0, "rewards/rejected": -396.91351318359375, "step": 1025 }, { "epoch": 10.8, "grad_norm": 1.368787934552529e-06, "learning_rate": 0.00017856842105263158, "logits/chosen": 13.252470016479492, "logits/rejected": 13.252470016479492, "logps/chosen": -3997.669921875, "logps/rejected": -3997.669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9396667480469, "rewards/margins": 0.0, "rewards/rejected": -396.9396667480469, "step": 1026 }, { "epoch": 10.810526315789474, "grad_norm": 2.3092115952749737e-06, "learning_rate": 0.00017854736842105263, "logits/chosen": 13.298238754272461, "logits/rejected": 13.298238754272461, "logps/chosen": -4878.5146484375, "logps/rejected": -4878.5146484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0043029785156, "rewards/margins": 0.0, "rewards/rejected": -485.0043029785156, "step": 1027 }, { "epoch": 10.821052631578947, "grad_norm": 8.208472763726604e-07, "learning_rate": 0.0001785263157894737, "logits/chosen": 13.237881660461426, "logits/rejected": 13.237881660461426, "logps/chosen": -2968.15625, "logps/rejected": -2968.15625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0894470214844, "rewards/margins": 0.0, "rewards/rejected": -294.0894470214844, "step": 1028 }, { "epoch": 10.83157894736842, "grad_norm": 1.2605588608494145e-06, "learning_rate": 0.00017850526315789475, "logits/chosen": 13.226240158081055, "logits/rejected": 13.226240158081055, "logps/chosen": -3998.015625, "logps/rejected": -3998.015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9742431640625, "rewards/margins": 0.0, "rewards/rejected": -396.9742431640625, "step": 1029 }, { "epoch": 10.842105263157894, "grad_norm": 1.4274274917625007e-06, "learning_rate": 0.00017848421052631578, "logits/chosen": 13.271389961242676, "logits/rejected": 13.271389961242676, "logps/chosen": -4878.86328125, "logps/rejected": -4878.86328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0391540527344, "rewards/margins": 0.0, "rewards/rejected": -485.0391540527344, "step": 1030 }, { "epoch": 10.852631578947369, "grad_norm": 1.3848475646227598e-06, "learning_rate": 0.00017846315789473685, "logits/chosen": 13.212285041809082, "logits/rejected": 13.212285041809082, "logps/chosen": -3544.8076171875, "logps/rejected": -3544.8076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.45489501953125, "rewards/margins": 0.0, "rewards/rejected": -351.45489501953125, "step": 1031 }, { "epoch": 10.863157894736842, "grad_norm": 1.1466543128335616e-06, "learning_rate": 0.0001784421052631579, "logits/chosen": 13.208029747009277, "logits/rejected": 13.208029747009277, "logps/chosen": -3757.9775390625, "logps/rejected": -3757.9775390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.87945556640625, "rewards/margins": 0.0, "rewards/rejected": -372.87945556640625, "step": 1032 }, { "epoch": 10.873684210526315, "grad_norm": 9.515599685983034e-07, "learning_rate": 0.00017842105263157895, "logits/chosen": 13.19658088684082, "logits/rejected": 13.19658088684082, "logps/chosen": -3544.865234375, "logps/rejected": -3544.865234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4606628417969, "rewards/margins": 0.0, "rewards/rejected": -351.4606628417969, "step": 1033 }, { "epoch": 10.884210526315789, "grad_norm": 7.903745995463396e-07, "learning_rate": 0.0001784, "logits/chosen": 13.18742561340332, "logits/rejected": 13.18742561340332, "logps/chosen": -2968.53125, "logps/rejected": -2968.53125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1269226074219, "rewards/margins": 0.0, "rewards/rejected": -294.1269226074219, "step": 1034 }, { "epoch": 10.894736842105264, "grad_norm": 8.046241646297858e-07, "learning_rate": 0.00017837894736842108, "logits/chosen": 13.171792030334473, "logits/rejected": 13.171792030334473, "logps/chosen": -2675.59765625, "logps/rejected": -2675.59765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7781677246094, "rewards/margins": 0.0, "rewards/rejected": -264.7781677246094, "step": 1035 }, { "epoch": 10.905263157894737, "grad_norm": 2.012414597629686e-06, "learning_rate": 0.00017835789473684213, "logits/chosen": 13.174912452697754, "logits/rejected": 13.174912452697754, "logps/chosen": -3777.1904296875, "logps/rejected": -3777.1904296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8499450683594, "rewards/margins": 0.0, "rewards/rejected": -374.8499450683594, "step": 1036 }, { "epoch": 10.91578947368421, "grad_norm": 1.4943033193048905e-06, "learning_rate": 0.00017833684210526315, "logits/chosen": 13.182901382446289, "logits/rejected": 13.182901382446289, "logps/chosen": -3758.171875, "logps/rejected": -3758.171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8988952636719, "rewards/margins": 0.0, "rewards/rejected": -372.8988952636719, "step": 1037 }, { "epoch": 10.926315789473684, "grad_norm": 1.9015815269085579e-06, "learning_rate": 0.00017831578947368422, "logits/chosen": 13.239189147949219, "logits/rejected": 13.239189147949219, "logps/chosen": -5174.32470703125, "logps/rejected": -5174.32470703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4940795898438, "rewards/margins": 0.0, "rewards/rejected": -514.4940795898438, "step": 1038 }, { "epoch": 10.936842105263159, "grad_norm": 1.4353944379763561e-06, "learning_rate": 0.00017829473684210527, "logits/chosen": 13.228001594543457, "logits/rejected": 13.228001594543457, "logps/chosen": -4878.841796875, "logps/rejected": -4878.841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0370178222656, "rewards/margins": 0.0, "rewards/rejected": -485.0370178222656, "step": 1039 }, { "epoch": 10.947368421052632, "grad_norm": 1.4171000657370314e-06, "learning_rate": 0.00017827368421052632, "logits/chosen": 13.177600860595703, "logits/rejected": 13.177600860595703, "logps/chosen": -2968.658203125, "logps/rejected": -2968.658203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1396179199219, "rewards/margins": 0.0, "rewards/rejected": -294.1396179199219, "step": 1040 }, { "epoch": 10.957894736842105, "grad_norm": 3.215348897356307e-06, "learning_rate": 0.00017825263157894737, "logits/chosen": 13.24644947052002, "logits/rejected": 13.24644947052002, "logps/chosen": -5174.0419921875, "logps/rejected": -5174.0419921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4657592773438, "rewards/margins": 0.0, "rewards/rejected": -514.4657592773438, "step": 1041 }, { "epoch": 10.968421052631578, "grad_norm": 1.5377390809589997e-06, "learning_rate": 0.00017823157894736845, "logits/chosen": 13.180655479431152, "logits/rejected": 13.180655479431152, "logps/chosen": -3778.388671875, "logps/rejected": -3778.388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9697570800781, "rewards/margins": 0.0, "rewards/rejected": -374.9697570800781, "step": 1042 }, { "epoch": 10.978947368421053, "grad_norm": 2.6887482817983255e-06, "learning_rate": 0.00017821052631578947, "logits/chosen": 13.249733924865723, "logits/rejected": 13.249733924865723, "logps/chosen": -5174.35498046875, "logps/rejected": -5174.35498046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4970703125, "rewards/margins": 0.0, "rewards/rejected": -514.4970703125, "step": 1043 }, { "epoch": 10.989473684210527, "grad_norm": 1.5689623751313775e-06, "learning_rate": 0.00017818947368421052, "logits/chosen": 13.185819625854492, "logits/rejected": 13.185819625854492, "logps/chosen": -3998.734375, "logps/rejected": -3998.734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0461120605469, "rewards/margins": 0.0, "rewards/rejected": -397.0461120605469, "step": 1044 }, { "epoch": 11.0, "grad_norm": 2.098635377478786e-06, "learning_rate": 0.0001781684210526316, "logits/chosen": 13.242274284362793, "logits/rejected": 13.242274284362793, "logps/chosen": -4879.4326171875, "logps/rejected": -4879.4326171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0960998535156, "rewards/margins": 0.0, "rewards/rejected": -485.0960998535156, "step": 1045 }, { "epoch": 11.010526315789473, "grad_norm": 1.1269537480984582e-06, "learning_rate": 0.00017814736842105264, "logits/chosen": 13.24583911895752, "logits/rejected": 13.24583911895752, "logps/chosen": -4879.86962890625, "logps/rejected": -4879.86962890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1398010253906, "rewards/margins": 0.0, "rewards/rejected": -485.1398010253906, "step": 1046 }, { "epoch": 11.021052631578947, "grad_norm": 2.0678728560596937e-06, "learning_rate": 0.0001781263157894737, "logits/chosen": 13.201807022094727, "logits/rejected": 13.201807022094727, "logps/chosen": -4288.85888671875, "logps/rejected": -4288.85888671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0885314941406, "rewards/margins": 0.0, "rewards/rejected": -426.0885314941406, "step": 1047 }, { "epoch": 11.031578947368422, "grad_norm": 2.1118066797498614e-06, "learning_rate": 0.00017810526315789474, "logits/chosen": 13.192779541015625, "logits/rejected": 13.192779541015625, "logps/chosen": -3998.36328125, "logps/rejected": -3998.36328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0090026855469, "rewards/margins": 0.0, "rewards/rejected": -397.0090026855469, "step": 1048 }, { "epoch": 11.042105263157895, "grad_norm": 1.1340811170157394e-06, "learning_rate": 0.00017808421052631582, "logits/chosen": 13.19736385345459, "logits/rejected": 13.19736385345459, "logps/chosen": -3758.8935546875, "logps/rejected": -3758.8935546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9710388183594, "rewards/margins": 0.0, "rewards/rejected": -372.9710388183594, "step": 1049 }, { "epoch": 11.052631578947368, "grad_norm": 1.226817744282016e-06, "learning_rate": 0.00017806315789473684, "logits/chosen": 13.17660140991211, "logits/rejected": 13.17660140991211, "logps/chosen": -3998.458984375, "logps/rejected": -3998.458984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0185852050781, "rewards/margins": 0.0, "rewards/rejected": -397.0185852050781, "step": 1050 }, { "epoch": 11.052631578947368, "eval_logits/chosen": 13.202532768249512, "eval_logits/rejected": 13.202532768249512, "eval_logps/chosen": -4310.796875, "eval_logps/rejected": -4310.796875, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.17645263671875, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.17645263671875, "eval_runtime": 4.5487, "eval_samples_per_second": 2.198, "eval_steps_per_second": 2.198, "step": 1050 }, { "epoch": 11.063157894736841, "grad_norm": 8.642288662485953e-07, "learning_rate": 0.0001780421052631579, "logits/chosen": 13.158774375915527, "logits/rejected": 13.158774375915527, "logps/chosen": -2673.4619140625, "logps/rejected": -2673.4619140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5646057128906, "rewards/margins": 0.0, "rewards/rejected": -264.5646057128906, "step": 1051 }, { "epoch": 11.073684210526316, "grad_norm": 2.489583721398958e-06, "learning_rate": 0.00017802105263157897, "logits/chosen": 13.1621675491333, "logits/rejected": 13.1621675491333, "logps/chosen": -4288.85107421875, "logps/rejected": -4288.85107421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0877380371094, "rewards/margins": 0.0, "rewards/rejected": -426.0877380371094, "step": 1052 }, { "epoch": 11.08421052631579, "grad_norm": 2.912012178057921e-06, "learning_rate": 0.00017800000000000002, "logits/chosen": 13.20124626159668, "logits/rejected": 13.20124626159668, "logps/chosen": -4880.75, "logps/rejected": -4880.75, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.22784423828125, "rewards/margins": 0.0, "rewards/rejected": -485.22784423828125, "step": 1053 }, { "epoch": 11.094736842105263, "grad_norm": 1.1341369372530608e-06, "learning_rate": 0.00017797894736842107, "logits/chosen": 13.19956111907959, "logits/rejected": 13.19956111907959, "logps/chosen": -4881.3955078125, "logps/rejected": -4881.3955078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2923889160156, "rewards/margins": 0.0, "rewards/rejected": -485.2923889160156, "step": 1054 }, { "epoch": 11.105263157894736, "grad_norm": 2.466234946041368e-06, "learning_rate": 0.00017795789473684212, "logits/chosen": 13.140680313110352, "logits/rejected": 13.140680313110352, "logps/chosen": -3999.33203125, "logps/rejected": -3999.33203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.10589599609375, "rewards/margins": 0.0, "rewards/rejected": -397.10589599609375, "step": 1055 }, { "epoch": 11.115789473684211, "grad_norm": 2.3248396701092133e-06, "learning_rate": 0.00017793684210526316, "logits/chosen": 13.14515495300293, "logits/rejected": 13.14515495300293, "logps/chosen": -3758.888671875, "logps/rejected": -3758.888671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9705505371094, "rewards/margins": 0.0, "rewards/rejected": -372.9705505371094, "step": 1056 }, { "epoch": 11.126315789473685, "grad_norm": 1.3584361795437871e-06, "learning_rate": 0.00017791578947368421, "logits/chosen": 13.119670867919922, "logits/rejected": 13.119670867919922, "logps/chosen": -2673.177734375, "logps/rejected": -2673.177734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.53619384765625, "rewards/margins": 0.0, "rewards/rejected": -264.53619384765625, "step": 1057 }, { "epoch": 11.136842105263158, "grad_norm": 8.700322950971895e-07, "learning_rate": 0.00017789473684210526, "logits/chosen": 13.124737739562988, "logits/rejected": 13.124737739562988, "logps/chosen": -2967.685546875, "logps/rejected": -2967.685546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0423583984375, "rewards/margins": 0.0, "rewards/rejected": -294.0423583984375, "step": 1058 }, { "epoch": 11.147368421052631, "grad_norm": 1.8731444697550614e-06, "learning_rate": 0.0001778736842105263, "logits/chosen": 13.113309860229492, "logits/rejected": 13.113309860229492, "logps/chosen": -4000.09375, "logps/rejected": -4000.09375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.18206787109375, "rewards/margins": 0.0, "rewards/rejected": -397.18206787109375, "step": 1059 }, { "epoch": 11.157894736842104, "grad_norm": 2.1676491996913683e-06, "learning_rate": 0.0001778526315789474, "logits/chosen": 13.106008529663086, "logits/rejected": 13.106008529663086, "logps/chosen": -3778.25, "logps/rejected": -3778.25, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9559020996094, "rewards/margins": 0.0, "rewards/rejected": -374.9559020996094, "step": 1060 }, { "epoch": 11.16842105263158, "grad_norm": 1.620020839254721e-06, "learning_rate": 0.00017783157894736844, "logits/chosen": 13.110352516174316, "logits/rejected": 13.110352516174316, "logps/chosen": -3759.298828125, "logps/rejected": -3759.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0115661621094, "rewards/margins": 0.0, "rewards/rejected": -373.0115661621094, "step": 1061 }, { "epoch": 11.178947368421053, "grad_norm": 8.484549880449777e-07, "learning_rate": 0.00017781052631578946, "logits/chosen": 13.100408554077148, "logits/rejected": 13.100408554077148, "logps/chosen": -2967.64453125, "logps/rejected": -2967.64453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.03826904296875, "rewards/margins": 0.0, "rewards/rejected": -294.03826904296875, "step": 1062 }, { "epoch": 11.189473684210526, "grad_norm": 1.1153858849866083e-06, "learning_rate": 0.00017778947368421054, "logits/chosen": 13.091215133666992, "logits/rejected": 13.091215133666992, "logps/chosen": -4000.166015625, "logps/rejected": -4000.166015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1893005371094, "rewards/margins": 0.0, "rewards/rejected": -397.1893005371094, "step": 1063 }, { "epoch": 11.2, "grad_norm": 2.8591518912435276e-06, "learning_rate": 0.00017776842105263159, "logits/chosen": 13.15332317352295, "logits/rejected": 13.15332317352295, "logps/chosen": -5172.298828125, "logps/rejected": -5172.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2914428710938, "rewards/margins": 0.0, "rewards/rejected": -514.2914428710938, "step": 1064 }, { "epoch": 11.210526315789474, "grad_norm": 1.5865151681282441e-06, "learning_rate": 0.00017774736842105264, "logits/chosen": 13.13739013671875, "logits/rejected": 13.13739013671875, "logps/chosen": -4881.796875, "logps/rejected": -4881.796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.33251953125, "rewards/margins": 0.0, "rewards/rejected": -485.33251953125, "step": 1065 }, { "epoch": 11.221052631578948, "grad_norm": 1.0422529612696962e-06, "learning_rate": 0.00017772631578947368, "logits/chosen": 13.078730583190918, "logits/rejected": 13.078730583190918, "logps/chosen": -4000.498046875, "logps/rejected": -4000.498046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.2225036621094, "rewards/margins": 0.0, "rewards/rejected": -397.2225036621094, "step": 1066 }, { "epoch": 11.23157894736842, "grad_norm": 1.81129894372134e-06, "learning_rate": 0.00017770526315789476, "logits/chosen": 13.143586158752441, "logits/rejected": 13.143586158752441, "logps/chosen": -5172.357421875, "logps/rejected": -5172.357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2973022460938, "rewards/margins": 0.0, "rewards/rejected": -514.2973022460938, "step": 1067 }, { "epoch": 11.242105263157894, "grad_norm": 1.194509877677774e-06, "learning_rate": 0.0001776842105263158, "logits/chosen": 13.130171775817871, "logits/rejected": 13.130171775817871, "logps/chosen": -4882.02392578125, "logps/rejected": -4882.02392578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.355224609375, "rewards/margins": 0.0, "rewards/rejected": -485.355224609375, "step": 1068 }, { "epoch": 11.25263157894737, "grad_norm": 1.1006327440554742e-06, "learning_rate": 0.00017766315789473683, "logits/chosen": 13.133552551269531, "logits/rejected": 13.133552551269531, "logps/chosen": -4882.29736328125, "logps/rejected": -4882.29736328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.382568359375, "rewards/margins": 0.0, "rewards/rejected": -485.382568359375, "step": 1069 }, { "epoch": 11.263157894736842, "grad_norm": 1.4036988886800827e-06, "learning_rate": 0.0001776421052631579, "logits/chosen": 13.090828895568848, "logits/rejected": 13.090828895568848, "logps/chosen": -4289.0556640625, "logps/rejected": -4289.0556640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.10821533203125, "rewards/margins": 0.0, "rewards/rejected": -426.10821533203125, "step": 1070 }, { "epoch": 11.273684210526316, "grad_norm": 1.9695432911248645e-06, "learning_rate": 0.00017762105263157896, "logits/chosen": 13.156347274780273, "logits/rejected": 13.156347274780273, "logps/chosen": -5173.0517578125, "logps/rejected": -5173.0517578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3667602539062, "rewards/margins": 0.0, "rewards/rejected": -514.3667602539062, "step": 1071 }, { "epoch": 11.284210526315789, "grad_norm": 1.204107661578746e-06, "learning_rate": 0.0001776, "logits/chosen": 13.08472728729248, "logits/rejected": 13.08472728729248, "logps/chosen": -2672.2265625, "logps/rejected": -2672.2265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4410705566406, "rewards/margins": 0.0, "rewards/rejected": -264.4410705566406, "step": 1072 }, { "epoch": 11.294736842105262, "grad_norm": 1.1482010222607641e-06, "learning_rate": 0.00017757894736842106, "logits/chosen": 13.156635284423828, "logits/rejected": 13.156635284423828, "logps/chosen": -4882.67236328125, "logps/rejected": -4882.67236328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.4200744628906, "rewards/margins": 0.0, "rewards/rejected": -485.4200744628906, "step": 1073 }, { "epoch": 11.305263157894737, "grad_norm": 1.6549596466575167e-06, "learning_rate": 0.00017755789473684213, "logits/chosen": 13.176777839660645, "logits/rejected": 13.176777839660645, "logps/chosen": -5173.92041015625, "logps/rejected": -5173.92041015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.45361328125, "rewards/margins": 0.0, "rewards/rejected": -514.45361328125, "step": 1074 }, { "epoch": 11.31578947368421, "grad_norm": 9.766260973265162e-07, "learning_rate": 0.00017753684210526316, "logits/chosen": 13.126758575439453, "logits/rejected": 13.126758575439453, "logps/chosen": -3759.0625, "logps/rejected": -3759.0625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9879455566406, "rewards/margins": 0.0, "rewards/rejected": -372.9879455566406, "step": 1075 }, { "epoch": 11.326315789473684, "grad_norm": 1.608597585800453e-06, "learning_rate": 0.0001775157894736842, "logits/chosen": 13.194517135620117, "logits/rejected": 13.194517135620117, "logps/chosen": -5175.11328125, "logps/rejected": -5175.11328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5729370117188, "rewards/margins": 0.0, "rewards/rejected": -514.5729370117188, "step": 1076 }, { "epoch": 11.336842105263157, "grad_norm": 9.253205917048035e-07, "learning_rate": 0.00017749473684210528, "logits/chosen": 13.138322830200195, "logits/rejected": 13.138322830200195, "logps/chosen": -2966.755859375, "logps/rejected": -2966.755859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.94940185546875, "rewards/margins": 0.0, "rewards/rejected": -293.94940185546875, "step": 1077 }, { "epoch": 11.347368421052632, "grad_norm": 1.484432914367062e-06, "learning_rate": 0.00017747368421052633, "logits/chosen": 13.185938835144043, "logits/rejected": 13.185938835144043, "logps/chosen": -4322.9375, "logps/rejected": -4322.9375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.10888671875, "rewards/margins": 0.0, "rewards/rejected": -429.10888671875, "step": 1078 }, { "epoch": 11.357894736842105, "grad_norm": 1.1060932365580811e-06, "learning_rate": 0.00017745263157894738, "logits/chosen": 13.144386291503906, "logits/rejected": 13.144386291503906, "logps/chosen": -2671.609375, "logps/rejected": -2671.609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.3793640136719, "rewards/margins": 0.0, "rewards/rejected": -264.3793640136719, "step": 1079 }, { "epoch": 11.368421052631579, "grad_norm": 1.4104914498602739e-06, "learning_rate": 0.00017743157894736843, "logits/chosen": 13.233640670776367, "logits/rejected": 13.233640670776367, "logps/chosen": -5177.072265625, "logps/rejected": -5177.072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.768798828125, "rewards/margins": 0.0, "rewards/rejected": -514.768798828125, "step": 1080 }, { "epoch": 11.378947368421052, "grad_norm": 1.144772681982431e-06, "learning_rate": 0.00017741052631578948, "logits/chosen": 13.230626106262207, "logits/rejected": 13.230626106262207, "logps/chosen": -4882.15625, "logps/rejected": -4882.15625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.36846923828125, "rewards/margins": 0.0, "rewards/rejected": -485.36846923828125, "step": 1081 }, { "epoch": 11.389473684210527, "grad_norm": 1.3507245739674545e-06, "learning_rate": 0.00017738947368421053, "logits/chosen": 13.240272521972656, "logits/rejected": 13.240272521972656, "logps/chosen": -4882.2900390625, "logps/rejected": -4882.2900390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3818359375, "rewards/margins": 0.0, "rewards/rejected": -485.3818359375, "step": 1082 }, { "epoch": 11.4, "grad_norm": 1.499010068073403e-06, "learning_rate": 0.00017736842105263158, "logits/chosen": 13.262801170349121, "logits/rejected": 13.262801170349121, "logps/chosen": -5178.076171875, "logps/rejected": -5178.076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8692016601562, "rewards/margins": 0.0, "rewards/rejected": -514.8692016601562, "step": 1083 }, { "epoch": 11.410526315789474, "grad_norm": 1.2322376505835564e-06, "learning_rate": 0.00017734736842105265, "logits/chosen": 13.204524040222168, "logits/rejected": 13.204524040222168, "logps/chosen": -2966.556640625, "logps/rejected": -2966.556640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9294738769531, "rewards/margins": 0.0, "rewards/rejected": -293.9294738769531, "step": 1084 }, { "epoch": 11.421052631578947, "grad_norm": 1.6729188700992381e-06, "learning_rate": 0.0001773263157894737, "logits/chosen": 13.210382461547852, "logits/rejected": 13.210382461547852, "logps/chosen": -3774.716796875, "logps/rejected": -3774.716796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.6025695800781, "rewards/margins": 0.0, "rewards/rejected": -374.6025695800781, "step": 1085 }, { "epoch": 11.431578947368422, "grad_norm": 1.41476141379826e-06, "learning_rate": 0.00017730526315789475, "logits/chosen": 13.279061317443848, "logits/rejected": 13.279061317443848, "logps/chosen": -4881.751953125, "logps/rejected": -4881.751953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3280334472656, "rewards/margins": 0.0, "rewards/rejected": -485.3280334472656, "step": 1086 }, { "epoch": 11.442105263157895, "grad_norm": 1.3853941709385253e-06, "learning_rate": 0.0001772842105263158, "logits/chosen": 13.228163719177246, "logits/rejected": 13.228163719177246, "logps/chosen": -3995.802734375, "logps/rejected": -3995.802734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7529602050781, "rewards/margins": 0.0, "rewards/rejected": -396.7529602050781, "step": 1087 }, { "epoch": 11.452631578947368, "grad_norm": 1.7602915249881335e-06, "learning_rate": 0.00017726315789473685, "logits/chosen": 13.295538902282715, "logits/rejected": 13.295538902282715, "logps/chosen": -4881.90380859375, "logps/rejected": -4881.90380859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3432312011719, "rewards/margins": 0.0, "rewards/rejected": -485.3432312011719, "step": 1088 }, { "epoch": 11.463157894736842, "grad_norm": 1.343849703516753e-06, "learning_rate": 0.0001772421052631579, "logits/chosen": 13.286849975585938, "logits/rejected": 13.286849975585938, "logps/chosen": -4324.69921875, "logps/rejected": -4324.69921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.2850646972656, "rewards/margins": 0.0, "rewards/rejected": -429.2850646972656, "step": 1089 }, { "epoch": 11.473684210526315, "grad_norm": 1.4026127246324904e-06, "learning_rate": 0.00017722105263157895, "logits/chosen": 13.257658004760742, "logits/rejected": 13.257658004760742, "logps/chosen": -3541.0703125, "logps/rejected": -3541.0703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.0811462402344, "rewards/margins": 0.0, "rewards/rejected": -351.0811462402344, "step": 1090 }, { "epoch": 11.48421052631579, "grad_norm": 1.318843260378344e-06, "learning_rate": 0.0001772, "logits/chosen": 13.319183349609375, "logits/rejected": 13.319183349609375, "logps/chosen": -4882.43115234375, "logps/rejected": -4882.43115234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3959655761719, "rewards/margins": 0.0, "rewards/rejected": -485.3959655761719, "step": 1091 }, { "epoch": 11.494736842105263, "grad_norm": 1.5861633073654957e-06, "learning_rate": 0.00017717894736842107, "logits/chosen": 13.263603210449219, "logits/rejected": 13.263603210449219, "logps/chosen": -3995.583984375, "logps/rejected": -3995.583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7310791015625, "rewards/margins": 0.0, "rewards/rejected": -396.7310791015625, "step": 1092 }, { "epoch": 11.505263157894737, "grad_norm": 1.3234729294708814e-06, "learning_rate": 0.00017715789473684212, "logits/chosen": 13.307832717895508, "logits/rejected": 13.307832717895508, "logps/chosen": -4325.40234375, "logps/rejected": -4325.40234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3553771972656, "rewards/margins": 0.0, "rewards/rejected": -429.3553771972656, "step": 1093 }, { "epoch": 11.51578947368421, "grad_norm": 9.968987342290347e-07, "learning_rate": 0.00017713684210526315, "logits/chosen": 13.269745826721191, "logits/rejected": 13.269745826721191, "logps/chosen": -3541.666015625, "logps/rejected": -3541.666015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1407165527344, "rewards/margins": 0.0, "rewards/rejected": -351.1407165527344, "step": 1094 }, { "epoch": 11.526315789473685, "grad_norm": 1.7469242266088258e-06, "learning_rate": 0.00017711578947368422, "logits/chosen": 13.322668075561523, "logits/rejected": 13.322668075561523, "logps/chosen": -4882.68701171875, "logps/rejected": -4882.68701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.4215393066406, "rewards/margins": 0.0, "rewards/rejected": -485.4215393066406, "step": 1095 }, { "epoch": 11.536842105263158, "grad_norm": 9.031431886796781e-07, "learning_rate": 0.00017709473684210527, "logits/chosen": 13.272272109985352, "logits/rejected": 13.272272109985352, "logps/chosen": -3542.076171875, "logps/rejected": -3542.076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1817321777344, "rewards/margins": 0.0, "rewards/rejected": -351.1817321777344, "step": 1096 }, { "epoch": 11.547368421052632, "grad_norm": 8.914427667150449e-07, "learning_rate": 0.00017707368421052632, "logits/chosen": 13.276670455932617, "logits/rejected": 13.276670455932617, "logps/chosen": -3542.412109375, "logps/rejected": -3542.412109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.21533203125, "rewards/margins": 0.0, "rewards/rejected": -351.21533203125, "step": 1097 }, { "epoch": 11.557894736842105, "grad_norm": 8.729178375688207e-07, "learning_rate": 0.00017705263157894737, "logits/chosen": 13.282588958740234, "logits/rejected": 13.282588958740234, "logps/chosen": -3542.728515625, "logps/rejected": -3542.728515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2469787597656, "rewards/margins": 0.0, "rewards/rejected": -351.2469787597656, "step": 1098 }, { "epoch": 11.568421052631578, "grad_norm": 1.7410309283150127e-06, "learning_rate": 0.00017703157894736845, "logits/chosen": 13.342294692993164, "logits/rejected": 13.342294692993164, "logps/chosen": -4882.4658203125, "logps/rejected": -4882.4658203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3994140625, "rewards/margins": 0.0, "rewards/rejected": -485.3994140625, "step": 1099 }, { "epoch": 11.578947368421053, "grad_norm": 8.881889357326145e-07, "learning_rate": 0.00017701052631578947, "logits/chosen": 13.29837703704834, "logits/rejected": 13.29837703704834, "logps/chosen": -3543.39453125, "logps/rejected": -3543.39453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3135681152344, "rewards/margins": 0.0, "rewards/rejected": -351.3135681152344, "step": 1100 }, { "epoch": 11.578947368421053, "eval_logits/chosen": 13.338384628295898, "eval_logits/rejected": 13.338384628295898, "eval_logps/chosen": -4311.4453125, "eval_logps/rejected": -4311.4453125, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.2413635253906, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.2413635253906, "eval_runtime": 4.2993, "eval_samples_per_second": 2.326, "eval_steps_per_second": 2.326, "step": 1100 }, { "epoch": 11.589473684210526, "grad_norm": 1.3109354313201038e-06, "learning_rate": 0.00017698947368421052, "logits/chosen": 13.37167739868164, "logits/rejected": 13.37167739868164, "logps/chosen": -5178.341796875, "logps/rejected": -5178.341796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.895751953125, "rewards/margins": 0.0, "rewards/rejected": -514.895751953125, "step": 1101 }, { "epoch": 11.6, "grad_norm": 1.0846333680092357e-06, "learning_rate": 0.0001769684210526316, "logits/chosen": 13.314481735229492, "logits/rejected": 13.314481735229492, "logps/chosen": -3544.10546875, "logps/rejected": -3544.10546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3846740722656, "rewards/margins": 0.0, "rewards/rejected": -351.3846740722656, "step": 1102 }, { "epoch": 11.610526315789473, "grad_norm": 1.0255636198053253e-06, "learning_rate": 0.00017694736842105264, "logits/chosen": 13.322269439697266, "logits/rejected": 13.322269439697266, "logps/chosen": -3544.44921875, "logps/rejected": -3544.44921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4190368652344, "rewards/margins": 0.0, "rewards/rejected": -351.4190368652344, "step": 1103 }, { "epoch": 11.621052631578948, "grad_norm": 1.0394411447123275e-06, "learning_rate": 0.0001769263157894737, "logits/chosen": 13.313828468322754, "logits/rejected": 13.313828468322754, "logps/chosen": -2671.3203125, "logps/rejected": -2671.3203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.3504333496094, "rewards/margins": 0.0, "rewards/rejected": -264.3504333496094, "step": 1104 }, { "epoch": 11.631578947368421, "grad_norm": 1.5713408174633514e-06, "learning_rate": 0.00017690526315789474, "logits/chosen": 13.326851844787598, "logits/rejected": 13.326851844787598, "logps/chosen": -3995.94921875, "logps/rejected": -3995.94921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7676086425781, "rewards/margins": 0.0, "rewards/rejected": -396.7676086425781, "step": 1105 }, { "epoch": 11.642105263157895, "grad_norm": 2.2194963094079867e-06, "learning_rate": 0.00017688421052631582, "logits/chosen": 13.398720741271973, "logits/rejected": 13.398720741271973, "logps/chosen": -5178.267578125, "logps/rejected": -5178.267578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8883666992188, "rewards/margins": 0.0, "rewards/rejected": -514.8883666992188, "step": 1106 }, { "epoch": 11.652631578947368, "grad_norm": 7.660092364858428e-07, "learning_rate": 0.00017686315789473684, "logits/chosen": 13.33483600616455, "logits/rejected": 13.33483600616455, "logps/chosen": -3545.88671875, "logps/rejected": -3545.88671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.56280517578125, "rewards/margins": 0.0, "rewards/rejected": -351.56280517578125, "step": 1107 }, { "epoch": 11.663157894736843, "grad_norm": 1.2355778835626552e-06, "learning_rate": 0.0001768421052631579, "logits/chosen": 13.369561195373535, "logits/rejected": 13.369561195373535, "logps/chosen": -4327.91796875, "logps/rejected": -4327.91796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.60693359375, "rewards/margins": 0.0, "rewards/rejected": -429.60693359375, "step": 1108 }, { "epoch": 11.673684210526316, "grad_norm": 1.5587136203976115e-06, "learning_rate": 0.00017682105263157897, "logits/chosen": 13.324060440063477, "logits/rejected": 13.324060440063477, "logps/chosen": -3995.958984375, "logps/rejected": -3995.958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7685852050781, "rewards/margins": 0.0, "rewards/rejected": -396.7685852050781, "step": 1109 }, { "epoch": 11.68421052631579, "grad_norm": 1.2366976989142131e-06, "learning_rate": 0.00017680000000000001, "logits/chosen": 13.309664726257324, "logits/rejected": 13.309664726257324, "logps/chosen": -2671.439453125, "logps/rejected": -2671.439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.36236572265625, "rewards/margins": 0.0, "rewards/rejected": -264.36236572265625, "step": 1110 }, { "epoch": 11.694736842105263, "grad_norm": 1.6387425603170414e-06, "learning_rate": 0.00017677894736842106, "logits/chosen": 13.319222450256348, "logits/rejected": 13.319222450256348, "logps/chosen": -4284.74609375, "logps/rejected": -4284.74609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.67724609375, "rewards/margins": 0.0, "rewards/rejected": -425.67724609375, "step": 1111 }, { "epoch": 11.705263157894738, "grad_norm": 1.3411945474217646e-06, "learning_rate": 0.0001767578947368421, "logits/chosen": 13.297146797180176, "logits/rejected": 13.297146797180176, "logps/chosen": -3996.51953125, "logps/rejected": -3996.51953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.82464599609375, "rewards/margins": 0.0, "rewards/rejected": -396.82464599609375, "step": 1112 }, { "epoch": 11.715789473684211, "grad_norm": 8.575942729294184e-07, "learning_rate": 0.00017673684210526316, "logits/chosen": 13.289884567260742, "logits/rejected": 13.289884567260742, "logps/chosen": -2967.37109375, "logps/rejected": -2967.37109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.01092529296875, "rewards/margins": 0.0, "rewards/rejected": -294.01092529296875, "step": 1113 }, { "epoch": 11.726315789473684, "grad_norm": 8.83433528997557e-07, "learning_rate": 0.0001767157894736842, "logits/chosen": 13.260509490966797, "logits/rejected": 13.260509490966797, "logps/chosen": -2672.458984375, "logps/rejected": -2672.458984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4643249511719, "rewards/margins": 0.0, "rewards/rejected": -264.4643249511719, "step": 1114 }, { "epoch": 11.736842105263158, "grad_norm": 8.83172560861567e-07, "learning_rate": 0.00017669473684210526, "logits/chosen": 13.247953414916992, "logits/rejected": 13.247953414916992, "logps/chosen": -2672.71875, "logps/rejected": -2672.71875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.49029541015625, "rewards/margins": 0.0, "rewards/rejected": -264.49029541015625, "step": 1115 }, { "epoch": 11.74736842105263, "grad_norm": 1.8588118564366596e-06, "learning_rate": 0.00017667368421052634, "logits/chosen": 13.254203796386719, "logits/rejected": 13.254203796386719, "logps/chosen": -3756.74609375, "logps/rejected": -3756.74609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.7563171386719, "rewards/margins": 0.0, "rewards/rejected": -372.7563171386719, "step": 1116 }, { "epoch": 11.757894736842106, "grad_norm": 1.3917166370447376e-06, "learning_rate": 0.0001766526315789474, "logits/chosen": 13.24231243133545, "logits/rejected": 13.24231243133545, "logps/chosen": -3756.95703125, "logps/rejected": -3756.95703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.77740478515625, "rewards/margins": 0.0, "rewards/rejected": -372.77740478515625, "step": 1117 }, { "epoch": 11.76842105263158, "grad_norm": 1.5146015357458964e-06, "learning_rate": 0.00017663157894736844, "logits/chosen": 13.291352272033691, "logits/rejected": 13.291352272033691, "logps/chosen": -5176.9404296875, "logps/rejected": -5176.9404296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.755615234375, "rewards/margins": 0.0, "rewards/rejected": -514.755615234375, "step": 1118 }, { "epoch": 11.778947368421052, "grad_norm": 1.2394157238304615e-06, "learning_rate": 0.00017661052631578949, "logits/chosen": 13.221146583557129, "logits/rejected": 13.221146583557129, "logps/chosen": -3546.5751953125, "logps/rejected": -3546.5751953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.63165283203125, "rewards/margins": 0.0, "rewards/rejected": -351.63165283203125, "step": 1119 }, { "epoch": 11.789473684210526, "grad_norm": 1.8103849015460582e-06, "learning_rate": 0.00017658947368421053, "logits/chosen": 13.210124969482422, "logits/rejected": 13.210124969482422, "logps/chosen": -3775.94140625, "logps/rejected": -3775.94140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.72503662109375, "rewards/margins": 0.0, "rewards/rejected": -374.72503662109375, "step": 1120 }, { "epoch": 11.8, "grad_norm": 1.1883638535437058e-06, "learning_rate": 0.00017656842105263158, "logits/chosen": 13.2149076461792, "logits/rejected": 13.2149076461792, "logps/chosen": -3546.697265625, "logps/rejected": -3546.697265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.64385986328125, "rewards/margins": 0.0, "rewards/rejected": -351.64385986328125, "step": 1121 }, { "epoch": 11.810526315789474, "grad_norm": 1.2033098073516157e-06, "learning_rate": 0.00017654736842105263, "logits/chosen": 13.205141067504883, "logits/rejected": 13.205141067504883, "logps/chosen": -3997.81640625, "logps/rejected": -3997.81640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9543151855469, "rewards/margins": 0.0, "rewards/rejected": -396.9543151855469, "step": 1122 }, { "epoch": 11.821052631578947, "grad_norm": 1.589643829902343e-06, "learning_rate": 0.00017652631578947368, "logits/chosen": 13.206165313720703, "logits/rejected": 13.206165313720703, "logps/chosen": -3776.43359375, "logps/rejected": -3776.43359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7742614746094, "rewards/margins": 0.0, "rewards/rejected": -374.7742614746094, "step": 1123 }, { "epoch": 11.83157894736842, "grad_norm": 8.367911163986719e-07, "learning_rate": 0.00017650526315789476, "logits/chosen": 13.212361335754395, "logits/rejected": 13.212361335754395, "logps/chosen": -3547.021484375, "logps/rejected": -3547.021484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.67626953125, "rewards/margins": 0.0, "rewards/rejected": -351.67626953125, "step": 1124 }, { "epoch": 11.842105263157894, "grad_norm": 1.5414308336403337e-06, "learning_rate": 0.0001764842105263158, "logits/chosen": 13.205819129943848, "logits/rejected": 13.205819129943848, "logps/chosen": -3777.16015625, "logps/rejected": -3777.16015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8468933105469, "rewards/margins": 0.0, "rewards/rejected": -374.8468933105469, "step": 1125 }, { "epoch": 11.852631578947369, "grad_norm": 1.4040676887816517e-06, "learning_rate": 0.00017646315789473683, "logits/chosen": 13.207777976989746, "logits/rejected": 13.207777976989746, "logps/chosen": -3777.712890625, "logps/rejected": -3777.712890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9021911621094, "rewards/margins": 0.0, "rewards/rejected": -374.9021911621094, "step": 1126 }, { "epoch": 11.863157894736842, "grad_norm": 1.2888542642031098e-06, "learning_rate": 0.0001764421052631579, "logits/chosen": 13.249421119689941, "logits/rejected": 13.249421119689941, "logps/chosen": -4329.033203125, "logps/rejected": -4329.033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.7184753417969, "rewards/margins": 0.0, "rewards/rejected": -429.7184753417969, "step": 1127 }, { "epoch": 11.873684210526315, "grad_norm": 1.69546319739311e-06, "learning_rate": 0.00017642105263157896, "logits/chosen": 13.215753555297852, "logits/rejected": 13.215753555297852, "logps/chosen": -4287.22314453125, "logps/rejected": -4287.22314453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9249572753906, "rewards/margins": 0.0, "rewards/rejected": -425.9249572753906, "step": 1128 }, { "epoch": 11.884210526315789, "grad_norm": 1.188335545521113e-06, "learning_rate": 0.0001764, "logits/chosen": 13.210471153259277, "logits/rejected": 13.210471153259277, "logps/chosen": -2968.7529296875, "logps/rejected": -2968.7529296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.14910888671875, "rewards/margins": 0.0, "rewards/rejected": -294.14910888671875, "step": 1129 }, { "epoch": 11.894736842105264, "grad_norm": 1.3131199239069247e-06, "learning_rate": 0.00017637894736842105, "logits/chosen": 13.204744338989258, "logits/rejected": 13.204744338989258, "logps/chosen": -3779.94140625, "logps/rejected": -3779.94140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.1250305175781, "rewards/margins": 0.0, "rewards/rejected": -375.1250305175781, "step": 1130 }, { "epoch": 11.905263157894737, "grad_norm": 7.731790105935943e-07, "learning_rate": 0.00017635789473684213, "logits/chosen": 13.209800720214844, "logits/rejected": 13.209800720214844, "logps/chosen": -3546.8916015625, "logps/rejected": -3546.8916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.6632995605469, "rewards/margins": 0.0, "rewards/rejected": -351.6632995605469, "step": 1131 }, { "epoch": 11.91578947368421, "grad_norm": 1.980549313884694e-06, "learning_rate": 0.00017633684210526315, "logits/chosen": 13.270223617553711, "logits/rejected": 13.270223617553711, "logps/chosen": -5173.8330078125, "logps/rejected": -5173.8330078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4448852539062, "rewards/margins": 0.0, "rewards/rejected": -514.4448852539062, "step": 1132 }, { "epoch": 11.926315789473684, "grad_norm": 1.5226237337628845e-06, "learning_rate": 0.0001763157894736842, "logits/chosen": 13.201573371887207, "logits/rejected": 13.201573371887207, "logps/chosen": -3996.80859375, "logps/rejected": -3996.80859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8535461425781, "rewards/margins": 0.0, "rewards/rejected": -396.8535461425781, "step": 1133 }, { "epoch": 11.936842105263159, "grad_norm": 1.4161789749778109e-06, "learning_rate": 0.00017629473684210528, "logits/chosen": 13.21018123626709, "logits/rejected": 13.21018123626709, "logps/chosen": -4288.837890625, "logps/rejected": -4288.837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.08642578125, "rewards/margins": 0.0, "rewards/rejected": -426.08642578125, "step": 1134 }, { "epoch": 11.947368421052632, "grad_norm": 2.034541466855444e-06, "learning_rate": 0.00017627368421052633, "logits/chosen": 13.268797874450684, "logits/rejected": 13.268797874450684, "logps/chosen": -5173.50341796875, "logps/rejected": -5173.50341796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4119262695312, "rewards/margins": 0.0, "rewards/rejected": -514.4119262695312, "step": 1135 }, { "epoch": 11.957894736842105, "grad_norm": 1.365693606203422e-06, "learning_rate": 0.00017625263157894738, "logits/chosen": 13.193127632141113, "logits/rejected": 13.193127632141113, "logps/chosen": -3996.896484375, "logps/rejected": -3996.896484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8623352050781, "rewards/margins": 0.0, "rewards/rejected": -396.8623352050781, "step": 1136 }, { "epoch": 11.968421052631578, "grad_norm": 1.0118862974195508e-06, "learning_rate": 0.00017623157894736843, "logits/chosen": 13.196903228759766, "logits/rejected": 13.196903228759766, "logps/chosen": -3546.19921875, "logps/rejected": -3546.19921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.59405517578125, "rewards/margins": 0.0, "rewards/rejected": -351.59405517578125, "step": 1137 }, { "epoch": 11.978947368421053, "grad_norm": 9.494154937783605e-07, "learning_rate": 0.0001762105263157895, "logits/chosen": 13.174653053283691, "logits/rejected": 13.174653053283691, "logps/chosen": -2675.28173828125, "logps/rejected": -2675.28173828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.74658203125, "rewards/margins": 0.0, "rewards/rejected": -264.74658203125, "step": 1138 }, { "epoch": 11.989473684210527, "grad_norm": 8.6102767227203e-07, "learning_rate": 0.00017618947368421052, "logits/chosen": 13.169896125793457, "logits/rejected": 13.169896125793457, "logps/chosen": -2675.30322265625, "logps/rejected": -2675.30322265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7487487792969, "rewards/margins": 0.0, "rewards/rejected": -264.7487487792969, "step": 1139 }, { "epoch": 12.0, "grad_norm": 1.532814735583088e-06, "learning_rate": 0.00017616842105263157, "logits/chosen": 13.172043800354004, "logits/rejected": 13.172043800354004, "logps/chosen": -3997.353515625, "logps/rejected": -3997.353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9080505371094, "rewards/margins": 0.0, "rewards/rejected": -396.9080505371094, "step": 1140 }, { "epoch": 12.010526315789473, "grad_norm": 2.001610027946299e-06, "learning_rate": 0.00017614736842105265, "logits/chosen": 13.223211288452148, "logits/rejected": 13.223211288452148, "logps/chosen": -4875.69970703125, "logps/rejected": -4875.69970703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.7228088378906, "rewards/margins": 0.0, "rewards/rejected": -484.7228088378906, "step": 1141 }, { "epoch": 12.021052631578947, "grad_norm": 1.4257593647926114e-06, "learning_rate": 0.0001761263157894737, "logits/chosen": 13.205347061157227, "logits/rejected": 13.205347061157227, "logps/chosen": -4328.4833984375, "logps/rejected": -4328.4833984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.6634826660156, "rewards/margins": 0.0, "rewards/rejected": -429.6634826660156, "step": 1142 }, { "epoch": 12.031578947368422, "grad_norm": 1.1494424825286842e-06, "learning_rate": 0.00017610526315789475, "logits/chosen": 13.158509254455566, "logits/rejected": 13.158509254455566, "logps/chosen": -3997.775390625, "logps/rejected": -3997.775390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9502258300781, "rewards/margins": 0.0, "rewards/rejected": -396.9502258300781, "step": 1143 }, { "epoch": 12.042105263157895, "grad_norm": 2.3392249204334803e-06, "learning_rate": 0.0001760842105263158, "logits/chosen": 13.226460456848145, "logits/rejected": 13.226460456848145, "logps/chosen": -5173.70361328125, "logps/rejected": -5173.70361328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4319458007812, "rewards/margins": 0.0, "rewards/rejected": -514.4319458007812, "step": 1144 }, { "epoch": 12.052631578947368, "grad_norm": 9.74021077126963e-07, "learning_rate": 0.00017606315789473685, "logits/chosen": 13.162618637084961, "logits/rejected": 13.162618637084961, "logps/chosen": -3545.4638671875, "logps/rejected": -3545.4638671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5205078125, "rewards/margins": 0.0, "rewards/rejected": -351.5205078125, "step": 1145 }, { "epoch": 12.063157894736841, "grad_norm": 8.510671705153072e-07, "learning_rate": 0.0001760421052631579, "logits/chosen": 13.16342830657959, "logits/rejected": 13.16342830657959, "logps/chosen": -3545.2705078125, "logps/rejected": -3545.2705078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5011901855469, "rewards/margins": 0.0, "rewards/rejected": -351.5011901855469, "step": 1146 }, { "epoch": 12.073684210526316, "grad_norm": 8.024080671020783e-07, "learning_rate": 0.00017602105263157895, "logits/chosen": 13.149815559387207, "logits/rejected": 13.149815559387207, "logps/chosen": -2675.3203125, "logps/rejected": -2675.3203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7504577636719, "rewards/margins": 0.0, "rewards/rejected": -264.7504577636719, "step": 1147 }, { "epoch": 12.08421052631579, "grad_norm": 1.6834076177474344e-06, "learning_rate": 0.00017600000000000002, "logits/chosen": 13.15926456451416, "logits/rejected": 13.15926456451416, "logps/chosen": -3998.701171875, "logps/rejected": -3998.701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0428161621094, "rewards/margins": 0.0, "rewards/rejected": -397.0428161621094, "step": 1148 }, { "epoch": 12.094736842105263, "grad_norm": 1.5382627225335455e-06, "learning_rate": 0.00017597894736842107, "logits/chosen": 13.157814979553223, "logits/rejected": 13.157814979553223, "logps/chosen": -3998.921875, "logps/rejected": -3998.921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.06488037109375, "rewards/margins": 0.0, "rewards/rejected": -397.06488037109375, "step": 1149 }, { "epoch": 12.105263157894736, "grad_norm": 1.5462560440937523e-06, "learning_rate": 0.00017595789473684212, "logits/chosen": 13.224363327026367, "logits/rejected": 13.224363327026367, "logps/chosen": -5174.9169921875, "logps/rejected": -5174.9169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5532836914062, "rewards/margins": 0.0, "rewards/rejected": -514.5532836914062, "step": 1150 }, { "epoch": 12.105263157894736, "eval_logits/chosen": 13.188931465148926, "eval_logits/rejected": 13.188931465148926, "eval_logps/chosen": -4310.96630859375, "eval_logps/rejected": -4310.96630859375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.1934509277344, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.1934509277344, "eval_runtime": 4.4204, "eval_samples_per_second": 2.262, "eval_steps_per_second": 2.262, "step": 1150 }, { "epoch": 12.115789473684211, "grad_norm": 1.0568624020379502e-06, "learning_rate": 0.00017593684210526314, "logits/chosen": 13.157355308532715, "logits/rejected": 13.157355308532715, "logps/chosen": -2968.5068359375, "logps/rejected": -2968.5068359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1244812011719, "rewards/margins": 0.0, "rewards/rejected": -294.1244812011719, "step": 1151 }, { "epoch": 12.126315789473685, "grad_norm": 1.1804228279288509e-06, "learning_rate": 0.00017591578947368422, "logits/chosen": 13.154706001281738, "logits/rejected": 13.154706001281738, "logps/chosen": -2968.431640625, "logps/rejected": -2968.431640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1169738769531, "rewards/margins": 0.0, "rewards/rejected": -294.1169738769531, "step": 1152 }, { "epoch": 12.136842105263158, "grad_norm": 1.3160945400159108e-06, "learning_rate": 0.00017589473684210527, "logits/chosen": 13.190837860107422, "logits/rejected": 13.190837860107422, "logps/chosen": -4328.025390625, "logps/rejected": -4328.025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.61767578125, "rewards/margins": 0.0, "rewards/rejected": -429.61767578125, "step": 1153 }, { "epoch": 12.147368421052631, "grad_norm": 9.006783443510358e-07, "learning_rate": 0.00017587368421052632, "logits/chosen": 13.138744354248047, "logits/rejected": 13.138744354248047, "logps/chosen": -2675.0927734375, "logps/rejected": -2675.0927734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7276916503906, "rewards/margins": 0.0, "rewards/rejected": -264.7276916503906, "step": 1154 }, { "epoch": 12.157894736842104, "grad_norm": 1.6588943481110618e-06, "learning_rate": 0.00017585263157894737, "logits/chosen": 13.204049110412598, "logits/rejected": 13.204049110412598, "logps/chosen": -4877.3251953125, "logps/rejected": -4877.3251953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8853454589844, "rewards/margins": 0.0, "rewards/rejected": -484.8853454589844, "step": 1155 }, { "epoch": 12.16842105263158, "grad_norm": 2.179196371798753e-06, "learning_rate": 0.00017583157894736844, "logits/chosen": 13.219549179077148, "logits/rejected": 13.219549179077148, "logps/chosen": -5175.49853515625, "logps/rejected": -5175.49853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6114501953125, "rewards/margins": 0.0, "rewards/rejected": -514.6114501953125, "step": 1156 }, { "epoch": 12.178947368421053, "grad_norm": 8.472194394926191e-07, "learning_rate": 0.0001758105263157895, "logits/chosen": 13.167341232299805, "logits/rejected": 13.167341232299805, "logps/chosen": -3545.0927734375, "logps/rejected": -3545.0927734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4833984375, "rewards/margins": 0.0, "rewards/rejected": -351.4833984375, "step": 1157 }, { "epoch": 12.189473684210526, "grad_norm": 8.551896257813496e-07, "learning_rate": 0.00017578947368421052, "logits/chosen": 13.175485610961914, "logits/rejected": 13.175485610961914, "logps/chosen": -2968.9130859375, "logps/rejected": -2968.9130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1651306152344, "rewards/margins": 0.0, "rewards/rejected": -294.1651306152344, "step": 1158 }, { "epoch": 12.2, "grad_norm": 1.011404378914449e-06, "learning_rate": 0.0001757684210526316, "logits/chosen": 13.16876220703125, "logits/rejected": 13.16876220703125, "logps/chosen": -2675.162109375, "logps/rejected": -2675.162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.734619140625, "rewards/margins": 0.0, "rewards/rejected": -264.734619140625, "step": 1159 }, { "epoch": 12.210526315789474, "grad_norm": 9.106698257710377e-07, "learning_rate": 0.00017574736842105264, "logits/chosen": 13.194976806640625, "logits/rejected": 13.194976806640625, "logps/chosen": -3545.2578125, "logps/rejected": -3545.2578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4999084472656, "rewards/margins": 0.0, "rewards/rejected": -351.4999084472656, "step": 1160 }, { "epoch": 12.221052631578948, "grad_norm": 1.2417212928994559e-06, "learning_rate": 0.0001757263157894737, "logits/chosen": 13.19233226776123, "logits/rejected": 13.19233226776123, "logps/chosen": -3998.916015625, "logps/rejected": -3998.916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0643005371094, "rewards/margins": 0.0, "rewards/rejected": -397.0643005371094, "step": 1161 }, { "epoch": 12.23157894736842, "grad_norm": 1.2281788031032193e-06, "learning_rate": 0.00017570526315789474, "logits/chosen": 13.196739196777344, "logits/rejected": 13.196739196777344, "logps/chosen": -3998.84765625, "logps/rejected": -3998.84765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0574645996094, "rewards/margins": 0.0, "rewards/rejected": -397.0574645996094, "step": 1162 }, { "epoch": 12.242105263157894, "grad_norm": 7.93373601482017e-07, "learning_rate": 0.00017568421052631582, "logits/chosen": 13.208879470825195, "logits/rejected": 13.208879470825195, "logps/chosen": -3545.5859375, "logps/rejected": -3545.5859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.53271484375, "rewards/margins": 0.0, "rewards/rejected": -351.53271484375, "step": 1163 }, { "epoch": 12.25263157894737, "grad_norm": 1.1000822723872261e-06, "learning_rate": 0.00017566315789473684, "logits/chosen": 13.210315704345703, "logits/rejected": 13.210315704345703, "logps/chosen": -3757.365234375, "logps/rejected": -3757.365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8182067871094, "rewards/margins": 0.0, "rewards/rejected": -372.8182067871094, "step": 1164 }, { "epoch": 12.263157894736842, "grad_norm": 1.4750858099432662e-06, "learning_rate": 0.0001756421052631579, "logits/chosen": 13.244125366210938, "logits/rejected": 13.244125366210938, "logps/chosen": -4328.5107421875, "logps/rejected": -4328.5107421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.6662292480469, "rewards/margins": 0.0, "rewards/rejected": -429.6662292480469, "step": 1165 }, { "epoch": 12.273684210526316, "grad_norm": 7.83409063842555e-07, "learning_rate": 0.00017562105263157896, "logits/chosen": 13.211919784545898, "logits/rejected": 13.211919784545898, "logps/chosen": -3545.8154296875, "logps/rejected": -3545.8154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5556640625, "rewards/margins": 0.0, "rewards/rejected": -351.5556640625, "step": 1166 }, { "epoch": 12.284210526315789, "grad_norm": 8.064207008828816e-07, "learning_rate": 0.0001756, "logits/chosen": 13.212503433227539, "logits/rejected": 13.212503433227539, "logps/chosen": -2969.1298828125, "logps/rejected": -2969.1298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1867980957031, "rewards/margins": 0.0, "rewards/rejected": -294.1867980957031, "step": 1167 }, { "epoch": 12.294736842105262, "grad_norm": 7.703820301685482e-07, "learning_rate": 0.00017557894736842106, "logits/chosen": 13.216180801391602, "logits/rejected": 13.216180801391602, "logps/chosen": -3546.19140625, "logps/rejected": -3546.19140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.59326171875, "rewards/margins": 0.0, "rewards/rejected": -351.59326171875, "step": 1168 }, { "epoch": 12.305263157894737, "grad_norm": 1.4685492715216242e-06, "learning_rate": 0.0001755578947368421, "logits/chosen": 13.211734771728516, "logits/rejected": 13.211734771728516, "logps/chosen": -3776.330078125, "logps/rejected": -3776.330078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7638854980469, "rewards/margins": 0.0, "rewards/rejected": -374.7638854980469, "step": 1169 }, { "epoch": 12.31578947368421, "grad_norm": 1.6122794477269053e-06, "learning_rate": 0.0001755368421052632, "logits/chosen": 13.268331527709961, "logits/rejected": 13.268331527709961, "logps/chosen": -4876.88037109375, "logps/rejected": -4876.88037109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.84088134765625, "rewards/margins": 0.0, "rewards/rejected": -484.84088134765625, "step": 1170 }, { "epoch": 12.326315789473684, "grad_norm": 1.5342782262450783e-06, "learning_rate": 0.0001755157894736842, "logits/chosen": 13.269502639770508, "logits/rejected": 13.269502639770508, "logps/chosen": -4877.0869140625, "logps/rejected": -4877.0869140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8615417480469, "rewards/margins": 0.0, "rewards/rejected": -484.8615417480469, "step": 1171 }, { "epoch": 12.336842105263157, "grad_norm": 9.192314678330149e-07, "learning_rate": 0.00017549473684210526, "logits/chosen": 13.204910278320312, "logits/rejected": 13.204910278320312, "logps/chosen": -2674.8876953125, "logps/rejected": -2674.8876953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7071838378906, "rewards/margins": 0.0, "rewards/rejected": -264.7071838378906, "step": 1172 }, { "epoch": 12.347368421052632, "grad_norm": 1.0172065003644093e-06, "learning_rate": 0.00017547368421052634, "logits/chosen": 13.221443176269531, "logits/rejected": 13.221443176269531, "logps/chosen": -3758.3583984375, "logps/rejected": -3758.3583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.91754150390625, "rewards/margins": 0.0, "rewards/rejected": -372.91754150390625, "step": 1173 }, { "epoch": 12.357894736842105, "grad_norm": 7.945783977447718e-07, "learning_rate": 0.00017545263157894738, "logits/chosen": 13.219433784484863, "logits/rejected": 13.219433784484863, "logps/chosen": -3546.3076171875, "logps/rejected": -3546.3076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.6048889160156, "rewards/margins": 0.0, "rewards/rejected": -351.6048889160156, "step": 1174 }, { "epoch": 12.368421052631579, "grad_norm": 1.0575088253972353e-06, "learning_rate": 0.00017543157894736843, "logits/chosen": 13.218424797058105, "logits/rejected": 13.218424797058105, "logps/chosen": -3758.75390625, "logps/rejected": -3758.75390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.95709228515625, "rewards/margins": 0.0, "rewards/rejected": -372.95709228515625, "step": 1175 }, { "epoch": 12.378947368421052, "grad_norm": 8.342209980582993e-07, "learning_rate": 0.00017541052631578948, "logits/chosen": 13.199202537536621, "logits/rejected": 13.199202537536621, "logps/chosen": -2674.888671875, "logps/rejected": -2674.888671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.707275390625, "rewards/margins": 0.0, "rewards/rejected": -264.707275390625, "step": 1176 }, { "epoch": 12.389473684210527, "grad_norm": 1.578581191097328e-06, "learning_rate": 0.00017538947368421053, "logits/chosen": 13.214439392089844, "logits/rejected": 13.214439392089844, "logps/chosen": -4287.619140625, "logps/rejected": -4287.619140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9645690917969, "rewards/margins": 0.0, "rewards/rejected": -425.9645690917969, "step": 1177 }, { "epoch": 12.4, "grad_norm": 1.4678345223728684e-06, "learning_rate": 0.00017536842105263158, "logits/chosen": 13.2723388671875, "logits/rejected": 13.2723388671875, "logps/chosen": -5175.3310546875, "logps/rejected": -5175.3310546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5946655273438, "rewards/margins": 0.0, "rewards/rejected": -514.5946655273438, "step": 1178 }, { "epoch": 12.410526315789474, "grad_norm": 1.457522216696816e-06, "learning_rate": 0.00017534736842105263, "logits/chosen": 13.203184127807617, "logits/rejected": 13.203184127807617, "logps/chosen": -3777.3095703125, "logps/rejected": -3777.3095703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8618469238281, "rewards/margins": 0.0, "rewards/rejected": -374.8618469238281, "step": 1179 }, { "epoch": 12.421052631578947, "grad_norm": 1.8166482504966552e-06, "learning_rate": 0.0001753263157894737, "logits/chosen": 13.256892204284668, "logits/rejected": 13.256892204284668, "logps/chosen": -4878.2294921875, "logps/rejected": -4878.2294921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9757995605469, "rewards/margins": 0.0, "rewards/rejected": -484.9757995605469, "step": 1180 }, { "epoch": 12.431578947368422, "grad_norm": 1.1513600384205347e-06, "learning_rate": 0.00017530526315789476, "logits/chosen": 13.206685066223145, "logits/rejected": 13.206685066223145, "logps/chosen": -3759.626953125, "logps/rejected": -3759.626953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0444030761719, "rewards/margins": 0.0, "rewards/rejected": -373.0444030761719, "step": 1181 }, { "epoch": 12.442105263157895, "grad_norm": 1.6320326494678739e-06, "learning_rate": 0.0001752842105263158, "logits/chosen": 13.262678146362305, "logits/rejected": 13.262678146362305, "logps/chosen": -5175.26953125, "logps/rejected": -5175.26953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5885620117188, "rewards/margins": 0.0, "rewards/rejected": -514.5885620117188, "step": 1182 }, { "epoch": 12.452631578947368, "grad_norm": 1.380205389978073e-06, "learning_rate": 0.00017526315789473683, "logits/chosen": 13.19819164276123, "logits/rejected": 13.19819164276123, "logps/chosen": -4288.6201171875, "logps/rejected": -4288.6201171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0646667480469, "rewards/margins": 0.0, "rewards/rejected": -426.0646667480469, "step": 1183 }, { "epoch": 12.463157894736842, "grad_norm": 1.4973530824136105e-06, "learning_rate": 0.0001752421052631579, "logits/chosen": 13.255668640136719, "logits/rejected": 13.255668640136719, "logps/chosen": -5175.7021484375, "logps/rejected": -5175.7021484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6317749023438, "rewards/margins": 0.0, "rewards/rejected": -514.6317749023438, "step": 1184 }, { "epoch": 12.473684210526315, "grad_norm": 8.26027758193959e-07, "learning_rate": 0.00017522105263157895, "logits/chosen": 13.19247817993164, "logits/rejected": 13.19247817993164, "logps/chosen": -3545.4443359375, "logps/rejected": -3545.4443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5185546875, "rewards/margins": 0.0, "rewards/rejected": -351.5185546875, "step": 1185 }, { "epoch": 12.48421052631579, "grad_norm": 8.63258264871547e-07, "learning_rate": 0.0001752, "logits/chosen": 13.19344711303711, "logits/rejected": 13.19344711303711, "logps/chosen": -3545.576171875, "logps/rejected": -3545.576171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.53173828125, "rewards/margins": 0.0, "rewards/rejected": -351.53173828125, "step": 1186 }, { "epoch": 12.494736842105263, "grad_norm": 8.619325626568752e-07, "learning_rate": 0.00017517894736842105, "logits/chosen": 13.19863510131836, "logits/rejected": 13.19863510131836, "logps/chosen": -3545.6513671875, "logps/rejected": -3545.6513671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5392761230469, "rewards/margins": 0.0, "rewards/rejected": -351.5392761230469, "step": 1187 }, { "epoch": 12.505263157894737, "grad_norm": 1.373716258967761e-06, "learning_rate": 0.00017515789473684213, "logits/chosen": 13.255412101745605, "logits/rejected": 13.255412101745605, "logps/chosen": -4879.18701171875, "logps/rejected": -4879.18701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.071533203125, "rewards/margins": 0.0, "rewards/rejected": -485.071533203125, "step": 1188 }, { "epoch": 12.51578947368421, "grad_norm": 1.444881149836874e-06, "learning_rate": 0.00017513684210526318, "logits/chosen": 13.263691902160645, "logits/rejected": 13.263691902160645, "logps/chosen": -4879.3876953125, "logps/rejected": -4879.3876953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.09161376953125, "rewards/margins": 0.0, "rewards/rejected": -485.09161376953125, "step": 1189 }, { "epoch": 12.526315789473685, "grad_norm": 1.5663051726733102e-06, "learning_rate": 0.0001751157894736842, "logits/chosen": 13.208550453186035, "logits/rejected": 13.208550453186035, "logps/chosen": -3995.5078125, "logps/rejected": -3995.5078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7234802246094, "rewards/margins": 0.0, "rewards/rejected": -396.7234802246094, "step": 1190 }, { "epoch": 12.536842105263158, "grad_norm": 1.49425943618553e-06, "learning_rate": 0.00017509473684210528, "logits/chosen": 13.209768295288086, "logits/rejected": 13.209768295288086, "logps/chosen": -3995.65234375, "logps/rejected": -3995.65234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7379150390625, "rewards/margins": 0.0, "rewards/rejected": -396.7379150390625, "step": 1191 }, { "epoch": 12.547368421052632, "grad_norm": 1.3053418115305249e-06, "learning_rate": 0.00017507368421052633, "logits/chosen": 13.21781063079834, "logits/rejected": 13.21781063079834, "logps/chosen": -4289.62890625, "logps/rejected": -4289.62890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.16552734375, "rewards/margins": 0.0, "rewards/rejected": -426.16552734375, "step": 1192 }, { "epoch": 12.557894736842105, "grad_norm": 1.270706889044959e-06, "learning_rate": 0.00017505263157894738, "logits/chosen": 13.199355125427246, "logits/rejected": 13.199355125427246, "logps/chosen": -3995.9765625, "logps/rejected": -3995.9765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7703552246094, "rewards/margins": 0.0, "rewards/rejected": -396.7703552246094, "step": 1193 }, { "epoch": 12.568421052631578, "grad_norm": 1.5296182027668692e-06, "learning_rate": 0.00017503157894736842, "logits/chosen": 13.19531536102295, "logits/rejected": 13.19531536102295, "logps/chosen": -3777.271484375, "logps/rejected": -3777.271484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8580322265625, "rewards/margins": 0.0, "rewards/rejected": -374.8580322265625, "step": 1194 }, { "epoch": 12.578947368421053, "grad_norm": 8.08208028502122e-07, "learning_rate": 0.0001750105263157895, "logits/chosen": 13.193829536437988, "logits/rejected": 13.193829536437988, "logps/chosen": -2968.283203125, "logps/rejected": -2968.283203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1021423339844, "rewards/margins": 0.0, "rewards/rejected": -294.1021423339844, "step": 1195 }, { "epoch": 12.589473684210526, "grad_norm": 1.5503273971262388e-06, "learning_rate": 0.00017498947368421052, "logits/chosen": 13.250874519348145, "logits/rejected": 13.250874519348145, "logps/chosen": -5176.5908203125, "logps/rejected": -5176.5908203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7206420898438, "rewards/margins": 0.0, "rewards/rejected": -514.7206420898438, "step": 1196 }, { "epoch": 12.6, "grad_norm": 9.686530120234238e-07, "learning_rate": 0.00017496842105263157, "logits/chosen": 13.183822631835938, "logits/rejected": 13.183822631835938, "logps/chosen": -3544.3095703125, "logps/rejected": -3544.3095703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.40509033203125, "rewards/margins": 0.0, "rewards/rejected": -351.40509033203125, "step": 1197 }, { "epoch": 12.610526315789473, "grad_norm": 1.7548613868711982e-06, "learning_rate": 0.00017494736842105265, "logits/chosen": 13.23482894897461, "logits/rejected": 13.23482894897461, "logps/chosen": -4880.33984375, "logps/rejected": -4880.33984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.18682861328125, "rewards/margins": 0.0, "rewards/rejected": -485.18682861328125, "step": 1198 }, { "epoch": 12.621052631578948, "grad_norm": 1.1824447483377298e-06, "learning_rate": 0.0001749263157894737, "logits/chosen": 13.164804458618164, "logits/rejected": 13.164804458618164, "logps/chosen": -2672.935546875, "logps/rejected": -2672.935546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.511962890625, "rewards/margins": 0.0, "rewards/rejected": -264.511962890625, "step": 1199 }, { "epoch": 12.631578947368421, "grad_norm": 1.5380776403617347e-06, "learning_rate": 0.00017490526315789475, "logits/chosen": 13.178834915161133, "logits/rejected": 13.178834915161133, "logps/chosen": -3777.666015625, "logps/rejected": -3777.666015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8974914550781, "rewards/margins": 0.0, "rewards/rejected": -374.8974914550781, "step": 1200 }, { "epoch": 12.631578947368421, "eval_logits/chosen": 13.219343185424805, "eval_logits/rejected": 13.219343185424805, "eval_logps/chosen": -4310.83984375, "eval_logps/rejected": -4310.83984375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.18084716796875, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.18084716796875, "eval_runtime": 4.6477, "eval_samples_per_second": 2.152, "eval_steps_per_second": 2.152, "step": 1200 }, { "epoch": 12.642105263157895, "grad_norm": 1.29360205392004e-06, "learning_rate": 0.0001748842105263158, "logits/chosen": 13.240039825439453, "logits/rejected": 13.240039825439453, "logps/chosen": -4880.54296875, "logps/rejected": -4880.54296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2071228027344, "rewards/margins": 0.0, "rewards/rejected": -485.2071228027344, "step": 1201 }, { "epoch": 12.652631578947368, "grad_norm": 9.248939250028343e-07, "learning_rate": 0.00017486315789473685, "logits/chosen": 13.171520233154297, "logits/rejected": 13.171520233154297, "logps/chosen": -2672.8818359375, "logps/rejected": -2672.8818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.506591796875, "rewards/margins": 0.0, "rewards/rejected": -264.506591796875, "step": 1202 }, { "epoch": 12.663157894736843, "grad_norm": 1.7157086631414131e-06, "learning_rate": 0.0001748421052631579, "logits/chosen": 13.257827758789062, "logits/rejected": 13.257827758789062, "logps/chosen": -5176.35205078125, "logps/rejected": -5176.35205078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.69677734375, "rewards/margins": 0.0, "rewards/rejected": -514.69677734375, "step": 1203 }, { "epoch": 12.673684210526316, "grad_norm": 1.2350144515949069e-06, "learning_rate": 0.00017482105263157894, "logits/chosen": 13.182551383972168, "logits/rejected": 13.182551383972168, "logps/chosen": -3997.763671875, "logps/rejected": -3997.763671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9490661621094, "rewards/margins": 0.0, "rewards/rejected": -396.9490661621094, "step": 1204 }, { "epoch": 12.68421052631579, "grad_norm": 1.0536100489844102e-06, "learning_rate": 0.00017480000000000002, "logits/chosen": 13.194211959838867, "logits/rejected": 13.194211959838867, "logps/chosen": -3758.677734375, "logps/rejected": -3758.677734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.949462890625, "rewards/margins": 0.0, "rewards/rejected": -372.949462890625, "step": 1205 }, { "epoch": 12.694736842105263, "grad_norm": 1.496665959166421e-06, "learning_rate": 0.00017477894736842107, "logits/chosen": 13.176555633544922, "logits/rejected": 13.176555633544922, "logps/chosen": -3997.82421875, "logps/rejected": -3997.82421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9551086425781, "rewards/margins": 0.0, "rewards/rejected": -396.9551086425781, "step": 1206 }, { "epoch": 12.705263157894738, "grad_norm": 1.4589624015570735e-06, "learning_rate": 0.00017475789473684212, "logits/chosen": 13.23509693145752, "logits/rejected": 13.23509693145752, "logps/chosen": -4881.1796875, "logps/rejected": -4881.1796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.27081298828125, "rewards/margins": 0.0, "rewards/rejected": -485.27081298828125, "step": 1207 }, { "epoch": 12.715789473684211, "grad_norm": 1.5126262269404833e-06, "learning_rate": 0.00017473684210526317, "logits/chosen": 13.1687593460083, "logits/rejected": 13.1687593460083, "logps/chosen": -3778.708984375, "logps/rejected": -3778.708984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0018005371094, "rewards/margins": 0.0, "rewards/rejected": -375.0018005371094, "step": 1208 }, { "epoch": 12.726315789473684, "grad_norm": 1.347111037830473e-06, "learning_rate": 0.00017471578947368422, "logits/chosen": 13.205496788024902, "logits/rejected": 13.205496788024902, "logps/chosen": -4324.451171875, "logps/rejected": -4324.451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.26025390625, "rewards/margins": 0.0, "rewards/rejected": -429.26025390625, "step": 1209 }, { "epoch": 12.736842105263158, "grad_norm": 9.442120472158422e-07, "learning_rate": 0.00017469473684210527, "logits/chosen": 13.167729377746582, "logits/rejected": 13.167729377746582, "logps/chosen": -3542.4921875, "logps/rejected": -3542.4921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2233581542969, "rewards/margins": 0.0, "rewards/rejected": -351.2233581542969, "step": 1210 }, { "epoch": 12.74736842105263, "grad_norm": 1.1260599421802908e-06, "learning_rate": 0.00017467368421052632, "logits/chosen": 13.153641700744629, "logits/rejected": 13.153641700744629, "logps/chosen": -3998.673828125, "logps/rejected": -3998.673828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0400695800781, "rewards/margins": 0.0, "rewards/rejected": -397.0400695800781, "step": 1211 }, { "epoch": 12.757894736842106, "grad_norm": 1.48184381032479e-06, "learning_rate": 0.00017465263157894737, "logits/chosen": 13.15683650970459, "logits/rejected": 13.15683650970459, "logps/chosen": -3779.37109375, "logps/rejected": -3779.37109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0679931640625, "rewards/margins": 0.0, "rewards/rejected": -375.0679931640625, "step": 1212 }, { "epoch": 12.76842105263158, "grad_norm": 1.5160769635258475e-06, "learning_rate": 0.00017463157894736844, "logits/chosen": 13.227375030517578, "logits/rejected": 13.227375030517578, "logps/chosen": -5175.3994140625, "logps/rejected": -5175.3994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6015014648438, "rewards/margins": 0.0, "rewards/rejected": -514.6015014648438, "step": 1213 }, { "epoch": 12.778947368421052, "grad_norm": 1.5666516901546856e-06, "learning_rate": 0.0001746105263157895, "logits/chosen": 13.21477222442627, "logits/rejected": 13.21477222442627, "logps/chosen": -4881.1376953125, "logps/rejected": -4881.1376953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2666015625, "rewards/margins": 0.0, "rewards/rejected": -485.2666015625, "step": 1214 }, { "epoch": 12.789473684210526, "grad_norm": 1.4243105397326872e-06, "learning_rate": 0.0001745894736842105, "logits/chosen": 13.14836311340332, "logits/rejected": 13.14836311340332, "logps/chosen": -3999.669921875, "logps/rejected": -3999.669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1396789550781, "rewards/margins": 0.0, "rewards/rejected": -397.1396789550781, "step": 1215 }, { "epoch": 12.8, "grad_norm": 1.4788988664804492e-06, "learning_rate": 0.0001745684210526316, "logits/chosen": 13.213777542114258, "logits/rejected": 13.213777542114258, "logps/chosen": -4881.03466796875, "logps/rejected": -4881.03466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2563171386719, "rewards/margins": 0.0, "rewards/rejected": -485.2563171386719, "step": 1216 }, { "epoch": 12.810526315789474, "grad_norm": 1.0042375606644782e-06, "learning_rate": 0.00017454736842105264, "logits/chosen": 13.158173561096191, "logits/rejected": 13.158173561096191, "logps/chosen": -3757.86328125, "logps/rejected": -3757.86328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8680114746094, "rewards/margins": 0.0, "rewards/rejected": -372.8680114746094, "step": 1217 }, { "epoch": 12.821052631578947, "grad_norm": 1.6326533796018339e-06, "learning_rate": 0.0001745263157894737, "logits/chosen": 13.223214149475098, "logits/rejected": 13.223214149475098, "logps/chosen": -5174.9873046875, "logps/rejected": -5174.9873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.560302734375, "rewards/margins": 0.0, "rewards/rejected": -514.560302734375, "step": 1218 }, { "epoch": 12.83157894736842, "grad_norm": 1.5520402030233527e-06, "learning_rate": 0.00017450526315789474, "logits/chosen": 13.223483085632324, "logits/rejected": 13.223483085632324, "logps/chosen": -5175.1748046875, "logps/rejected": -5175.1748046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5790405273438, "rewards/margins": 0.0, "rewards/rejected": -514.5790405273438, "step": 1219 }, { "epoch": 12.842105263157894, "grad_norm": 1.0565128150119563e-06, "learning_rate": 0.00017448421052631581, "logits/chosen": 13.136785507202148, "logits/rejected": 13.136785507202148, "logps/chosen": -2672.26171875, "logps/rejected": -2672.26171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.444580078125, "rewards/margins": 0.0, "rewards/rejected": -264.444580078125, "step": 1220 }, { "epoch": 12.852631578947369, "grad_norm": 1.404454678777256e-06, "learning_rate": 0.00017446315789473684, "logits/chosen": 13.16103744506836, "logits/rejected": 13.16103744506836, "logps/chosen": -4286.90625, "logps/rejected": -4286.90625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.8932800292969, "rewards/margins": 0.0, "rewards/rejected": -425.8932800292969, "step": 1221 }, { "epoch": 12.863157894736842, "grad_norm": 1.1864430007335613e-06, "learning_rate": 0.00017444210526315789, "logits/chosen": 13.14920425415039, "logits/rejected": 13.14920425415039, "logps/chosen": -3999.20703125, "logps/rejected": -3999.20703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0933837890625, "rewards/margins": 0.0, "rewards/rejected": -397.0933837890625, "step": 1222 }, { "epoch": 12.873684210526315, "grad_norm": 1.3955528856968158e-06, "learning_rate": 0.00017442105263157896, "logits/chosen": 13.160967826843262, "logits/rejected": 13.160967826843262, "logps/chosen": -4287.162109375, "logps/rejected": -4287.162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9188537597656, "rewards/margins": 0.0, "rewards/rejected": -425.9188537597656, "step": 1223 }, { "epoch": 12.884210526315789, "grad_norm": 1.1030474524886813e-06, "learning_rate": 0.0001744, "logits/chosen": 13.143211364746094, "logits/rejected": 13.143211364746094, "logps/chosen": -3999.69140625, "logps/rejected": -3999.69140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1418151855469, "rewards/margins": 0.0, "rewards/rejected": -397.1418151855469, "step": 1224 }, { "epoch": 12.894736842105264, "grad_norm": 1.0810474577738205e-06, "learning_rate": 0.00017437894736842106, "logits/chosen": 13.135416030883789, "logits/rejected": 13.135416030883789, "logps/chosen": -4000.001953125, "logps/rejected": -4000.001953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1728820800781, "rewards/margins": 0.0, "rewards/rejected": -397.1728820800781, "step": 1225 }, { "epoch": 12.905263157894737, "grad_norm": 1.5307930425478844e-06, "learning_rate": 0.0001743578947368421, "logits/chosen": 13.195619583129883, "logits/rejected": 13.195619583129883, "logps/chosen": -4880.837890625, "logps/rejected": -4880.837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.23663330078125, "rewards/margins": 0.0, "rewards/rejected": -485.23663330078125, "step": 1226 }, { "epoch": 12.91578947368421, "grad_norm": 1.5578589227516204e-06, "learning_rate": 0.00017433684210526319, "logits/chosen": 13.124464988708496, "logits/rejected": 13.124464988708496, "logps/chosen": -3778.41015625, "logps/rejected": -3778.41015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9718933105469, "rewards/margins": 0.0, "rewards/rejected": -374.9718933105469, "step": 1227 }, { "epoch": 12.926315789473684, "grad_norm": 1.81817688371666e-06, "learning_rate": 0.0001743157894736842, "logits/chosen": 13.194299697875977, "logits/rejected": 13.194299697875977, "logps/chosen": -5175.64404296875, "logps/rejected": -5175.64404296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6259765625, "rewards/margins": 0.0, "rewards/rejected": -514.6259765625, "step": 1228 }, { "epoch": 12.936842105263159, "grad_norm": 1.4195894664226216e-06, "learning_rate": 0.00017429473684210526, "logits/chosen": 13.122154235839844, "logits/rejected": 13.122154235839844, "logps/chosen": -4288.3291015625, "logps/rejected": -4288.3291015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0355529785156, "rewards/margins": 0.0, "rewards/rejected": -426.0355529785156, "step": 1229 }, { "epoch": 12.947368421052632, "grad_norm": 1.5928274024190614e-06, "learning_rate": 0.00017427368421052633, "logits/chosen": 13.157710075378418, "logits/rejected": 13.157710075378418, "logps/chosen": -4324.19921875, "logps/rejected": -4324.19921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.2350769042969, "rewards/margins": 0.0, "rewards/rejected": -429.2350769042969, "step": 1230 }, { "epoch": 12.957894736842105, "grad_norm": 9.173579087473627e-07, "learning_rate": 0.00017425263157894738, "logits/chosen": 13.101690292358398, "logits/rejected": 13.101690292358398, "logps/chosen": -2672.404296875, "logps/rejected": -2672.404296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4588317871094, "rewards/margins": 0.0, "rewards/rejected": -264.4588317871094, "step": 1231 }, { "epoch": 12.968421052631578, "grad_norm": 9.18890009415918e-07, "learning_rate": 0.00017423157894736843, "logits/chosen": 13.104532241821289, "logits/rejected": 13.104532241821289, "logps/chosen": -2672.669921875, "logps/rejected": -2672.669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.48541259765625, "rewards/margins": 0.0, "rewards/rejected": -264.48541259765625, "step": 1232 }, { "epoch": 12.978947368421053, "grad_norm": 8.989978823592537e-07, "learning_rate": 0.00017421052631578948, "logits/chosen": 13.12751293182373, "logits/rejected": 13.12751293182373, "logps/chosen": -2966.474609375, "logps/rejected": -2966.474609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9212646484375, "rewards/margins": 0.0, "rewards/rejected": -293.9212646484375, "step": 1233 }, { "epoch": 12.989473684210527, "grad_norm": 4.231207185512176e-06, "learning_rate": 0.00017418947368421053, "logits/chosen": 13.189374923706055, "logits/rejected": 13.189374923706055, "logps/chosen": -4880.4609375, "logps/rejected": -4880.4609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1989440917969, "rewards/margins": 0.0, "rewards/rejected": -485.1989440917969, "step": 1234 }, { "epoch": 13.0, "grad_norm": 1.278900299439556e-06, "learning_rate": 0.00017416842105263158, "logits/chosen": 13.141138076782227, "logits/rejected": 13.141138076782227, "logps/chosen": -3540.908203125, "logps/rejected": -3540.908203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.06494140625, "rewards/margins": 0.0, "rewards/rejected": -351.06494140625, "step": 1235 }, { "epoch": 13.010526315789473, "grad_norm": 4.298643034417182e-06, "learning_rate": 0.00017414736842105263, "logits/chosen": 13.135010719299316, "logits/rejected": 13.135010719299316, "logps/chosen": -3999.578125, "logps/rejected": -3999.578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1304931640625, "rewards/margins": 0.0, "rewards/rejected": -397.1304931640625, "step": 1236 }, { "epoch": 13.021052631578947, "grad_norm": 1.3581113762484165e-06, "learning_rate": 0.0001741263157894737, "logits/chosen": 13.157971382141113, "logits/rejected": 13.157971382141113, "logps/chosen": -3758.025390625, "logps/rejected": -3758.025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8842468261719, "rewards/margins": 0.0, "rewards/rejected": -372.8842468261719, "step": 1237 }, { "epoch": 13.031578947368422, "grad_norm": 8.696854933987197e-07, "learning_rate": 0.00017410526315789475, "logits/chosen": 13.139606475830078, "logits/rejected": 13.139606475830078, "logps/chosen": -2673.958984375, "logps/rejected": -2673.958984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.61431884765625, "rewards/margins": 0.0, "rewards/rejected": -264.61431884765625, "step": 1238 }, { "epoch": 13.042105263157895, "grad_norm": 5.090080321679125e-06, "learning_rate": 0.0001740842105263158, "logits/chosen": 13.165019989013672, "logits/rejected": 13.165019989013672, "logps/chosen": -4289.61669921875, "logps/rejected": -4289.61669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.164306640625, "rewards/margins": 0.0, "rewards/rejected": -426.164306640625, "step": 1239 }, { "epoch": 13.052631578947368, "grad_norm": 1.8682176232687198e-06, "learning_rate": 0.00017406315789473685, "logits/chosen": 13.158531188964844, "logits/rejected": 13.158531188964844, "logps/chosen": -3778.431640625, "logps/rejected": -3778.431640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.97406005859375, "rewards/margins": 0.0, "rewards/rejected": -374.97406005859375, "step": 1240 }, { "epoch": 13.063157894736841, "grad_norm": 2.378750195930479e-06, "learning_rate": 0.0001740421052631579, "logits/chosen": 13.16057014465332, "logits/rejected": 13.16057014465332, "logps/chosen": -2967.625, "logps/rejected": -2967.625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.03631591796875, "rewards/margins": 0.0, "rewards/rejected": -294.03631591796875, "step": 1241 }, { "epoch": 13.073684210526316, "grad_norm": 2.5235326575057115e-06, "learning_rate": 0.00017402105263157895, "logits/chosen": 13.167869567871094, "logits/rejected": 13.167869567871094, "logps/chosen": -3758.5048828125, "logps/rejected": -3758.5048828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.93218994140625, "rewards/margins": 0.0, "rewards/rejected": -372.93218994140625, "step": 1242 }, { "epoch": 13.08421052631579, "grad_norm": 1.0824261380548705e-06, "learning_rate": 0.000174, "logits/chosen": 13.170663833618164, "logits/rejected": 13.170663833618164, "logps/chosen": -3541.8671875, "logps/rejected": -3541.8671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1608581542969, "rewards/margins": 0.0, "rewards/rejected": -351.1608581542969, "step": 1243 }, { "epoch": 13.094736842105263, "grad_norm": 1.986631787076476e-06, "learning_rate": 0.00017397894736842105, "logits/chosen": 13.174274444580078, "logits/rejected": 13.174274444580078, "logps/chosen": -4291.10791015625, "logps/rejected": -4291.10791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.3134460449219, "rewards/margins": 0.0, "rewards/rejected": -426.3134460449219, "step": 1244 }, { "epoch": 13.105263157894736, "grad_norm": 9.613373777028755e-07, "learning_rate": 0.00017395789473684213, "logits/chosen": 13.155138969421387, "logits/rejected": 13.155138969421387, "logps/chosen": -2675.404296875, "logps/rejected": -2675.404296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.75885009765625, "rewards/margins": 0.0, "rewards/rejected": -264.75885009765625, "step": 1245 }, { "epoch": 13.115789473684211, "grad_norm": 2.8787717383238487e-06, "learning_rate": 0.00017393684210526318, "logits/chosen": 13.171771049499512, "logits/rejected": 13.171771049499512, "logps/chosen": -3779.2490234375, "logps/rejected": -3779.2490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0557861328125, "rewards/margins": 0.0, "rewards/rejected": -375.0557861328125, "step": 1246 }, { "epoch": 13.126315789473685, "grad_norm": 9.68525910138851e-07, "learning_rate": 0.0001739157894736842, "logits/chosen": 13.179234504699707, "logits/rejected": 13.179234504699707, "logps/chosen": -3542.28515625, "logps/rejected": -3542.28515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.20263671875, "rewards/margins": 0.0, "rewards/rejected": -351.20263671875, "step": 1247 }, { "epoch": 13.136842105263158, "grad_norm": 2.1995188035361934e-06, "learning_rate": 0.00017389473684210527, "logits/chosen": 13.164140701293945, "logits/rejected": 13.164140701293945, "logps/chosen": -3998.693359375, "logps/rejected": -3998.693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0420227050781, "rewards/margins": 0.0, "rewards/rejected": -397.0420227050781, "step": 1248 }, { "epoch": 13.147368421052631, "grad_norm": 2.7806797788798576e-06, "learning_rate": 0.00017387368421052632, "logits/chosen": 13.156556129455566, "logits/rejected": 13.156556129455566, "logps/chosen": -3998.7265625, "logps/rejected": -3998.7265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.04534912109375, "rewards/margins": 0.0, "rewards/rejected": -397.04534912109375, "step": 1249 }, { "epoch": 13.157894736842104, "grad_norm": 1.0588154282231699e-06, "learning_rate": 0.00017385263157894737, "logits/chosen": 13.135560989379883, "logits/rejected": 13.135560989379883, "logps/chosen": -2676.119140625, "logps/rejected": -2676.119140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.830322265625, "rewards/margins": 0.0, "rewards/rejected": -264.830322265625, "step": 1250 }, { "epoch": 13.157894736842104, "eval_logits/chosen": 13.177592277526855, "eval_logits/rejected": 13.177592277526855, "eval_logps/chosen": -4309.9306640625, "eval_logps/rejected": -4309.9306640625, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.0899353027344, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.0899353027344, "eval_runtime": 4.3262, "eval_samples_per_second": 2.311, "eval_steps_per_second": 2.311, "step": 1250 }, { "epoch": 13.16842105263158, "grad_norm": 1.6972080629784614e-06, "learning_rate": 0.00017383157894736842, "logits/chosen": 13.1998872756958, "logits/rejected": 13.1998872756958, "logps/chosen": -4878.42431640625, "logps/rejected": -4878.42431640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9952697753906, "rewards/margins": 0.0, "rewards/rejected": -484.9952697753906, "step": 1251 }, { "epoch": 13.178947368421053, "grad_norm": 2.2566489406017354e-06, "learning_rate": 0.0001738105263157895, "logits/chosen": 13.137022018432617, "logits/rejected": 13.137022018432617, "logps/chosen": -3542.4072265625, "logps/rejected": -3542.4072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.21484375, "rewards/margins": 0.0, "rewards/rejected": -351.21484375, "step": 1252 }, { "epoch": 13.189473684210526, "grad_norm": 2.3852032882132335e-06, "learning_rate": 0.00017378947368421052, "logits/chosen": 13.186307907104492, "logits/rejected": 13.186307907104492, "logps/chosen": -4878.13134765625, "logps/rejected": -4878.13134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9659729003906, "rewards/margins": 0.0, "rewards/rejected": -484.9659729003906, "step": 1253 }, { "epoch": 13.2, "grad_norm": 1.001463942884584e-06, "learning_rate": 0.00017376842105263157, "logits/chosen": 13.128952026367188, "logits/rejected": 13.128952026367188, "logps/chosen": -2969.6259765625, "logps/rejected": -2969.6259765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.2364196777344, "rewards/margins": 0.0, "rewards/rejected": -294.2364196777344, "step": 1254 }, { "epoch": 13.210526315789474, "grad_norm": 2.3843738290452166e-06, "learning_rate": 0.00017374736842105265, "logits/chosen": 13.166271209716797, "logits/rejected": 13.166271209716797, "logps/chosen": -4325.814453125, "logps/rejected": -4325.814453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3965759277344, "rewards/margins": 0.0, "rewards/rejected": -429.3965759277344, "step": 1255 }, { "epoch": 13.221052631578948, "grad_norm": 1.1960969459323678e-06, "learning_rate": 0.0001737263157894737, "logits/chosen": 13.117654800415039, "logits/rejected": 13.117654800415039, "logps/chosen": -3998.9453125, "logps/rejected": -3998.9453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0672302246094, "rewards/margins": 0.0, "rewards/rejected": -397.0672302246094, "step": 1256 }, { "epoch": 13.23157894736842, "grad_norm": 2.0701395442301873e-06, "learning_rate": 0.00017370526315789474, "logits/chosen": 13.196760177612305, "logits/rejected": 13.196760177612305, "logps/chosen": -5171.2568359375, "logps/rejected": -5171.2568359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.187255859375, "rewards/margins": 0.0, "rewards/rejected": -514.187255859375, "step": 1257 }, { "epoch": 13.242105263157894, "grad_norm": 2.019994553847937e-06, "learning_rate": 0.0001736842105263158, "logits/chosen": 13.11418342590332, "logits/rejected": 13.11418342590332, "logps/chosen": -3999.14453125, "logps/rejected": -3999.14453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0871276855469, "rewards/margins": 0.0, "rewards/rejected": -397.0871276855469, "step": 1258 }, { "epoch": 13.25263157894737, "grad_norm": 1.033930630001123e-06, "learning_rate": 0.00017366315789473687, "logits/chosen": 13.123323440551758, "logits/rejected": 13.123323440551758, "logps/chosen": -3542.9091796875, "logps/rejected": -3542.9091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2650451660156, "rewards/margins": 0.0, "rewards/rejected": -351.2650451660156, "step": 1259 }, { "epoch": 13.263157894736842, "grad_norm": 9.739521829033038e-07, "learning_rate": 0.0001736421052631579, "logits/chosen": 13.119747161865234, "logits/rejected": 13.119747161865234, "logps/chosen": -3543.3935546875, "logps/rejected": -3543.3935546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3134765625, "rewards/margins": 0.0, "rewards/rejected": -351.3134765625, "step": 1260 }, { "epoch": 13.273684210526316, "grad_norm": 2.752431100816466e-06, "learning_rate": 0.00017362105263157894, "logits/chosen": 13.173813819885254, "logits/rejected": 13.173813819885254, "logps/chosen": -4878.8154296875, "logps/rejected": -4878.8154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0343933105469, "rewards/margins": 0.0, "rewards/rejected": -485.0343933105469, "step": 1261 }, { "epoch": 13.284210526315789, "grad_norm": 1.5259577139659086e-06, "learning_rate": 0.00017360000000000002, "logits/chosen": 13.176005363464355, "logits/rejected": 13.176005363464355, "logps/chosen": -4879.44189453125, "logps/rejected": -4879.44189453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0970153808594, "rewards/margins": 0.0, "rewards/rejected": -485.0970153808594, "step": 1262 }, { "epoch": 13.294736842105262, "grad_norm": 1.538273522783129e-06, "learning_rate": 0.00017357894736842107, "logits/chosen": 13.179133415222168, "logits/rejected": 13.179133415222168, "logps/chosen": -4879.802734375, "logps/rejected": -4879.802734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.13311767578125, "rewards/margins": 0.0, "rewards/rejected": -485.13311767578125, "step": 1263 }, { "epoch": 13.305263157894737, "grad_norm": 1.694831439635891e-06, "learning_rate": 0.00017355789473684212, "logits/chosen": 13.132376670837402, "logits/rejected": 13.132376670837402, "logps/chosen": -3543.6337890625, "logps/rejected": -3543.6337890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3374938964844, "rewards/margins": 0.0, "rewards/rejected": -351.3374938964844, "step": 1264 }, { "epoch": 13.31578947368421, "grad_norm": 1.55275188262749e-06, "learning_rate": 0.00017353684210526317, "logits/chosen": 13.198293685913086, "logits/rejected": 13.198293685913086, "logps/chosen": -4880.35546875, "logps/rejected": -4880.35546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1883850097656, "rewards/margins": 0.0, "rewards/rejected": -485.1883850097656, "step": 1265 }, { "epoch": 13.326315789473684, "grad_norm": 1.3666732456840691e-06, "learning_rate": 0.00017351578947368422, "logits/chosen": 13.14201545715332, "logits/rejected": 13.14201545715332, "logps/chosen": -3999.75, "logps/rejected": -3999.75, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1476745605469, "rewards/margins": 0.0, "rewards/rejected": -397.1476745605469, "step": 1266 }, { "epoch": 13.336842105263157, "grad_norm": 1.4160054888634477e-06, "learning_rate": 0.00017349473684210526, "logits/chosen": 13.16655445098877, "logits/rejected": 13.16655445098877, "logps/chosen": -3543.83984375, "logps/rejected": -3543.83984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3581237792969, "rewards/margins": 0.0, "rewards/rejected": -351.3581237792969, "step": 1267 }, { "epoch": 13.347368421052632, "grad_norm": 1.955471134351683e-06, "learning_rate": 0.00017347368421052631, "logits/chosen": 13.16107177734375, "logits/rejected": 13.16107177734375, "logps/chosen": -3999.6640625, "logps/rejected": -3999.6640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.13909912109375, "rewards/margins": 0.0, "rewards/rejected": -397.13909912109375, "step": 1268 }, { "epoch": 13.357894736842105, "grad_norm": 1.1480233297334053e-06, "learning_rate": 0.0001734526315789474, "logits/chosen": 13.156723976135254, "logits/rejected": 13.156723976135254, "logps/chosen": -2673.8017578125, "logps/rejected": -2673.8017578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5986022949219, "rewards/margins": 0.0, "rewards/rejected": -264.5986022949219, "step": 1269 }, { "epoch": 13.368421052631579, "grad_norm": 9.004695016301412e-07, "learning_rate": 0.00017343157894736844, "logits/chosen": 13.160689353942871, "logits/rejected": 13.160689353942871, "logps/chosen": -2673.66796875, "logps/rejected": -2673.66796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.585205078125, "rewards/margins": 0.0, "rewards/rejected": -264.585205078125, "step": 1270 }, { "epoch": 13.378947368421052, "grad_norm": 2.334185182917281e-06, "learning_rate": 0.0001734105263157895, "logits/chosen": 13.22353744506836, "logits/rejected": 13.22353744506836, "logps/chosen": -4325.568359375, "logps/rejected": -4325.568359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3719787597656, "rewards/margins": 0.0, "rewards/rejected": -429.3719787597656, "step": 1271 }, { "epoch": 13.389473684210527, "grad_norm": 2.050010152743198e-06, "learning_rate": 0.0001733894736842105, "logits/chosen": 13.185863494873047, "logits/rejected": 13.185863494873047, "logps/chosen": -4287.62451171875, "logps/rejected": -4287.62451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.965087890625, "rewards/margins": 0.0, "rewards/rejected": -425.965087890625, "step": 1272 }, { "epoch": 13.4, "grad_norm": 1.47646130699286e-06, "learning_rate": 0.0001733684210526316, "logits/chosen": 13.224810600280762, "logits/rejected": 13.224810600280762, "logps/chosen": -4325.912109375, "logps/rejected": -4325.912109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4063415527344, "rewards/margins": 0.0, "rewards/rejected": -429.4063415527344, "step": 1273 }, { "epoch": 13.410526315789474, "grad_norm": 1.1658304401862551e-06, "learning_rate": 0.00017334736842105264, "logits/chosen": 13.187033653259277, "logits/rejected": 13.187033653259277, "logps/chosen": -2968.05859375, "logps/rejected": -2968.05859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0796813964844, "rewards/margins": 0.0, "rewards/rejected": -294.0796813964844, "step": 1274 }, { "epoch": 13.421052631578947, "grad_norm": 2.672120899660513e-06, "learning_rate": 0.00017332631578947369, "logits/chosen": 13.256819725036621, "logits/rejected": 13.256819725036621, "logps/chosen": -5173.73828125, "logps/rejected": -5173.73828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4354248046875, "rewards/margins": 0.0, "rewards/rejected": -514.4354248046875, "step": 1275 }, { "epoch": 13.431578947368422, "grad_norm": 1.5713484344814788e-06, "learning_rate": 0.00017330526315789474, "logits/chosen": 13.174192428588867, "logits/rejected": 13.174192428588867, "logps/chosen": -4000.41015625, "logps/rejected": -4000.41015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.2137145996094, "rewards/margins": 0.0, "rewards/rejected": -397.2137145996094, "step": 1276 }, { "epoch": 13.442105263157895, "grad_norm": 1.8625045186126954e-06, "learning_rate": 0.0001732842105263158, "logits/chosen": 13.251129150390625, "logits/rejected": 13.251129150390625, "logps/chosen": -5174.11474609375, "logps/rejected": -5174.11474609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4730834960938, "rewards/margins": 0.0, "rewards/rejected": -514.4730834960938, "step": 1277 }, { "epoch": 13.452631578947368, "grad_norm": 1.4260409670896479e-06, "learning_rate": 0.00017326315789473686, "logits/chosen": 13.181211471557617, "logits/rejected": 13.181211471557617, "logps/chosen": -3757.837890625, "logps/rejected": -3757.837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.865478515625, "rewards/margins": 0.0, "rewards/rejected": -372.865478515625, "step": 1278 }, { "epoch": 13.463157894736842, "grad_norm": 1.514201471763954e-06, "learning_rate": 0.00017324210526315788, "logits/chosen": 13.172541618347168, "logits/rejected": 13.172541618347168, "logps/chosen": -3777.443359375, "logps/rejected": -3777.443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8752136230469, "rewards/margins": 0.0, "rewards/rejected": -374.8752136230469, "step": 1279 }, { "epoch": 13.473684210526315, "grad_norm": 1.5647108284611022e-06, "learning_rate": 0.00017322105263157896, "logits/chosen": 13.17613410949707, "logits/rejected": 13.17613410949707, "logps/chosen": -4287.66015625, "logps/rejected": -4287.66015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9686584472656, "rewards/margins": 0.0, "rewards/rejected": -425.9686584472656, "step": 1280 }, { "epoch": 13.48421052631579, "grad_norm": 2.22114476855495e-06, "learning_rate": 0.0001732, "logits/chosen": 13.242491722106934, "logits/rejected": 13.242491722106934, "logps/chosen": -5175.3720703125, "logps/rejected": -5175.3720703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5988159179688, "rewards/margins": 0.0, "rewards/rejected": -514.5988159179688, "step": 1281 }, { "epoch": 13.494736842105263, "grad_norm": 1.0163159913645359e-06, "learning_rate": 0.00017317894736842106, "logits/chosen": 13.173091888427734, "logits/rejected": 13.173091888427734, "logps/chosen": -3758.30859375, "logps/rejected": -3758.30859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9125671386719, "rewards/margins": 0.0, "rewards/rejected": -372.9125671386719, "step": 1282 }, { "epoch": 13.505263157894737, "grad_norm": 9.761752153281122e-07, "learning_rate": 0.0001731578947368421, "logits/chosen": 13.173741340637207, "logits/rejected": 13.173741340637207, "logps/chosen": -3544.259765625, "logps/rejected": -3544.259765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4001159667969, "rewards/margins": 0.0, "rewards/rejected": -351.4001159667969, "step": 1283 }, { "epoch": 13.51578947368421, "grad_norm": 1.6959684216999449e-06, "learning_rate": 0.00017313684210526318, "logits/chosen": 13.15912914276123, "logits/rejected": 13.15912914276123, "logps/chosen": -3999.373046875, "logps/rejected": -3999.373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1099853515625, "rewards/margins": 0.0, "rewards/rejected": -397.1099853515625, "step": 1284 }, { "epoch": 13.526315789473685, "grad_norm": 1.46214756568952e-06, "learning_rate": 0.0001731157894736842, "logits/chosen": 13.164461135864258, "logits/rejected": 13.164461135864258, "logps/chosen": -3777.279296875, "logps/rejected": -3777.279296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.85882568359375, "rewards/margins": 0.0, "rewards/rejected": -374.85882568359375, "step": 1285 }, { "epoch": 13.536842105263158, "grad_norm": 1.5509444892813917e-06, "learning_rate": 0.00017309473684210526, "logits/chosen": 13.238690376281738, "logits/rejected": 13.238690376281738, "logps/chosen": -5176.623046875, "logps/rejected": -5176.623046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.723876953125, "rewards/margins": 0.0, "rewards/rejected": -514.723876953125, "step": 1286 }, { "epoch": 13.547368421052632, "grad_norm": 1.7104727021433064e-06, "learning_rate": 0.00017307368421052633, "logits/chosen": 13.226909637451172, "logits/rejected": 13.226909637451172, "logps/chosen": -4879.6728515625, "logps/rejected": -4879.6728515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1201171875, "rewards/margins": 0.0, "rewards/rejected": -485.1201171875, "step": 1287 }, { "epoch": 13.557894736842105, "grad_norm": 1.0637072591634933e-06, "learning_rate": 0.00017305263157894738, "logits/chosen": 13.172325134277344, "logits/rejected": 13.172325134277344, "logps/chosen": -2968.44140625, "logps/rejected": -2968.44140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1179504394531, "rewards/margins": 0.0, "rewards/rejected": -294.1179504394531, "step": 1288 }, { "epoch": 13.568421052631578, "grad_norm": 1.2580004522533272e-06, "learning_rate": 0.00017303157894736843, "logits/chosen": 13.232382774353027, "logits/rejected": 13.232382774353027, "logps/chosen": -4879.8671875, "logps/rejected": -4879.8671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1395568847656, "rewards/margins": 0.0, "rewards/rejected": -485.1395568847656, "step": 1289 }, { "epoch": 13.578947368421053, "grad_norm": 9.166489007839118e-07, "learning_rate": 0.00017301052631578948, "logits/chosen": 13.154160499572754, "logits/rejected": 13.154160499572754, "logps/chosen": -2673.2763671875, "logps/rejected": -2673.2763671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5460510253906, "rewards/margins": 0.0, "rewards/rejected": -264.5460510253906, "step": 1290 }, { "epoch": 13.589473684210526, "grad_norm": 2.135568138328381e-06, "learning_rate": 0.00017298947368421056, "logits/chosen": 13.238767623901367, "logits/rejected": 13.238767623901367, "logps/chosen": -4880.556640625, "logps/rejected": -4880.556640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.20849609375, "rewards/margins": 0.0, "rewards/rejected": -485.20849609375, "step": 1291 }, { "epoch": 13.6, "grad_norm": 1.4988737575549749e-06, "learning_rate": 0.00017296842105263158, "logits/chosen": 13.18018627166748, "logits/rejected": 13.18018627166748, "logps/chosen": -3778.162109375, "logps/rejected": -3778.162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9471130371094, "rewards/margins": 0.0, "rewards/rejected": -374.9471130371094, "step": 1292 }, { "epoch": 13.610526315789473, "grad_norm": 1.0861937198569649e-06, "learning_rate": 0.00017294736842105263, "logits/chosen": 13.191993713378906, "logits/rejected": 13.191993713378906, "logps/chosen": -3759.056640625, "logps/rejected": -3759.056640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.98736572265625, "rewards/margins": 0.0, "rewards/rejected": -372.98736572265625, "step": 1293 }, { "epoch": 13.621052631578948, "grad_norm": 1.4736940556758782e-06, "learning_rate": 0.0001729263157894737, "logits/chosen": 13.25374984741211, "logits/rejected": 13.25374984741211, "logps/chosen": -4880.7998046875, "logps/rejected": -4880.7998046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2328186035156, "rewards/margins": 0.0, "rewards/rejected": -485.2328186035156, "step": 1294 }, { "epoch": 13.631578947368421, "grad_norm": 1.1962823691646918e-06, "learning_rate": 0.00017290526315789475, "logits/chosen": 13.172516822814941, "logits/rejected": 13.172516822814941, "logps/chosen": -2673.259765625, "logps/rejected": -2673.259765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5444030761719, "rewards/margins": 0.0, "rewards/rejected": -264.5444030761719, "step": 1295 }, { "epoch": 13.642105263157895, "grad_norm": 8.691646371516981e-07, "learning_rate": 0.0001728842105263158, "logits/chosen": 13.202339172363281, "logits/rejected": 13.202339172363281, "logps/chosen": -2968.2421875, "logps/rejected": -2968.2421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0980224609375, "rewards/margins": 0.0, "rewards/rejected": -294.0980224609375, "step": 1296 }, { "epoch": 13.652631578947368, "grad_norm": 1.582819322720752e-06, "learning_rate": 0.00017286315789473685, "logits/chosen": 13.187515258789062, "logits/rejected": 13.187515258789062, "logps/chosen": -3997.517578125, "logps/rejected": -3997.517578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9244384765625, "rewards/margins": 0.0, "rewards/rejected": -396.9244384765625, "step": 1297 }, { "epoch": 13.663157894736843, "grad_norm": 3.0316725769807817e-06, "learning_rate": 0.0001728421052631579, "logits/chosen": 13.27031135559082, "logits/rejected": 13.27031135559082, "logps/chosen": -5176.4931640625, "logps/rejected": -5176.4931640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7108764648438, "rewards/margins": 0.0, "rewards/rejected": -514.7108764648438, "step": 1298 }, { "epoch": 13.673684210526316, "grad_norm": 1.8782495772029506e-06, "learning_rate": 0.00017282105263157895, "logits/chosen": 13.197049140930176, "logits/rejected": 13.197049140930176, "logps/chosen": -4289.2333984375, "logps/rejected": -4289.2333984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1259765625, "rewards/margins": 0.0, "rewards/rejected": -426.1259765625, "step": 1299 }, { "epoch": 13.68421052631579, "grad_norm": 1.786511802492896e-06, "learning_rate": 0.0001728, "logits/chosen": 13.179607391357422, "logits/rejected": 13.179607391357422, "logps/chosen": -3997.408203125, "logps/rejected": -3997.408203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.91351318359375, "rewards/margins": 0.0, "rewards/rejected": -396.91351318359375, "step": 1300 }, { "epoch": 13.68421052631579, "eval_logits/chosen": 13.220492362976074, "eval_logits/rejected": 13.220492362976074, "eval_logps/chosen": -4311.43017578125, "eval_logps/rejected": -4311.43017578125, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.2398376464844, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.2398376464844, "eval_runtime": 4.4434, "eval_samples_per_second": 2.251, "eval_steps_per_second": 2.251, "step": 1300 }, { "epoch": 13.694736842105263, "grad_norm": 1.7261189668715815e-06, "learning_rate": 0.00017277894736842108, "logits/chosen": 13.256749153137207, "logits/rejected": 13.256749153137207, "logps/chosen": -5176.52001953125, "logps/rejected": -5176.52001953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7135620117188, "rewards/margins": 0.0, "rewards/rejected": -514.7135620117188, "step": 1301 }, { "epoch": 13.705263157894738, "grad_norm": 1.0206708793703e-06, "learning_rate": 0.00017275789473684212, "logits/chosen": 13.154136657714844, "logits/rejected": 13.154136657714844, "logps/chosen": -2673.3359375, "logps/rejected": -2673.3359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.552001953125, "rewards/margins": 0.0, "rewards/rejected": -264.552001953125, "step": 1302 }, { "epoch": 13.715789473684211, "grad_norm": 1.119861849474546e-06, "learning_rate": 0.00017273684210526317, "logits/chosen": 13.173050880432129, "logits/rejected": 13.173050880432129, "logps/chosen": -3542.158203125, "logps/rejected": -3542.158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.18994140625, "rewards/margins": 0.0, "rewards/rejected": -351.18994140625, "step": 1303 }, { "epoch": 13.726315789473684, "grad_norm": 1.6815436083561508e-06, "learning_rate": 0.0001727157894736842, "logits/chosen": 13.161462783813477, "logits/rejected": 13.161462783813477, "logps/chosen": -3779.09375, "logps/rejected": -3779.09375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0402526855469, "rewards/margins": 0.0, "rewards/rejected": -375.0402526855469, "step": 1304 }, { "epoch": 13.736842105263158, "grad_norm": 1.0372693850513315e-06, "learning_rate": 0.00017269473684210527, "logits/chosen": 13.166621208190918, "logits/rejected": 13.166621208190918, "logps/chosen": -3542.416015625, "logps/rejected": -3542.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2157287597656, "rewards/margins": 0.0, "rewards/rejected": -351.2157287597656, "step": 1305 }, { "epoch": 13.74736842105263, "grad_norm": 1.5220086879708106e-06, "learning_rate": 0.00017267368421052632, "logits/chosen": 13.23691177368164, "logits/rejected": 13.23691177368164, "logps/chosen": -5176.58203125, "logps/rejected": -5176.58203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7197875976562, "rewards/margins": 0.0, "rewards/rejected": -514.7197875976562, "step": 1306 }, { "epoch": 13.757894736842106, "grad_norm": 1.469584958613268e-06, "learning_rate": 0.00017265263157894737, "logits/chosen": 13.17170238494873, "logits/rejected": 13.17170238494873, "logps/chosen": -3542.32421875, "logps/rejected": -3542.32421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.20654296875, "rewards/margins": 0.0, "rewards/rejected": -351.20654296875, "step": 1307 }, { "epoch": 13.76842105263158, "grad_norm": 1.43769204896671e-06, "learning_rate": 0.00017263157894736842, "logits/chosen": 13.16417407989502, "logits/rejected": 13.16417407989502, "logps/chosen": -3997.89453125, "logps/rejected": -3997.89453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9621276855469, "rewards/margins": 0.0, "rewards/rejected": -396.9621276855469, "step": 1308 }, { "epoch": 13.778947368421052, "grad_norm": 1.658162091189297e-06, "learning_rate": 0.0001726105263157895, "logits/chosen": 13.219422340393066, "logits/rejected": 13.219422340393066, "logps/chosen": -4326.048828125, "logps/rejected": -4326.048828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4200134277344, "rewards/margins": 0.0, "rewards/rejected": -429.4200134277344, "step": 1309 }, { "epoch": 13.789473684210526, "grad_norm": 8.727628824090061e-07, "learning_rate": 0.00017258947368421055, "logits/chosen": 13.190666198730469, "logits/rejected": 13.190666198730469, "logps/chosen": -2967.962890625, "logps/rejected": -2967.962890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0700988769531, "rewards/margins": 0.0, "rewards/rejected": -294.0700988769531, "step": 1310 }, { "epoch": 13.8, "grad_norm": 3.942923740396509e-06, "learning_rate": 0.00017256842105263157, "logits/chosen": 13.250381469726562, "logits/rejected": 13.250381469726562, "logps/chosen": -4880.02392578125, "logps/rejected": -4880.02392578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1552429199219, "rewards/margins": 0.0, "rewards/rejected": -485.1552429199219, "step": 1311 }, { "epoch": 13.810526315789474, "grad_norm": 1.4225306586013176e-06, "learning_rate": 0.00017254736842105264, "logits/chosen": 13.25527286529541, "logits/rejected": 13.25527286529541, "logps/chosen": -4880.912109375, "logps/rejected": -4880.912109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2440490722656, "rewards/margins": 0.0, "rewards/rejected": -485.2440490722656, "step": 1312 }, { "epoch": 13.821052631578947, "grad_norm": 2.981441184601863e-06, "learning_rate": 0.0001725263157894737, "logits/chosen": 13.194138526916504, "logits/rejected": 13.194138526916504, "logps/chosen": -3778.814453125, "logps/rejected": -3778.814453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0123291015625, "rewards/margins": 0.0, "rewards/rejected": -375.0123291015625, "step": 1313 }, { "epoch": 13.83157894736842, "grad_norm": 2.0837317151745083e-06, "learning_rate": 0.00017250526315789474, "logits/chosen": 13.191129684448242, "logits/rejected": 13.191129684448242, "logps/chosen": -3997.51953125, "logps/rejected": -3997.51953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9246520996094, "rewards/margins": 0.0, "rewards/rejected": -396.9246520996094, "step": 1314 }, { "epoch": 13.842105263157894, "grad_norm": 9.167787879960088e-07, "learning_rate": 0.0001724842105263158, "logits/chosen": 13.182260513305664, "logits/rejected": 13.182260513305664, "logps/chosen": -2673.736328125, "logps/rejected": -2673.736328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.592041015625, "rewards/margins": 0.0, "rewards/rejected": -264.592041015625, "step": 1315 }, { "epoch": 13.852631578947369, "grad_norm": 3.7157267342990963e-06, "learning_rate": 0.00017246315789473687, "logits/chosen": 13.274850845336914, "logits/rejected": 13.274850845336914, "logps/chosen": -5176.7109375, "logps/rejected": -5176.7109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.732666015625, "rewards/margins": 0.0, "rewards/rejected": -514.732666015625, "step": 1316 }, { "epoch": 13.863157894736842, "grad_norm": 1.0027770258602686e-06, "learning_rate": 0.0001724421052631579, "logits/chosen": 13.207159996032715, "logits/rejected": 13.207159996032715, "logps/chosen": -3543.87109375, "logps/rejected": -3543.87109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3612365722656, "rewards/margins": 0.0, "rewards/rejected": -351.3612365722656, "step": 1317 }, { "epoch": 13.873684210526315, "grad_norm": 1.4301142527983757e-06, "learning_rate": 0.00017242105263157894, "logits/chosen": 13.190265655517578, "logits/rejected": 13.190265655517578, "logps/chosen": -3997.947265625, "logps/rejected": -3997.947265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9674072265625, "rewards/margins": 0.0, "rewards/rejected": -396.9674072265625, "step": 1318 }, { "epoch": 13.884210526315789, "grad_norm": 2.0466534351726295e-06, "learning_rate": 0.00017240000000000002, "logits/chosen": 13.200031280517578, "logits/rejected": 13.200031280517578, "logps/chosen": -3758.0693359375, "logps/rejected": -3758.0693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8886413574219, "rewards/margins": 0.0, "rewards/rejected": -372.8886413574219, "step": 1319 }, { "epoch": 13.894736842105264, "grad_norm": 1.5260916370607447e-06, "learning_rate": 0.00017237894736842107, "logits/chosen": 13.19677448272705, "logits/rejected": 13.19677448272705, "logps/chosen": -3544.154296875, "logps/rejected": -3544.154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3895568847656, "rewards/margins": 0.0, "rewards/rejected": -351.3895568847656, "step": 1320 }, { "epoch": 13.905263157894737, "grad_norm": 1.5502948826906504e-06, "learning_rate": 0.00017235789473684211, "logits/chosen": 13.181499481201172, "logits/rejected": 13.181499481201172, "logps/chosen": -3998.498046875, "logps/rejected": -3998.498046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0224914550781, "rewards/margins": 0.0, "rewards/rejected": -397.0224914550781, "step": 1321 }, { "epoch": 13.91578947368421, "grad_norm": 1.4038795370652224e-06, "learning_rate": 0.00017233684210526316, "logits/chosen": 13.24766731262207, "logits/rejected": 13.24766731262207, "logps/chosen": -4880.6533203125, "logps/rejected": -4880.6533203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2181701660156, "rewards/margins": 0.0, "rewards/rejected": -485.2181701660156, "step": 1322 }, { "epoch": 13.926315789473684, "grad_norm": 9.854934432951268e-07, "learning_rate": 0.0001723157894736842, "logits/chosen": 13.16574478149414, "logits/rejected": 13.16574478149414, "logps/chosen": -2673.9072265625, "logps/rejected": -2673.9072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.609130859375, "rewards/margins": 0.0, "rewards/rejected": -264.609130859375, "step": 1323 }, { "epoch": 13.936842105263159, "grad_norm": 1.7348602341371588e-06, "learning_rate": 0.00017229473684210526, "logits/chosen": 13.190699577331543, "logits/rejected": 13.190699577331543, "logps/chosen": -3544.298828125, "logps/rejected": -3544.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4040222167969, "rewards/margins": 0.0, "rewards/rejected": -351.4040222167969, "step": 1324 }, { "epoch": 13.947368421052632, "grad_norm": 1.4018485217093257e-06, "learning_rate": 0.0001722736842105263, "logits/chosen": 13.245927810668945, "logits/rejected": 13.245927810668945, "logps/chosen": -4880.5869140625, "logps/rejected": -4880.5869140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2115173339844, "rewards/margins": 0.0, "rewards/rejected": -485.2115173339844, "step": 1325 }, { "epoch": 13.957894736842105, "grad_norm": 1.5222434512907057e-06, "learning_rate": 0.0001722526315789474, "logits/chosen": 13.194190979003906, "logits/rejected": 13.194190979003906, "logps/chosen": -4287.30615234375, "logps/rejected": -4287.30615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9332580566406, "rewards/margins": 0.0, "rewards/rejected": -425.9332580566406, "step": 1326 }, { "epoch": 13.968421052631578, "grad_norm": 2.366171656831284e-06, "learning_rate": 0.00017223157894736844, "logits/chosen": 13.230754852294922, "logits/rejected": 13.230754852294922, "logps/chosen": -4326.416015625, "logps/rejected": -4326.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4567565917969, "rewards/margins": 0.0, "rewards/rejected": -429.4567565917969, "step": 1327 }, { "epoch": 13.978947368421053, "grad_norm": 2.2065630673751002e-06, "learning_rate": 0.0001722105263157895, "logits/chosen": 13.260734558105469, "logits/rejected": 13.260734558105469, "logps/chosen": -5176.2802734375, "logps/rejected": -5176.2802734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6896362304688, "rewards/margins": 0.0, "rewards/rejected": -514.6896362304688, "step": 1328 }, { "epoch": 13.989473684210527, "grad_norm": 3.0406204132304993e-06, "learning_rate": 0.00017218947368421054, "logits/chosen": 13.262333869934082, "logits/rejected": 13.262333869934082, "logps/chosen": -5176.4462890625, "logps/rejected": -5176.4462890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7062377929688, "rewards/margins": 0.0, "rewards/rejected": -514.7062377929688, "step": 1329 }, { "epoch": 14.0, "grad_norm": 3.0065550618019188e-06, "learning_rate": 0.00017216842105263159, "logits/chosen": 13.265579223632812, "logits/rejected": 13.265579223632812, "logps/chosen": -5176.94140625, "logps/rejected": -5176.94140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7557373046875, "rewards/margins": 0.0, "rewards/rejected": -514.7557373046875, "step": 1330 }, { "epoch": 14.010526315789473, "grad_norm": 1.303809085584362e-06, "learning_rate": 0.00017214736842105263, "logits/chosen": 13.205971717834473, "logits/rejected": 13.205971717834473, "logps/chosen": -3758.37109375, "logps/rejected": -3758.37109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9187927246094, "rewards/margins": 0.0, "rewards/rejected": -372.9187927246094, "step": 1331 }, { "epoch": 14.021052631578947, "grad_norm": 1.5817903431525338e-06, "learning_rate": 0.00017212631578947368, "logits/chosen": 13.20882511138916, "logits/rejected": 13.20882511138916, "logps/chosen": -3758.20703125, "logps/rejected": -3758.20703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.90240478515625, "rewards/margins": 0.0, "rewards/rejected": -372.90240478515625, "step": 1332 }, { "epoch": 14.031578947368422, "grad_norm": 2.65092535300937e-06, "learning_rate": 0.00017210526315789476, "logits/chosen": 13.195255279541016, "logits/rejected": 13.195255279541016, "logps/chosen": -3998.2421875, "logps/rejected": -3998.2421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9969177246094, "rewards/margins": 0.0, "rewards/rejected": -396.9969177246094, "step": 1333 }, { "epoch": 14.042105263157895, "grad_norm": 1.4598131201637443e-06, "learning_rate": 0.0001720842105263158, "logits/chosen": 13.193469047546387, "logits/rejected": 13.193469047546387, "logps/chosen": -3998.203125, "logps/rejected": -3998.203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9930114746094, "rewards/margins": 0.0, "rewards/rejected": -396.9930114746094, "step": 1334 }, { "epoch": 14.052631578947368, "grad_norm": 2.649177076818887e-06, "learning_rate": 0.00017206315789473686, "logits/chosen": 13.252957344055176, "logits/rejected": 13.252957344055176, "logps/chosen": -4880.5791015625, "logps/rejected": -4880.5791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.21075439453125, "rewards/margins": 0.0, "rewards/rejected": -485.21075439453125, "step": 1335 }, { "epoch": 14.063157894736841, "grad_norm": 1.0034248134616064e-06, "learning_rate": 0.00017204210526315788, "logits/chosen": 13.19357681274414, "logits/rejected": 13.19357681274414, "logps/chosen": -2967.359375, "logps/rejected": -2967.359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0097351074219, "rewards/margins": 0.0, "rewards/rejected": -294.0097351074219, "step": 1336 }, { "epoch": 14.073684210526316, "grad_norm": 8.169994544005021e-06, "learning_rate": 0.00017202105263157896, "logits/chosen": 13.253226280212402, "logits/rejected": 13.253226280212402, "logps/chosen": -5177.7568359375, "logps/rejected": -5177.7568359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8372802734375, "rewards/margins": 0.0, "rewards/rejected": -514.8372802734375, "step": 1337 }, { "epoch": 14.08421052631579, "grad_norm": 9.519492891740811e-07, "learning_rate": 0.000172, "logits/chosen": 13.159282684326172, "logits/rejected": 13.159282684326172, "logps/chosen": -2672.634765625, "logps/rejected": -2672.634765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4819030761719, "rewards/margins": 0.0, "rewards/rejected": -264.4819030761719, "step": 1338 }, { "epoch": 14.094736842105263, "grad_norm": 1.65651690622326e-05, "learning_rate": 0.00017197894736842106, "logits/chosen": 13.237701416015625, "logits/rejected": 13.237701416015625, "logps/chosen": -5176.2880859375, "logps/rejected": -5176.2880859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6903686523438, "rewards/margins": 0.0, "rewards/rejected": -514.6903686523438, "step": 1339 }, { "epoch": 14.105263157894736, "grad_norm": 7.876771633164026e-06, "learning_rate": 0.0001719578947368421, "logits/chosen": 13.184174537658691, "logits/rejected": 13.184174537658691, "logps/chosen": -3774.2265625, "logps/rejected": -3774.2265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.5535583496094, "rewards/margins": 0.0, "rewards/rejected": -374.5535583496094, "step": 1340 }, { "epoch": 14.115789473684211, "grad_norm": 1.3585263332061004e-05, "learning_rate": 0.00017193684210526318, "logits/chosen": 13.225566864013672, "logits/rejected": 13.225566864013672, "logps/chosen": -4323.64453125, "logps/rejected": -4323.64453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.1795959472656, "rewards/margins": 0.0, "rewards/rejected": -429.1795959472656, "step": 1341 }, { "epoch": 14.126315789473685, "grad_norm": 1.6314090771629708e-06, "learning_rate": 0.0001719157894736842, "logits/chosen": 13.169658660888672, "logits/rejected": 13.169658660888672, "logps/chosen": -3776.046875, "logps/rejected": -3776.046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7355651855469, "rewards/margins": 0.0, "rewards/rejected": -374.7355651855469, "step": 1342 }, { "epoch": 14.136842105263158, "grad_norm": 2.9886099582654424e-05, "learning_rate": 0.00017189473684210525, "logits/chosen": 13.212135314941406, "logits/rejected": 13.212135314941406, "logps/chosen": -5173.58642578125, "logps/rejected": -5173.58642578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4202270507812, "rewards/margins": 0.0, "rewards/rejected": -514.4202270507812, "step": 1343 }, { "epoch": 14.147368421052631, "grad_norm": 1.2741395948978607e-05, "learning_rate": 0.00017187368421052633, "logits/chosen": 13.180377960205078, "logits/rejected": 13.180377960205078, "logps/chosen": -3754.025390625, "logps/rejected": -3754.025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.4842224121094, "rewards/margins": 0.0, "rewards/rejected": -372.4842224121094, "step": 1344 }, { "epoch": 14.157894736842104, "grad_norm": 1.555896051286254e-05, "learning_rate": 0.00017185263157894738, "logits/chosen": 13.168469429016113, "logits/rejected": 13.168469429016113, "logps/chosen": -3533.8203125, "logps/rejected": -3533.8203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.3561706542969, "rewards/margins": 0.0, "rewards/rejected": -350.3561706542969, "step": 1345 }, { "epoch": 14.16842105263158, "grad_norm": 5.479887022374896e-06, "learning_rate": 0.00017183157894736843, "logits/chosen": 13.175041198730469, "logits/rejected": 13.175041198730469, "logps/chosen": -2964.75, "logps/rejected": -2964.75, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.7488098144531, "rewards/margins": 0.0, "rewards/rejected": -293.7488098144531, "step": 1346 }, { "epoch": 14.178947368421053, "grad_norm": 2.8144702355348272e-06, "learning_rate": 0.00017181052631578948, "logits/chosen": 13.155779838562012, "logits/rejected": 13.155779838562012, "logps/chosen": -4287.39453125, "logps/rejected": -4287.39453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9421081542969, "rewards/margins": 0.0, "rewards/rejected": -425.9421081542969, "step": 1347 }, { "epoch": 14.189473684210526, "grad_norm": 1.4167173503665254e-05, "learning_rate": 0.00017178947368421055, "logits/chosen": 13.124217987060547, "logits/rejected": 13.124217987060547, "logps/chosen": -3539.01171875, "logps/rejected": -3539.01171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.87530517578125, "rewards/margins": 0.0, "rewards/rejected": -350.87530517578125, "step": 1348 }, { "epoch": 14.2, "grad_norm": 8.512510248692706e-06, "learning_rate": 0.00017176842105263158, "logits/chosen": 13.127250671386719, "logits/rejected": 13.127250671386719, "logps/chosen": -3995.546875, "logps/rejected": -3995.546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7273864746094, "rewards/margins": 0.0, "rewards/rejected": -396.7273864746094, "step": 1349 }, { "epoch": 14.210526315789474, "grad_norm": 1.8842674762709066e-06, "learning_rate": 0.00017174736842105262, "logits/chosen": 13.14427375793457, "logits/rejected": 13.14427375793457, "logps/chosen": -2672.650390625, "logps/rejected": -2672.650390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.48345947265625, "rewards/margins": 0.0, "rewards/rejected": -264.48345947265625, "step": 1350 }, { "epoch": 14.210526315789474, "eval_logits/chosen": 13.205266952514648, "eval_logits/rejected": 13.205266952514648, "eval_logps/chosen": -4303.63818359375, "eval_logps/rejected": -4303.63818359375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -427.460693359375, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -427.460693359375, "eval_runtime": 4.2485, "eval_samples_per_second": 2.354, "eval_steps_per_second": 2.354, "step": 1350 }, { "epoch": 14.221052631578948, "grad_norm": 7.026199000392808e-06, "learning_rate": 0.0001717263157894737, "logits/chosen": 13.177156448364258, "logits/rejected": 13.177156448364258, "logps/chosen": -2963.359375, "logps/rejected": -2963.359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.6097412109375, "rewards/margins": 0.0, "rewards/rejected": -293.6097412109375, "step": 1351 }, { "epoch": 14.23157894736842, "grad_norm": 1.765816705301404e-05, "learning_rate": 0.00017170526315789475, "logits/chosen": 13.15739917755127, "logits/rejected": 13.15739917755127, "logps/chosen": -3988.13671875, "logps/rejected": -3988.13671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -395.9863586425781, "rewards/margins": 0.0, "rewards/rejected": -395.9863586425781, "step": 1352 }, { "epoch": 14.242105263157894, "grad_norm": 7.770182492095046e-06, "learning_rate": 0.0001716842105263158, "logits/chosen": 13.227485656738281, "logits/rejected": 13.227485656738281, "logps/chosen": -5177.908203125, "logps/rejected": -5177.908203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8524169921875, "rewards/margins": 0.0, "rewards/rejected": -514.8524169921875, "step": 1353 }, { "epoch": 14.25263157894737, "grad_norm": 2.5974733944167383e-05, "learning_rate": 0.00017166315789473685, "logits/chosen": 13.068532943725586, "logits/rejected": 13.068532943725586, "logps/chosen": -3530.765625, "logps/rejected": -3530.765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.0506896972656, "rewards/margins": 0.0, "rewards/rejected": -350.0506896972656, "step": 1354 }, { "epoch": 14.263157894736842, "grad_norm": 2.8610100343939848e-05, "learning_rate": 0.0001716421052631579, "logits/chosen": 13.18753719329834, "logits/rejected": 13.18753719329834, "logps/chosen": -5172.0068359375, "logps/rejected": -5172.0068359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2622680664062, "rewards/margins": 0.0, "rewards/rejected": -514.2622680664062, "step": 1355 }, { "epoch": 14.273684210526316, "grad_norm": 1.2156935554230586e-05, "learning_rate": 0.00017162105263157895, "logits/chosen": 13.169296264648438, "logits/rejected": 13.169296264648438, "logps/chosen": -3538.7587890625, "logps/rejected": -3538.7587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.8500061035156, "rewards/margins": 0.0, "rewards/rejected": -350.8500061035156, "step": 1356 }, { "epoch": 14.284210526315789, "grad_norm": 3.180498242727481e-05, "learning_rate": 0.0001716, "logits/chosen": 13.19412899017334, "logits/rejected": 13.19412899017334, "logps/chosen": -4302.65234375, "logps/rejected": -4302.65234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -427.08038330078125, "rewards/margins": 0.0, "rewards/rejected": -427.08038330078125, "step": 1357 }, { "epoch": 14.294736842105262, "grad_norm": 1.5012937183200847e-05, "learning_rate": 0.00017157894736842107, "logits/chosen": 13.164214134216309, "logits/rejected": 13.164214134216309, "logps/chosen": -3533.70703125, "logps/rejected": -3533.70703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.3448181152344, "rewards/margins": 0.0, "rewards/rejected": -350.3448181152344, "step": 1358 }, { "epoch": 14.305263157894737, "grad_norm": 5.59252612220007e-06, "learning_rate": 0.00017155789473684212, "logits/chosen": 13.21369457244873, "logits/rejected": 13.21369457244873, "logps/chosen": -4877.013671875, "logps/rejected": -4877.013671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.8542175292969, "rewards/margins": 0.0, "rewards/rejected": -484.8542175292969, "step": 1359 }, { "epoch": 14.31578947368421, "grad_norm": 1.5276284102583304e-05, "learning_rate": 0.00017153684210526317, "logits/chosen": 13.001856803894043, "logits/rejected": 13.001856803894043, "logps/chosen": -2658.439453125, "logps/rejected": -2658.439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -263.0623474121094, "rewards/margins": 0.0, "rewards/rejected": -263.0623474121094, "step": 1360 }, { "epoch": 14.326315789473684, "grad_norm": 4.242912109475583e-05, "learning_rate": 0.00017151578947368422, "logits/chosen": 12.979222297668457, "logits/rejected": 12.979222297668457, "logps/chosen": -3965.40625, "logps/rejected": -3965.40625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -393.71331787109375, "rewards/margins": 0.0, "rewards/rejected": -393.71331787109375, "step": 1361 }, { "epoch": 14.336842105263157, "grad_norm": 1.6982289707812015e-06, "learning_rate": 0.00017149473684210527, "logits/chosen": 13.156967163085938, "logits/rejected": 13.156967163085938, "logps/chosen": -3776.45703125, "logps/rejected": -3776.45703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7765808105469, "rewards/margins": 0.0, "rewards/rejected": -374.7765808105469, "step": 1362 }, { "epoch": 14.347368421052632, "grad_norm": 1.9396415154915303e-05, "learning_rate": 0.00017147368421052632, "logits/chosen": 13.17154312133789, "logits/rejected": 13.17154312133789, "logps/chosen": -3979.978515625, "logps/rejected": -3979.978515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -395.1705322265625, "rewards/margins": 0.0, "rewards/rejected": -395.1705322265625, "step": 1363 }, { "epoch": 14.357894736842105, "grad_norm": 1.6068575860117562e-05, "learning_rate": 0.00017145263157894737, "logits/chosen": 13.17044734954834, "logits/rejected": 13.17044734954834, "logps/chosen": -3524.6103515625, "logps/rejected": -3524.6103515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -349.4351501464844, "rewards/margins": 0.0, "rewards/rejected": -349.4351501464844, "step": 1364 }, { "epoch": 14.368421052631579, "grad_norm": 1.9298457118566148e-05, "learning_rate": 0.00017143157894736845, "logits/chosen": 13.162466049194336, "logits/rejected": 13.162466049194336, "logps/chosen": -3977.45703125, "logps/rejected": -3977.45703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -394.91839599609375, "rewards/margins": 0.0, "rewards/rejected": -394.91839599609375, "step": 1365 }, { "epoch": 14.378947368421052, "grad_norm": 1.4766707863600459e-05, "learning_rate": 0.0001714105263157895, "logits/chosen": 13.214545249938965, "logits/rejected": 13.214545249938965, "logps/chosen": -4871.7392578125, "logps/rejected": -4871.7392578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.3267517089844, "rewards/margins": 0.0, "rewards/rejected": -484.3267517089844, "step": 1366 }, { "epoch": 14.389473684210527, "grad_norm": 1.4433329852181487e-05, "learning_rate": 0.00017138947368421054, "logits/chosen": 13.064927101135254, "logits/rejected": 13.064927101135254, "logps/chosen": -3990.587890625, "logps/rejected": -3990.587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.2314758300781, "rewards/margins": 0.0, "rewards/rejected": -396.2314758300781, "step": 1367 }, { "epoch": 14.4, "grad_norm": 2.533439692342654e-05, "learning_rate": 0.00017136842105263157, "logits/chosen": 12.99682331085205, "logits/rejected": 12.99682331085205, "logps/chosen": -3522.75390625, "logps/rejected": -3522.75390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -349.24951171875, "rewards/margins": 0.0, "rewards/rejected": -349.24951171875, "step": 1368 }, { "epoch": 14.410526315789474, "grad_norm": 3.089273741352372e-05, "learning_rate": 0.00017134736842105264, "logits/chosen": 13.113932609558105, "logits/rejected": 13.113932609558105, "logps/chosen": -5161.814453125, "logps/rejected": -5161.814453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -513.2430419921875, "rewards/margins": 0.0, "rewards/rejected": -513.2430419921875, "step": 1369 }, { "epoch": 14.421052631578947, "grad_norm": 9.777106697583804e-07, "learning_rate": 0.0001713263157894737, "logits/chosen": 13.1111478805542, "logits/rejected": 13.1111478805542, "logps/chosen": -2672.65234375, "logps/rejected": -2672.65234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.483642578125, "rewards/margins": 0.0, "rewards/rejected": -264.483642578125, "step": 1370 }, { "epoch": 14.431578947368422, "grad_norm": 6.444215614465065e-06, "learning_rate": 0.00017130526315789474, "logits/chosen": 13.156900405883789, "logits/rejected": 13.156900405883789, "logps/chosen": -2958.296875, "logps/rejected": -2958.296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.1034851074219, "rewards/margins": 0.0, "rewards/rejected": -293.1034851074219, "step": 1371 }, { "epoch": 14.442105263157895, "grad_norm": 8.70741769176675e-06, "learning_rate": 0.0001712842105263158, "logits/chosen": 13.146058082580566, "logits/rejected": 13.146058082580566, "logps/chosen": -2946.986328125, "logps/rejected": -2946.986328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -291.9724426269531, "rewards/margins": 0.0, "rewards/rejected": -291.9724426269531, "step": 1372 }, { "epoch": 14.452631578947368, "grad_norm": 1.7357309843646362e-05, "learning_rate": 0.00017126315789473687, "logits/chosen": 13.128440856933594, "logits/rejected": 13.128440856933594, "logps/chosen": -3744.365234375, "logps/rejected": -3744.365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -371.5674133300781, "rewards/margins": 0.0, "rewards/rejected": -371.5674133300781, "step": 1373 }, { "epoch": 14.463157894736842, "grad_norm": 2.121261968568433e-05, "learning_rate": 0.0001712421052631579, "logits/chosen": 13.164679527282715, "logits/rejected": 13.164679527282715, "logps/chosen": -4298.451171875, "logps/rejected": -4298.451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.6602478027344, "rewards/margins": 0.0, "rewards/rejected": -426.6602478027344, "step": 1374 }, { "epoch": 14.473684210526315, "grad_norm": 1.280099149880698e-05, "learning_rate": 0.00017122105263157894, "logits/chosen": 13.112738609313965, "logits/rejected": 13.112738609313965, "logps/chosen": -3988.1796875, "logps/rejected": -3988.1796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -395.99066162109375, "rewards/margins": 0.0, "rewards/rejected": -395.99066162109375, "step": 1375 }, { "epoch": 14.48421052631579, "grad_norm": 1.5633984276064439e-06, "learning_rate": 0.00017120000000000001, "logits/chosen": 13.05401611328125, "logits/rejected": 13.05401611328125, "logps/chosen": -3998.109375, "logps/rejected": -3998.109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9836120605469, "rewards/margins": 0.0, "rewards/rejected": -396.9836120605469, "step": 1376 }, { "epoch": 14.494736842105263, "grad_norm": 2.510229023755528e-05, "learning_rate": 0.00017117894736842106, "logits/chosen": 13.020404815673828, "logits/rejected": 13.020404815673828, "logps/chosen": -4862.0576171875, "logps/rejected": -4862.0576171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -483.3586120605469, "rewards/margins": 0.0, "rewards/rejected": -483.3586120605469, "step": 1377 }, { "epoch": 14.505263157894737, "grad_norm": 3.04246241285e-05, "learning_rate": 0.0001711578947368421, "logits/chosen": 12.989005088806152, "logits/rejected": 12.989005088806152, "logps/chosen": -4857.396484375, "logps/rejected": -4857.396484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -482.8924865722656, "rewards/margins": 0.0, "rewards/rejected": -482.8924865722656, "step": 1378 }, { "epoch": 14.51578947368421, "grad_norm": 4.315393653087085e-06, "learning_rate": 0.00017113684210526316, "logits/chosen": 12.965651512145996, "logits/rejected": 12.965651512145996, "logps/chosen": -2669.421875, "logps/rejected": -2669.421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.1606140136719, "rewards/margins": 0.0, "rewards/rejected": -264.1606140136719, "step": 1379 }, { "epoch": 14.526315789473685, "grad_norm": 1.223992171617283e-06, "learning_rate": 0.00017111578947368424, "logits/chosen": 13.045852661132812, "logits/rejected": 13.045852661132812, "logps/chosen": -3542.16796875, "logps/rejected": -3542.16796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.19091796875, "rewards/margins": 0.0, "rewards/rejected": -351.19091796875, "step": 1380 }, { "epoch": 14.536842105263158, "grad_norm": 1.3810269592795521e-05, "learning_rate": 0.00017109473684210526, "logits/chosen": 13.118818283081055, "logits/rejected": 13.118818283081055, "logps/chosen": -4869.65185546875, "logps/rejected": -4869.65185546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.1180114746094, "rewards/margins": 0.0, "rewards/rejected": -484.1180114746094, "step": 1381 }, { "epoch": 14.547368421052632, "grad_norm": 6.774283519916935e-06, "learning_rate": 0.0001710736842105263, "logits/chosen": 13.078365325927734, "logits/rejected": 13.078365325927734, "logps/chosen": -3537.509765625, "logps/rejected": -3537.509765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -350.72509765625, "rewards/margins": 0.0, "rewards/rejected": -350.72509765625, "step": 1382 }, { "epoch": 14.557894736842105, "grad_norm": 7.00138161846553e-06, "learning_rate": 0.00017105263157894739, "logits/chosen": 13.082615852355957, "logits/rejected": 13.082615852355957, "logps/chosen": -3750.982421875, "logps/rejected": -3750.982421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.179931640625, "rewards/margins": 0.0, "rewards/rejected": -372.179931640625, "step": 1383 }, { "epoch": 14.568421052631578, "grad_norm": 1.6010828403523192e-05, "learning_rate": 0.00017103157894736844, "logits/chosen": 13.148509979248047, "logits/rejected": 13.148509979248047, "logps/chosen": -5164.42578125, "logps/rejected": -5164.42578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -513.504150390625, "rewards/margins": 0.0, "rewards/rejected": -513.504150390625, "step": 1384 }, { "epoch": 14.578947368421053, "grad_norm": 1.1382093134670868e-06, "learning_rate": 0.00017101052631578948, "logits/chosen": 13.071215629577637, "logits/rejected": 13.071215629577637, "logps/chosen": -3542.55078125, "logps/rejected": -3542.55078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2292175292969, "rewards/margins": 0.0, "rewards/rejected": -351.2292175292969, "step": 1385 }, { "epoch": 14.589473684210526, "grad_norm": 7.105096756276907e-06, "learning_rate": 0.00017098947368421053, "logits/chosen": 13.12106704711914, "logits/rejected": 13.12106704711914, "logps/chosen": -5171.47705078125, "logps/rejected": -5171.47705078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.2092895507812, "rewards/margins": 0.0, "rewards/rejected": -514.2092895507812, "step": 1386 }, { "epoch": 14.6, "grad_norm": 1.8247093976242468e-05, "learning_rate": 0.00017096842105263158, "logits/chosen": 13.034560203552246, "logits/rejected": 13.034560203552246, "logps/chosen": -4314.28125, "logps/rejected": -4314.28125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -428.2432556152344, "rewards/margins": 0.0, "rewards/rejected": -428.2432556152344, "step": 1387 }, { "epoch": 14.610526315789473, "grad_norm": 1.1161985639773775e-05, "learning_rate": 0.00017094736842105263, "logits/chosen": 13.062606811523438, "logits/rejected": 13.062606811523438, "logps/chosen": -4280.14208984375, "logps/rejected": -4280.14208984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.21685791015625, "rewards/margins": 0.0, "rewards/rejected": -425.21685791015625, "step": 1388 }, { "epoch": 14.621052631578948, "grad_norm": 2.6693758172768867e-06, "learning_rate": 0.00017092631578947368, "logits/chosen": 13.179910659790039, "logits/rejected": 13.179910659790039, "logps/chosen": -4878.9443359375, "logps/rejected": -4878.9443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0472717285156, "rewards/margins": 0.0, "rewards/rejected": -485.0472717285156, "step": 1389 }, { "epoch": 14.631578947368421, "grad_norm": 8.390725838580693e-07, "learning_rate": 0.00017090526315789476, "logits/chosen": 13.135778427124023, "logits/rejected": 13.135778427124023, "logps/chosen": -2673.55078125, "logps/rejected": -2673.55078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.573486328125, "rewards/margins": 0.0, "rewards/rejected": -264.573486328125, "step": 1390 }, { "epoch": 14.642105263157895, "grad_norm": 3.430728838793584e-06, "learning_rate": 0.0001708842105263158, "logits/chosen": 13.19646167755127, "logits/rejected": 13.19646167755127, "logps/chosen": -3542.49609375, "logps/rejected": -3542.49609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2237243652344, "rewards/margins": 0.0, "rewards/rejected": -351.2237243652344, "step": 1391 }, { "epoch": 14.652631578947368, "grad_norm": 1.7772032151697204e-05, "learning_rate": 0.00017086315789473686, "logits/chosen": 13.278853416442871, "logits/rejected": 13.278853416442871, "logps/chosen": -5165.3515625, "logps/rejected": -5165.3515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -513.5967407226562, "rewards/margins": 0.0, "rewards/rejected": -513.5967407226562, "step": 1392 }, { "epoch": 14.663157894736843, "grad_norm": 2.0008653791592224e-06, "learning_rate": 0.00017084210526315788, "logits/chosen": 13.20356273651123, "logits/rejected": 13.20356273651123, "logps/chosen": -2672.498046875, "logps/rejected": -2672.498046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4682312011719, "rewards/margins": 0.0, "rewards/rejected": -264.4682312011719, "step": 1393 }, { "epoch": 14.673684210526316, "grad_norm": 1.9669969333335757e-06, "learning_rate": 0.00017082105263157896, "logits/chosen": 13.22119140625, "logits/rejected": 13.22119140625, "logps/chosen": -3777.6875, "logps/rejected": -3777.6875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8996276855469, "rewards/margins": 0.0, "rewards/rejected": -374.8996276855469, "step": 1394 }, { "epoch": 14.68421052631579, "grad_norm": 1.116115413424268e-06, "learning_rate": 0.0001708, "logits/chosen": 13.217914581298828, "logits/rejected": 13.217914581298828, "logps/chosen": -3757.6015625, "logps/rejected": -3757.6015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.84185791015625, "rewards/margins": 0.0, "rewards/rejected": -372.84185791015625, "step": 1395 }, { "epoch": 14.694736842105263, "grad_norm": 2.5807366910157725e-06, "learning_rate": 0.00017077894736842105, "logits/chosen": 13.215991020202637, "logits/rejected": 13.215991020202637, "logps/chosen": -3544.076171875, "logps/rejected": -3544.076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3817443847656, "rewards/margins": 0.0, "rewards/rejected": -351.3817443847656, "step": 1396 }, { "epoch": 14.705263157894738, "grad_norm": 5.147968295204919e-06, "learning_rate": 0.00017075789473684213, "logits/chosen": 13.202549934387207, "logits/rejected": 13.202549934387207, "logps/chosen": -3776.275390625, "logps/rejected": -3776.275390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7584228515625, "rewards/margins": 0.0, "rewards/rejected": -374.7584228515625, "step": 1397 }, { "epoch": 14.715789473684211, "grad_norm": 2.7654589302983368e-06, "learning_rate": 0.00017073684210526318, "logits/chosen": 13.183501243591309, "logits/rejected": 13.183501243591309, "logps/chosen": -2672.6220703125, "logps/rejected": -2672.6220703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4806213378906, "rewards/margins": 0.0, "rewards/rejected": -264.4806213378906, "step": 1398 }, { "epoch": 14.726315789473684, "grad_norm": 3.7479319416888757e-06, "learning_rate": 0.00017071578947368423, "logits/chosen": 13.294772148132324, "logits/rejected": 13.294772148132324, "logps/chosen": -5176.44091796875, "logps/rejected": -5176.44091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7056884765625, "rewards/margins": 0.0, "rewards/rejected": -514.7056884765625, "step": 1399 }, { "epoch": 14.736842105263158, "grad_norm": 2.0623833734134678e-06, "learning_rate": 0.00017069473684210525, "logits/chosen": 13.232860565185547, "logits/rejected": 13.232860565185547, "logps/chosen": -2967.2158203125, "logps/rejected": -2967.2158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9953918457031, "rewards/margins": 0.0, "rewards/rejected": -293.9953918457031, "step": 1400 }, { "epoch": 14.736842105263158, "eval_logits/chosen": 13.278051376342773, "eval_logits/rejected": 13.278051376342773, "eval_logps/chosen": -4311.49755859375, "eval_logps/rejected": -4311.49755859375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.24664306640625, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.24664306640625, "eval_runtime": 4.3236, "eval_samples_per_second": 2.313, "eval_steps_per_second": 2.313, "step": 1400 }, { "epoch": 14.74736842105263, "grad_norm": 2.655626303749159e-06, "learning_rate": 0.00017067368421052633, "logits/chosen": 13.240413665771484, "logits/rejected": 13.240413665771484, "logps/chosen": -3778.259765625, "logps/rejected": -3778.259765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9568786621094, "rewards/margins": 0.0, "rewards/rejected": -374.9568786621094, "step": 1401 }, { "epoch": 14.757894736842106, "grad_norm": 1.0872782922888291e-06, "learning_rate": 0.00017065263157894738, "logits/chosen": 13.25534439086914, "logits/rejected": 13.25534439086914, "logps/chosen": -3758.271484375, "logps/rejected": -3758.271484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9088439941406, "rewards/margins": 0.0, "rewards/rejected": -372.9088439941406, "step": 1402 }, { "epoch": 14.76842105263158, "grad_norm": 4.671528586186469e-06, "learning_rate": 0.00017063157894736843, "logits/chosen": 13.32196044921875, "logits/rejected": 13.32196044921875, "logps/chosen": -4878.6728515625, "logps/rejected": -4878.6728515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0201110839844, "rewards/margins": 0.0, "rewards/rejected": -485.0201110839844, "step": 1403 }, { "epoch": 14.778947368421052, "grad_norm": 2.7528462851478253e-06, "learning_rate": 0.00017061052631578948, "logits/chosen": 13.2721529006958, "logits/rejected": 13.2721529006958, "logps/chosen": -4287.81298828125, "logps/rejected": -4287.81298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.98394775390625, "rewards/margins": 0.0, "rewards/rejected": -425.98394775390625, "step": 1404 }, { "epoch": 14.789473684210526, "grad_norm": 2.0953516468580347e-06, "learning_rate": 0.00017058947368421055, "logits/chosen": 13.246936798095703, "logits/rejected": 13.246936798095703, "logps/chosen": -3995.33203125, "logps/rejected": -3995.33203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7059020996094, "rewards/margins": 0.0, "rewards/rejected": -396.7059020996094, "step": 1405 }, { "epoch": 14.8, "grad_norm": 4.696082669397583e-06, "learning_rate": 0.00017056842105263157, "logits/chosen": 13.311169624328613, "logits/rejected": 13.311169624328613, "logps/chosen": -5176.23046875, "logps/rejected": -5176.23046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6846313476562, "rewards/margins": 0.0, "rewards/rejected": -514.6846313476562, "step": 1406 }, { "epoch": 14.810526315789474, "grad_norm": 1.7624211068323348e-06, "learning_rate": 0.00017054736842105262, "logits/chosen": 13.294036865234375, "logits/rejected": 13.294036865234375, "logps/chosen": -5176.8974609375, "logps/rejected": -5176.8974609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7513427734375, "rewards/margins": 0.0, "rewards/rejected": -514.7513427734375, "step": 1407 }, { "epoch": 14.821052631578947, "grad_norm": 2.4072714950307272e-06, "learning_rate": 0.0001705263157894737, "logits/chosen": 13.267194747924805, "logits/rejected": 13.267194747924805, "logps/chosen": -4879.583984375, "logps/rejected": -4879.583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1112365722656, "rewards/margins": 0.0, "rewards/rejected": -485.1112365722656, "step": 1408 }, { "epoch": 14.83157894736842, "grad_norm": 4.572812031256035e-06, "learning_rate": 0.00017050526315789475, "logits/chosen": 13.201088905334473, "logits/rejected": 13.201088905334473, "logps/chosen": -4288.3095703125, "logps/rejected": -4288.3095703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0335998535156, "rewards/margins": 0.0, "rewards/rejected": -426.0335998535156, "step": 1409 }, { "epoch": 14.842105263157894, "grad_norm": 4.520805305219255e-06, "learning_rate": 0.0001704842105263158, "logits/chosen": 13.195809364318848, "logits/rejected": 13.195809364318848, "logps/chosen": -4288.4296875, "logps/rejected": -4288.4296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0456237792969, "rewards/margins": 0.0, "rewards/rejected": -426.0456237792969, "step": 1410 }, { "epoch": 14.852631578947369, "grad_norm": 1.9461765532469144e-06, "learning_rate": 0.00017046315789473685, "logits/chosen": 13.164036750793457, "logits/rejected": 13.164036750793457, "logps/chosen": -2673.326171875, "logps/rejected": -2673.326171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.551025390625, "rewards/margins": 0.0, "rewards/rejected": -264.551025390625, "step": 1411 }, { "epoch": 14.863157894736842, "grad_norm": 1.728772417664004e-06, "learning_rate": 0.0001704421052631579, "logits/chosen": 13.183733940124512, "logits/rejected": 13.183733940124512, "logps/chosen": -3995.220703125, "logps/rejected": -3995.220703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.69476318359375, "rewards/margins": 0.0, "rewards/rejected": -396.69476318359375, "step": 1412 }, { "epoch": 14.873684210526315, "grad_norm": 1.232051545230206e-06, "learning_rate": 0.00017042105263157895, "logits/chosen": 13.204034805297852, "logits/rejected": 13.204034805297852, "logps/chosen": -4290.5673828125, "logps/rejected": -4290.5673828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.2593688964844, "rewards/margins": 0.0, "rewards/rejected": -426.2593688964844, "step": 1413 }, { "epoch": 14.884210526315789, "grad_norm": 1.7505884670754313e-06, "learning_rate": 0.0001704, "logits/chosen": 13.182990074157715, "logits/rejected": 13.182990074157715, "logps/chosen": -3995.57421875, "logps/rejected": -3995.57421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7301025390625, "rewards/margins": 0.0, "rewards/rejected": -396.7301025390625, "step": 1414 }, { "epoch": 14.894736842105264, "grad_norm": 1.1477078487587278e-06, "learning_rate": 0.00017037894736842107, "logits/chosen": 13.187840461730957, "logits/rejected": 13.187840461730957, "logps/chosen": -3543.3994140625, "logps/rejected": -3543.3994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3140563964844, "rewards/margins": 0.0, "rewards/rejected": -351.3140563964844, "step": 1415 }, { "epoch": 14.905263157894737, "grad_norm": 5.2062227950955275e-06, "learning_rate": 0.00017035789473684212, "logits/chosen": 13.2350435256958, "logits/rejected": 13.2350435256958, "logps/chosen": -4879.0400390625, "logps/rejected": -4879.0400390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0568542480469, "rewards/margins": 0.0, "rewards/rejected": -485.0568542480469, "step": 1416 }, { "epoch": 14.91578947368421, "grad_norm": 1.4766152389711351e-06, "learning_rate": 0.00017033684210526317, "logits/chosen": 13.200243949890137, "logits/rejected": 13.200243949890137, "logps/chosen": -4326.447265625, "logps/rejected": -4326.447265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4598693847656, "rewards/margins": 0.0, "rewards/rejected": -429.4598693847656, "step": 1417 }, { "epoch": 14.926315789473684, "grad_norm": 3.0300182061182568e-06, "learning_rate": 0.00017031578947368422, "logits/chosen": 13.221475601196289, "logits/rejected": 13.221475601196289, "logps/chosen": -4879.955078125, "logps/rejected": -4879.955078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1483459472656, "rewards/margins": 0.0, "rewards/rejected": -485.1483459472656, "step": 1418 }, { "epoch": 14.936842105263159, "grad_norm": 9.390932405040076e-07, "learning_rate": 0.00017029473684210527, "logits/chosen": 13.138678550720215, "logits/rejected": 13.138678550720215, "logps/chosen": -2674.5126953125, "logps/rejected": -2674.5126953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.669677734375, "rewards/margins": 0.0, "rewards/rejected": -264.669677734375, "step": 1419 }, { "epoch": 14.947368421052632, "grad_norm": 1.936387889145408e-06, "learning_rate": 0.00017027368421052632, "logits/chosen": 13.155964851379395, "logits/rejected": 13.155964851379395, "logps/chosen": -3542.1923828125, "logps/rejected": -3542.1923828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.193359375, "rewards/margins": 0.0, "rewards/rejected": -351.193359375, "step": 1420 }, { "epoch": 14.957894736842105, "grad_norm": 1.9245119347033324e-06, "learning_rate": 0.00017025263157894737, "logits/chosen": 13.207898139953613, "logits/rejected": 13.207898139953613, "logps/chosen": -4880.283203125, "logps/rejected": -4880.283203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.18115234375, "rewards/margins": 0.0, "rewards/rejected": -485.18115234375, "step": 1421 }, { "epoch": 14.968421052631578, "grad_norm": 2.437748662487138e-06, "learning_rate": 0.00017023157894736844, "logits/chosen": 13.208243370056152, "logits/rejected": 13.208243370056152, "logps/chosen": -4880.294921875, "logps/rejected": -4880.294921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1823425292969, "rewards/margins": 0.0, "rewards/rejected": -485.1823425292969, "step": 1422 }, { "epoch": 14.978947368421053, "grad_norm": 2.550890712882392e-06, "learning_rate": 0.0001702105263157895, "logits/chosen": 13.211731910705566, "logits/rejected": 13.211731910705566, "logps/chosen": -4880.76220703125, "logps/rejected": -4880.76220703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.22906494140625, "rewards/margins": 0.0, "rewards/rejected": -485.22906494140625, "step": 1423 }, { "epoch": 14.989473684210527, "grad_norm": 2.1681428279407555e-06, "learning_rate": 0.00017018947368421054, "logits/chosen": 13.151742935180664, "logits/rejected": 13.151742935180664, "logps/chosen": -3997.357421875, "logps/rejected": -3997.357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9084167480469, "rewards/margins": 0.0, "rewards/rejected": -396.9084167480469, "step": 1424 }, { "epoch": 15.0, "grad_norm": 1.658588416830753e-06, "learning_rate": 0.00017016842105263156, "logits/chosen": 13.15597915649414, "logits/rejected": 13.15597915649414, "logps/chosen": -3997.861328125, "logps/rejected": -3997.861328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9588317871094, "rewards/margins": 0.0, "rewards/rejected": -396.9588317871094, "step": 1425 }, { "epoch": 15.010526315789473, "grad_norm": 1.2933398920722539e-06, "learning_rate": 0.00017014736842105264, "logits/chosen": 13.156377792358398, "logits/rejected": 13.156377792358398, "logps/chosen": -3998.240234375, "logps/rejected": -3998.240234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9967041015625, "rewards/margins": 0.0, "rewards/rejected": -396.9967041015625, "step": 1426 }, { "epoch": 15.021052631578947, "grad_norm": 9.10378957996727e-07, "learning_rate": 0.0001701263157894737, "logits/chosen": 13.14017391204834, "logits/rejected": 13.14017391204834, "logps/chosen": -2673.80859375, "logps/rejected": -2673.80859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5992736816406, "rewards/margins": 0.0, "rewards/rejected": -264.5992736816406, "step": 1427 }, { "epoch": 15.031578947368422, "grad_norm": 8.363235224351229e-07, "learning_rate": 0.00017010526315789474, "logits/chosen": 13.135923385620117, "logits/rejected": 13.135923385620117, "logps/chosen": -2673.908203125, "logps/rejected": -2673.908203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6092224121094, "rewards/margins": 0.0, "rewards/rejected": -264.6092224121094, "step": 1428 }, { "epoch": 15.042105263157895, "grad_norm": 1.8444816305418499e-06, "learning_rate": 0.0001700842105263158, "logits/chosen": 13.142634391784668, "logits/rejected": 13.142634391784668, "logps/chosen": -3999.39453125, "logps/rejected": -3999.39453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1121520996094, "rewards/margins": 0.0, "rewards/rejected": -397.1121520996094, "step": 1429 }, { "epoch": 15.052631578947368, "grad_norm": 8.300196441268781e-07, "learning_rate": 0.00017006315789473686, "logits/chosen": 13.121553421020508, "logits/rejected": 13.121553421020508, "logps/chosen": -2674.248046875, "logps/rejected": -2674.248046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6432189941406, "rewards/margins": 0.0, "rewards/rejected": -264.6432189941406, "step": 1430 }, { "epoch": 15.063157894736841, "grad_norm": 3.89398519473616e-06, "learning_rate": 0.00017004210526315791, "logits/chosen": 13.18561840057373, "logits/rejected": 13.18561840057373, "logps/chosen": -4882.1416015625, "logps/rejected": -4882.1416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.36700439453125, "rewards/margins": 0.0, "rewards/rejected": -485.36700439453125, "step": 1431 }, { "epoch": 15.073684210526316, "grad_norm": 9.39815095080121e-07, "learning_rate": 0.00017002105263157894, "logits/chosen": 13.12387466430664, "logits/rejected": 13.12387466430664, "logps/chosen": -3541.009765625, "logps/rejected": -3541.009765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.0751037597656, "rewards/margins": 0.0, "rewards/rejected": -351.0751037597656, "step": 1432 }, { "epoch": 15.08421052631579, "grad_norm": 1.432331600881298e-06, "learning_rate": 0.00017, "logits/chosen": 13.124300003051758, "logits/rejected": 13.124300003051758, "logps/chosen": -4290.08984375, "logps/rejected": -4290.08984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.2116394042969, "rewards/margins": 0.0, "rewards/rejected": -426.2116394042969, "step": 1433 }, { "epoch": 15.094736842105263, "grad_norm": 1.4277686659625033e-06, "learning_rate": 0.00016997894736842106, "logits/chosen": 13.105451583862305, "logits/rejected": 13.105451583862305, "logps/chosen": -3757.8203125, "logps/rejected": -3757.8203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8637390136719, "rewards/margins": 0.0, "rewards/rejected": -372.8637390136719, "step": 1434 }, { "epoch": 15.105263157894736, "grad_norm": 1.6716501249902649e-06, "learning_rate": 0.0001699578947368421, "logits/chosen": 13.164885520935059, "logits/rejected": 13.164885520935059, "logps/chosen": -5173.11865234375, "logps/rejected": -5173.11865234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3734741210938, "rewards/margins": 0.0, "rewards/rejected": -514.3734741210938, "step": 1435 }, { "epoch": 15.115789473684211, "grad_norm": 1.701274868537439e-06, "learning_rate": 0.00016993684210526316, "logits/chosen": 13.162287712097168, "logits/rejected": 13.162287712097168, "logps/chosen": -5173.2021484375, "logps/rejected": -5173.2021484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3817749023438, "rewards/margins": 0.0, "rewards/rejected": -514.3817749023438, "step": 1436 }, { "epoch": 15.126315789473685, "grad_norm": 2.2852630081615644e-06, "learning_rate": 0.00016991578947368424, "logits/chosen": 13.1080904006958, "logits/rejected": 13.1080904006958, "logps/chosen": -4289.6025390625, "logps/rejected": -4289.6025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.16290283203125, "rewards/margins": 0.0, "rewards/rejected": -426.16290283203125, "step": 1437 }, { "epoch": 15.136842105263158, "grad_norm": 2.016254939007922e-06, "learning_rate": 0.00016989473684210526, "logits/chosen": 13.092247009277344, "logits/rejected": 13.092247009277344, "logps/chosen": -4000.6171875, "logps/rejected": -4000.6171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.2344055175781, "rewards/margins": 0.0, "rewards/rejected": -397.2344055175781, "step": 1438 }, { "epoch": 15.147368421052631, "grad_norm": 1.459816530768876e-06, "learning_rate": 0.0001698736842105263, "logits/chosen": 13.082552909851074, "logits/rejected": 13.082552909851074, "logps/chosen": -2674.099609375, "logps/rejected": -2674.099609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6283874511719, "rewards/margins": 0.0, "rewards/rejected": -264.6283874511719, "step": 1439 }, { "epoch": 15.157894736842104, "grad_norm": 1.1665380270642345e-06, "learning_rate": 0.00016985263157894738, "logits/chosen": 13.103185653686523, "logits/rejected": 13.103185653686523, "logps/chosen": -4001.138671875, "logps/rejected": -4001.138671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.28656005859375, "rewards/margins": 0.0, "rewards/rejected": -397.28656005859375, "step": 1440 }, { "epoch": 15.16842105263158, "grad_norm": 2.090728685288923e-06, "learning_rate": 0.00016983157894736843, "logits/chosen": 13.180010795593262, "logits/rejected": 13.180010795593262, "logps/chosen": -5173.89599609375, "logps/rejected": -5173.89599609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.451171875, "rewards/margins": 0.0, "rewards/rejected": -514.451171875, "step": 1441 }, { "epoch": 15.178947368421053, "grad_norm": 1.0571292250460829e-06, "learning_rate": 0.00016981052631578948, "logits/chosen": 13.112493515014648, "logits/rejected": 13.112493515014648, "logps/chosen": -4001.369140625, "logps/rejected": -4001.369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.3096008300781, "rewards/margins": 0.0, "rewards/rejected": -397.3096008300781, "step": 1442 }, { "epoch": 15.189473684210526, "grad_norm": 1.5526258039244567e-06, "learning_rate": 0.00016978947368421053, "logits/chosen": 13.149299621582031, "logits/rejected": 13.149299621582031, "logps/chosen": -4324.1875, "logps/rejected": -4324.1875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.23388671875, "rewards/margins": 0.0, "rewards/rejected": -429.23388671875, "step": 1443 }, { "epoch": 15.2, "grad_norm": 1.6236004967140616e-06, "learning_rate": 0.00016976842105263158, "logits/chosen": 13.119844436645508, "logits/rejected": 13.119844436645508, "logps/chosen": -4001.435546875, "logps/rejected": -4001.435546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.3162536621094, "rewards/margins": 0.0, "rewards/rejected": -397.3162536621094, "step": 1444 }, { "epoch": 15.210526315789474, "grad_norm": 9.579836159900879e-07, "learning_rate": 0.00016974736842105263, "logits/chosen": 13.129072189331055, "logits/rejected": 13.129072189331055, "logps/chosen": -3541.03515625, "logps/rejected": -3541.03515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.07763671875, "rewards/margins": 0.0, "rewards/rejected": -351.07763671875, "step": 1445 }, { "epoch": 15.221052631578948, "grad_norm": 1.7672175545158098e-06, "learning_rate": 0.00016972631578947368, "logits/chosen": 13.120251655578613, "logits/rejected": 13.120251655578613, "logps/chosen": -4001.60546875, "logps/rejected": -4001.60546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.3332214355469, "rewards/margins": 0.0, "rewards/rejected": -397.3332214355469, "step": 1446 }, { "epoch": 15.23157894736842, "grad_norm": 1.3236641507319291e-06, "learning_rate": 0.00016970526315789476, "logits/chosen": 13.14944076538086, "logits/rejected": 13.14944076538086, "logps/chosen": -4324.9453125, "logps/rejected": -4324.9453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3096618652344, "rewards/margins": 0.0, "rewards/rejected": -429.3096618652344, "step": 1447 }, { "epoch": 15.242105263157894, "grad_norm": 1.3325736745173344e-06, "learning_rate": 0.0001696842105263158, "logits/chosen": 13.112533569335938, "logits/rejected": 13.112533569335938, "logps/chosen": -4002.443359375, "logps/rejected": -4002.443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.4170227050781, "rewards/margins": 0.0, "rewards/rejected": -397.4170227050781, "step": 1448 }, { "epoch": 15.25263157894737, "grad_norm": 1.0645068186931894e-06, "learning_rate": 0.00016966315789473685, "logits/chosen": 13.113541603088379, "logits/rejected": 13.113541603088379, "logps/chosen": -3758.01171875, "logps/rejected": -3758.01171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.88287353515625, "rewards/margins": 0.0, "rewards/rejected": -372.88287353515625, "step": 1449 }, { "epoch": 15.263157894736842, "grad_norm": 1.0816390840773238e-06, "learning_rate": 0.0001696421052631579, "logits/chosen": 13.101728439331055, "logits/rejected": 13.101728439331055, "logps/chosen": -2967.015625, "logps/rejected": -2967.015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9753723144531, "rewards/margins": 0.0, "rewards/rejected": -293.9753723144531, "step": 1450 }, { "epoch": 15.263157894736842, "eval_logits/chosen": 13.13160514831543, "eval_logits/rejected": 13.13160514831543, "eval_logps/chosen": -4309.7705078125, "eval_logps/rejected": -4309.7705078125, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.07391357421875, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.07391357421875, "eval_runtime": 4.2283, "eval_samples_per_second": 2.365, "eval_steps_per_second": 2.365, "step": 1450 }, { "epoch": 15.273684210526316, "grad_norm": 1.2565507176987012e-06, "learning_rate": 0.00016962105263157895, "logits/chosen": 13.11225700378418, "logits/rejected": 13.11225700378418, "logps/chosen": -4289.29296875, "logps/rejected": -4289.29296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1319274902344, "rewards/margins": 0.0, "rewards/rejected": -426.1319274902344, "step": 1451 }, { "epoch": 15.284210526315789, "grad_norm": 1.4384738733497215e-06, "learning_rate": 0.0001696, "logits/chosen": 13.09652328491211, "logits/rejected": 13.09652328491211, "logps/chosen": -3541.744140625, "logps/rejected": -3541.744140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1485290527344, "rewards/margins": 0.0, "rewards/rejected": -351.1485290527344, "step": 1452 }, { "epoch": 15.294736842105262, "grad_norm": 1.4498164091492072e-06, "learning_rate": 0.00016957894736842105, "logits/chosen": 13.093693733215332, "logits/rejected": 13.093693733215332, "logps/chosen": -3758.248046875, "logps/rejected": -3758.248046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.906494140625, "rewards/margins": 0.0, "rewards/rejected": -372.906494140625, "step": 1453 }, { "epoch": 15.305263157894737, "grad_norm": 1.8467335394234397e-06, "learning_rate": 0.00016955789473684213, "logits/chosen": 13.094812393188477, "logits/rejected": 13.094812393188477, "logps/chosen": -3775.763671875, "logps/rejected": -3775.763671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7072448730469, "rewards/margins": 0.0, "rewards/rejected": -374.7072448730469, "step": 1454 }, { "epoch": 15.31578947368421, "grad_norm": 2.647607288963627e-06, "learning_rate": 0.00016953684210526318, "logits/chosen": 13.159708023071289, "logits/rejected": 13.159708023071289, "logps/chosen": -5174.69873046875, "logps/rejected": -5174.69873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5314331054688, "rewards/margins": 0.0, "rewards/rejected": -514.5314331054688, "step": 1455 }, { "epoch": 15.326315789473684, "grad_norm": 1.782884623935388e-06, "learning_rate": 0.00016951578947368423, "logits/chosen": 13.152277946472168, "logits/rejected": 13.152277946472168, "logps/chosen": -4878.61669921875, "logps/rejected": -4878.61669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0144958496094, "rewards/margins": 0.0, "rewards/rejected": -485.0144958496094, "step": 1456 }, { "epoch": 15.336842105263157, "grad_norm": 1.2863453093814314e-06, "learning_rate": 0.00016949473684210525, "logits/chosen": 13.116669654846191, "logits/rejected": 13.116669654846191, "logps/chosen": -4289.73583984375, "logps/rejected": -4289.73583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1762390136719, "rewards/margins": 0.0, "rewards/rejected": -426.1762390136719, "step": 1457 }, { "epoch": 15.347368421052632, "grad_norm": 1.8738066955847898e-06, "learning_rate": 0.00016947368421052633, "logits/chosen": 13.173236846923828, "logits/rejected": 13.173236846923828, "logps/chosen": -5175.06787109375, "logps/rejected": -5175.06787109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.568359375, "rewards/margins": 0.0, "rewards/rejected": -514.568359375, "step": 1458 }, { "epoch": 15.357894736842105, "grad_norm": 1.9695919490914093e-06, "learning_rate": 0.00016945263157894737, "logits/chosen": 13.11767864227295, "logits/rejected": 13.11767864227295, "logps/chosen": -3776.349609375, "logps/rejected": -3776.349609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7658386230469, "rewards/margins": 0.0, "rewards/rejected": -374.7658386230469, "step": 1459 }, { "epoch": 15.368421052631579, "grad_norm": 1.5616370774296229e-06, "learning_rate": 0.00016943157894736842, "logits/chosen": 13.127367973327637, "logits/rejected": 13.127367973327637, "logps/chosen": -3543.095703125, "logps/rejected": -3543.095703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.28369140625, "rewards/margins": 0.0, "rewards/rejected": -351.28369140625, "step": 1460 }, { "epoch": 15.378947368421052, "grad_norm": 1.3086751096125226e-06, "learning_rate": 0.00016941052631578947, "logits/chosen": 13.191308975219727, "logits/rejected": 13.191308975219727, "logps/chosen": -4879.3369140625, "logps/rejected": -4879.3369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0865173339844, "rewards/margins": 0.0, "rewards/rejected": -485.0865173339844, "step": 1461 }, { "epoch": 15.389473684210527, "grad_norm": 1.0606236173771322e-06, "learning_rate": 0.00016938947368421055, "logits/chosen": 13.142932891845703, "logits/rejected": 13.142932891845703, "logps/chosen": -4000.73046875, "logps/rejected": -4000.73046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.2457275390625, "rewards/margins": 0.0, "rewards/rejected": -397.2457275390625, "step": 1462 }, { "epoch": 15.4, "grad_norm": 9.446542890145793e-07, "learning_rate": 0.00016936842105263157, "logits/chosen": 13.138042449951172, "logits/rejected": 13.138042449951172, "logps/chosen": -2673.4072265625, "logps/rejected": -2673.4072265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.55914306640625, "rewards/margins": 0.0, "rewards/rejected": -264.55914306640625, "step": 1463 }, { "epoch": 15.410526315789474, "grad_norm": 2.5797266971494537e-06, "learning_rate": 0.00016934736842105262, "logits/chosen": 13.229494094848633, "logits/rejected": 13.229494094848633, "logps/chosen": -5176.02880859375, "logps/rejected": -5176.02880859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6644897460938, "rewards/margins": 0.0, "rewards/rejected": -514.6644897460938, "step": 1464 }, { "epoch": 15.421052631578947, "grad_norm": 1.0844455573533196e-06, "learning_rate": 0.0001693263157894737, "logits/chosen": 13.173554420471191, "logits/rejected": 13.173554420471191, "logps/chosen": -3759.541015625, "logps/rejected": -3759.541015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0357971191406, "rewards/margins": 0.0, "rewards/rejected": -373.0357971191406, "step": 1465 }, { "epoch": 15.431578947368422, "grad_norm": 1.138741708928137e-06, "learning_rate": 0.00016930526315789475, "logits/chosen": 13.177787780761719, "logits/rejected": 13.177787780761719, "logps/chosen": -3759.552734375, "logps/rejected": -3759.552734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0369567871094, "rewards/margins": 0.0, "rewards/rejected": -373.0369567871094, "step": 1466 }, { "epoch": 15.442105263157895, "grad_norm": 2.0837719603150617e-06, "learning_rate": 0.0001692842105263158, "logits/chosen": 13.241813659667969, "logits/rejected": 13.241813659667969, "logps/chosen": -5176.51611328125, "logps/rejected": -5176.51611328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7131958007812, "rewards/margins": 0.0, "rewards/rejected": -514.7131958007812, "step": 1467 }, { "epoch": 15.452631578947368, "grad_norm": 1.546308453725942e-06, "learning_rate": 0.00016926315789473684, "logits/chosen": 13.185162544250488, "logits/rejected": 13.185162544250488, "logps/chosen": -3778.318359375, "logps/rejected": -3778.318359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9627380371094, "rewards/margins": 0.0, "rewards/rejected": -374.9627380371094, "step": 1468 }, { "epoch": 15.463157894736842, "grad_norm": 1.0050520131699159e-06, "learning_rate": 0.00016924210526315792, "logits/chosen": 13.16478443145752, "logits/rejected": 13.16478443145752, "logps/chosen": -2673.3095703125, "logps/rejected": -2673.3095703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.54937744140625, "rewards/margins": 0.0, "rewards/rejected": -264.54937744140625, "step": 1469 }, { "epoch": 15.473684210526315, "grad_norm": 1.3186632941142307e-06, "learning_rate": 0.00016922105263157894, "logits/chosen": 13.191641807556152, "logits/rejected": 13.191641807556152, "logps/chosen": -3543.771484375, "logps/rejected": -3543.771484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3512878417969, "rewards/margins": 0.0, "rewards/rejected": -351.3512878417969, "step": 1470 }, { "epoch": 15.48421052631579, "grad_norm": 1.375670763081871e-06, "learning_rate": 0.0001692, "logits/chosen": 13.245780944824219, "logits/rejected": 13.245780944824219, "logps/chosen": -4880.0185546875, "logps/rejected": -4880.0185546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1546936035156, "rewards/margins": 0.0, "rewards/rejected": -485.1546936035156, "step": 1471 }, { "epoch": 15.494736842105263, "grad_norm": 9.17223019314406e-07, "learning_rate": 0.00016917894736842107, "logits/chosen": 13.180889129638672, "logits/rejected": 13.180889129638672, "logps/chosen": -2673.4375, "logps/rejected": -2673.4375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5621643066406, "rewards/margins": 0.0, "rewards/rejected": -264.5621643066406, "step": 1472 }, { "epoch": 15.505263157894737, "grad_norm": 9.388008948008064e-07, "learning_rate": 0.00016915789473684212, "logits/chosen": 13.20954704284668, "logits/rejected": 13.20954704284668, "logps/chosen": -3544.189453125, "logps/rejected": -3544.189453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.39306640625, "rewards/margins": 0.0, "rewards/rejected": -351.39306640625, "step": 1473 }, { "epoch": 15.51578947368421, "grad_norm": 8.480238875563373e-07, "learning_rate": 0.00016913684210526317, "logits/chosen": 13.216889381408691, "logits/rejected": 13.216889381408691, "logps/chosen": -3544.5859375, "logps/rejected": -3544.5859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4327087402344, "rewards/margins": 0.0, "rewards/rejected": -351.4327087402344, "step": 1474 }, { "epoch": 15.526315789473685, "grad_norm": 8.220295057981275e-07, "learning_rate": 0.00016911578947368422, "logits/chosen": 13.203375816345215, "logits/rejected": 13.203375816345215, "logps/chosen": -2674.0849609375, "logps/rejected": -2674.0849609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6269226074219, "rewards/margins": 0.0, "rewards/rejected": -264.6269226074219, "step": 1475 }, { "epoch": 15.536842105263158, "grad_norm": 7.893989391050127e-07, "learning_rate": 0.00016909473684210527, "logits/chosen": 13.233211517333984, "logits/rejected": 13.233211517333984, "logps/chosen": -3544.9033203125, "logps/rejected": -3544.9033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4644470214844, "rewards/margins": 0.0, "rewards/rejected": -351.4644470214844, "step": 1476 }, { "epoch": 15.547368421052632, "grad_norm": 2.275001406815136e-06, "learning_rate": 0.00016907368421052632, "logits/chosen": 13.230999946594238, "logits/rejected": 13.230999946594238, "logps/chosen": -3996.65234375, "logps/rejected": -3996.65234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8379211425781, "rewards/margins": 0.0, "rewards/rejected": -396.8379211425781, "step": 1477 }, { "epoch": 15.557894736842105, "grad_norm": 8.026908631109109e-07, "learning_rate": 0.00016905263157894736, "logits/chosen": 13.231245040893555, "logits/rejected": 13.231245040893555, "logps/chosen": -2968.416015625, "logps/rejected": -2968.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.11541748046875, "rewards/margins": 0.0, "rewards/rejected": -294.11541748046875, "step": 1478 }, { "epoch": 15.568421052631578, "grad_norm": 7.65748609410366e-07, "learning_rate": 0.00016903157894736844, "logits/chosen": 13.237869262695312, "logits/rejected": 13.237869262695312, "logps/chosen": -3545.455078125, "logps/rejected": -3545.455078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5196228027344, "rewards/margins": 0.0, "rewards/rejected": -351.5196228027344, "step": 1479 }, { "epoch": 15.578947368421053, "grad_norm": 1.9207375316909747e-06, "learning_rate": 0.0001690105263157895, "logits/chosen": 13.28357982635498, "logits/rejected": 13.28357982635498, "logps/chosen": -4879.654296875, "logps/rejected": -4879.654296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1182556152344, "rewards/margins": 0.0, "rewards/rejected": -485.1182556152344, "step": 1480 }, { "epoch": 15.589473684210526, "grad_norm": 8.145337915266282e-07, "learning_rate": 0.00016898947368421054, "logits/chosen": 13.211501121520996, "logits/rejected": 13.211501121520996, "logps/chosen": -2674.8994140625, "logps/rejected": -2674.8994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7083435058594, "rewards/margins": 0.0, "rewards/rejected": -264.7083435058594, "step": 1481 }, { "epoch": 15.6, "grad_norm": 1.0985201015500934e-06, "learning_rate": 0.0001689684210526316, "logits/chosen": 13.23165225982666, "logits/rejected": 13.23165225982666, "logps/chosen": -3545.8486328125, "logps/rejected": -3545.8486328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5589904785156, "rewards/margins": 0.0, "rewards/rejected": -351.5589904785156, "step": 1482 }, { "epoch": 15.610526315789473, "grad_norm": 1.3010347856834414e-06, "learning_rate": 0.00016894736842105264, "logits/chosen": 13.229446411132812, "logits/rejected": 13.229446411132812, "logps/chosen": -3759.5693359375, "logps/rejected": -3759.5693359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.03863525390625, "rewards/margins": 0.0, "rewards/rejected": -373.03863525390625, "step": 1483 }, { "epoch": 15.621052631578948, "grad_norm": 1.4617021406593267e-06, "learning_rate": 0.0001689263157894737, "logits/chosen": 13.28757381439209, "logits/rejected": 13.28757381439209, "logps/chosen": -5177.04248046875, "logps/rejected": -5177.04248046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7658081054688, "rewards/margins": 0.0, "rewards/rejected": -514.7658081054688, "step": 1484 }, { "epoch": 15.631578947368421, "grad_norm": 1.2254095054231584e-06, "learning_rate": 0.00016890526315789474, "logits/chosen": 13.277099609375, "logits/rejected": 13.277099609375, "logps/chosen": -4879.13427734375, "logps/rejected": -4879.13427734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0662536621094, "rewards/margins": 0.0, "rewards/rejected": -485.0662536621094, "step": 1485 }, { "epoch": 15.642105263157895, "grad_norm": 1.277786623177235e-06, "learning_rate": 0.0001688842105263158, "logits/chosen": 13.277246475219727, "logits/rejected": 13.277246475219727, "logps/chosen": -4879.3466796875, "logps/rejected": -4879.3466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0874938964844, "rewards/margins": 0.0, "rewards/rejected": -485.0874938964844, "step": 1486 }, { "epoch": 15.652631578947368, "grad_norm": 1.4210424978955416e-06, "learning_rate": 0.00016886315789473686, "logits/chosen": 13.219776153564453, "logits/rejected": 13.219776153564453, "logps/chosen": -2968.2109375, "logps/rejected": -2968.2109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.09490966796875, "rewards/margins": 0.0, "rewards/rejected": -294.09490966796875, "step": 1487 }, { "epoch": 15.663157894736843, "grad_norm": 1.5450937098648865e-06, "learning_rate": 0.0001688421052631579, "logits/chosen": 13.233162879943848, "logits/rejected": 13.233162879943848, "logps/chosen": -3778.689453125, "logps/rejected": -3778.689453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9998474121094, "rewards/margins": 0.0, "rewards/rejected": -374.9998474121094, "step": 1488 }, { "epoch": 15.673684210526316, "grad_norm": 1.0246831152471714e-06, "learning_rate": 0.00016882105263157893, "logits/chosen": 13.235774040222168, "logits/rejected": 13.235774040222168, "logps/chosen": -3546.1328125, "logps/rejected": -3546.1328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.58740234375, "rewards/margins": 0.0, "rewards/rejected": -351.58740234375, "step": 1489 }, { "epoch": 15.68421052631579, "grad_norm": 8.195901841645536e-07, "learning_rate": 0.0001688, "logits/chosen": 13.241355895996094, "logits/rejected": 13.241355895996094, "logps/chosen": -3546.193359375, "logps/rejected": -3546.193359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5934753417969, "rewards/margins": 0.0, "rewards/rejected": -351.5934753417969, "step": 1490 }, { "epoch": 15.694736842105263, "grad_norm": 3.489791424726718e-06, "learning_rate": 0.00016877894736842106, "logits/chosen": 13.30412769317627, "logits/rejected": 13.30412769317627, "logps/chosen": -5176.5517578125, "logps/rejected": -5176.5517578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7167358398438, "rewards/margins": 0.0, "rewards/rejected": -514.7167358398438, "step": 1491 }, { "epoch": 15.705263157894738, "grad_norm": 1.3516136050384375e-06, "learning_rate": 0.0001687578947368421, "logits/chosen": 13.275618553161621, "logits/rejected": 13.275618553161621, "logps/chosen": -4326.9931640625, "logps/rejected": -4326.9931640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.51446533203125, "rewards/margins": 0.0, "rewards/rejected": -429.51446533203125, "step": 1492 }, { "epoch": 15.715789473684211, "grad_norm": 1.6686075241523213e-06, "learning_rate": 0.00016873684210526316, "logits/chosen": 13.245379447937012, "logits/rejected": 13.245379447937012, "logps/chosen": -3995.63671875, "logps/rejected": -3995.63671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.7363586425781, "rewards/margins": 0.0, "rewards/rejected": -396.7363586425781, "step": 1493 }, { "epoch": 15.726315789473684, "grad_norm": 2.5728875243657967e-06, "learning_rate": 0.00016871578947368423, "logits/chosen": 13.30986213684082, "logits/rejected": 13.30986213684082, "logps/chosen": -5177.07421875, "logps/rejected": -5177.07421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7689819335938, "rewards/margins": 0.0, "rewards/rejected": -514.7689819335938, "step": 1494 }, { "epoch": 15.736842105263158, "grad_norm": 1.4438797961702221e-06, "learning_rate": 0.00016869473684210526, "logits/chosen": 13.261364936828613, "logits/rejected": 13.261364936828613, "logps/chosen": -4287.255859375, "logps/rejected": -4287.255859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.92822265625, "rewards/margins": 0.0, "rewards/rejected": -425.92822265625, "step": 1495 }, { "epoch": 15.74736842105263, "grad_norm": 1.357111841571168e-06, "learning_rate": 0.0001686736842105263, "logits/chosen": 13.235664367675781, "logits/rejected": 13.235664367675781, "logps/chosen": -3995.97265625, "logps/rejected": -3995.97265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.76995849609375, "rewards/margins": 0.0, "rewards/rejected": -396.76995849609375, "step": 1496 }, { "epoch": 15.757894736842106, "grad_norm": 1.9852880086546065e-06, "learning_rate": 0.00016865263157894738, "logits/chosen": 13.28403091430664, "logits/rejected": 13.28403091430664, "logps/chosen": -4880.365234375, "logps/rejected": -4880.365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1893615722656, "rewards/margins": 0.0, "rewards/rejected": -485.1893615722656, "step": 1497 }, { "epoch": 15.76842105263158, "grad_norm": 1.8101868590747472e-06, "learning_rate": 0.00016863157894736843, "logits/chosen": 13.219038009643555, "logits/rejected": 13.219038009643555, "logps/chosen": -3996.298828125, "logps/rejected": -3996.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8025817871094, "rewards/margins": 0.0, "rewards/rejected": -396.8025817871094, "step": 1498 }, { "epoch": 15.778947368421052, "grad_norm": 2.113829168592929e-06, "learning_rate": 0.00016861052631578948, "logits/chosen": 13.209192276000977, "logits/rejected": 13.209192276000977, "logps/chosen": -2967.923828125, "logps/rejected": -2967.923828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0661926269531, "rewards/margins": 0.0, "rewards/rejected": -294.0661926269531, "step": 1499 }, { "epoch": 15.789473684210526, "grad_norm": 1.437405785509327e-06, "learning_rate": 0.00016858947368421053, "logits/chosen": 13.205979347229004, "logits/rejected": 13.205979347229004, "logps/chosen": -3997.12890625, "logps/rejected": -3997.12890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8855895996094, "rewards/margins": 0.0, "rewards/rejected": -396.8855895996094, "step": 1500 }, { "epoch": 15.789473684210526, "eval_logits/chosen": 13.236076354980469, "eval_logits/rejected": 13.236076354980469, "eval_logps/chosen": -4311.6533203125, "eval_logps/rejected": -4311.6533203125, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.26220703125, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.26220703125, "eval_runtime": 4.5412, "eval_samples_per_second": 2.202, "eval_steps_per_second": 2.202, "step": 1500 }, { "epoch": 15.8, "grad_norm": 1.4228317013476044e-06, "learning_rate": 0.0001685684210526316, "logits/chosen": 13.210309982299805, "logits/rejected": 13.210309982299805, "logps/chosen": -3545.455078125, "logps/rejected": -3545.455078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5196228027344, "rewards/margins": 0.0, "rewards/rejected": -351.5196228027344, "step": 1501 }, { "epoch": 15.810526315789474, "grad_norm": 1.2206730843900004e-06, "learning_rate": 0.00016854736842105263, "logits/chosen": 13.196483612060547, "logits/rejected": 13.196483612060547, "logps/chosen": -3998.130859375, "logps/rejected": -3998.130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.98577880859375, "rewards/margins": 0.0, "rewards/rejected": -396.98577880859375, "step": 1502 }, { "epoch": 15.821052631578947, "grad_norm": 1.321790136898926e-06, "learning_rate": 0.00016852631578947368, "logits/chosen": 13.20278549194336, "logits/rejected": 13.20278549194336, "logps/chosen": -3778.0625, "logps/rejected": -3778.0625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9371337890625, "rewards/margins": 0.0, "rewards/rejected": -374.9371337890625, "step": 1503 }, { "epoch": 15.83157894736842, "grad_norm": 8.005339964256564e-07, "learning_rate": 0.00016850526315789475, "logits/chosen": 13.18626880645752, "logits/rejected": 13.18626880645752, "logps/chosen": -2968.583984375, "logps/rejected": -2968.583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1322021484375, "rewards/margins": 0.0, "rewards/rejected": -294.1322021484375, "step": 1504 }, { "epoch": 15.842105263157894, "grad_norm": 9.207843163494545e-07, "learning_rate": 0.0001684842105263158, "logits/chosen": 13.168615341186523, "logits/rejected": 13.168615341186523, "logps/chosen": -2673.6474609375, "logps/rejected": -2673.6474609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5831604003906, "rewards/margins": 0.0, "rewards/rejected": -264.5831604003906, "step": 1505 }, { "epoch": 15.852631578947369, "grad_norm": 3.915900833817432e-06, "learning_rate": 0.00016846315789473685, "logits/chosen": 13.228930473327637, "logits/rejected": 13.228930473327637, "logps/chosen": -4879.8671875, "logps/rejected": -4879.8671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1395568847656, "rewards/margins": 0.0, "rewards/rejected": -485.1395568847656, "step": 1506 }, { "epoch": 15.863157894736842, "grad_norm": 7.730230322522402e-07, "learning_rate": 0.0001684421052631579, "logits/chosen": 13.165517807006836, "logits/rejected": 13.165517807006836, "logps/chosen": -2968.626953125, "logps/rejected": -2968.626953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1365051269531, "rewards/margins": 0.0, "rewards/rejected": -294.1365051269531, "step": 1507 }, { "epoch": 15.873684210526315, "grad_norm": 1.2821412838093238e-06, "learning_rate": 0.00016842105263157895, "logits/chosen": 13.167649269104004, "logits/rejected": 13.167649269104004, "logps/chosen": -3778.802734375, "logps/rejected": -3778.802734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.01116943359375, "rewards/margins": 0.0, "rewards/rejected": -375.01116943359375, "step": 1508 }, { "epoch": 15.884210526315789, "grad_norm": 1.4347260730573907e-06, "learning_rate": 0.0001684, "logits/chosen": 13.203887939453125, "logits/rejected": 13.203887939453125, "logps/chosen": -4880.7763671875, "logps/rejected": -4880.7763671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.23046875, "rewards/margins": 0.0, "rewards/rejected": -485.23046875, "step": 1509 }, { "epoch": 15.894736842105264, "grad_norm": 1.2377859093248844e-06, "learning_rate": 0.00016837894736842105, "logits/chosen": 13.199368476867676, "logits/rejected": 13.199368476867676, "logps/chosen": -4880.734375, "logps/rejected": -4880.734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2262878417969, "rewards/margins": 0.0, "rewards/rejected": -485.2262878417969, "step": 1510 }, { "epoch": 15.905263157894737, "grad_norm": 1.5752422086734441e-06, "learning_rate": 0.00016835789473684213, "logits/chosen": 13.195993423461914, "logits/rejected": 13.195993423461914, "logps/chosen": -4880.8310546875, "logps/rejected": -4880.8310546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2359313964844, "rewards/margins": 0.0, "rewards/rejected": -485.2359313964844, "step": 1511 }, { "epoch": 15.91578947368421, "grad_norm": 2.0205939108564053e-06, "learning_rate": 0.00016833684210526318, "logits/chosen": 13.194650650024414, "logits/rejected": 13.194650650024414, "logps/chosen": -4881.19775390625, "logps/rejected": -4881.19775390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2726135253906, "rewards/margins": 0.0, "rewards/rejected": -485.2726135253906, "step": 1512 }, { "epoch": 15.926315789473684, "grad_norm": 4.246265234542079e-06, "learning_rate": 0.00016831578947368422, "logits/chosen": 13.16921615600586, "logits/rejected": 13.16921615600586, "logps/chosen": -4324.853515625, "logps/rejected": -4324.853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3005065917969, "rewards/margins": 0.0, "rewards/rejected": -429.3005065917969, "step": 1513 }, { "epoch": 15.936842105263159, "grad_norm": 1.3122472637405735e-06, "learning_rate": 0.00016829473684210527, "logits/chosen": 13.204113006591797, "logits/rejected": 13.204113006591797, "logps/chosen": -4882.04345703125, "logps/rejected": -4882.04345703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.357177734375, "rewards/margins": 0.0, "rewards/rejected": -485.357177734375, "step": 1514 }, { "epoch": 15.947368421052632, "grad_norm": 1.4962099612603197e-06, "learning_rate": 0.00016827368421052632, "logits/chosen": 13.176742553710938, "logits/rejected": 13.176742553710938, "logps/chosen": -4288.4716796875, "logps/rejected": -4288.4716796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0498046875, "rewards/margins": 0.0, "rewards/rejected": -426.0498046875, "step": 1515 }, { "epoch": 15.957894736842105, "grad_norm": 2.5174356323987013e-06, "learning_rate": 0.00016825263157894737, "logits/chosen": 13.22823429107666, "logits/rejected": 13.22823429107666, "logps/chosen": -5173.9306640625, "logps/rejected": -5173.9306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4546508789062, "rewards/margins": 0.0, "rewards/rejected": -514.4546508789062, "step": 1516 }, { "epoch": 15.968421052631578, "grad_norm": 1.5843913843127666e-06, "learning_rate": 0.00016823157894736842, "logits/chosen": 13.20238208770752, "logits/rejected": 13.20238208770752, "logps/chosen": -4325.05859375, "logps/rejected": -4325.05859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3210144042969, "rewards/margins": 0.0, "rewards/rejected": -429.3210144042969, "step": 1517 }, { "epoch": 15.978947368421053, "grad_norm": 1.2926406043334282e-06, "learning_rate": 0.0001682105263157895, "logits/chosen": 13.189013481140137, "logits/rejected": 13.189013481140137, "logps/chosen": -3780.138671875, "logps/rejected": -3780.138671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.1447448730469, "rewards/margins": 0.0, "rewards/rejected": -375.1447448730469, "step": 1518 }, { "epoch": 15.989473684210527, "grad_norm": 9.168121550828801e-07, "learning_rate": 0.00016818947368421055, "logits/chosen": 13.193624496459961, "logits/rejected": 13.193624496459961, "logps/chosen": -3542.181640625, "logps/rejected": -3542.181640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1922912597656, "rewards/margins": 0.0, "rewards/rejected": -351.1922912597656, "step": 1519 }, { "epoch": 16.0, "grad_norm": 3.7533593513217056e-06, "learning_rate": 0.0001681684210526316, "logits/chosen": 13.256009101867676, "logits/rejected": 13.256009101867676, "logps/chosen": -5174.01318359375, "logps/rejected": -5174.01318359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.462890625, "rewards/margins": 0.0, "rewards/rejected": -514.462890625, "step": 1520 }, { "epoch": 16.010526315789473, "grad_norm": 9.518960837340273e-07, "learning_rate": 0.00016814736842105262, "logits/chosen": 13.205791473388672, "logits/rejected": 13.205791473388672, "logps/chosen": -3542.416015625, "logps/rejected": -3542.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2157287597656, "rewards/margins": 0.0, "rewards/rejected": -351.2157287597656, "step": 1521 }, { "epoch": 16.021052631578947, "grad_norm": 1.2076296798113617e-06, "learning_rate": 0.0001681263157894737, "logits/chosen": 13.201488494873047, "logits/rejected": 13.201488494873047, "logps/chosen": -3999.0078125, "logps/rejected": -3999.0078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0734558105469, "rewards/margins": 0.0, "rewards/rejected": -397.0734558105469, "step": 1522 }, { "epoch": 16.03157894736842, "grad_norm": 1.3971101679999265e-06, "learning_rate": 0.00016810526315789474, "logits/chosen": 13.22538948059082, "logits/rejected": 13.22538948059082, "logps/chosen": -4288.8740234375, "logps/rejected": -4288.8740234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0900573730469, "rewards/margins": 0.0, "rewards/rejected": -426.0900573730469, "step": 1523 }, { "epoch": 16.042105263157893, "grad_norm": 1.7650563677307218e-06, "learning_rate": 0.0001680842105263158, "logits/chosen": 13.214360237121582, "logits/rejected": 13.214360237121582, "logps/chosen": -3542.40625, "logps/rejected": -3542.40625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2147521972656, "rewards/margins": 0.0, "rewards/rejected": -351.2147521972656, "step": 1524 }, { "epoch": 16.05263157894737, "grad_norm": 1.375620968246949e-06, "learning_rate": 0.00016806315789473684, "logits/chosen": 13.206888198852539, "logits/rejected": 13.206888198852539, "logps/chosen": -3998.708984375, "logps/rejected": -3998.708984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0435791015625, "rewards/margins": 0.0, "rewards/rejected": -397.0435791015625, "step": 1525 }, { "epoch": 16.063157894736843, "grad_norm": 1.6022536328819115e-06, "learning_rate": 0.00016804210526315792, "logits/chosen": 13.264443397521973, "logits/rejected": 13.264443397521973, "logps/chosen": -4882.97412109375, "logps/rejected": -4882.97412109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.45025634765625, "rewards/margins": 0.0, "rewards/rejected": -485.45025634765625, "step": 1526 }, { "epoch": 16.073684210526316, "grad_norm": 1.1701546327458345e-06, "learning_rate": 0.00016802105263157894, "logits/chosen": 13.192530632019043, "logits/rejected": 13.192530632019043, "logps/chosen": -2672.140625, "logps/rejected": -2672.140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4324645996094, "rewards/margins": 0.0, "rewards/rejected": -264.4324645996094, "step": 1527 }, { "epoch": 16.08421052631579, "grad_norm": 1.153848984358774e-06, "learning_rate": 0.000168, "logits/chosen": 13.205900192260742, "logits/rejected": 13.205900192260742, "logps/chosen": -3999.07421875, "logps/rejected": -3999.07421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0801086425781, "rewards/margins": 0.0, "rewards/rejected": -397.0801086425781, "step": 1528 }, { "epoch": 16.094736842105263, "grad_norm": 1.0798538596645813e-06, "learning_rate": 0.00016797894736842107, "logits/chosen": 13.26051139831543, "logits/rejected": 13.26051139831543, "logps/chosen": -4882.5927734375, "logps/rejected": -4882.5927734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.412109375, "rewards/margins": 0.0, "rewards/rejected": -485.412109375, "step": 1529 }, { "epoch": 16.105263157894736, "grad_norm": 1.3760369483861723e-06, "learning_rate": 0.00016795789473684212, "logits/chosen": 13.1987886428833, "logits/rejected": 13.1987886428833, "logps/chosen": -2967.771484375, "logps/rejected": -2967.771484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.05096435546875, "rewards/margins": 0.0, "rewards/rejected": -294.05096435546875, "step": 1530 }, { "epoch": 16.11578947368421, "grad_norm": 1.2300752132432535e-06, "learning_rate": 0.00016793684210526317, "logits/chosen": 13.212223052978516, "logits/rejected": 13.212223052978516, "logps/chosen": -3779.744140625, "logps/rejected": -3779.744140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.1053161621094, "rewards/margins": 0.0, "rewards/rejected": -375.1053161621094, "step": 1531 }, { "epoch": 16.126315789473683, "grad_norm": 1.8733865090325708e-06, "learning_rate": 0.00016791578947368421, "logits/chosen": 13.219634056091309, "logits/rejected": 13.219634056091309, "logps/chosen": -4289.3134765625, "logps/rejected": -4289.3134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1340026855469, "rewards/margins": 0.0, "rewards/rejected": -426.1340026855469, "step": 1532 }, { "epoch": 16.13684210526316, "grad_norm": 1.3331396075955126e-06, "learning_rate": 0.00016789473684210526, "logits/chosen": 13.207313537597656, "logits/rejected": 13.207313537597656, "logps/chosen": -3779.89453125, "logps/rejected": -3779.89453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.1203308105469, "rewards/margins": 0.0, "rewards/rejected": -375.1203308105469, "step": 1533 }, { "epoch": 16.147368421052633, "grad_norm": 1.3025571661273716e-06, "learning_rate": 0.0001678736842105263, "logits/chosen": 13.223761558532715, "logits/rejected": 13.223761558532715, "logps/chosen": -4326.466796875, "logps/rejected": -4326.466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4618225097656, "rewards/margins": 0.0, "rewards/rejected": -429.4618225097656, "step": 1534 }, { "epoch": 16.157894736842106, "grad_norm": 8.950647725214367e-07, "learning_rate": 0.00016785263157894736, "logits/chosen": 13.197572708129883, "logits/rejected": 13.197572708129883, "logps/chosen": -3542.939453125, "logps/rejected": -3542.939453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.26806640625, "rewards/margins": 0.0, "rewards/rejected": -351.26806640625, "step": 1535 }, { "epoch": 16.16842105263158, "grad_norm": 1.7535032839077758e-06, "learning_rate": 0.00016783157894736844, "logits/chosen": 13.184588432312012, "logits/rejected": 13.184588432312012, "logps/chosen": -3999.541015625, "logps/rejected": -3999.541015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1268005371094, "rewards/margins": 0.0, "rewards/rejected": -397.1268005371094, "step": 1536 }, { "epoch": 16.178947368421053, "grad_norm": 9.275355523641338e-07, "learning_rate": 0.0001678105263157895, "logits/chosen": 13.188555717468262, "logits/rejected": 13.188555717468262, "logps/chosen": -3543.091796875, "logps/rejected": -3543.091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2832946777344, "rewards/margins": 0.0, "rewards/rejected": -351.2832946777344, "step": 1537 }, { "epoch": 16.189473684210526, "grad_norm": 9.93465391729842e-07, "learning_rate": 0.00016778947368421054, "logits/chosen": 13.184117317199707, "logits/rejected": 13.184117317199707, "logps/chosen": -3543.158203125, "logps/rejected": -3543.158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2899475097656, "rewards/margins": 0.0, "rewards/rejected": -351.2899475097656, "step": 1538 }, { "epoch": 16.2, "grad_norm": 1.221544380314299e-06, "learning_rate": 0.0001677684210526316, "logits/chosen": 13.18503475189209, "logits/rejected": 13.18503475189209, "logps/chosen": -3781.2265625, "logps/rejected": -3781.2265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.2535400390625, "rewards/margins": 0.0, "rewards/rejected": -375.2535400390625, "step": 1539 }, { "epoch": 16.210526315789473, "grad_norm": 1.1523267176016816e-06, "learning_rate": 0.00016774736842105264, "logits/chosen": 13.169259071350098, "logits/rejected": 13.169259071350098, "logps/chosen": -3999.501953125, "logps/rejected": -3999.501953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1228942871094, "rewards/margins": 0.0, "rewards/rejected": -397.1228942871094, "step": 1540 }, { "epoch": 16.221052631578946, "grad_norm": 1.2691651818386163e-06, "learning_rate": 0.00016772631578947369, "logits/chosen": 13.178340911865234, "logits/rejected": 13.178340911865234, "logps/chosen": -3781.39453125, "logps/rejected": -3781.39453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.2703552246094, "rewards/margins": 0.0, "rewards/rejected": -375.2703552246094, "step": 1541 }, { "epoch": 16.231578947368423, "grad_norm": 9.442172768103774e-07, "learning_rate": 0.00016770526315789473, "logits/chosen": 13.14732837677002, "logits/rejected": 13.14732837677002, "logps/chosen": -2673.548828125, "logps/rejected": -2673.548828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.57330322265625, "rewards/margins": 0.0, "rewards/rejected": -264.57330322265625, "step": 1542 }, { "epoch": 16.242105263157896, "grad_norm": 1.3646114211951499e-06, "learning_rate": 0.0001676842105263158, "logits/chosen": 13.164592742919922, "logits/rejected": 13.164592742919922, "logps/chosen": -3757.373046875, "logps/rejected": -3757.373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8190002441406, "rewards/margins": 0.0, "rewards/rejected": -372.8190002441406, "step": 1543 }, { "epoch": 16.25263157894737, "grad_norm": 2.3008640255284263e-06, "learning_rate": 0.00016766315789473686, "logits/chosen": 13.185821533203125, "logits/rejected": 13.185821533203125, "logps/chosen": -4327.1298828125, "logps/rejected": -4327.1298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.52813720703125, "rewards/margins": 0.0, "rewards/rejected": -429.52813720703125, "step": 1544 }, { "epoch": 16.263157894736842, "grad_norm": 2.661973439899157e-06, "learning_rate": 0.0001676421052631579, "logits/chosen": 13.220733642578125, "logits/rejected": 13.220733642578125, "logps/chosen": -5172.6591796875, "logps/rejected": -5172.6591796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3275146484375, "rewards/margins": 0.0, "rewards/rejected": -514.3275146484375, "step": 1545 }, { "epoch": 16.273684210526316, "grad_norm": 1.3652403367814259e-06, "learning_rate": 0.00016762105263157896, "logits/chosen": 13.153447151184082, "logits/rejected": 13.153447151184082, "logps/chosen": -3999.9140625, "logps/rejected": -3999.9140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1640930175781, "rewards/margins": 0.0, "rewards/rejected": -397.1640930175781, "step": 1546 }, { "epoch": 16.28421052631579, "grad_norm": 1.607982198947866e-06, "learning_rate": 0.0001676, "logits/chosen": 13.17337417602539, "logits/rejected": 13.17337417602539, "logps/chosen": -4290.24560546875, "logps/rejected": -4290.24560546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.2272033691406, "rewards/margins": 0.0, "rewards/rejected": -426.2272033691406, "step": 1547 }, { "epoch": 16.294736842105262, "grad_norm": 1.1096141179223196e-06, "learning_rate": 0.00016757894736842106, "logits/chosen": 13.156723976135254, "logits/rejected": 13.156723976135254, "logps/chosen": -3757.8818359375, "logps/rejected": -3757.8818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.869873046875, "rewards/margins": 0.0, "rewards/rejected": -372.869873046875, "step": 1548 }, { "epoch": 16.305263157894736, "grad_norm": 1.1814041727120639e-06, "learning_rate": 0.0001675578947368421, "logits/chosen": 13.151932716369629, "logits/rejected": 13.151932716369629, "logps/chosen": -3757.9375, "logps/rejected": -3757.9375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8754577636719, "rewards/margins": 0.0, "rewards/rejected": -372.8754577636719, "step": 1549 }, { "epoch": 16.31578947368421, "grad_norm": 2.057674237221363e-06, "learning_rate": 0.00016753684210526318, "logits/chosen": 13.206674575805664, "logits/rejected": 13.206674575805664, "logps/chosen": -5172.7880859375, "logps/rejected": -5172.7880859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3403930664062, "rewards/margins": 0.0, "rewards/rejected": -514.3403930664062, "step": 1550 }, { "epoch": 16.31578947368421, "eval_logits/chosen": 13.176271438598633, "eval_logits/rejected": 13.176271438598633, "eval_logps/chosen": -4310.7607421875, "eval_logps/rejected": -4310.7607421875, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.1729431152344, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.1729431152344, "eval_runtime": 4.2869, "eval_samples_per_second": 2.333, "eval_steps_per_second": 2.333, "step": 1550 }, { "epoch": 16.326315789473686, "grad_norm": 1.1810367368525476e-06, "learning_rate": 0.00016751578947368423, "logits/chosen": 13.15052318572998, "logits/rejected": 13.15052318572998, "logps/chosen": -3544.3505859375, "logps/rejected": -3544.3505859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4091796875, "rewards/margins": 0.0, "rewards/rejected": -351.4091796875, "step": 1551 }, { "epoch": 16.33684210526316, "grad_norm": 1.806598902476253e-06, "learning_rate": 0.00016749473684210528, "logits/chosen": 13.210683822631836, "logits/rejected": 13.210683822631836, "logps/chosen": -5173.298828125, "logps/rejected": -5173.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.3914794921875, "rewards/margins": 0.0, "rewards/rejected": -514.3914794921875, "step": 1552 }, { "epoch": 16.347368421052632, "grad_norm": 1.7224299426743528e-06, "learning_rate": 0.0001674736842105263, "logits/chosen": 13.218097686767578, "logits/rejected": 13.218097686767578, "logps/chosen": -5174.03369140625, "logps/rejected": -5174.03369140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.4649658203125, "rewards/margins": 0.0, "rewards/rejected": -514.4649658203125, "step": 1553 }, { "epoch": 16.357894736842105, "grad_norm": 1.440164396626642e-06, "learning_rate": 0.00016745263157894738, "logits/chosen": 13.159080505371094, "logits/rejected": 13.159080505371094, "logps/chosen": -2968.6787109375, "logps/rejected": -2968.6787109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1416931152344, "rewards/margins": 0.0, "rewards/rejected": -294.1416931152344, "step": 1554 }, { "epoch": 16.36842105263158, "grad_norm": 1.0518540420889622e-06, "learning_rate": 0.00016743157894736843, "logits/chosen": 13.186163902282715, "logits/rejected": 13.186163902282715, "logps/chosen": -3544.818359375, "logps/rejected": -3544.818359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4559631347656, "rewards/margins": 0.0, "rewards/rejected": -351.4559631347656, "step": 1555 }, { "epoch": 16.378947368421052, "grad_norm": 8.651535949866229e-07, "learning_rate": 0.00016741052631578948, "logits/chosen": 13.201345443725586, "logits/rejected": 13.201345443725586, "logps/chosen": -3545.4052734375, "logps/rejected": -3545.4052734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5146484375, "rewards/margins": 0.0, "rewards/rejected": -351.5146484375, "step": 1556 }, { "epoch": 16.389473684210525, "grad_norm": 9.128787041845499e-07, "learning_rate": 0.00016738947368421053, "logits/chosen": 13.204204559326172, "logits/rejected": 13.204204559326172, "logps/chosen": -2969.244140625, "logps/rejected": -2969.244140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1982116699219, "rewards/margins": 0.0, "rewards/rejected": -294.1982116699219, "step": 1557 }, { "epoch": 16.4, "grad_norm": 2.8977399324503494e-06, "learning_rate": 0.0001673684210526316, "logits/chosen": 13.275843620300293, "logits/rejected": 13.275843620300293, "logps/chosen": -4878.45263671875, "logps/rejected": -4878.45263671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.99810791015625, "rewards/margins": 0.0, "rewards/rejected": -484.99810791015625, "step": 1558 }, { "epoch": 16.410526315789475, "grad_norm": 2.60602064372506e-06, "learning_rate": 0.00016734736842105263, "logits/chosen": 13.284257888793945, "logits/rejected": 13.284257888793945, "logps/chosen": -4878.1826171875, "logps/rejected": -4878.1826171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -484.9710998535156, "rewards/margins": 0.0, "rewards/rejected": -484.9710998535156, "step": 1559 }, { "epoch": 16.42105263157895, "grad_norm": 2.481759565853281e-06, "learning_rate": 0.00016732631578947368, "logits/chosen": 13.297558784484863, "logits/rejected": 13.297558784484863, "logps/chosen": -5177.32373046875, "logps/rejected": -5177.32373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7939453125, "rewards/margins": 0.0, "rewards/rejected": -514.7939453125, "step": 1560 }, { "epoch": 16.431578947368422, "grad_norm": 1.0028124961536378e-06, "learning_rate": 0.00016730526315789475, "logits/chosen": 13.22165584564209, "logits/rejected": 13.22165584564209, "logps/chosen": -2673.4052734375, "logps/rejected": -2673.4052734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5589294433594, "rewards/margins": 0.0, "rewards/rejected": -264.5589294433594, "step": 1561 }, { "epoch": 16.442105263157895, "grad_norm": 1.7059057881851913e-06, "learning_rate": 0.0001672842105263158, "logits/chosen": 13.232525825500488, "logits/rejected": 13.232525825500488, "logps/chosen": -3996.466796875, "logps/rejected": -3996.466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8193664550781, "rewards/margins": 0.0, "rewards/rejected": -396.8193664550781, "step": 1562 }, { "epoch": 16.45263157894737, "grad_norm": 2.424517560939421e-06, "learning_rate": 0.00016726315789473685, "logits/chosen": 13.22361946105957, "logits/rejected": 13.22361946105957, "logps/chosen": -2967.9365234375, "logps/rejected": -2967.9365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0674743652344, "rewards/margins": 0.0, "rewards/rejected": -294.0674743652344, "step": 1563 }, { "epoch": 16.46315789473684, "grad_norm": 2.439674972265493e-06, "learning_rate": 0.0001672421052631579, "logits/chosen": 13.241548538208008, "logits/rejected": 13.241548538208008, "logps/chosen": -3545.689453125, "logps/rejected": -3545.689453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5430603027344, "rewards/margins": 0.0, "rewards/rejected": -351.5430603027344, "step": 1564 }, { "epoch": 16.473684210526315, "grad_norm": 1.7806263485908858e-06, "learning_rate": 0.00016722105263157895, "logits/chosen": 13.2506685256958, "logits/rejected": 13.2506685256958, "logps/chosen": -3777.9892578125, "logps/rejected": -3777.9892578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9298095703125, "rewards/margins": 0.0, "rewards/rejected": -374.9298095703125, "step": 1565 }, { "epoch": 16.48421052631579, "grad_norm": 1.2744127388941706e-06, "learning_rate": 0.0001672, "logits/chosen": 13.240317344665527, "logits/rejected": 13.240317344665527, "logps/chosen": -2969.0400390625, "logps/rejected": -2969.0400390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1778259277344, "rewards/margins": 0.0, "rewards/rejected": -294.1778259277344, "step": 1566 }, { "epoch": 16.49473684210526, "grad_norm": 2.0518414203252178e-06, "learning_rate": 0.00016717894736842105, "logits/chosen": 13.24996280670166, "logits/rejected": 13.24996280670166, "logps/chosen": -3996.28125, "logps/rejected": -3996.28125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8008117675781, "rewards/margins": 0.0, "rewards/rejected": -396.8008117675781, "step": 1567 }, { "epoch": 16.50526315789474, "grad_norm": 1.7397899227944436e-06, "learning_rate": 0.00016715789473684212, "logits/chosen": 13.262529373168945, "logits/rejected": 13.262529373168945, "logps/chosen": -3777.9306640625, "logps/rejected": -3777.9306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9239501953125, "rewards/margins": 0.0, "rewards/rejected": -374.9239501953125, "step": 1568 }, { "epoch": 16.51578947368421, "grad_norm": 3.3266060199821368e-06, "learning_rate": 0.00016713684210526317, "logits/chosen": 13.296720504760742, "logits/rejected": 13.296720504760742, "logps/chosen": -4878.798828125, "logps/rejected": -4878.798828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.03271484375, "rewards/margins": 0.0, "rewards/rejected": -485.03271484375, "step": 1569 }, { "epoch": 16.526315789473685, "grad_norm": 1.233409875567304e-06, "learning_rate": 0.00016711578947368422, "logits/chosen": 13.269818305969238, "logits/rejected": 13.269818305969238, "logps/chosen": -4328.2216796875, "logps/rejected": -4328.2216796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.6372985839844, "rewards/margins": 0.0, "rewards/rejected": -429.6372985839844, "step": 1570 }, { "epoch": 16.53684210526316, "grad_norm": 1.755433572725451e-06, "learning_rate": 0.00016709473684210527, "logits/chosen": 13.225573539733887, "logits/rejected": 13.225573539733887, "logps/chosen": -3996.615234375, "logps/rejected": -3996.615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8341979980469, "rewards/margins": 0.0, "rewards/rejected": -396.8341979980469, "step": 1571 }, { "epoch": 16.54736842105263, "grad_norm": 9.790144304133719e-07, "learning_rate": 0.00016707368421052632, "logits/chosen": 13.223475456237793, "logits/rejected": 13.223475456237793, "logps/chosen": -3545.7998046875, "logps/rejected": -3545.7998046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5541076660156, "rewards/margins": 0.0, "rewards/rejected": -351.5541076660156, "step": 1572 }, { "epoch": 16.557894736842105, "grad_norm": 1.1840425031550694e-06, "learning_rate": 0.00016705263157894737, "logits/chosen": 13.198973655700684, "logits/rejected": 13.198973655700684, "logps/chosen": -3997.5546875, "logps/rejected": -3997.5546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.92816162109375, "rewards/margins": 0.0, "rewards/rejected": -396.92816162109375, "step": 1573 }, { "epoch": 16.568421052631578, "grad_norm": 1.8935512571260915e-06, "learning_rate": 0.00016703157894736842, "logits/chosen": 13.195361137390137, "logits/rejected": 13.195361137390137, "logps/chosen": -3758.8916015625, "logps/rejected": -3758.8916015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9708557128906, "rewards/margins": 0.0, "rewards/rejected": -372.9708557128906, "step": 1574 }, { "epoch": 16.57894736842105, "grad_norm": 1.1107524642284261e-06, "learning_rate": 0.0001670105263157895, "logits/chosen": 13.163818359375, "logits/rejected": 13.163818359375, "logps/chosen": -2673.5859375, "logps/rejected": -2673.5859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5769958496094, "rewards/margins": 0.0, "rewards/rejected": -264.5769958496094, "step": 1575 }, { "epoch": 16.589473684210525, "grad_norm": 1.747143869579304e-06, "learning_rate": 0.00016698947368421055, "logits/chosen": 13.229965209960938, "logits/rejected": 13.229965209960938, "logps/chosen": -5176.796875, "logps/rejected": -5176.796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7412719726562, "rewards/margins": 0.0, "rewards/rejected": -514.7412719726562, "step": 1576 }, { "epoch": 16.6, "grad_norm": 1.7563052097102627e-06, "learning_rate": 0.0001669684210526316, "logits/chosen": 13.214452743530273, "logits/rejected": 13.214452743530273, "logps/chosen": -4879.61328125, "logps/rejected": -4879.61328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1141662597656, "rewards/margins": 0.0, "rewards/rejected": -485.1141662597656, "step": 1577 }, { "epoch": 16.610526315789475, "grad_norm": 1.0877309932766366e-06, "learning_rate": 0.00016694736842105262, "logits/chosen": 13.156359672546387, "logits/rejected": 13.156359672546387, "logps/chosen": -3998.810546875, "logps/rejected": -3998.810546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0537414550781, "rewards/margins": 0.0, "rewards/rejected": -397.0537414550781, "step": 1578 }, { "epoch": 16.621052631578948, "grad_norm": 1.4249728792492533e-06, "learning_rate": 0.0001669263157894737, "logits/chosen": 13.17270278930664, "logits/rejected": 13.17270278930664, "logps/chosen": -4287.84228515625, "logps/rejected": -4287.84228515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.98687744140625, "rewards/margins": 0.0, "rewards/rejected": -425.98687744140625, "step": 1579 }, { "epoch": 16.63157894736842, "grad_norm": 1.1871371725646895e-06, "learning_rate": 0.00016690526315789474, "logits/chosen": 13.196175575256348, "logits/rejected": 13.196175575256348, "logps/chosen": -4880.28125, "logps/rejected": -4880.28125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.18096923828125, "rewards/margins": 0.0, "rewards/rejected": -485.18096923828125, "step": 1580 }, { "epoch": 16.642105263157895, "grad_norm": 1.0823998763953568e-06, "learning_rate": 0.0001668842105263158, "logits/chosen": 13.146787643432617, "logits/rejected": 13.146787643432617, "logps/chosen": -3759.208984375, "logps/rejected": -3759.208984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0025939941406, "rewards/margins": 0.0, "rewards/rejected": -373.0025939941406, "step": 1581 }, { "epoch": 16.652631578947368, "grad_norm": 1.1689540997394943e-06, "learning_rate": 0.00016686315789473687, "logits/chosen": 13.184684753417969, "logits/rejected": 13.184684753417969, "logps/chosen": -4880.6171875, "logps/rejected": -4880.6171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2145690917969, "rewards/margins": 0.0, "rewards/rejected": -485.2145690917969, "step": 1582 }, { "epoch": 16.66315789473684, "grad_norm": 1.5519713087996934e-06, "learning_rate": 0.00016684210526315792, "logits/chosen": 13.149711608886719, "logits/rejected": 13.149711608886719, "logps/chosen": -4288.66259765625, "logps/rejected": -4288.66259765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.06890869140625, "rewards/margins": 0.0, "rewards/rejected": -426.06890869140625, "step": 1583 }, { "epoch": 16.673684210526314, "grad_norm": 1.1183192327735014e-06, "learning_rate": 0.00016682105263157894, "logits/chosen": 13.122364044189453, "logits/rejected": 13.122364044189453, "logps/chosen": -2968.58203125, "logps/rejected": -2968.58203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.13201904296875, "rewards/margins": 0.0, "rewards/rejected": -294.13201904296875, "step": 1584 }, { "epoch": 16.68421052631579, "grad_norm": 1.2538896498881513e-06, "learning_rate": 0.0001668, "logits/chosen": 13.12143611907959, "logits/rejected": 13.12143611907959, "logps/chosen": -3999.919921875, "logps/rejected": -3999.919921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1646728515625, "rewards/margins": 0.0, "rewards/rejected": -397.1646728515625, "step": 1585 }, { "epoch": 16.694736842105264, "grad_norm": 2.0074051008123206e-06, "learning_rate": 0.00016677894736842107, "logits/chosen": 13.181496620178223, "logits/rejected": 13.181496620178223, "logps/chosen": -5175.79296875, "logps/rejected": -5175.79296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.640869140625, "rewards/margins": 0.0, "rewards/rejected": -514.640869140625, "step": 1586 }, { "epoch": 16.705263157894738, "grad_norm": 8.466049621347338e-07, "learning_rate": 0.00016675789473684211, "logits/chosen": 13.109195709228516, "logits/rejected": 13.109195709228516, "logps/chosen": -2673.37109375, "logps/rejected": -2673.37109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5555114746094, "rewards/margins": 0.0, "rewards/rejected": -264.5555114746094, "step": 1587 }, { "epoch": 16.71578947368421, "grad_norm": 1.3248028380985488e-06, "learning_rate": 0.00016673684210526316, "logits/chosen": 13.134121894836426, "logits/rejected": 13.134121894836426, "logps/chosen": -3778.740234375, "logps/rejected": -3778.740234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0049133300781, "rewards/margins": 0.0, "rewards/rejected": -375.0049133300781, "step": 1588 }, { "epoch": 16.726315789473684, "grad_norm": 1.9373460418137256e-06, "learning_rate": 0.0001667157894736842, "logits/chosen": 13.154068946838379, "logits/rejected": 13.154068946838379, "logps/chosen": -4326.181640625, "logps/rejected": -4326.181640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4333190917969, "rewards/margins": 0.0, "rewards/rejected": -429.4333190917969, "step": 1589 }, { "epoch": 16.736842105263158, "grad_norm": 1.785628114703286e-06, "learning_rate": 0.0001666947368421053, "logits/chosen": 13.187432289123535, "logits/rejected": 13.187432289123535, "logps/chosen": -5175.6162109375, "logps/rejected": -5175.6162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6232299804688, "rewards/margins": 0.0, "rewards/rejected": -514.6232299804688, "step": 1590 }, { "epoch": 16.74736842105263, "grad_norm": 9.664781828178093e-07, "learning_rate": 0.0001666736842105263, "logits/chosen": 13.145255088806152, "logits/rejected": 13.145255088806152, "logps/chosen": -3542.837890625, "logps/rejected": -3542.837890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2579040527344, "rewards/margins": 0.0, "rewards/rejected": -351.2579040527344, "step": 1591 }, { "epoch": 16.757894736842104, "grad_norm": 8.376316600333666e-07, "learning_rate": 0.00016665263157894736, "logits/chosen": 13.132930755615234, "logits/rejected": 13.132930755615234, "logps/chosen": -2673.841796875, "logps/rejected": -2673.841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.60260009765625, "rewards/margins": 0.0, "rewards/rejected": -264.60260009765625, "step": 1592 }, { "epoch": 16.768421052631577, "grad_norm": 2.3407110347761773e-06, "learning_rate": 0.00016663157894736844, "logits/chosen": 13.215516090393066, "logits/rejected": 13.215516090393066, "logps/chosen": -5175.9033203125, "logps/rejected": -5175.9033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6519165039062, "rewards/margins": 0.0, "rewards/rejected": -514.6519165039062, "step": 1593 }, { "epoch": 16.778947368421054, "grad_norm": 9.087800094675913e-07, "learning_rate": 0.00016661052631578949, "logits/chosen": 13.177078247070312, "logits/rejected": 13.177078247070312, "logps/chosen": -3543.1328125, "logps/rejected": -3543.1328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.28741455078125, "rewards/margins": 0.0, "rewards/rejected": -351.28741455078125, "step": 1594 }, { "epoch": 16.789473684210527, "grad_norm": 1.2974768424101057e-06, "learning_rate": 0.00016658947368421054, "logits/chosen": 13.229546546936035, "logits/rejected": 13.229546546936035, "logps/chosen": -4881.5537109375, "logps/rejected": -4881.5537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3081970214844, "rewards/margins": 0.0, "rewards/rejected": -485.3081970214844, "step": 1595 }, { "epoch": 16.8, "grad_norm": 8.345022592948226e-07, "learning_rate": 0.00016656842105263158, "logits/chosen": 13.180034637451172, "logits/rejected": 13.180034637451172, "logps/chosen": -2673.927734375, "logps/rejected": -2673.927734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6111755371094, "rewards/margins": 0.0, "rewards/rejected": -264.6111755371094, "step": 1596 }, { "epoch": 16.810526315789474, "grad_norm": 1.6810772649478167e-06, "learning_rate": 0.00016654736842105263, "logits/chosen": 13.26390552520752, "logits/rejected": 13.26390552520752, "logps/chosen": -5177.091796875, "logps/rejected": -5177.091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.770751953125, "rewards/margins": 0.0, "rewards/rejected": -514.770751953125, "step": 1597 }, { "epoch": 16.821052631578947, "grad_norm": 1.2773365369866951e-06, "learning_rate": 0.00016652631578947368, "logits/chosen": 13.213223457336426, "logits/rejected": 13.213223457336426, "logps/chosen": -3998.63671875, "logps/rejected": -3998.63671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0363464355469, "rewards/margins": 0.0, "rewards/rejected": -397.0363464355469, "step": 1598 }, { "epoch": 16.83157894736842, "grad_norm": 8.511328815075103e-07, "learning_rate": 0.00016650526315789473, "logits/chosen": 13.210891723632812, "logits/rejected": 13.210891723632812, "logps/chosen": -2674.048828125, "logps/rejected": -2674.048828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.623291015625, "rewards/margins": 0.0, "rewards/rejected": -264.623291015625, "step": 1599 }, { "epoch": 16.842105263157894, "grad_norm": 1.1729697462214972e-06, "learning_rate": 0.0001664842105263158, "logits/chosen": 13.278419494628906, "logits/rejected": 13.278419494628906, "logps/chosen": -4881.03466796875, "logps/rejected": -4881.03466796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2563171386719, "rewards/margins": 0.0, "rewards/rejected": -485.2563171386719, "step": 1600 }, { "epoch": 16.842105263157894, "eval_logits/chosen": 13.265612602233887, "eval_logits/rejected": 13.265612602233887, "eval_logps/chosen": -4311.87646484375, "eval_logps/rejected": -4311.87646484375, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.28448486328125, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.28448486328125, "eval_runtime": 4.2993, "eval_samples_per_second": 2.326, "eval_steps_per_second": 2.326, "step": 1600 }, { "epoch": 16.852631578947367, "grad_norm": 1.272752911063435e-06, "learning_rate": 0.00016646315789473686, "logits/chosen": 13.282668113708496, "logits/rejected": 13.282668113708496, "logps/chosen": -4881.19921875, "logps/rejected": -4881.19921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.27276611328125, "rewards/margins": 0.0, "rewards/rejected": -485.27276611328125, "step": 1601 }, { "epoch": 16.863157894736844, "grad_norm": 1.1911339470316307e-06, "learning_rate": 0.0001664421052631579, "logits/chosen": 13.232366561889648, "logits/rejected": 13.232366561889648, "logps/chosen": -3998.216796875, "logps/rejected": -3998.216796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9943542480469, "rewards/margins": 0.0, "rewards/rejected": -396.9943542480469, "step": 1602 }, { "epoch": 16.873684210526317, "grad_norm": 1.1970423656748608e-06, "learning_rate": 0.00016642105263157896, "logits/chosen": 13.231376647949219, "logits/rejected": 13.231376647949219, "logps/chosen": -3998.2421875, "logps/rejected": -3998.2421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9969177246094, "rewards/margins": 0.0, "rewards/rejected": -396.9969177246094, "step": 1603 }, { "epoch": 16.88421052631579, "grad_norm": 1.3990925253892783e-06, "learning_rate": 0.0001664, "logits/chosen": 13.240729331970215, "logits/rejected": 13.240729331970215, "logps/chosen": -3543.373046875, "logps/rejected": -3543.373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3114318847656, "rewards/margins": 0.0, "rewards/rejected": -351.3114318847656, "step": 1604 }, { "epoch": 16.894736842105264, "grad_norm": 1.425261757503904e-06, "learning_rate": 0.00016637894736842106, "logits/chosen": 13.248808860778809, "logits/rejected": 13.248808860778809, "logps/chosen": -4288.2763671875, "logps/rejected": -4288.2763671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0302734375, "rewards/margins": 0.0, "rewards/rejected": -426.0302734375, "step": 1605 }, { "epoch": 16.905263157894737, "grad_norm": 2.318427050340688e-06, "learning_rate": 0.0001663578947368421, "logits/chosen": 13.25756549835205, "logits/rejected": 13.25756549835205, "logps/chosen": -4326.1875, "logps/rejected": -4326.1875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.43389892578125, "rewards/margins": 0.0, "rewards/rejected": -429.43389892578125, "step": 1606 }, { "epoch": 16.91578947368421, "grad_norm": 8.237370252572873e-07, "learning_rate": 0.00016633684210526318, "logits/chosen": 13.212408065795898, "logits/rejected": 13.212408065795898, "logps/chosen": -2674.447265625, "logps/rejected": -2674.447265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.66314697265625, "rewards/margins": 0.0, "rewards/rejected": -264.66314697265625, "step": 1607 }, { "epoch": 16.926315789473684, "grad_norm": 1.3481775340551394e-06, "learning_rate": 0.00016631578947368423, "logits/chosen": 13.270916938781738, "logits/rejected": 13.270916938781738, "logps/chosen": -4881.7822265625, "logps/rejected": -4881.7822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3310546875, "rewards/margins": 0.0, "rewards/rejected": -485.3310546875, "step": 1608 }, { "epoch": 16.936842105263157, "grad_norm": 1.6446851986984257e-06, "learning_rate": 0.00016629473684210528, "logits/chosen": 13.268778800964355, "logits/rejected": 13.268778800964355, "logps/chosen": -4881.5537109375, "logps/rejected": -4881.5537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3081970214844, "rewards/margins": 0.0, "rewards/rejected": -485.3081970214844, "step": 1609 }, { "epoch": 16.94736842105263, "grad_norm": 8.176473329513101e-07, "learning_rate": 0.0001662736842105263, "logits/chosen": 13.206258773803711, "logits/rejected": 13.206258773803711, "logps/chosen": -2674.796875, "logps/rejected": -2674.796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6980895996094, "rewards/margins": 0.0, "rewards/rejected": -264.6980895996094, "step": 1610 }, { "epoch": 16.957894736842107, "grad_norm": 2.6329346383136e-06, "learning_rate": 0.00016625263157894738, "logits/chosen": 13.271363258361816, "logits/rejected": 13.271363258361816, "logps/chosen": -5177.3896484375, "logps/rejected": -5177.3896484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.800537109375, "rewards/margins": 0.0, "rewards/rejected": -514.800537109375, "step": 1611 }, { "epoch": 16.96842105263158, "grad_norm": 9.865083256954676e-07, "learning_rate": 0.00016623157894736843, "logits/chosen": 13.2173433303833, "logits/rejected": 13.2173433303833, "logps/chosen": -3758.169921875, "logps/rejected": -3758.169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.898681640625, "rewards/margins": 0.0, "rewards/rejected": -372.898681640625, "step": 1612 }, { "epoch": 16.978947368421053, "grad_norm": 9.488172167948505e-07, "learning_rate": 0.00016621052631578948, "logits/chosen": 13.21692180633545, "logits/rejected": 13.21692180633545, "logps/chosen": -3543.259765625, "logps/rejected": -3543.259765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.30010986328125, "rewards/margins": 0.0, "rewards/rejected": -351.30010986328125, "step": 1613 }, { "epoch": 16.989473684210527, "grad_norm": 1.335289198323153e-06, "learning_rate": 0.00016618947368421053, "logits/chosen": 13.263039588928223, "logits/rejected": 13.263039588928223, "logps/chosen": -5177.6787109375, "logps/rejected": -5177.6787109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8294677734375, "rewards/margins": 0.0, "rewards/rejected": -514.8294677734375, "step": 1614 }, { "epoch": 17.0, "grad_norm": 1.1219150337637984e-06, "learning_rate": 0.0001661684210526316, "logits/chosen": 13.25462532043457, "logits/rejected": 13.25462532043457, "logps/chosen": -4882.509765625, "logps/rejected": -4882.509765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.40380859375, "rewards/margins": 0.0, "rewards/rejected": -485.40380859375, "step": 1615 }, { "epoch": 17.010526315789473, "grad_norm": 1.3002828609387507e-06, "learning_rate": 0.00016614736842105262, "logits/chosen": 13.218510627746582, "logits/rejected": 13.218510627746582, "logps/chosen": -3543.177734375, "logps/rejected": -3543.177734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2919006347656, "rewards/margins": 0.0, "rewards/rejected": -351.2919006347656, "step": 1616 }, { "epoch": 17.021052631578947, "grad_norm": 1.2810995713152806e-06, "learning_rate": 0.00016612631578947367, "logits/chosen": 13.261174201965332, "logits/rejected": 13.261174201965332, "logps/chosen": -4882.78515625, "logps/rejected": -4882.78515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.4313659667969, "rewards/margins": 0.0, "rewards/rejected": -485.4313659667969, "step": 1617 }, { "epoch": 17.03157894736842, "grad_norm": 1.475021804253629e-06, "learning_rate": 0.00016610526315789475, "logits/chosen": 13.213229179382324, "logits/rejected": 13.213229179382324, "logps/chosen": -2966.763671875, "logps/rejected": -2966.763671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9501647949219, "rewards/margins": 0.0, "rewards/rejected": -293.9501647949219, "step": 1618 }, { "epoch": 17.042105263157893, "grad_norm": 9.967370715457946e-07, "learning_rate": 0.0001660842105263158, "logits/chosen": 13.27358341217041, "logits/rejected": 13.27358341217041, "logps/chosen": -4882.57568359375, "logps/rejected": -4882.57568359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.410400390625, "rewards/margins": 0.0, "rewards/rejected": -485.410400390625, "step": 1619 }, { "epoch": 17.05263157894737, "grad_norm": 1.3451509630613145e-06, "learning_rate": 0.00016606315789473685, "logits/chosen": 13.245838165283203, "logits/rejected": 13.245838165283203, "logps/chosen": -3776.876953125, "logps/rejected": -3776.876953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8185729980469, "rewards/margins": 0.0, "rewards/rejected": -374.8185729980469, "step": 1620 }, { "epoch": 17.063157894736843, "grad_norm": 8.665405744068266e-07, "learning_rate": 0.0001660421052631579, "logits/chosen": 13.248611450195312, "logits/rejected": 13.248611450195312, "logps/chosen": -3543.697265625, "logps/rejected": -3543.697265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3438415527344, "rewards/margins": 0.0, "rewards/rejected": -351.3438415527344, "step": 1621 }, { "epoch": 17.073684210526316, "grad_norm": 1.7614240732655162e-06, "learning_rate": 0.00016602105263157897, "logits/chosen": 13.240443229675293, "logits/rejected": 13.240443229675293, "logps/chosen": -3997.587890625, "logps/rejected": -3997.587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9314880371094, "rewards/margins": 0.0, "rewards/rejected": -396.9314880371094, "step": 1622 }, { "epoch": 17.08421052631579, "grad_norm": 8.349593372258823e-07, "learning_rate": 0.000166, "logits/chosen": 13.254756927490234, "logits/rejected": 13.254756927490234, "logps/chosen": -3543.798828125, "logps/rejected": -3543.798828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.35400390625, "rewards/margins": 0.0, "rewards/rejected": -351.35400390625, "step": 1623 }, { "epoch": 17.094736842105263, "grad_norm": 9.612680287318653e-07, "learning_rate": 0.00016597894736842105, "logits/chosen": 13.25146770477295, "logits/rejected": 13.25146770477295, "logps/chosen": -3758.263671875, "logps/rejected": -3758.263671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9080505371094, "rewards/margins": 0.0, "rewards/rejected": -372.9080505371094, "step": 1624 }, { "epoch": 17.105263157894736, "grad_norm": 1.274058945455181e-06, "learning_rate": 0.00016595789473684212, "logits/chosen": 13.292013168334961, "logits/rejected": 13.292013168334961, "logps/chosen": -4883.044921875, "logps/rejected": -4883.044921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.45733642578125, "rewards/margins": 0.0, "rewards/rejected": -485.45733642578125, "step": 1625 }, { "epoch": 17.11578947368421, "grad_norm": 1.0444042572999024e-06, "learning_rate": 0.00016593684210526317, "logits/chosen": 13.24094295501709, "logits/rejected": 13.24094295501709, "logps/chosen": -2967.138671875, "logps/rejected": -2967.138671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9876708984375, "rewards/margins": 0.0, "rewards/rejected": -293.9876708984375, "step": 1626 }, { "epoch": 17.126315789473683, "grad_norm": 8.29914313271729e-07, "learning_rate": 0.00016591578947368422, "logits/chosen": 13.25707721710205, "logits/rejected": 13.25707721710205, "logps/chosen": -3544.16015625, "logps/rejected": -3544.16015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.39013671875, "rewards/margins": 0.0, "rewards/rejected": -351.39013671875, "step": 1627 }, { "epoch": 17.13684210526316, "grad_norm": 8.177157724276185e-07, "learning_rate": 0.00016589473684210527, "logits/chosen": 13.260442733764648, "logits/rejected": 13.260442733764648, "logps/chosen": -3544.287109375, "logps/rejected": -3544.287109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.40283203125, "rewards/margins": 0.0, "rewards/rejected": -351.40283203125, "step": 1628 }, { "epoch": 17.147368421052633, "grad_norm": 8.585454338572163e-07, "learning_rate": 0.00016587368421052632, "logits/chosen": 13.243003845214844, "logits/rejected": 13.243003845214844, "logps/chosen": -2673.169921875, "logps/rejected": -2673.169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.535400390625, "rewards/margins": 0.0, "rewards/rejected": -264.535400390625, "step": 1629 }, { "epoch": 17.157894736842106, "grad_norm": 8.492467600262898e-07, "learning_rate": 0.00016585263157894737, "logits/chosen": 13.246129035949707, "logits/rejected": 13.246129035949707, "logps/chosen": -2673.228515625, "logps/rejected": -2673.228515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.541259765625, "rewards/margins": 0.0, "rewards/rejected": -264.541259765625, "step": 1630 }, { "epoch": 17.16842105263158, "grad_norm": 7.809761086718936e-07, "learning_rate": 0.00016583157894736842, "logits/chosen": 13.269388198852539, "logits/rejected": 13.269388198852539, "logps/chosen": -3544.669921875, "logps/rejected": -3544.669921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4411315917969, "rewards/margins": 0.0, "rewards/rejected": -351.4411315917969, "step": 1631 }, { "epoch": 17.178947368421053, "grad_norm": 7.648363862244878e-07, "learning_rate": 0.0001658105263157895, "logits/chosen": 13.272018432617188, "logits/rejected": 13.272018432617188, "logps/chosen": -3544.994140625, "logps/rejected": -3544.994140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4735412597656, "rewards/margins": 0.0, "rewards/rejected": -351.4735412597656, "step": 1632 }, { "epoch": 17.189473684210526, "grad_norm": 1.338182528343168e-06, "learning_rate": 0.00016578947368421054, "logits/chosen": 13.297073364257812, "logits/rejected": 13.297073364257812, "logps/chosen": -4325.95703125, "logps/rejected": -4325.95703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4108581542969, "rewards/margins": 0.0, "rewards/rejected": -429.4108581542969, "step": 1633 }, { "epoch": 17.2, "grad_norm": 1.7346213780911057e-06, "learning_rate": 0.0001657684210526316, "logits/chosen": 13.26230525970459, "logits/rejected": 13.26230525970459, "logps/chosen": -3997.34765625, "logps/rejected": -3997.34765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9074401855469, "rewards/margins": 0.0, "rewards/rejected": -396.9074401855469, "step": 1634 }, { "epoch": 17.210526315789473, "grad_norm": 1.5153933645706275e-06, "learning_rate": 0.00016574736842105264, "logits/chosen": 13.256904602050781, "logits/rejected": 13.256904602050781, "logps/chosen": -3997.357421875, "logps/rejected": -3997.357421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9084167480469, "rewards/margins": 0.0, "rewards/rejected": -396.9084167480469, "step": 1635 }, { "epoch": 17.221052631578946, "grad_norm": 8.730449962968123e-07, "learning_rate": 0.0001657263157894737, "logits/chosen": 13.260743141174316, "logits/rejected": 13.260743141174316, "logps/chosen": -3546.0361328125, "logps/rejected": -3546.0361328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5777282714844, "rewards/margins": 0.0, "rewards/rejected": -351.5777282714844, "step": 1636 }, { "epoch": 17.231578947368423, "grad_norm": 1.2017012522846926e-06, "learning_rate": 0.00016570526315789474, "logits/chosen": 13.289908409118652, "logits/rejected": 13.289908409118652, "logps/chosen": -4881.3134765625, "logps/rejected": -4881.3134765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2841796875, "rewards/margins": 0.0, "rewards/rejected": -485.2841796875, "step": 1637 }, { "epoch": 17.242105263157896, "grad_norm": 1.467969013901893e-06, "learning_rate": 0.0001656842105263158, "logits/chosen": 13.24655532836914, "logits/rejected": 13.24655532836914, "logps/chosen": -3777.841796875, "logps/rejected": -3777.841796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9150695800781, "rewards/margins": 0.0, "rewards/rejected": -374.9150695800781, "step": 1638 }, { "epoch": 17.25263157894737, "grad_norm": 1.6740277715143748e-06, "learning_rate": 0.00016566315789473687, "logits/chosen": 13.23279857635498, "logits/rejected": 13.23279857635498, "logps/chosen": -3758.751953125, "logps/rejected": -3758.751953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9568786621094, "rewards/margins": 0.0, "rewards/rejected": -372.9568786621094, "step": 1639 }, { "epoch": 17.263157894736842, "grad_norm": 1.254528115168796e-06, "learning_rate": 0.00016564210526315792, "logits/chosen": 13.232345581054688, "logits/rejected": 13.232345581054688, "logps/chosen": -3546.5546875, "logps/rejected": -3546.5546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.6296081542969, "rewards/margins": 0.0, "rewards/rejected": -351.6296081542969, "step": 1640 }, { "epoch": 17.273684210526316, "grad_norm": 8.505404025527241e-07, "learning_rate": 0.00016562105263157896, "logits/chosen": 13.207818031311035, "logits/rejected": 13.207818031311035, "logps/chosen": -2674.3125, "logps/rejected": -2674.3125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.649658203125, "rewards/margins": 0.0, "rewards/rejected": -264.649658203125, "step": 1641 }, { "epoch": 17.28421052631579, "grad_norm": 1.5906676935628639e-06, "learning_rate": 0.0001656, "logits/chosen": 13.275137901306152, "logits/rejected": 13.275137901306152, "logps/chosen": -5175.3525390625, "logps/rejected": -5175.3525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5968627929688, "rewards/margins": 0.0, "rewards/rejected": -514.5968627929688, "step": 1642 }, { "epoch": 17.294736842105262, "grad_norm": 1.3361074024942354e-06, "learning_rate": 0.00016557894736842106, "logits/chosen": 13.26642894744873, "logits/rejected": 13.26642894744873, "logps/chosen": -4880.49853515625, "logps/rejected": -4880.49853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.20269775390625, "rewards/margins": 0.0, "rewards/rejected": -485.20269775390625, "step": 1643 }, { "epoch": 17.305263157894736, "grad_norm": 1.2978564427612582e-06, "learning_rate": 0.0001655578947368421, "logits/chosen": 13.253207206726074, "logits/rejected": 13.253207206726074, "logps/chosen": -4327.404296875, "logps/rejected": -4327.404296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.5555725097656, "rewards/margins": 0.0, "rewards/rejected": -429.5555725097656, "step": 1644 }, { "epoch": 17.31578947368421, "grad_norm": 9.405152354702295e-07, "learning_rate": 0.00016553684210526316, "logits/chosen": 13.229449272155762, "logits/rejected": 13.229449272155762, "logps/chosen": -3759.2548828125, "logps/rejected": -3759.2548828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0071716308594, "rewards/margins": 0.0, "rewards/rejected": -373.0071716308594, "step": 1645 }, { "epoch": 17.326315789473686, "grad_norm": 2.9235459351184545e-06, "learning_rate": 0.0001655157894736842, "logits/chosen": 13.281408309936523, "logits/rejected": 13.281408309936523, "logps/chosen": -5175.4765625, "logps/rejected": -5175.4765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6092529296875, "rewards/margins": 0.0, "rewards/rejected": -514.6092529296875, "step": 1646 }, { "epoch": 17.33684210526316, "grad_norm": 1.6358704897356802e-06, "learning_rate": 0.0001654947368421053, "logits/chosen": 13.272923469543457, "logits/rejected": 13.272923469543457, "logps/chosen": -4880.3291015625, "logps/rejected": -4880.3291015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1857604980469, "rewards/margins": 0.0, "rewards/rejected": -485.1857604980469, "step": 1647 }, { "epoch": 17.347368421052632, "grad_norm": 1.3586352451966377e-06, "learning_rate": 0.0001654736842105263, "logits/chosen": 13.220980644226074, "logits/rejected": 13.220980644226074, "logps/chosen": -3997.91015625, "logps/rejected": -3997.91015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9637145996094, "rewards/margins": 0.0, "rewards/rejected": -396.9637145996094, "step": 1648 }, { "epoch": 17.357894736842105, "grad_norm": 1.3687122191186063e-06, "learning_rate": 0.00016545263157894736, "logits/chosen": 13.217361450195312, "logits/rejected": 13.217361450195312, "logps/chosen": -2968.2177734375, "logps/rejected": -2968.2177734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0955810546875, "rewards/margins": 0.0, "rewards/rejected": -294.0955810546875, "step": 1649 }, { "epoch": 17.36842105263158, "grad_norm": 1.3353009080674383e-06, "learning_rate": 0.00016543157894736843, "logits/chosen": 13.234546661376953, "logits/rejected": 13.234546661376953, "logps/chosen": -3778.525390625, "logps/rejected": -3778.525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9834289550781, "rewards/margins": 0.0, "rewards/rejected": -374.9834289550781, "step": 1650 }, { "epoch": 17.36842105263158, "eval_logits/chosen": 13.251673698425293, "eval_logits/rejected": 13.251673698425293, "eval_logps/chosen": -4311.59033203125, "eval_logps/rejected": -4311.59033203125, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.25592041015625, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.25592041015625, "eval_runtime": 4.2501, "eval_samples_per_second": 2.353, "eval_steps_per_second": 2.353, "step": 1650 }, { "epoch": 17.378947368421052, "grad_norm": 1.0953107221212122e-06, "learning_rate": 0.00016541052631578948, "logits/chosen": 13.23139762878418, "logits/rejected": 13.23139762878418, "logps/chosen": -3546.4423828125, "logps/rejected": -3546.4423828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.6183776855469, "rewards/margins": 0.0, "rewards/rejected": -351.6183776855469, "step": 1651 }, { "epoch": 17.389473684210525, "grad_norm": 1.3938832807980361e-06, "learning_rate": 0.00016538947368421053, "logits/chosen": 13.218032836914062, "logits/rejected": 13.218032836914062, "logps/chosen": -2968.259765625, "logps/rejected": -2968.259765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.09979248046875, "rewards/margins": 0.0, "rewards/rejected": -294.09979248046875, "step": 1652 }, { "epoch": 17.4, "grad_norm": 1.5852797332627233e-06, "learning_rate": 0.00016536842105263158, "logits/chosen": 13.246777534484863, "logits/rejected": 13.246777534484863, "logps/chosen": -4287.09619140625, "logps/rejected": -4287.09619140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9122619628906, "rewards/margins": 0.0, "rewards/rejected": -425.9122619628906, "step": 1653 }, { "epoch": 17.410526315789475, "grad_norm": 1.7589213712199125e-06, "learning_rate": 0.00016534736842105263, "logits/chosen": 13.286881446838379, "logits/rejected": 13.286881446838379, "logps/chosen": -5176.388671875, "logps/rejected": -5176.388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.700439453125, "rewards/margins": 0.0, "rewards/rejected": -514.700439453125, "step": 1654 }, { "epoch": 17.42105263157895, "grad_norm": 1.5455976836165064e-06, "learning_rate": 0.00016532631578947368, "logits/chosen": 13.230978965759277, "logits/rejected": 13.230978965759277, "logps/chosen": -3998.046875, "logps/rejected": -3998.046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9773864746094, "rewards/margins": 0.0, "rewards/rejected": -396.9773864746094, "step": 1655 }, { "epoch": 17.431578947368422, "grad_norm": 1.4634289300374803e-06, "learning_rate": 0.00016530526315789473, "logits/chosen": 13.22904109954834, "logits/rejected": 13.22904109954834, "logps/chosen": -3998.103515625, "logps/rejected": -3998.103515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9830322265625, "rewards/margins": 0.0, "rewards/rejected": -396.9830322265625, "step": 1656 }, { "epoch": 17.442105263157895, "grad_norm": 1.4717924159413087e-06, "learning_rate": 0.0001652842105263158, "logits/chosen": 13.281997680664062, "logits/rejected": 13.281997680664062, "logps/chosen": -5176.4345703125, "logps/rejected": -5176.4345703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7050170898438, "rewards/margins": 0.0, "rewards/rejected": -514.7050170898438, "step": 1657 }, { "epoch": 17.45263157894737, "grad_norm": 8.434948881586024e-07, "learning_rate": 0.00016526315789473686, "logits/chosen": 13.211052894592285, "logits/rejected": 13.211052894592285, "logps/chosen": -2673.8974609375, "logps/rejected": -2673.8974609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.608154296875, "rewards/margins": 0.0, "rewards/rejected": -264.608154296875, "step": 1658 }, { "epoch": 17.46315789473684, "grad_norm": 8.573423997404461e-07, "learning_rate": 0.0001652421052631579, "logits/chosen": 13.206940650939941, "logits/rejected": 13.206940650939941, "logps/chosen": -2673.7265625, "logps/rejected": -2673.7265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.591064453125, "rewards/margins": 0.0, "rewards/rejected": -264.591064453125, "step": 1659 }, { "epoch": 17.473684210526315, "grad_norm": 1.327583049715031e-06, "learning_rate": 0.00016522105263157895, "logits/chosen": 13.2329740524292, "logits/rejected": 13.2329740524292, "logps/chosen": -4288.6162109375, "logps/rejected": -4288.6162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.06427001953125, "rewards/margins": 0.0, "rewards/rejected": -426.06427001953125, "step": 1660 }, { "epoch": 17.48421052631579, "grad_norm": 1.3415901776170358e-06, "learning_rate": 0.0001652, "logits/chosen": 13.227052688598633, "logits/rejected": 13.227052688598633, "logps/chosen": -4288.80615234375, "logps/rejected": -4288.80615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.083251953125, "rewards/margins": 0.0, "rewards/rejected": -426.083251953125, "step": 1661 }, { "epoch": 17.49473684210526, "grad_norm": 1.8072954617309733e-06, "learning_rate": 0.00016517894736842105, "logits/chosen": 13.255829811096191, "logits/rejected": 13.255829811096191, "logps/chosen": -5177.033203125, "logps/rejected": -5177.033203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.764892578125, "rewards/margins": 0.0, "rewards/rejected": -514.764892578125, "step": 1662 }, { "epoch": 17.50526315789474, "grad_norm": 1.4737507854079013e-06, "learning_rate": 0.0001651578947368421, "logits/chosen": 13.206585884094238, "logits/rejected": 13.206585884094238, "logps/chosen": -3544.9296875, "logps/rejected": -3544.9296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.46710205078125, "rewards/margins": 0.0, "rewards/rejected": -351.46710205078125, "step": 1663 }, { "epoch": 17.51578947368421, "grad_norm": 1.1374694395271945e-06, "learning_rate": 0.00016513684210526318, "logits/chosen": 13.189765930175781, "logits/rejected": 13.189765930175781, "logps/chosen": -3998.576171875, "logps/rejected": -3998.576171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0303039550781, "rewards/margins": 0.0, "rewards/rejected": -397.0303039550781, "step": 1664 }, { "epoch": 17.526315789473685, "grad_norm": 1.2963898825546494e-06, "learning_rate": 0.00016511578947368423, "logits/chosen": 13.206446647644043, "logits/rejected": 13.206446647644043, "logps/chosen": -3778.44140625, "logps/rejected": -3778.44140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.97503662109375, "rewards/margins": 0.0, "rewards/rejected": -374.97503662109375, "step": 1665 }, { "epoch": 17.53684210526316, "grad_norm": 1.1174919336554012e-06, "learning_rate": 0.00016509473684210528, "logits/chosen": 13.18382740020752, "logits/rejected": 13.18382740020752, "logps/chosen": -3998.845703125, "logps/rejected": -3998.845703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0572509765625, "rewards/margins": 0.0, "rewards/rejected": -397.0572509765625, "step": 1666 }, { "epoch": 17.54736842105263, "grad_norm": 1.3727382111028419e-06, "learning_rate": 0.00016507368421052633, "logits/chosen": 13.23823070526123, "logits/rejected": 13.23823070526123, "logps/chosen": -5177.3037109375, "logps/rejected": -5177.3037109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7919311523438, "rewards/margins": 0.0, "rewards/rejected": -514.7919311523438, "step": 1667 }, { "epoch": 17.557894736842105, "grad_norm": 1.0609696801111568e-06, "learning_rate": 0.00016505263157894738, "logits/chosen": 13.176491737365723, "logits/rejected": 13.176491737365723, "logps/chosen": -2968.94140625, "logps/rejected": -2968.94140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1679382324219, "rewards/margins": 0.0, "rewards/rejected": -294.1679382324219, "step": 1668 }, { "epoch": 17.568421052631578, "grad_norm": 1.5158143469307106e-06, "learning_rate": 0.00016503157894736843, "logits/chosen": 13.192623138427734, "logits/rejected": 13.192623138427734, "logps/chosen": -3778.525390625, "logps/rejected": -3778.525390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.9834289550781, "rewards/margins": 0.0, "rewards/rejected": -374.9834289550781, "step": 1669 }, { "epoch": 17.57894736842105, "grad_norm": 1.6550102373003028e-06, "learning_rate": 0.00016501052631578947, "logits/chosen": 13.172411918640137, "logits/rejected": 13.172411918640137, "logps/chosen": -3999.150390625, "logps/rejected": -3999.150390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0877380371094, "rewards/margins": 0.0, "rewards/rejected": -397.0877380371094, "step": 1670 }, { "epoch": 17.589473684210525, "grad_norm": 1.703960379018099e-06, "learning_rate": 0.00016498947368421055, "logits/chosen": 13.216747283935547, "logits/rejected": 13.216747283935547, "logps/chosen": -4879.1630859375, "logps/rejected": -4879.1630859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.06915283203125, "rewards/margins": 0.0, "rewards/rejected": -485.06915283203125, "step": 1671 }, { "epoch": 17.6, "grad_norm": 1.2773125490639359e-06, "learning_rate": 0.0001649684210526316, "logits/chosen": 13.197542190551758, "logits/rejected": 13.197542190551758, "logps/chosen": -4327.16796875, "logps/rejected": -4327.16796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.5319519042969, "rewards/margins": 0.0, "rewards/rejected": -429.5319519042969, "step": 1672 }, { "epoch": 17.610526315789475, "grad_norm": 8.086529987849644e-07, "learning_rate": 0.00016494736842105265, "logits/chosen": 13.149407386779785, "logits/rejected": 13.149407386779785, "logps/chosen": -2674.2578125, "logps/rejected": -2674.2578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6441955566406, "rewards/margins": 0.0, "rewards/rejected": -264.6441955566406, "step": 1673 }, { "epoch": 17.621052631578948, "grad_norm": 1.176727778329223e-06, "learning_rate": 0.00016492631578947367, "logits/chosen": 13.150078773498535, "logits/rejected": 13.150078773498535, "logps/chosen": -3999.626953125, "logps/rejected": -3999.626953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1353759765625, "rewards/margins": 0.0, "rewards/rejected": -397.1353759765625, "step": 1674 }, { "epoch": 17.63157894736842, "grad_norm": 9.85017550192424e-07, "learning_rate": 0.00016490526315789475, "logits/chosen": 13.16022777557373, "logits/rejected": 13.16022777557373, "logps/chosen": -3543.904296875, "logps/rejected": -3543.904296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.36456298828125, "rewards/margins": 0.0, "rewards/rejected": -351.36456298828125, "step": 1675 }, { "epoch": 17.642105263157895, "grad_norm": 8.099822252916056e-07, "learning_rate": 0.0001648842105263158, "logits/chosen": 13.134472846984863, "logits/rejected": 13.134472846984863, "logps/chosen": -2674.35546875, "logps/rejected": -2674.35546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6539611816406, "rewards/margins": 0.0, "rewards/rejected": -264.6539611816406, "step": 1676 }, { "epoch": 17.652631578947368, "grad_norm": 1.276584384868329e-06, "learning_rate": 0.00016486315789473685, "logits/chosen": 13.14843463897705, "logits/rejected": 13.14843463897705, "logps/chosen": -3758.064453125, "logps/rejected": -3758.064453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8881530761719, "rewards/margins": 0.0, "rewards/rejected": -372.8881530761719, "step": 1677 }, { "epoch": 17.66315789473684, "grad_norm": 8.137351983350527e-07, "learning_rate": 0.0001648421052631579, "logits/chosen": 13.129046440124512, "logits/rejected": 13.129046440124512, "logps/chosen": -2674.59375, "logps/rejected": -2674.59375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.67779541015625, "rewards/margins": 0.0, "rewards/rejected": -264.67779541015625, "step": 1678 }, { "epoch": 17.673684210526314, "grad_norm": 1.6141307241923641e-06, "learning_rate": 0.00016482105263157897, "logits/chosen": 13.169686317443848, "logits/rejected": 13.169686317443848, "logps/chosen": -4327.4873046875, "logps/rejected": -4327.4873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.5638732910156, "rewards/margins": 0.0, "rewards/rejected": -429.5638732910156, "step": 1679 }, { "epoch": 17.68421052631579, "grad_norm": 9.15808357149217e-07, "learning_rate": 0.0001648, "logits/chosen": 13.149499893188477, "logits/rejected": 13.149499893188477, "logps/chosen": -3543.8193359375, "logps/rejected": -3543.8193359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3560485839844, "rewards/margins": 0.0, "rewards/rejected": -351.3560485839844, "step": 1680 }, { "epoch": 17.694736842105264, "grad_norm": 1.5173853853411856e-06, "learning_rate": 0.00016477894736842104, "logits/chosen": 13.18626594543457, "logits/rejected": 13.18626594543457, "logps/chosen": -4879.27490234375, "logps/rejected": -4879.27490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.080322265625, "rewards/margins": 0.0, "rewards/rejected": -485.080322265625, "step": 1681 }, { "epoch": 17.705263157894738, "grad_norm": 1.6809143517093617e-06, "learning_rate": 0.00016475789473684212, "logits/chosen": 13.190747261047363, "logits/rejected": 13.190747261047363, "logps/chosen": -4879.158203125, "logps/rejected": -4879.158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.06866455078125, "rewards/margins": 0.0, "rewards/rejected": -485.06866455078125, "step": 1682 }, { "epoch": 17.71578947368421, "grad_norm": 2.2709521090291673e-06, "learning_rate": 0.00016473684210526317, "logits/chosen": 13.205517768859863, "logits/rejected": 13.205517768859863, "logps/chosen": -5175.84228515625, "logps/rejected": -5175.84228515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6458129882812, "rewards/margins": 0.0, "rewards/rejected": -514.6458129882812, "step": 1683 }, { "epoch": 17.726315789473684, "grad_norm": 7.800434218552255e-07, "learning_rate": 0.00016471578947368422, "logits/chosen": 13.148836135864258, "logits/rejected": 13.148836135864258, "logps/chosen": -2675.6943359375, "logps/rejected": -2675.6943359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.787841796875, "rewards/margins": 0.0, "rewards/rejected": -264.787841796875, "step": 1684 }, { "epoch": 17.736842105263158, "grad_norm": 1.6296972944473964e-06, "learning_rate": 0.00016469473684210527, "logits/chosen": 13.183969497680664, "logits/rejected": 13.183969497680664, "logps/chosen": -4289.72900390625, "logps/rejected": -4289.72900390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.175537109375, "rewards/margins": 0.0, "rewards/rejected": -426.175537109375, "step": 1685 }, { "epoch": 17.74736842105263, "grad_norm": 1.197213350678794e-06, "learning_rate": 0.00016467368421052632, "logits/chosen": 13.215919494628906, "logits/rejected": 13.215919494628906, "logps/chosen": -4879.90234375, "logps/rejected": -4879.90234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.14306640625, "rewards/margins": 0.0, "rewards/rejected": -485.14306640625, "step": 1686 }, { "epoch": 17.757894736842104, "grad_norm": 1.3737014796788571e-06, "learning_rate": 0.00016465263157894737, "logits/chosen": 13.171225547790527, "logits/rejected": 13.171225547790527, "logps/chosen": -2968.603515625, "logps/rejected": -2968.603515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1341552734375, "rewards/margins": 0.0, "rewards/rejected": -294.1341552734375, "step": 1687 }, { "epoch": 17.768421052631577, "grad_norm": 8.144222078954044e-07, "learning_rate": 0.00016463157894736842, "logits/chosen": 13.171070098876953, "logits/rejected": 13.171070098876953, "logps/chosen": -2675.91015625, "logps/rejected": -2675.91015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.8094177246094, "rewards/margins": 0.0, "rewards/rejected": -264.8094177246094, "step": 1688 }, { "epoch": 17.778947368421054, "grad_norm": 1.3077759604129824e-06, "learning_rate": 0.0001646105263157895, "logits/chosen": 13.200596809387207, "logits/rejected": 13.200596809387207, "logps/chosen": -3779.298828125, "logps/rejected": -3779.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0607604980469, "rewards/margins": 0.0, "rewards/rejected": -375.0607604980469, "step": 1689 }, { "epoch": 17.789473684210527, "grad_norm": 1.3314678426468163e-06, "learning_rate": 0.00016458947368421054, "logits/chosen": 13.23664665222168, "logits/rejected": 13.23664665222168, "logps/chosen": -4880.37109375, "logps/rejected": -4880.37109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.18994140625, "rewards/margins": 0.0, "rewards/rejected": -485.18994140625, "step": 1690 }, { "epoch": 17.8, "grad_norm": 1.4908573575667106e-06, "learning_rate": 0.0001645684210526316, "logits/chosen": 13.249382019042969, "logits/rejected": 13.249382019042969, "logps/chosen": -5175.990234375, "logps/rejected": -5175.990234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6605834960938, "rewards/margins": 0.0, "rewards/rejected": -514.6605834960938, "step": 1691 }, { "epoch": 17.810526315789474, "grad_norm": 1.1765446288336534e-06, "learning_rate": 0.00016454736842105264, "logits/chosen": 13.194196701049805, "logits/rejected": 13.194196701049805, "logps/chosen": -3998.376953125, "logps/rejected": -3998.376953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0103759765625, "rewards/margins": 0.0, "rewards/rejected": -397.0103759765625, "step": 1692 }, { "epoch": 17.821052631578947, "grad_norm": 1.171123130916385e-06, "learning_rate": 0.0001645263157894737, "logits/chosen": 13.194611549377441, "logits/rejected": 13.194611549377441, "logps/chosen": -3998.416015625, "logps/rejected": -3998.416015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0142822265625, "rewards/margins": 0.0, "rewards/rejected": -397.0142822265625, "step": 1693 }, { "epoch": 17.83157894736842, "grad_norm": 1.1897999456778052e-06, "learning_rate": 0.00016450526315789474, "logits/chosen": 13.242565155029297, "logits/rejected": 13.242565155029297, "logps/chosen": -4881.2607421875, "logps/rejected": -4881.2607421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2789001464844, "rewards/margins": 0.0, "rewards/rejected": -485.2789001464844, "step": 1694 }, { "epoch": 17.842105263157894, "grad_norm": 1.289170654672489e-06, "learning_rate": 0.0001644842105263158, "logits/chosen": 13.207611083984375, "logits/rejected": 13.207611083984375, "logps/chosen": -3779.822265625, "logps/rejected": -3779.822265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.1131286621094, "rewards/margins": 0.0, "rewards/rejected": -375.1131286621094, "step": 1695 }, { "epoch": 17.852631578947367, "grad_norm": 1.3572716852650046e-06, "learning_rate": 0.00016446315789473686, "logits/chosen": 13.207937240600586, "logits/rejected": 13.207937240600586, "logps/chosen": -4289.74853515625, "logps/rejected": -4289.74853515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.177490234375, "rewards/margins": 0.0, "rewards/rejected": -426.177490234375, "step": 1696 }, { "epoch": 17.863157894736844, "grad_norm": 1.566055289003998e-06, "learning_rate": 0.0001644421052631579, "logits/chosen": 13.238598823547363, "logits/rejected": 13.238598823547363, "logps/chosen": -5175.99609375, "logps/rejected": -5175.99609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6611938476562, "rewards/margins": 0.0, "rewards/rejected": -514.6611938476562, "step": 1697 }, { "epoch": 17.873684210526317, "grad_norm": 1.6482883893331746e-06, "learning_rate": 0.00016442105263157896, "logits/chosen": 13.185717582702637, "logits/rejected": 13.185717582702637, "logps/chosen": -3758.48828125, "logps/rejected": -3758.48828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9305114746094, "rewards/margins": 0.0, "rewards/rejected": -372.9305114746094, "step": 1698 }, { "epoch": 17.88421052631579, "grad_norm": 2.168531864299439e-06, "learning_rate": 0.0001644, "logits/chosen": 13.209497451782227, "logits/rejected": 13.209497451782227, "logps/chosen": -4326.32421875, "logps/rejected": -4326.32421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.44757080078125, "rewards/margins": 0.0, "rewards/rejected": -429.44757080078125, "step": 1699 }, { "epoch": 17.894736842105264, "grad_norm": 1.0153877383345389e-06, "learning_rate": 0.00016437894736842106, "logits/chosen": 13.223252296447754, "logits/rejected": 13.223252296447754, "logps/chosen": -4882.1025390625, "logps/rejected": -4882.1025390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.36309814453125, "rewards/margins": 0.0, "rewards/rejected": -485.36309814453125, "step": 1700 }, { "epoch": 17.894736842105264, "eval_logits/chosen": 13.209467887878418, "eval_logits/rejected": 13.209467887878418, "eval_logps/chosen": -4311.66064453125, "eval_logps/rejected": -4311.66064453125, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.26287841796875, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.26287841796875, "eval_runtime": 4.2898, "eval_samples_per_second": 2.331, "eval_steps_per_second": 2.331, "step": 1700 }, { "epoch": 17.905263157894737, "grad_norm": 1.3979546338305227e-06, "learning_rate": 0.0001643578947368421, "logits/chosen": 13.171424865722656, "logits/rejected": 13.171424865722656, "logps/chosen": -3998.767578125, "logps/rejected": -3998.767578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0494384765625, "rewards/margins": 0.0, "rewards/rejected": -397.0494384765625, "step": 1701 }, { "epoch": 17.91578947368421, "grad_norm": 9.574965815772885e-07, "learning_rate": 0.00016433684210526316, "logits/chosen": 13.180306434631348, "logits/rejected": 13.180306434631348, "logps/chosen": -3758.78125, "logps/rejected": -3758.78125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9598083496094, "rewards/margins": 0.0, "rewards/rejected": -372.9598083496094, "step": 1702 }, { "epoch": 17.926315789473684, "grad_norm": 2.7593716822593706e-06, "learning_rate": 0.00016431578947368424, "logits/chosen": 13.224714279174805, "logits/rejected": 13.224714279174805, "logps/chosen": -5175.970703125, "logps/rejected": -5175.970703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6586303710938, "rewards/margins": 0.0, "rewards/rejected": -514.6586303710938, "step": 1703 }, { "epoch": 17.936842105263157, "grad_norm": 1.6290850908262655e-06, "learning_rate": 0.00016429473684210529, "logits/chosen": 13.161206245422363, "logits/rejected": 13.161206245422363, "logps/chosen": -3999.0625, "logps/rejected": -3999.0625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0789489746094, "rewards/margins": 0.0, "rewards/rejected": -397.0789489746094, "step": 1704 }, { "epoch": 17.94736842105263, "grad_norm": 1.6467810155518237e-06, "learning_rate": 0.0001642736842105263, "logits/chosen": 13.215140342712402, "logits/rejected": 13.215140342712402, "logps/chosen": -5176.349609375, "logps/rejected": -5176.349609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.696533203125, "rewards/margins": 0.0, "rewards/rejected": -514.696533203125, "step": 1705 }, { "epoch": 17.957894736842107, "grad_norm": 1.444196300326439e-06, "learning_rate": 0.00016425263157894736, "logits/chosen": 13.21228313446045, "logits/rejected": 13.21228313446045, "logps/chosen": -5176.732421875, "logps/rejected": -5176.732421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7348022460938, "rewards/margins": 0.0, "rewards/rejected": -514.7348022460938, "step": 1706 }, { "epoch": 17.96842105263158, "grad_norm": 1.2963780591235263e-06, "learning_rate": 0.00016423157894736843, "logits/chosen": 13.174302101135254, "logits/rejected": 13.174302101135254, "logps/chosen": -4289.6875, "logps/rejected": -4289.6875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.17138671875, "rewards/margins": 0.0, "rewards/rejected": -426.17138671875, "step": 1707 }, { "epoch": 17.978947368421053, "grad_norm": 2.2712229110766202e-06, "learning_rate": 0.00016421052631578948, "logits/chosen": 13.200567245483398, "logits/rejected": 13.200567245483398, "logps/chosen": -4882.18701171875, "logps/rejected": -4882.18701171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3715515136719, "rewards/margins": 0.0, "rewards/rejected": -485.3715515136719, "step": 1708 }, { "epoch": 17.989473684210527, "grad_norm": 1.8442843838784029e-06, "learning_rate": 0.00016418947368421053, "logits/chosen": 13.167006492614746, "logits/rejected": 13.167006492614746, "logps/chosen": -3541.791015625, "logps/rejected": -3541.791015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1532287597656, "rewards/margins": 0.0, "rewards/rejected": -351.1532287597656, "step": 1709 }, { "epoch": 18.0, "grad_norm": 1.325768153037643e-06, "learning_rate": 0.00016416842105263158, "logits/chosen": 13.156643867492676, "logits/rejected": 13.156643867492676, "logps/chosen": -3998.87109375, "logps/rejected": -3998.87109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0597839355469, "rewards/margins": 0.0, "rewards/rejected": -397.0597839355469, "step": 1710 }, { "epoch": 18.010526315789473, "grad_norm": 1.3008568657824071e-06, "learning_rate": 0.00016414736842105266, "logits/chosen": 13.186636924743652, "logits/rejected": 13.186636924743652, "logps/chosen": -4289.2861328125, "logps/rejected": -4289.2861328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1312561035156, "rewards/margins": 0.0, "rewards/rejected": -426.1312561035156, "step": 1711 }, { "epoch": 18.021052631578947, "grad_norm": 1.221750153490575e-06, "learning_rate": 0.00016412631578947368, "logits/chosen": 13.217198371887207, "logits/rejected": 13.217198371887207, "logps/chosen": -4882.2001953125, "logps/rejected": -4882.2001953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.37286376953125, "rewards/margins": 0.0, "rewards/rejected": -485.37286376953125, "step": 1712 }, { "epoch": 18.03157894736842, "grad_norm": 9.961817113435245e-07, "learning_rate": 0.00016410526315789473, "logits/chosen": 13.184962272644043, "logits/rejected": 13.184962272644043, "logps/chosen": -3542.19921875, "logps/rejected": -3542.19921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.1940612792969, "rewards/margins": 0.0, "rewards/rejected": -351.1940612792969, "step": 1713 }, { "epoch": 18.042105263157893, "grad_norm": 9.931683280228754e-07, "learning_rate": 0.0001640842105263158, "logits/chosen": 13.184737205505371, "logits/rejected": 13.184737205505371, "logps/chosen": -3758.923828125, "logps/rejected": -3758.923828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9740905761719, "rewards/margins": 0.0, "rewards/rejected": -372.9740905761719, "step": 1714 }, { "epoch": 18.05263157894737, "grad_norm": 2.1246914911898784e-06, "learning_rate": 0.00016406315789473685, "logits/chosen": 13.237903594970703, "logits/rejected": 13.237903594970703, "logps/chosen": -5178.34765625, "logps/rejected": -5178.34765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8963623046875, "rewards/margins": 0.0, "rewards/rejected": -514.8963623046875, "step": 1715 }, { "epoch": 18.063157894736843, "grad_norm": 9.134220135820215e-07, "learning_rate": 0.0001640421052631579, "logits/chosen": 13.179265975952148, "logits/rejected": 13.179265975952148, "logps/chosen": -2671.7890625, "logps/rejected": -2671.7890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.3973083496094, "rewards/margins": 0.0, "rewards/rejected": -264.3973083496094, "step": 1716 }, { "epoch": 18.073684210526316, "grad_norm": 8.840111718200205e-07, "learning_rate": 0.00016402105263157895, "logits/chosen": 13.205976486206055, "logits/rejected": 13.205976486206055, "logps/chosen": -3542.48046875, "logps/rejected": -3542.48046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.22216796875, "rewards/margins": 0.0, "rewards/rejected": -351.22216796875, "step": 1717 }, { "epoch": 18.08421052631579, "grad_norm": 9.021367759487475e-07, "learning_rate": 0.000164, "logits/chosen": 13.190559387207031, "logits/rejected": 13.190559387207031, "logps/chosen": -2672.080078125, "logps/rejected": -2672.080078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4264221191406, "rewards/margins": 0.0, "rewards/rejected": -264.4264221191406, "step": 1718 }, { "epoch": 18.094736842105263, "grad_norm": 1.8655525764188496e-06, "learning_rate": 0.00016397894736842105, "logits/chosen": 13.260146141052246, "logits/rejected": 13.260146141052246, "logps/chosen": -5178.826171875, "logps/rejected": -5178.826171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.9442138671875, "rewards/margins": 0.0, "rewards/rejected": -514.9442138671875, "step": 1719 }, { "epoch": 18.105263157894736, "grad_norm": 9.820496416068636e-07, "learning_rate": 0.0001639578947368421, "logits/chosen": 13.218685150146484, "logits/rejected": 13.218685150146484, "logps/chosen": -3759.435546875, "logps/rejected": -3759.435546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0252380371094, "rewards/margins": 0.0, "rewards/rejected": -373.0252380371094, "step": 1720 }, { "epoch": 18.11578947368421, "grad_norm": 1.2514809668573434e-06, "learning_rate": 0.00016393684210526318, "logits/chosen": 13.209930419921875, "logits/rejected": 13.209930419921875, "logps/chosen": -3998.107421875, "logps/rejected": -3998.107421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9834289550781, "rewards/margins": 0.0, "rewards/rejected": -396.9834289550781, "step": 1721 }, { "epoch": 18.126315789473683, "grad_norm": 8.620293101557763e-07, "learning_rate": 0.00016391578947368423, "logits/chosen": 13.206565856933594, "logits/rejected": 13.206565856933594, "logps/chosen": -2672.3046875, "logps/rejected": -2672.3046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4488830566406, "rewards/margins": 0.0, "rewards/rejected": -264.4488830566406, "step": 1722 }, { "epoch": 18.13684210526316, "grad_norm": 9.048561082636297e-07, "learning_rate": 0.00016389473684210528, "logits/chosen": 13.205591201782227, "logits/rejected": 13.205591201782227, "logps/chosen": -2672.458984375, "logps/rejected": -2672.458984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.4643249511719, "rewards/margins": 0.0, "rewards/rejected": -264.4643249511719, "step": 1723 }, { "epoch": 18.147368421052633, "grad_norm": 1.1921965779038146e-06, "learning_rate": 0.00016387368421052632, "logits/chosen": 13.20761775970459, "logits/rejected": 13.20761775970459, "logps/chosen": -3997.89453125, "logps/rejected": -3997.89453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9621276855469, "rewards/margins": 0.0, "rewards/rejected": -396.9621276855469, "step": 1724 }, { "epoch": 18.157894736842106, "grad_norm": 1.7906684206536738e-06, "learning_rate": 0.00016385263157894737, "logits/chosen": 13.215581893920898, "logits/rejected": 13.215581893920898, "logps/chosen": -3759.201171875, "logps/rejected": -3759.201171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0018005371094, "rewards/margins": 0.0, "rewards/rejected": -373.0018005371094, "step": 1725 }, { "epoch": 18.16842105263158, "grad_norm": 1.5402041526613175e-06, "learning_rate": 0.00016383157894736842, "logits/chosen": 13.25319766998291, "logits/rejected": 13.25319766998291, "logps/chosen": -4881.15185546875, "logps/rejected": -4881.15185546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.2680358886719, "rewards/margins": 0.0, "rewards/rejected": -485.2680358886719, "step": 1726 }, { "epoch": 18.178947368421053, "grad_norm": 1.3679416497325292e-06, "learning_rate": 0.00016381052631578947, "logits/chosen": 13.223785400390625, "logits/rejected": 13.223785400390625, "logps/chosen": -4289.30859375, "logps/rejected": -4289.30859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1335144042969, "rewards/margins": 0.0, "rewards/rejected": -426.1335144042969, "step": 1727 }, { "epoch": 18.189473684210526, "grad_norm": 8.217092499762657e-07, "learning_rate": 0.00016378947368421055, "logits/chosen": 13.189509391784668, "logits/rejected": 13.189509391784668, "logps/chosen": -2673.619140625, "logps/rejected": -2673.619140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.580322265625, "rewards/margins": 0.0, "rewards/rejected": -264.580322265625, "step": 1728 }, { "epoch": 18.2, "grad_norm": 1.1977446092714672e-06, "learning_rate": 0.0001637684210526316, "logits/chosen": 13.188858985900879, "logits/rejected": 13.188858985900879, "logps/chosen": -3998.232421875, "logps/rejected": -3998.232421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9959411621094, "rewards/margins": 0.0, "rewards/rejected": -396.9959411621094, "step": 1729 }, { "epoch": 18.210526315789473, "grad_norm": 9.371688065584749e-07, "learning_rate": 0.00016374736842105265, "logits/chosen": 13.200185775756836, "logits/rejected": 13.200185775756836, "logps/chosen": -3543.51953125, "logps/rejected": -3543.51953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3260803222656, "rewards/margins": 0.0, "rewards/rejected": -351.3260803222656, "step": 1730 }, { "epoch": 18.221052631578946, "grad_norm": 9.086565455618256e-07, "learning_rate": 0.0001637263157894737, "logits/chosen": 13.195902824401855, "logits/rejected": 13.195902824401855, "logps/chosen": -3543.580078125, "logps/rejected": -3543.580078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3321228027344, "rewards/margins": 0.0, "rewards/rejected": -351.3321228027344, "step": 1731 }, { "epoch": 18.231578947368423, "grad_norm": 8.61796252138447e-07, "learning_rate": 0.00016370526315789475, "logits/chosen": 13.194711685180664, "logits/rejected": 13.194711685180664, "logps/chosen": -3543.75390625, "logps/rejected": -3543.75390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3495178222656, "rewards/margins": 0.0, "rewards/rejected": -351.3495178222656, "step": 1732 }, { "epoch": 18.242105263157896, "grad_norm": 8.233304811255948e-07, "learning_rate": 0.0001636842105263158, "logits/chosen": 13.19621467590332, "logits/rejected": 13.19621467590332, "logps/chosen": -3544.06640625, "logps/rejected": -3544.06640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3807678222656, "rewards/margins": 0.0, "rewards/rejected": -351.3807678222656, "step": 1733 }, { "epoch": 18.25263157894737, "grad_norm": 1.9305980458739214e-06, "learning_rate": 0.00016366315789473684, "logits/chosen": 13.24044418334961, "logits/rejected": 13.24044418334961, "logps/chosen": -5178.00537109375, "logps/rejected": -5178.00537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8621215820312, "rewards/margins": 0.0, "rewards/rejected": -514.8621215820312, "step": 1734 }, { "epoch": 18.263157894736842, "grad_norm": 1.5832422377570765e-06, "learning_rate": 0.00016364210526315792, "logits/chosen": 13.184294700622559, "logits/rejected": 13.184294700622559, "logps/chosen": -3998.580078125, "logps/rejected": -3998.580078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.03070068359375, "rewards/margins": 0.0, "rewards/rejected": -397.03070068359375, "step": 1735 }, { "epoch": 18.273684210526316, "grad_norm": 1.7293010614594095e-06, "learning_rate": 0.00016362105263157897, "logits/chosen": 13.207387924194336, "logits/rejected": 13.207387924194336, "logps/chosen": -4289.455078125, "logps/rejected": -4289.455078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1481628417969, "rewards/margins": 0.0, "rewards/rejected": -426.1481628417969, "step": 1736 }, { "epoch": 18.28421052631579, "grad_norm": 1.5562432054139208e-06, "learning_rate": 0.0001636, "logits/chosen": 13.177202224731445, "logits/rejected": 13.177202224731445, "logps/chosen": -2967.9921875, "logps/rejected": -2967.9921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0730285644531, "rewards/margins": 0.0, "rewards/rejected": -294.0730285644531, "step": 1737 }, { "epoch": 18.294736842105262, "grad_norm": 1.3520865422833594e-06, "learning_rate": 0.00016357894736842104, "logits/chosen": 13.23664665222168, "logits/rejected": 13.23664665222168, "logps/chosen": -5177.64599609375, "logps/rejected": -5177.64599609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.826171875, "rewards/margins": 0.0, "rewards/rejected": -514.826171875, "step": 1738 }, { "epoch": 18.305263157894736, "grad_norm": 9.935459956977866e-07, "learning_rate": 0.00016355789473684212, "logits/chosen": 13.196292877197266, "logits/rejected": 13.196292877197266, "logps/chosen": -3544.83984375, "logps/rejected": -3544.83984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4580993652344, "rewards/margins": 0.0, "rewards/rejected": -351.4580993652344, "step": 1739 }, { "epoch": 18.31578947368421, "grad_norm": 1.3221443850852665e-06, "learning_rate": 0.00016353684210526317, "logits/chosen": 13.234463691711426, "logits/rejected": 13.234463691711426, "logps/chosen": -4879.3125, "logps/rejected": -4879.3125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0840759277344, "rewards/margins": 0.0, "rewards/rejected": -485.0840759277344, "step": 1740 }, { "epoch": 18.326315789473686, "grad_norm": 1.1604338396864478e-06, "learning_rate": 0.00016351578947368422, "logits/chosen": 13.18543529510498, "logits/rejected": 13.18543529510498, "logps/chosen": -3998.95703125, "logps/rejected": -3998.95703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0683898925781, "rewards/margins": 0.0, "rewards/rejected": -397.0683898925781, "step": 1741 }, { "epoch": 18.33684210526316, "grad_norm": 1.1535853445820976e-06, "learning_rate": 0.00016349473684210527, "logits/chosen": 13.185277938842773, "logits/rejected": 13.185277938842773, "logps/chosen": -3999.001953125, "logps/rejected": -3999.001953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0728759765625, "rewards/margins": 0.0, "rewards/rejected": -397.0728759765625, "step": 1742 }, { "epoch": 18.347368421052632, "grad_norm": 1.194940750792739e-06, "learning_rate": 0.00016347368421052634, "logits/chosen": 13.195455551147461, "logits/rejected": 13.195455551147461, "logps/chosen": -3759.8857421875, "logps/rejected": -3759.8857421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -373.0702819824219, "rewards/margins": 0.0, "rewards/rejected": -373.0702819824219, "step": 1743 }, { "epoch": 18.357894736842105, "grad_norm": 1.3847267155142617e-06, "learning_rate": 0.00016345263157894736, "logits/chosen": 13.179770469665527, "logits/rejected": 13.179770469665527, "logps/chosen": -2968.576171875, "logps/rejected": -2968.576171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1314392089844, "rewards/margins": 0.0, "rewards/rejected": -294.1314392089844, "step": 1744 }, { "epoch": 18.36842105263158, "grad_norm": 1.594595460119308e-06, "learning_rate": 0.0001634315789473684, "logits/chosen": 13.238062858581543, "logits/rejected": 13.238062858581543, "logps/chosen": -5177.8115234375, "logps/rejected": -5177.8115234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8427124023438, "rewards/margins": 0.0, "rewards/rejected": -514.8427124023438, "step": 1745 }, { "epoch": 18.378947368421052, "grad_norm": 1.6247003031821805e-06, "learning_rate": 0.0001634105263157895, "logits/chosen": 13.17969036102295, "logits/rejected": 13.17969036102295, "logps/chosen": -3999.08984375, "logps/rejected": -3999.08984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0816650390625, "rewards/margins": 0.0, "rewards/rejected": -397.0816650390625, "step": 1746 }, { "epoch": 18.389473684210525, "grad_norm": 7.692754593335849e-07, "learning_rate": 0.00016338947368421054, "logits/chosen": 13.195625305175781, "logits/rejected": 13.195625305175781, "logps/chosen": -3545.607421875, "logps/rejected": -3545.607421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5348815917969, "rewards/margins": 0.0, "rewards/rejected": -351.5348815917969, "step": 1747 }, { "epoch": 18.4, "grad_norm": 1.3056379657427897e-06, "learning_rate": 0.0001633684210526316, "logits/chosen": 13.214991569519043, "logits/rejected": 13.214991569519043, "logps/chosen": -4326.58203125, "logps/rejected": -4326.58203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4733581542969, "rewards/margins": 0.0, "rewards/rejected": -429.4733581542969, "step": 1748 }, { "epoch": 18.410526315789475, "grad_norm": 1.0942079597953125e-06, "learning_rate": 0.00016334736842105264, "logits/chosen": 13.174125671386719, "logits/rejected": 13.174125671386719, "logps/chosen": -2968.7578125, "logps/rejected": -2968.7578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.14959716796875, "rewards/margins": 0.0, "rewards/rejected": -294.14959716796875, "step": 1749 }, { "epoch": 18.42105263157895, "grad_norm": 2.1360276605264517e-06, "learning_rate": 0.0001633263157894737, "logits/chosen": 13.196590423583984, "logits/rejected": 13.196590423583984, "logps/chosen": -4289.212890625, "logps/rejected": -4289.212890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.1239318847656, "rewards/margins": 0.0, "rewards/rejected": -426.1239318847656, "step": 1750 }, { "epoch": 18.42105263157895, "eval_logits/chosen": 13.2058687210083, "eval_logits/rejected": 13.2058687210083, "eval_logps/chosen": -4311.08837890625, "eval_logps/rejected": -4311.08837890625, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.20556640625, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.20556640625, "eval_runtime": 4.2818, "eval_samples_per_second": 2.335, "eval_steps_per_second": 2.335, "step": 1750 }, { "epoch": 18.431578947368422, "grad_norm": 1.6378935470129363e-06, "learning_rate": 0.00016330526315789474, "logits/chosen": 13.220552444458008, "logits/rejected": 13.220552444458008, "logps/chosen": -4879.091796875, "logps/rejected": -4879.091796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.06201171875, "rewards/margins": 0.0, "rewards/rejected": -485.06201171875, "step": 1751 }, { "epoch": 18.442105263157895, "grad_norm": 1.196119342239399e-06, "learning_rate": 0.00016328421052631579, "logits/chosen": 13.164741516113281, "logits/rejected": 13.164741516113281, "logps/chosen": -3999.990234375, "logps/rejected": -3999.990234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1717224121094, "rewards/margins": 0.0, "rewards/rejected": -397.1717224121094, "step": 1752 }, { "epoch": 18.45263157894737, "grad_norm": 1.747448095557047e-06, "learning_rate": 0.00016326315789473686, "logits/chosen": 13.20039176940918, "logits/rejected": 13.20039176940918, "logps/chosen": -4327.0859375, "logps/rejected": -4327.0859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.52374267578125, "rewards/margins": 0.0, "rewards/rejected": -429.52374267578125, "step": 1753 }, { "epoch": 18.46315789473684, "grad_norm": 1.3808701169182314e-06, "learning_rate": 0.0001632421052631579, "logits/chosen": 13.217162132263184, "logits/rejected": 13.217162132263184, "logps/chosen": -5177.5419921875, "logps/rejected": -5177.5419921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.8157958984375, "rewards/margins": 0.0, "rewards/rejected": -514.8157958984375, "step": 1754 }, { "epoch": 18.473684210526315, "grad_norm": 1.3913941074861214e-06, "learning_rate": 0.00016322105263157896, "logits/chosen": 13.216073989868164, "logits/rejected": 13.216073989868164, "logps/chosen": -5177.658203125, "logps/rejected": -5177.658203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.827392578125, "rewards/margins": 0.0, "rewards/rejected": -514.827392578125, "step": 1755 }, { "epoch": 18.48421052631579, "grad_norm": 1.474475766372052e-06, "learning_rate": 0.0001632, "logits/chosen": 13.159319877624512, "logits/rejected": 13.159319877624512, "logps/chosen": -2969.203125, "logps/rejected": -2969.203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1941223144531, "rewards/margins": 0.0, "rewards/rejected": -294.1941223144531, "step": 1756 }, { "epoch": 18.49473684210526, "grad_norm": 1.266942490474321e-06, "learning_rate": 0.00016317894736842106, "logits/chosen": 13.214132308959961, "logits/rejected": 13.214132308959961, "logps/chosen": -4879.36767578125, "logps/rejected": -4879.36767578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.089599609375, "rewards/margins": 0.0, "rewards/rejected": -485.089599609375, "step": 1757 }, { "epoch": 18.50526315789474, "grad_norm": 1.2730267826555064e-06, "learning_rate": 0.0001631578947368421, "logits/chosen": 13.20785140991211, "logits/rejected": 13.20785140991211, "logps/chosen": -4328.140625, "logps/rejected": -4328.140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.62921142578125, "rewards/margins": 0.0, "rewards/rejected": -429.62921142578125, "step": 1758 }, { "epoch": 18.51578947368421, "grad_norm": 8.870155170370708e-07, "learning_rate": 0.00016313684210526316, "logits/chosen": 13.171010971069336, "logits/rejected": 13.171010971069336, "logps/chosen": -2673.05859375, "logps/rejected": -2673.05859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5242614746094, "rewards/margins": 0.0, "rewards/rejected": -264.5242614746094, "step": 1759 }, { "epoch": 18.526315789473685, "grad_norm": 8.619770142104244e-07, "learning_rate": 0.00016311578947368423, "logits/chosen": 13.196556091308594, "logits/rejected": 13.196556091308594, "logps/chosen": -3545.423828125, "logps/rejected": -3545.423828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5165100097656, "rewards/margins": 0.0, "rewards/rejected": -351.5165100097656, "step": 1760 }, { "epoch": 18.53684210526316, "grad_norm": 1.8582466054795077e-06, "learning_rate": 0.00016309473684210528, "logits/chosen": 13.205060958862305, "logits/rejected": 13.205060958862305, "logps/chosen": -3775.66015625, "logps/rejected": -3775.66015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.6968994140625, "rewards/margins": 0.0, "rewards/rejected": -374.6968994140625, "step": 1761 }, { "epoch": 18.54736842105263, "grad_norm": 8.518305776306079e-07, "learning_rate": 0.00016307368421052633, "logits/chosen": 13.191962242126465, "logits/rejected": 13.191962242126465, "logps/chosen": -2969.541015625, "logps/rejected": -2969.541015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.2279052734375, "rewards/margins": 0.0, "rewards/rejected": -294.2279052734375, "step": 1762 }, { "epoch": 18.557894736842105, "grad_norm": 8.548731216251326e-07, "learning_rate": 0.00016305263157894735, "logits/chosen": 13.211217880249023, "logits/rejected": 13.211217880249023, "logps/chosen": -3545.353515625, "logps/rejected": -3545.353515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5094909667969, "rewards/margins": 0.0, "rewards/rejected": -351.5094909667969, "step": 1763 }, { "epoch": 18.568421052631578, "grad_norm": 1.6722326563467504e-06, "learning_rate": 0.00016303157894736843, "logits/chosen": 13.219381332397461, "logits/rejected": 13.219381332397461, "logps/chosen": -3776.5546875, "logps/rejected": -3776.5546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.7863464355469, "rewards/margins": 0.0, "rewards/rejected": -374.7863464355469, "step": 1764 }, { "epoch": 18.57894736842105, "grad_norm": 1.6722440250305226e-06, "learning_rate": 0.00016301052631578948, "logits/chosen": 13.255590438842773, "logits/rejected": 13.255590438842773, "logps/chosen": -4879.55859375, "logps/rejected": -4879.55859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.10870361328125, "rewards/margins": 0.0, "rewards/rejected": -485.10870361328125, "step": 1765 }, { "epoch": 18.589473684210525, "grad_norm": 1.3035553365625674e-06, "learning_rate": 0.00016298947368421053, "logits/chosen": 13.228759765625, "logits/rejected": 13.228759765625, "logps/chosen": -3777.3603515625, "logps/rejected": -3777.3603515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8669128417969, "rewards/margins": 0.0, "rewards/rejected": -374.8669128417969, "step": 1766 }, { "epoch": 18.6, "grad_norm": 1.1575800726859597e-06, "learning_rate": 0.0001629684210526316, "logits/chosen": 13.207669258117676, "logits/rejected": 13.207669258117676, "logps/chosen": -3998.458984375, "logps/rejected": -3998.458984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0185852050781, "rewards/margins": 0.0, "rewards/rejected": -397.0185852050781, "step": 1767 }, { "epoch": 18.610526315789475, "grad_norm": 1.2824010582335177e-06, "learning_rate": 0.00016294736842105265, "logits/chosen": 13.207022666931152, "logits/rejected": 13.207022666931152, "logps/chosen": -3998.443359375, "logps/rejected": -3998.443359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.01702880859375, "rewards/margins": 0.0, "rewards/rejected": -397.01702880859375, "step": 1768 }, { "epoch": 18.621052631578948, "grad_norm": 1.4964734873501584e-06, "learning_rate": 0.00016292631578947368, "logits/chosen": 13.224481582641602, "logits/rejected": 13.224481582641602, "logps/chosen": -3545.59375, "logps/rejected": -3545.59375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.53350830078125, "rewards/margins": 0.0, "rewards/rejected": -351.53350830078125, "step": 1769 }, { "epoch": 18.63157894736842, "grad_norm": 1.755260882418952e-06, "learning_rate": 0.00016290526315789473, "logits/chosen": 13.260469436645508, "logits/rejected": 13.260469436645508, "logps/chosen": -4880.009765625, "logps/rejected": -4880.009765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.15380859375, "rewards/margins": 0.0, "rewards/rejected": -485.15380859375, "step": 1770 }, { "epoch": 18.642105263157895, "grad_norm": 1.6575957033637678e-06, "learning_rate": 0.0001628842105263158, "logits/chosen": 13.268829345703125, "logits/rejected": 13.268829345703125, "logps/chosen": -5176.8935546875, "logps/rejected": -5176.8935546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7509155273438, "rewards/margins": 0.0, "rewards/rejected": -514.7509155273438, "step": 1771 }, { "epoch": 18.652631578947368, "grad_norm": 1.2034778364977683e-06, "learning_rate": 0.00016286315789473685, "logits/chosen": 13.264095306396484, "logits/rejected": 13.264095306396484, "logps/chosen": -4880.46630859375, "logps/rejected": -4880.46630859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.199462890625, "rewards/margins": 0.0, "rewards/rejected": -485.199462890625, "step": 1772 }, { "epoch": 18.66315789473684, "grad_norm": 1.132873308051785e-06, "learning_rate": 0.0001628421052631579, "logits/chosen": 13.208897590637207, "logits/rejected": 13.208897590637207, "logps/chosen": -3999.048828125, "logps/rejected": -3999.048828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.07757568359375, "rewards/margins": 0.0, "rewards/rejected": -397.07757568359375, "step": 1773 }, { "epoch": 18.673684210526314, "grad_norm": 1.4973380757510313e-06, "learning_rate": 0.00016282105263157895, "logits/chosen": 13.271554946899414, "logits/rejected": 13.271554946899414, "logps/chosen": -5176.4873046875, "logps/rejected": -5176.4873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7103271484375, "rewards/margins": 0.0, "rewards/rejected": -514.7103271484375, "step": 1774 }, { "epoch": 18.68421052631579, "grad_norm": 8.517868650415039e-07, "learning_rate": 0.0001628, "logits/chosen": 13.207213401794434, "logits/rejected": 13.207213401794434, "logps/chosen": -2673.1953125, "logps/rejected": -2673.1953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5379333496094, "rewards/margins": 0.0, "rewards/rejected": -264.5379333496094, "step": 1775 }, { "epoch": 18.694736842105264, "grad_norm": 1.6368485376005992e-06, "learning_rate": 0.00016277894736842105, "logits/chosen": 13.270036697387695, "logits/rejected": 13.270036697387695, "logps/chosen": -5176.90673828125, "logps/rejected": -5176.90673828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7522583007812, "rewards/margins": 0.0, "rewards/rejected": -514.7522583007812, "step": 1776 }, { "epoch": 18.705263157894738, "grad_norm": 1.1331502491884748e-06, "learning_rate": 0.0001627578947368421, "logits/chosen": 13.205471992492676, "logits/rejected": 13.205471992492676, "logps/chosen": -3999.43359375, "logps/rejected": -3999.43359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1160583496094, "rewards/margins": 0.0, "rewards/rejected": -397.1160583496094, "step": 1777 }, { "epoch": 18.71578947368421, "grad_norm": 9.49476543610217e-07, "learning_rate": 0.00016273684210526317, "logits/chosen": 13.22486686706543, "logits/rejected": 13.22486686706543, "logps/chosen": -3544.505859375, "logps/rejected": -3544.505859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.4247131347656, "rewards/margins": 0.0, "rewards/rejected": -351.4247131347656, "step": 1778 }, { "epoch": 18.726315789473684, "grad_norm": 1.0547296369622927e-06, "learning_rate": 0.00016271578947368422, "logits/chosen": 13.25903034210205, "logits/rejected": 13.25903034210205, "logps/chosen": -4881.5244140625, "logps/rejected": -4881.5244140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3052673339844, "rewards/margins": 0.0, "rewards/rejected": -485.3052673339844, "step": 1779 }, { "epoch": 18.736842105263158, "grad_norm": 1.4396492815649253e-06, "learning_rate": 0.00016269473684210527, "logits/chosen": 13.228307723999023, "logits/rejected": 13.228307723999023, "logps/chosen": -4287.298828125, "logps/rejected": -4287.298828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.9325256347656, "rewards/margins": 0.0, "rewards/rejected": -425.9325256347656, "step": 1780 }, { "epoch": 18.74736842105263, "grad_norm": 2.24858717956522e-06, "learning_rate": 0.00016267368421052632, "logits/chosen": 13.241291046142578, "logits/rejected": 13.241291046142578, "logps/chosen": -4327.0, "logps/rejected": -4327.0, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.51513671875, "rewards/margins": 0.0, "rewards/rejected": -429.51513671875, "step": 1781 }, { "epoch": 18.757894736842104, "grad_norm": 1.0807607395690866e-06, "learning_rate": 0.00016265263157894737, "logits/chosen": 13.196743965148926, "logits/rejected": 13.196743965148926, "logps/chosen": -3999.537109375, "logps/rejected": -3999.537109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.12640380859375, "rewards/margins": 0.0, "rewards/rejected": -397.12640380859375, "step": 1782 }, { "epoch": 18.768421052631577, "grad_norm": 1.2052894362568622e-06, "learning_rate": 0.00016263157894736842, "logits/chosen": 13.218193054199219, "logits/rejected": 13.218193054199219, "logps/chosen": -3779.65625, "logps/rejected": -3779.65625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.0965270996094, "rewards/margins": 0.0, "rewards/rejected": -375.0965270996094, "step": 1783 }, { "epoch": 18.778947368421054, "grad_norm": 1.1812438742708764e-06, "learning_rate": 0.00016261052631578947, "logits/chosen": 13.214422225952148, "logits/rejected": 13.214422225952148, "logps/chosen": -3779.775390625, "logps/rejected": -3779.775390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.1084289550781, "rewards/margins": 0.0, "rewards/rejected": -375.1084289550781, "step": 1784 }, { "epoch": 18.789473684210527, "grad_norm": 8.895038376977027e-07, "learning_rate": 0.00016258947368421055, "logits/chosen": 13.208364486694336, "logits/rejected": 13.208364486694336, "logps/chosen": -3544.01953125, "logps/rejected": -3544.01953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3760681152344, "rewards/margins": 0.0, "rewards/rejected": -351.3760681152344, "step": 1785 }, { "epoch": 18.8, "grad_norm": 1.5744819847895997e-06, "learning_rate": 0.0001625684210526316, "logits/chosen": 13.2423095703125, "logits/rejected": 13.2423095703125, "logps/chosen": -4881.88037109375, "logps/rejected": -4881.88037109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.34088134765625, "rewards/margins": 0.0, "rewards/rejected": -485.34088134765625, "step": 1786 }, { "epoch": 18.810526315789474, "grad_norm": 1.2158124036432127e-06, "learning_rate": 0.00016254736842105265, "logits/chosen": 13.199462890625, "logits/rejected": 13.199462890625, "logps/chosen": -3757.240234375, "logps/rejected": -3757.240234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.80572509765625, "rewards/margins": 0.0, "rewards/rejected": -372.80572509765625, "step": 1787 }, { "epoch": 18.821052631578947, "grad_norm": 1.4959591680963058e-06, "learning_rate": 0.0001625263157894737, "logits/chosen": 13.241324424743652, "logits/rejected": 13.241324424743652, "logps/chosen": -4882.1474609375, "logps/rejected": -4882.1474609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3675842285156, "rewards/margins": 0.0, "rewards/rejected": -485.3675842285156, "step": 1788 }, { "epoch": 18.83157894736842, "grad_norm": 1.2322303746259422e-06, "learning_rate": 0.00016250526315789474, "logits/chosen": 13.193045616149902, "logits/rejected": 13.193045616149902, "logps/chosen": -2967.212890625, "logps/rejected": -2967.212890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -293.9950866699219, "rewards/margins": 0.0, "rewards/rejected": -293.9950866699219, "step": 1789 }, { "epoch": 18.842105263157894, "grad_norm": 1.706665102574334e-06, "learning_rate": 0.0001624842105263158, "logits/chosen": 13.215299606323242, "logits/rejected": 13.215299606323242, "logps/chosen": -4288.0556640625, "logps/rejected": -4288.0556640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0082092285156, "rewards/margins": 0.0, "rewards/rejected": -426.0082092285156, "step": 1790 }, { "epoch": 18.852631578947367, "grad_norm": 1.2062840824000887e-06, "learning_rate": 0.00016246315789473684, "logits/chosen": 13.213371276855469, "logits/rejected": 13.213371276855469, "logps/chosen": -3781.076171875, "logps/rejected": -3781.076171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.2384948730469, "rewards/margins": 0.0, "rewards/rejected": -375.2384948730469, "step": 1791 }, { "epoch": 18.863157894736844, "grad_norm": 1.1165700470883166e-06, "learning_rate": 0.00016244210526315792, "logits/chosen": 13.188092231750488, "logits/rejected": 13.188092231750488, "logps/chosen": -3999.748046875, "logps/rejected": -3999.748046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1474914550781, "rewards/margins": 0.0, "rewards/rejected": -397.1474914550781, "step": 1792 }, { "epoch": 18.873684210526317, "grad_norm": 8.589752837906417e-07, "learning_rate": 0.00016242105263157897, "logits/chosen": 13.185833930969238, "logits/rejected": 13.185833930969238, "logps/chosen": -2672.76953125, "logps/rejected": -2672.76953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.495361328125, "rewards/margins": 0.0, "rewards/rejected": -264.495361328125, "step": 1793 }, { "epoch": 18.88421052631579, "grad_norm": 8.52804532769369e-07, "learning_rate": 0.00016240000000000002, "logits/chosen": 13.182662010192871, "logits/rejected": 13.182662010192871, "logps/chosen": -2672.9296875, "logps/rejected": -2672.9296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.5113830566406, "rewards/margins": 0.0, "rewards/rejected": -264.5113830566406, "step": 1794 }, { "epoch": 18.894736842105264, "grad_norm": 1.524874505776097e-06, "learning_rate": 0.00016237894736842104, "logits/chosen": 13.193991661071777, "logits/rejected": 13.193991661071777, "logps/chosen": -3757.8955078125, "logps/rejected": -3757.8955078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.8712463378906, "rewards/margins": 0.0, "rewards/rejected": -372.8712463378906, "step": 1795 }, { "epoch": 18.905263157894737, "grad_norm": 1.5084017377375858e-06, "learning_rate": 0.00016235789473684212, "logits/chosen": 13.241434097290039, "logits/rejected": 13.241434097290039, "logps/chosen": -5175.3798828125, "logps/rejected": -5175.3798828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5995483398438, "rewards/margins": 0.0, "rewards/rejected": -514.5995483398438, "step": 1796 }, { "epoch": 18.91578947368421, "grad_norm": 1.1134924307043548e-06, "learning_rate": 0.00016233684210526317, "logits/chosen": 13.200626373291016, "logits/rejected": 13.200626373291016, "logps/chosen": -3782.296875, "logps/rejected": -3782.296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -375.3605651855469, "rewards/margins": 0.0, "rewards/rejected": -375.3605651855469, "step": 1797 }, { "epoch": 18.926315789473684, "grad_norm": 9.63920228969073e-07, "learning_rate": 0.00016231578947368421, "logits/chosen": 13.197031021118164, "logits/rejected": 13.197031021118164, "logps/chosen": -3543.177734375, "logps/rejected": -3543.177734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.2919006347656, "rewards/margins": 0.0, "rewards/rejected": -351.2919006347656, "step": 1798 }, { "epoch": 18.936842105263157, "grad_norm": 1.1202159839740489e-06, "learning_rate": 0.0001622947368421053, "logits/chosen": 13.174903869628906, "logits/rejected": 13.174903869628906, "logps/chosen": -3999.482421875, "logps/rejected": -3999.482421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1209411621094, "rewards/margins": 0.0, "rewards/rejected": -397.1209411621094, "step": 1799 }, { "epoch": 18.94736842105263, "grad_norm": 1.1833983535325387e-06, "learning_rate": 0.00016227368421052634, "logits/chosen": 13.229084014892578, "logits/rejected": 13.229084014892578, "logps/chosen": -4882.0439453125, "logps/rejected": -4882.0439453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.35723876953125, "rewards/margins": 0.0, "rewards/rejected": -485.35723876953125, "step": 1800 }, { "epoch": 18.94736842105263, "eval_logits/chosen": 13.212472915649414, "eval_logits/rejected": 13.212472915649414, "eval_logps/chosen": -4311.494140625, "eval_logps/rejected": -4311.494140625, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.2461853027344, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.2461853027344, "eval_runtime": 4.4235, "eval_samples_per_second": 2.261, "eval_steps_per_second": 2.261, "step": 1800 }, { "epoch": 18.957894736842107, "grad_norm": 8.207575206142792e-07, "learning_rate": 0.00016225263157894736, "logits/chosen": 13.172099113464355, "logits/rejected": 13.172099113464355, "logps/chosen": -2674.26953125, "logps/rejected": -2674.26953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6453552246094, "rewards/margins": 0.0, "rewards/rejected": -264.6453552246094, "step": 1801 }, { "epoch": 18.96842105263158, "grad_norm": 1.1158606412209338e-06, "learning_rate": 0.0001622315789473684, "logits/chosen": 13.22603988647461, "logits/rejected": 13.22603988647461, "logps/chosen": -4882.111328125, "logps/rejected": -4882.111328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3639831542969, "rewards/margins": 0.0, "rewards/rejected": -485.3639831542969, "step": 1802 }, { "epoch": 18.978947368421053, "grad_norm": 1.7575513311385293e-06, "learning_rate": 0.0001622105263157895, "logits/chosen": 13.214081764221191, "logits/rejected": 13.214081764221191, "logps/chosen": -4325.6953125, "logps/rejected": -4325.6953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.3846740722656, "rewards/margins": 0.0, "rewards/rejected": -429.3846740722656, "step": 1803 }, { "epoch": 18.989473684210527, "grad_norm": 1.5989957091733231e-06, "learning_rate": 0.00016218947368421054, "logits/chosen": 13.23378849029541, "logits/rejected": 13.23378849029541, "logps/chosen": -5174.8564453125, "logps/rejected": -5174.8564453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.5472412109375, "rewards/margins": 0.0, "rewards/rejected": -514.5472412109375, "step": 1804 }, { "epoch": 19.0, "grad_norm": 1.1448355508036911e-06, "learning_rate": 0.00016216842105263159, "logits/chosen": 13.22888469696045, "logits/rejected": 13.22888469696045, "logps/chosen": -4882.234375, "logps/rejected": -4882.234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.37628173828125, "rewards/margins": 0.0, "rewards/rejected": -485.37628173828125, "step": 1805 }, { "epoch": 19.010526315789473, "grad_norm": 1.2935046243001125e-06, "learning_rate": 0.00016214736842105264, "logits/chosen": 13.184165000915527, "logits/rejected": 13.184165000915527, "logps/chosen": -2967.94921875, "logps/rejected": -2967.94921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0687255859375, "rewards/margins": 0.0, "rewards/rejected": -294.0687255859375, "step": 1806 }, { "epoch": 19.021052631578947, "grad_norm": 1.386531266689417e-06, "learning_rate": 0.00016212631578947368, "logits/chosen": 13.227188110351562, "logits/rejected": 13.227188110351562, "logps/chosen": -4325.90234375, "logps/rejected": -4325.90234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4053649902344, "rewards/margins": 0.0, "rewards/rejected": -429.4053649902344, "step": 1807 }, { "epoch": 19.03157894736842, "grad_norm": 1.4685055020891014e-06, "learning_rate": 0.00016210526315789473, "logits/chosen": 13.186352729797363, "logits/rejected": 13.186352729797363, "logps/chosen": -3999.365234375, "logps/rejected": -3999.365234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.1092224121094, "rewards/margins": 0.0, "rewards/rejected": -397.1092224121094, "step": 1808 }, { "epoch": 19.042105263157893, "grad_norm": 1.2376596032481757e-06, "learning_rate": 0.00016208421052631578, "logits/chosen": 13.231396675109863, "logits/rejected": 13.231396675109863, "logps/chosen": -4326.2919921875, "logps/rejected": -4326.2919921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4443359375, "rewards/margins": 0.0, "rewards/rejected": -429.4443359375, "step": 1809 }, { "epoch": 19.05263157894737, "grad_norm": 1.2141348406657926e-06, "learning_rate": 0.00016206315789473686, "logits/chosen": 13.23031997680664, "logits/rejected": 13.23031997680664, "logps/chosen": -4326.7333984375, "logps/rejected": -4326.7333984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.4884948730469, "rewards/margins": 0.0, "rewards/rejected": -429.4884948730469, "step": 1810 }, { "epoch": 19.063157894736843, "grad_norm": 2.3427364794770256e-06, "learning_rate": 0.0001620421052631579, "logits/chosen": 13.246783256530762, "logits/rejected": 13.246783256530762, "logps/chosen": -5175.3828125, "logps/rejected": -5175.3828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.599853515625, "rewards/margins": 0.0, "rewards/rejected": -514.599853515625, "step": 1811 }, { "epoch": 19.073684210526316, "grad_norm": 7.989654022821924e-07, "learning_rate": 0.00016202105263157896, "logits/chosen": 13.186037063598633, "logits/rejected": 13.186037063598633, "logps/chosen": -2674.52734375, "logps/rejected": -2674.52734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.671142578125, "rewards/margins": 0.0, "rewards/rejected": -264.671142578125, "step": 1812 }, { "epoch": 19.08421052631579, "grad_norm": 1.1549470855243271e-06, "learning_rate": 0.000162, "logits/chosen": 13.242344856262207, "logits/rejected": 13.242344856262207, "logps/chosen": -4882.19140625, "logps/rejected": -4882.19140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.3719787597656, "rewards/margins": 0.0, "rewards/rejected": -485.3719787597656, "step": 1813 }, { "epoch": 19.094736842105263, "grad_norm": 8.248579774772224e-07, "learning_rate": 0.00016197894736842106, "logits/chosen": 13.189043045043945, "logits/rejected": 13.189043045043945, "logps/chosen": -2674.630859375, "logps/rejected": -2674.630859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6814880371094, "rewards/margins": 0.0, "rewards/rejected": -264.6814880371094, "step": 1814 }, { "epoch": 19.105263157894736, "grad_norm": 1.630377028050134e-06, "learning_rate": 0.0001619578947368421, "logits/chosen": 13.254777908325195, "logits/rejected": 13.254777908325195, "logps/chosen": -5176.736328125, "logps/rejected": -5176.736328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7352294921875, "rewards/margins": 0.0, "rewards/rejected": -514.7352294921875, "step": 1815 }, { "epoch": 19.11578947368421, "grad_norm": 1.6961309938778868e-06, "learning_rate": 0.00016193684210526316, "logits/chosen": 13.259284973144531, "logits/rejected": 13.259284973144531, "logps/chosen": -5176.9248046875, "logps/rejected": -5176.9248046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.7540893554688, "rewards/margins": 0.0, "rewards/rejected": -514.7540893554688, "step": 1816 }, { "epoch": 19.126315789473683, "grad_norm": 1.3150581708032405e-06, "learning_rate": 0.00016191578947368423, "logits/chosen": 13.200679779052734, "logits/rejected": 13.200679779052734, "logps/chosen": -3998.30859375, "logps/rejected": -3998.30859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0035400390625, "rewards/margins": 0.0, "rewards/rejected": -397.0035400390625, "step": 1817 }, { "epoch": 19.13684210526316, "grad_norm": 8.613211548436084e-07, "learning_rate": 0.00016189473684210528, "logits/chosen": 13.204448699951172, "logits/rejected": 13.204448699951172, "logps/chosen": -2674.6162109375, "logps/rejected": -2674.6162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.6800231933594, "rewards/margins": 0.0, "rewards/rejected": -264.6800231933594, "step": 1818 }, { "epoch": 19.147368421052633, "grad_norm": 1.0789335647132248e-06, "learning_rate": 0.00016187368421052633, "logits/chosen": 13.226563453674316, "logits/rejected": 13.226563453674316, "logps/chosen": -3544.240234375, "logps/rejected": -3544.240234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3981628417969, "rewards/margins": 0.0, "rewards/rejected": -351.3981628417969, "step": 1819 }, { "epoch": 19.157894736842106, "grad_norm": 9.315833722212119e-07, "learning_rate": 0.00016185263157894738, "logits/chosen": 13.230652809143066, "logits/rejected": 13.230652809143066, "logps/chosen": -3544.158203125, "logps/rejected": -3544.158203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.38995361328125, "rewards/margins": 0.0, "rewards/rejected": -351.38995361328125, "step": 1820 }, { "epoch": 19.16842105263158, "grad_norm": 8.662165100759012e-07, "learning_rate": 0.00016183157894736843, "logits/chosen": 13.237730026245117, "logits/rejected": 13.237730026245117, "logps/chosen": -3544.154296875, "logps/rejected": -3544.154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.3895568847656, "rewards/margins": 0.0, "rewards/rejected": -351.3895568847656, "step": 1821 }, { "epoch": 19.178947368421053, "grad_norm": 1.006268121273024e-06, "learning_rate": 0.00016181052631578948, "logits/chosen": 13.239742279052734, "logits/rejected": 13.239742279052734, "logps/chosen": -3758.587890625, "logps/rejected": -3758.587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.94049072265625, "rewards/margins": 0.0, "rewards/rejected": -372.94049072265625, "step": 1822 }, { "epoch": 19.189473684210526, "grad_norm": 1.0211598464593408e-06, "learning_rate": 0.00016178947368421053, "logits/chosen": 13.24504566192627, "logits/rejected": 13.24504566192627, "logps/chosen": -3758.546875, "logps/rejected": -3758.546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9363708496094, "rewards/margins": 0.0, "rewards/rejected": -372.9363708496094, "step": 1823 }, { "epoch": 19.2, "grad_norm": 1.6710969248379115e-06, "learning_rate": 0.0001617684210526316, "logits/chosen": 13.280534744262695, "logits/rejected": 13.280534744262695, "logps/chosen": -4329.84765625, "logps/rejected": -4329.84765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -429.7998962402344, "rewards/margins": 0.0, "rewards/rejected": -429.7998962402344, "step": 1824 }, { "epoch": 19.210526315789473, "grad_norm": 1.0648620900610695e-06, "learning_rate": 0.00016174736842105265, "logits/chosen": 13.250589370727539, "logits/rejected": 13.250589370727539, "logps/chosen": -3758.7060546875, "logps/rejected": -3758.7060546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -372.9523010253906, "rewards/margins": 0.0, "rewards/rejected": -372.9523010253906, "step": 1825 }, { "epoch": 19.221052631578946, "grad_norm": 2.909375325543806e-06, "learning_rate": 0.00016172631578947368, "logits/chosen": 13.261029243469238, "logits/rejected": 13.261029243469238, "logps/chosen": -4286.9296875, "logps/rejected": -4286.9296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -425.8955993652344, "rewards/margins": 0.0, "rewards/rejected": -425.8955993652344, "step": 1826 }, { "epoch": 19.231578947368423, "grad_norm": 8.360383390026982e-07, "learning_rate": 0.00016170526315789472, "logits/chosen": 13.229317665100098, "logits/rejected": 13.229317665100098, "logps/chosen": -2675.111328125, "logps/rejected": -2675.111328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.72955322265625, "rewards/margins": 0.0, "rewards/rejected": -264.72955322265625, "step": 1827 }, { "epoch": 19.242105263157896, "grad_norm": 1.2091257985957782e-06, "learning_rate": 0.0001616842105263158, "logits/chosen": 13.21916675567627, "logits/rejected": 13.21916675567627, "logps/chosen": -3996.8671875, "logps/rejected": -3996.8671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8594055175781, "rewards/margins": 0.0, "rewards/rejected": -396.8594055175781, "step": 1828 }, { "epoch": 19.25263157894737, "grad_norm": 1.8102182366419584e-06, "learning_rate": 0.00016166315789473685, "logits/chosen": 13.21895694732666, "logits/rejected": 13.21895694732666, "logps/chosen": -2968.1845703125, "logps/rejected": -2968.1845703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.0922546386719, "rewards/margins": 0.0, "rewards/rejected": -294.0922546386719, "step": 1829 }, { "epoch": 19.263157894736842, "grad_norm": 1.3274924413053668e-06, "learning_rate": 0.0001616421052631579, "logits/chosen": 13.228885650634766, "logits/rejected": 13.228885650634766, "logps/chosen": -3545.7734375, "logps/rejected": -3545.7734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.5514831542969, "rewards/margins": 0.0, "rewards/rejected": -351.5514831542969, "step": 1830 }, { "epoch": 19.273684210526316, "grad_norm": 1.4063456319490797e-06, "learning_rate": 0.00016162105263157895, "logits/chosen": 13.266222953796387, "logits/rejected": 13.266222953796387, "logps/chosen": -4879.04736328125, "logps/rejected": -4879.04736328125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0575866699219, "rewards/margins": 0.0, "rewards/rejected": -485.0575866699219, "step": 1831 }, { "epoch": 19.28421052631579, "grad_norm": 1.2195123417768627e-06, "learning_rate": 0.00016160000000000002, "logits/chosen": 13.20736312866211, "logits/rejected": 13.20736312866211, "logps/chosen": -3996.8515625, "logps/rejected": -3996.8515625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.85784912109375, "rewards/margins": 0.0, "rewards/rejected": -396.85784912109375, "step": 1832 }, { "epoch": 19.294736842105262, "grad_norm": 1.2364076837911853e-06, "learning_rate": 0.00016157894736842105, "logits/chosen": 13.20590877532959, "logits/rejected": 13.20590877532959, "logps/chosen": -3996.908203125, "logps/rejected": -3996.908203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.8634948730469, "rewards/margins": 0.0, "rewards/rejected": -396.8634948730469, "step": 1833 }, { "epoch": 19.305263157894736, "grad_norm": 7.871368552514468e-07, "learning_rate": 0.0001615578947368421, "logits/chosen": 13.202531814575195, "logits/rejected": 13.202531814575195, "logps/chosen": -2675.5341796875, "logps/rejected": -2675.5341796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7718200683594, "rewards/margins": 0.0, "rewards/rejected": -264.7718200683594, "step": 1834 }, { "epoch": 19.31578947368421, "grad_norm": 1.3683973065781174e-06, "learning_rate": 0.00016153684210526317, "logits/chosen": 13.19500732421875, "logits/rejected": 13.19500732421875, "logps/chosen": -3998.021484375, "logps/rejected": -3998.021484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -396.9748229980469, "rewards/margins": 0.0, "rewards/rejected": -396.9748229980469, "step": 1835 }, { "epoch": 19.326315789473686, "grad_norm": 8.103224331534875e-07, "learning_rate": 0.00016151578947368422, "logits/chosen": 13.187421798706055, "logits/rejected": 13.187421798706055, "logps/chosen": -2675.7939453125, "logps/rejected": -2675.7939453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -264.7978210449219, "rewards/margins": 0.0, "rewards/rejected": -264.7978210449219, "step": 1836 }, { "epoch": 19.33684210526316, "grad_norm": 7.7311335644481e-07, "learning_rate": 0.00016149473684210527, "logits/chosen": 13.199485778808594, "logits/rejected": 13.199485778808594, "logps/chosen": -3545.5712890625, "logps/rejected": -3545.5712890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.53125, "rewards/margins": 0.0, "rewards/rejected": -351.53125, "step": 1837 }, { "epoch": 19.347368421052632, "grad_norm": 1.428639848199964e-06, "learning_rate": 0.00016147368421052632, "logits/chosen": 13.193339347839355, "logits/rejected": 13.193339347839355, "logps/chosen": -3776.21484375, "logps/rejected": -3776.21484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.75238037109375, "rewards/margins": 0.0, "rewards/rejected": -374.75238037109375, "step": 1838 }, { "epoch": 19.357894736842105, "grad_norm": 1.1254073797317687e-06, "learning_rate": 0.00016145263157894737, "logits/chosen": 13.160295486450195, "logits/rejected": 13.160295486450195, "logps/chosen": -3999.005859375, "logps/rejected": -3999.005859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.0732727050781, "rewards/margins": 0.0, "rewards/rejected": -397.0732727050781, "step": 1839 }, { "epoch": 19.36842105263158, "grad_norm": 1.3588329466074356e-06, "learning_rate": 0.00016143157894736842, "logits/chosen": 13.176033020019531, "logits/rejected": 13.176033020019531, "logps/chosen": -3777.0634765625, "logps/rejected": -3777.0634765625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -374.8372497558594, "rewards/margins": 0.0, "rewards/rejected": -374.8372497558594, "step": 1840 }, { "epoch": 19.378947368421052, "grad_norm": 1.33735113649891e-06, "learning_rate": 0.00016141052631578947, "logits/chosen": 13.201398849487305, "logits/rejected": 13.201398849487305, "logps/chosen": -4878.69873046875, "logps/rejected": -4878.69873046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.022705078125, "rewards/margins": 0.0, "rewards/rejected": -485.022705078125, "step": 1841 }, { "epoch": 19.389473684210525, "grad_norm": 1.5353456319644465e-06, "learning_rate": 0.00016138947368421054, "logits/chosen": 13.20130443572998, "logits/rejected": 13.20130443572998, "logps/chosen": -5175.6748046875, "logps/rejected": -5175.6748046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -514.6290893554688, "rewards/margins": 0.0, "rewards/rejected": -514.6290893554688, "step": 1842 }, { "epoch": 19.4, "grad_norm": 1.4818235740676755e-06, "learning_rate": 0.0001613684210526316, "logits/chosen": 13.18979263305664, "logits/rejected": 13.18979263305664, "logps/chosen": -4878.943359375, "logps/rejected": -4878.943359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.04718017578125, "rewards/margins": 0.0, "rewards/rejected": -485.04718017578125, "step": 1843 }, { "epoch": 19.410526315789475, "grad_norm": 1.6471550452479278e-06, "learning_rate": 0.00016134736842105264, "logits/chosen": 13.186290740966797, "logits/rejected": 13.186290740966797, "logps/chosen": -4879.10546875, "logps/rejected": -4879.10546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.0633850097656, "rewards/margins": 0.0, "rewards/rejected": -485.0633850097656, "step": 1844 }, { "epoch": 19.42105263157895, "grad_norm": 1.7259623064092011e-06, "learning_rate": 0.0001613263157894737, "logits/chosen": 13.150799751281738, "logits/rejected": 13.150799751281738, "logps/chosen": -4288.2705078125, "logps/rejected": -4288.2705078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -426.0296936035156, "rewards/margins": 0.0, "rewards/rejected": -426.0296936035156, "step": 1845 }, { "epoch": 19.431578947368422, "grad_norm": 1.8720892285273294e-06, "learning_rate": 0.00016130526315789474, "logits/chosen": 13.13432502746582, "logits/rejected": 13.13432502746582, "logps/chosen": -2968.306640625, "logps/rejected": -2968.306640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -294.1044616699219, "rewards/margins": 0.0, "rewards/rejected": -294.1044616699219, "step": 1846 }, { "epoch": 19.442105263157895, "grad_norm": 9.840515531323035e-07, "learning_rate": 0.0001612842105263158, "logits/chosen": 13.150936126708984, "logits/rejected": 13.150936126708984, "logps/chosen": -3544.3681640625, "logps/rejected": -3544.3681640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.41094970703125, "rewards/margins": 0.0, "rewards/rejected": -351.41094970703125, "step": 1847 }, { "epoch": 19.45263157894737, "grad_norm": 1.1795752925536362e-06, "learning_rate": 0.00016126315789473684, "logits/chosen": 13.131905555725098, "logits/rejected": 13.131905555725098, "logps/chosen": -4000.6953125, "logps/rejected": -4000.6953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -397.2422180175781, "rewards/margins": 0.0, "rewards/rejected": -397.2422180175781, "step": 1848 }, { "epoch": 19.46315789473684, "grad_norm": 1.0176698879149626e-06, "learning_rate": 0.00016124210526315792, "logits/chosen": 13.160609245300293, "logits/rejected": 13.160609245300293, "logps/chosen": -3544.1650390625, "logps/rejected": -3544.1650390625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -351.390625, "rewards/margins": 0.0, "rewards/rejected": -351.390625, "step": 1849 }, { "epoch": 19.473684210526315, "grad_norm": 4.105552761757281e-06, "learning_rate": 0.00016122105263157897, "logits/chosen": 13.200133323669434, "logits/rejected": 13.200133323669434, "logps/chosen": -4880.45849609375, "logps/rejected": -4880.45849609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -485.1986999511719, "rewards/margins": 0.0, "rewards/rejected": -485.1986999511719, "step": 1850 }, { "epoch": 19.473684210526315, "eval_logits/chosen": 13.179985046386719, "eval_logits/rejected": 13.179985046386719, "eval_logps/chosen": -4311.1513671875, "eval_logps/rejected": -4311.1513671875, "eval_loss": 0.6931471824645996, "eval_rewards/accuracies": 0.0, "eval_rewards/chosen": -428.21197509765625, "eval_rewards/margins": 0.0, "eval_rewards/rejected": -428.21197509765625, "eval_runtime": 4.3529, "eval_samples_per_second": 2.297, "eval_steps_per_second": 2.297, "step": 1850 } ], "logging_steps": 1, "max_steps": 9500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }