{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10000.0, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006418613980543576, "grad_norm": 32.048425683927725, "learning_rate": 4.999493072462126e-07, "logits/chosen": -2.115234375, "logits/rejected": -1.5445556640625, "logps/chosen": -132.65625, "logps/rejected": -181.029296875, "loss": 0.703399658203125, "rewards/accuracies": 0.439453125, "rewards/chosen": -0.015892624855041504, "rewards/margins": -0.00758051872253418, "rewards/rejected": -0.008310675621032715, "step": 1 }, { "epoch": 0.06418613980543576, "grad_norm": 23.578875399152054, "learning_rate": 4.949476630105669e-07, "logits/chosen": -2.146253824234009, "logits/rejected": -1.5375298261642456, "logps/chosen": -127.29991149902344, "logps/rejected": -182.31988525390625, "loss": 0.6579203075832791, "rewards/accuracies": 0.5796440839767456, "rewards/chosen": 0.001607447862625122, "rewards/margins": 0.09492193162441254, "rewards/rejected": -0.09333191812038422, "step": 10 }, { "epoch": 0.12837227961087153, "grad_norm": 16.096083288887513, "learning_rate": 4.799948609147061e-07, "logits/chosen": -2.1872315406799316, "logits/rejected": -1.5583984851837158, "logps/chosen": -127.17167663574219, "logps/rejected": -188.09335327148438, "loss": 0.5185166358947754, "rewards/accuracies": 0.7955077886581421, "rewards/chosen": 0.10318219661712646, "rewards/margins": 0.5961636304855347, "rewards/rejected": -0.49292677640914917, "step": 20 }, { "epoch": 0.1925584194163073, "grad_norm": 16.84839050494715, "learning_rate": 4.557459664734141e-07, "logits/chosen": -2.217529296875, "logits/rejected": -1.5862548351287842, "logps/chosen": -125.22636413574219, "logps/rejected": -189.91366577148438, "loss": 0.41579198837280273, "rewards/accuracies": 0.850781261920929, "rewards/chosen": 0.16709718108177185, "rewards/margins": 1.2345550060272217, "rewards/rejected": -1.067326307296753, "step": 30 }, { "epoch": 0.25674455922174305, "grad_norm": 8.87416271370084, "learning_rate": 4.2318108837739986e-07, "logits/chosen": -2.2668824195861816, "logits/rejected": -1.598077416419983, "logps/chosen": -127.8832015991211, "logps/rejected": -192.91796875, "loss": 0.35149335861206055, "rewards/accuracies": 0.866406261920929, "rewards/chosen": 0.12800344824790955, "rewards/margins": 1.7798080444335938, "rewards/rejected": -1.651770830154419, "step": 40 }, { "epoch": 0.3209306990271788, "grad_norm": 8.909019678687008, "learning_rate": 3.8361645653195024e-07, "logits/chosen": -2.3679442405700684, "logits/rejected": -1.644537329673767, "logps/chosen": -127.83222961425781, "logps/rejected": -205.20272827148438, "loss": 0.30472755432128906, "rewards/accuracies": 0.8851562738418579, "rewards/chosen": 0.2481112778186798, "rewards/margins": 2.5448379516601562, "rewards/rejected": -2.296844482421875, "step": 50 }, { "epoch": 0.3851168388326146, "grad_norm": 9.080316543515908, "learning_rate": 3.3865122176063385e-07, "logits/chosen": -2.381664991378784, "logits/rejected": -1.673553466796875, "logps/chosen": -126.41679382324219, "logps/rejected": -204.1730499267578, "loss": 0.3040948390960693, "rewards/accuracies": 0.883007824420929, "rewards/chosen": 0.16746802628040314, "rewards/margins": 2.610337734222412, "rewards/rejected": -2.443005323410034, "step": 60 }, { "epoch": 0.44930297863805035, "grad_norm": 7.319834355840221, "learning_rate": 2.9010282021444005e-07, "logits/chosen": -2.3894896507263184, "logits/rejected": -1.654962182044983, "logps/chosen": -126.53593444824219, "logps/rejected": -207.21328735351562, "loss": 0.2680961608886719, "rewards/accuracies": 0.8951171636581421, "rewards/chosen": 0.16773858666419983, "rewards/margins": 2.727093458175659, "rewards/rejected": -2.5592041015625, "step": 70 }, { "epoch": 0.5134891184434861, "grad_norm": 11.714525445790118, "learning_rate": 2.399335149726463e-07, "logits/chosen": -2.400561571121216, "logits/rejected": -1.6822998523712158, "logps/chosen": -126.2464828491211, "logps/rejected": -208.8488311767578, "loss": 0.27129082679748534, "rewards/accuracies": 0.8958984613418579, "rewards/chosen": 0.0786014050245285, "rewards/margins": 2.6517059803009033, "rewards/rejected": -2.5730834007263184, "step": 80 }, { "epoch": 0.5776752582489219, "grad_norm": 13.05044359804742, "learning_rate": 1.9017108392811062e-07, "logits/chosen": -2.4099974632263184, "logits/rejected": -1.6946532726287842, "logps/chosen": -129.740234375, "logps/rejected": -212.8679656982422, "loss": 0.25001063346862795, "rewards/accuracies": 0.9019531011581421, "rewards/chosen": 0.07523002475500107, "rewards/margins": 2.694448947906494, "rewards/rejected": -2.618884325027466, "step": 90 }, { "epoch": 0.6418613980543576, "grad_norm": 11.826565551460419, "learning_rate": 1.428268596492364e-07, "logits/chosen": -2.4135499000549316, "logits/rejected": -1.6916077136993408, "logps/chosen": -128.24374389648438, "logps/rejected": -200.87850952148438, "loss": 0.2475870132446289, "rewards/accuracies": 0.9037109613418579, "rewards/chosen": 0.13076062500476837, "rewards/margins": 2.663525342941284, "rewards/rejected": -2.5332884788513184, "step": 100 }, { "epoch": 0.7060475378597934, "grad_norm": 19.350657646267706, "learning_rate": 9.981443394050524e-08, "logits/chosen": -2.416271924972534, "logits/rejected": -1.668573021888733, "logps/chosen": -124.5199203491211, "logps/rejected": -204.02774047851562, "loss": 0.24730167388916016, "rewards/accuracies": 0.8970702886581421, "rewards/chosen": 0.06585326045751572, "rewards/margins": 2.7098052501678467, "rewards/rejected": -2.6437134742736816, "step": 110 }, { "epoch": 0.7702336776652292, "grad_norm": 14.914033483862706, "learning_rate": 6.28723129572247e-08, "logits/chosen": -2.42510986328125, "logits/rejected": -1.6651611328125, "logps/chosen": -129.416015625, "logps/rejected": -211.32461547851562, "loss": 0.2530521869659424, "rewards/accuracies": 0.8990234136581421, "rewards/chosen": 0.03038964234292507, "rewards/margins": 2.786761522293091, "rewards/rejected": -2.7561402320861816, "step": 120 }, { "epoch": 0.834419817470665, "grad_norm": 12.452750762359729, "learning_rate": 3.349364905389032e-08, "logits/chosen": -2.4213013648986816, "logits/rejected": -1.6964843273162842, "logps/chosen": -126.28125, "logps/rejected": -202.9324188232422, "loss": 0.24086828231811525, "rewards/accuracies": 0.904101550579071, "rewards/chosen": 0.0428071990609169, "rewards/margins": 2.784435987472534, "rewards/rejected": -2.741345167160034, "step": 130 }, { "epoch": 0.8986059572761007, "grad_norm": 7.337706041982024, "learning_rate": 1.2865889513213628e-08, "logits/chosen": -2.431103467941284, "logits/rejected": -1.689788818359375, "logps/chosen": -127.52030944824219, "logps/rejected": -207.2761688232422, "loss": 0.24082815647125244, "rewards/accuracies": 0.9056640863418579, "rewards/chosen": 0.055707789957523346, "rewards/margins": 2.8203492164611816, "rewards/rejected": -2.7646727561950684, "step": 140 }, { "epoch": 0.9627920970815365, "grad_norm": 9.490755329142848, "learning_rate": 1.8227814754865067e-09, "logits/chosen": -2.4349732398986816, "logits/rejected": -1.698211669921875, "logps/chosen": -129.65625, "logps/rejected": -206.28164672851562, "loss": 0.24139628410339356, "rewards/accuracies": 0.9046875238418579, "rewards/chosen": 0.04055643081665039, "rewards/margins": 2.805835008621216, "rewards/rejected": -2.7651429176330566, "step": 150 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2243593606337659e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }