{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999917239096251, "eval_steps": 500, "global_step": 6041, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05015310767193578, "grad_norm": 4.3741655349731445, "learning_rate": 5.008264462809917e-07, "logits/chosen": -0.5391724109649658, "logits/rejected": -0.4776774048805237, "logps/chosen": -73.29891204833984, "logps/rejected": -13.118536949157715, "loss": 0.684, "rewards/accuracies": 0.589108943939209, "rewards/chosen": 0.016623780131340027, "rewards/margins": 0.01931975968182087, "rewards/rejected": -0.0026959802489727736, "step": 303 }, { "epoch": 0.10030621534387156, "grad_norm": 3.5748157501220703, "learning_rate": 9.998160412067696e-07, "logits/chosen": -0.5328251719474792, "logits/rejected": -0.4811299741268158, "logps/chosen": -66.50557708740234, "logps/rejected": -15.574967384338379, "loss": 0.4581, "rewards/accuracies": 0.8675742745399475, "rewards/chosen": 0.6051633954048157, "rewards/margins": 0.8234596848487854, "rewards/rejected": -0.21829627454280853, "step": 606 }, { "epoch": 0.15045932301580733, "grad_norm": 3.891946315765381, "learning_rate": 9.440765268579838e-07, "logits/chosen": -0.5248011946678162, "logits/rejected": -0.4659003019332886, "logps/chosen": -61.22736358642578, "logps/rejected": -34.781681060791016, "loss": 0.0781, "rewards/accuracies": 0.9843234419822693, "rewards/chosen": 1.5499842166900635, "rewards/margins": 3.709665298461914, "rewards/rejected": -2.1596810817718506, "step": 909 }, { "epoch": 0.20061243068774312, "grad_norm": 0.11657057702541351, "learning_rate": 8.883370125091979e-07, "logits/chosen": -0.48526012897491455, "logits/rejected": -0.4090143144130707, "logps/chosen": -56.97617721557617, "logps/rejected": -48.22561264038086, "loss": 0.0338, "rewards/accuracies": 0.9867987036705017, "rewards/chosen": 1.691601276397705, "rewards/margins": 5.236583709716797, "rewards/rejected": -3.544982433319092, "step": 1212 }, { "epoch": 0.2507655383596789, "grad_norm": 0.1646522730588913, "learning_rate": 8.325974981604121e-07, "logits/chosen": -0.43419143557548523, "logits/rejected": -0.3306835889816284, "logps/chosen": -56.835514068603516, "logps/rejected": -55.82384490966797, "loss": 0.027, "rewards/accuracies": 0.9913366436958313, "rewards/chosen": 1.7533217668533325, "rewards/margins": 6.046737194061279, "rewards/rejected": -4.2934160232543945, "step": 1515 }, { "epoch": 0.30091864603161467, "grad_norm": 16.816612243652344, "learning_rate": 7.768579838116262e-07, "logits/chosen": -0.4011126756668091, "logits/rejected": -0.2737236022949219, "logps/chosen": -55.48396301269531, "logps/rejected": -65.16657257080078, "loss": 0.0307, "rewards/accuracies": 0.9892739653587341, "rewards/chosen": 1.696967601776123, "rewards/margins": 6.927124500274658, "rewards/rejected": -5.230156898498535, "step": 1818 }, { "epoch": 0.3510717537035504, "grad_norm": 2.5598514080047607, "learning_rate": 7.211184694628402e-07, "logits/chosen": -0.37529540061950684, "logits/rejected": -0.23136167228221893, "logps/chosen": -57.458560943603516, "logps/rejected": -76.030517578125, "loss": 0.0153, "rewards/accuracies": 0.9954620599746704, "rewards/chosen": 1.6613062620162964, "rewards/margins": 7.948617935180664, "rewards/rejected": -6.287312030792236, "step": 2121 }, { "epoch": 0.40122486137548624, "grad_norm": 0.03665272891521454, "learning_rate": 6.653789551140544e-07, "logits/chosen": -0.32625895738601685, "logits/rejected": -0.1696743667125702, "logps/chosen": -56.47751998901367, "logps/rejected": -82.79418182373047, "loss": 0.0185, "rewards/accuracies": 0.9929868578910828, "rewards/chosen": 1.56876802444458, "rewards/margins": 8.616827964782715, "rewards/rejected": -7.048060417175293, "step": 2424 }, { "epoch": 0.451377969047422, "grad_norm": 0.0068373712711036205, "learning_rate": 6.096394407652685e-07, "logits/chosen": -0.33455565571784973, "logits/rejected": -0.17350350320339203, "logps/chosen": -57.92192840576172, "logps/rejected": -88.568603515625, "loss": 0.0217, "rewards/accuracies": 0.9913366436958313, "rewards/chosen": 1.5466176271438599, "rewards/margins": 9.043907165527344, "rewards/rejected": -7.497289657592773, "step": 2727 }, { "epoch": 0.5015310767193578, "grad_norm": 1.1985265016555786, "learning_rate": 5.538999264164827e-07, "logits/chosen": -0.32679906487464905, "logits/rejected": -0.16868844628334045, "logps/chosen": -57.272857666015625, "logps/rejected": -90.3719711303711, "loss": 0.0215, "rewards/accuracies": 0.9929868578910828, "rewards/chosen": 1.4742236137390137, "rewards/margins": 9.2846097946167, "rewards/rejected": -7.8103861808776855, "step": 3030 }, { "epoch": 0.5516841843912935, "grad_norm": 0.004047638736665249, "learning_rate": 4.981604120676968e-07, "logits/chosen": -0.3204115033149719, "logits/rejected": -0.17468391358852386, "logps/chosen": -58.51227951049805, "logps/rejected": -94.06062316894531, "loss": 0.0278, "rewards/accuracies": 0.9905115962028503, "rewards/chosen": 1.527059555053711, "rewards/margins": 9.519412994384766, "rewards/rejected": -7.992353916168213, "step": 3333 }, { "epoch": 0.6018372920632293, "grad_norm": 0.3182278573513031, "learning_rate": 4.4242089771891094e-07, "logits/chosen": -0.3149339258670807, "logits/rejected": -0.15087205171585083, "logps/chosen": -57.350006103515625, "logps/rejected": -94.76825714111328, "loss": 0.0276, "rewards/accuracies": 0.9900990128517151, "rewards/chosen": 1.4863520860671997, "rewards/margins": 9.664637565612793, "rewards/rejected": -8.178285598754883, "step": 3636 }, { "epoch": 0.6519903997351651, "grad_norm": 0.016533929854631424, "learning_rate": 3.866813833701251e-07, "logits/chosen": -0.3052721619606018, "logits/rejected": -0.1389181911945343, "logps/chosen": -58.16946792602539, "logps/rejected": -95.81718444824219, "loss": 0.0245, "rewards/accuracies": 0.9909241199493408, "rewards/chosen": 1.4919017553329468, "rewards/margins": 9.79186725616455, "rewards/rejected": -8.299964904785156, "step": 3939 }, { "epoch": 0.7021435074071009, "grad_norm": 0.010237179696559906, "learning_rate": 3.3094186902133917e-07, "logits/chosen": -0.3208546042442322, "logits/rejected": -0.1598815768957138, "logps/chosen": -58.12910079956055, "logps/rejected": -97.86547088623047, "loss": 0.0245, "rewards/accuracies": 0.9913366436958313, "rewards/chosen": 1.4368475675582886, "rewards/margins": 9.900219917297363, "rewards/rejected": -8.463372230529785, "step": 4242 }, { "epoch": 0.7522966150790367, "grad_norm": 0.008713229559361935, "learning_rate": 2.752023546725533e-07, "logits/chosen": -0.3229035437107086, "logits/rejected": -0.15609696507453918, "logps/chosen": -57.847740173339844, "logps/rejected": -98.02397918701172, "loss": 0.019, "rewards/accuracies": 0.9925742745399475, "rewards/chosen": 1.491112232208252, "rewards/margins": 10.048946380615234, "rewards/rejected": -8.557833671569824, "step": 4545 }, { "epoch": 0.8024497227509725, "grad_norm": 0.0016923310467973351, "learning_rate": 2.1946284032376748e-07, "logits/chosen": -0.3198649287223816, "logits/rejected": -0.1555498093366623, "logps/chosen": -57.79204177856445, "logps/rejected": -97.5900650024414, "loss": 0.0152, "rewards/accuracies": 0.9958746433258057, "rewards/chosen": 1.5458145141601562, "rewards/margins": 9.990068435668945, "rewards/rejected": -8.444254875183105, "step": 4848 }, { "epoch": 0.8526028304229082, "grad_norm": 0.005034138448536396, "learning_rate": 1.637233259749816e-07, "logits/chosen": -0.31206631660461426, "logits/rejected": -0.13810566067695618, "logps/chosen": -60.38402557373047, "logps/rejected": -97.9999008178711, "loss": 0.0182, "rewards/accuracies": 0.9938119053840637, "rewards/chosen": 1.5842220783233643, "rewards/margins": 10.07706356048584, "rewards/rejected": -8.492840766906738, "step": 5151 }, { "epoch": 0.902755938094844, "grad_norm": 0.01207835040986538, "learning_rate": 1.0798381162619573e-07, "logits/chosen": -0.3065117299556732, "logits/rejected": -0.13630090653896332, "logps/chosen": -59.60470199584961, "logps/rejected": -98.72499084472656, "loss": 0.0196, "rewards/accuracies": 0.9933993816375732, "rewards/chosen": 1.5179524421691895, "rewards/margins": 10.089001655578613, "rewards/rejected": -8.571049690246582, "step": 5454 }, { "epoch": 0.9529090457667798, "grad_norm": 0.008624515496194363, "learning_rate": 5.224429727740986e-08, "logits/chosen": -0.298076331615448, "logits/rejected": -0.12163959443569183, "logps/chosen": -61.02180862426758, "logps/rejected": -98.43746185302734, "loss": 0.0201, "rewards/accuracies": 0.9942244291305542, "rewards/chosen": 1.5929877758026123, "rewards/margins": 10.118744850158691, "rewards/rejected": -8.525758743286133, "step": 5757 }, { "epoch": 0.999917239096251, "step": 6041, "total_flos": 3.6566231979141366e+18, "train_loss": 0.08035980544132106, "train_runtime": 90068.102, "train_samples_per_second": 0.537, "train_steps_per_second": 0.067 } ], "logging_steps": 303, "max_steps": 6041, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.6566231979141366e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }