{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999015651146766, "eval_steps": 500, "global_step": 5079, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05000492174426617, "grad_norm": 5.337889194488525, "learning_rate": 5e-07, "logits/chosen": -0.5339647531509399, "logits/rejected": -0.47227513790130615, "logps/chosen": -71.58138275146484, "logps/rejected": -13.533102989196777, "loss": 0.6865, "rewards/accuracies": 0.5871062874794006, "rewards/chosen": 0.012079809792339802, "rewards/margins": 0.013962473720312119, "rewards/rejected": -0.001882663695141673, "step": 254 }, { "epoch": 0.10000984348853234, "grad_norm": 5.7681660652160645, "learning_rate": 1e-06, "logits/chosen": -0.524512529373169, "logits/rejected": -0.470420241355896, "logps/chosen": -68.49125671386719, "logps/rejected": -14.996256828308105, "loss": 0.5103, "rewards/accuracies": 0.8449802994728088, "rewards/chosen": 0.4529332220554352, "rewards/margins": 0.5830409526824951, "rewards/rejected": -0.13010773062705994, "step": 508 }, { "epoch": 0.1500147652327985, "grad_norm": 1.7311837673187256, "learning_rate": 9.44432290527237e-07, "logits/chosen": -0.5406456589698792, "logits/rejected": -0.49012085795402527, "logps/chosen": -61.494964599609375, "logps/rejected": -28.769521713256836, "loss": 0.1423, "rewards/accuracies": 0.9788385629653931, "rewards/chosen": 1.3764965534210205, "rewards/margins": 2.9073357582092285, "rewards/rejected": -1.5308390855789185, "step": 762 }, { "epoch": 0.20001968697706468, "grad_norm": 4.213954925537109, "learning_rate": 8.888645810544738e-07, "logits/chosen": -0.5104743838310242, "logits/rejected": -0.43841081857681274, "logps/chosen": -56.98832702636719, "logps/rejected": -44.329437255859375, "loss": 0.0364, "rewards/accuracies": 0.9872047305107117, "rewards/chosen": 1.6219738721847534, "rewards/margins": 4.794076919555664, "rewards/rejected": -3.1721031665802, "step": 1016 }, { "epoch": 0.25002460872133087, "grad_norm": 0.14951969683170319, "learning_rate": 8.332968715817108e-07, "logits/chosen": -0.4608861804008484, "logits/rejected": -0.3742350935935974, "logps/chosen": -55.1967887878418, "logps/rejected": -51.54916000366211, "loss": 0.0258, "rewards/accuracies": 0.9886810779571533, "rewards/chosen": 1.6978679895401, "rewards/margins": 5.571296691894531, "rewards/rejected": -3.8734288215637207, "step": 1270 }, { "epoch": 0.300029530465597, "grad_norm": 0.11377181112766266, "learning_rate": 7.777291621089477e-07, "logits/chosen": -0.4032284915447235, "logits/rejected": -0.2789752185344696, "logps/chosen": -57.66849899291992, "logps/rejected": -57.404354095458984, "loss": 0.0181, "rewards/accuracies": 0.9936023354530334, "rewards/chosen": 1.815442681312561, "rewards/margins": 6.287877559661865, "rewards/rejected": -4.472434043884277, "step": 1524 }, { "epoch": 0.3500344522098632, "grad_norm": 61.85912322998047, "learning_rate": 7.221614526361847e-07, "logits/chosen": -0.39197683334350586, "logits/rejected": -0.2711484432220459, "logps/chosen": -56.51285934448242, "logps/rejected": -66.6546630859375, "loss": 0.0243, "rewards/accuracies": 0.9906495809555054, "rewards/chosen": 1.6953144073486328, "rewards/margins": 6.998918533325195, "rewards/rejected": -5.3036041259765625, "step": 1778 }, { "epoch": 0.40003937395412936, "grad_norm": 0.09933885931968689, "learning_rate": 6.665937431634215e-07, "logits/chosen": -0.35676872730255127, "logits/rejected": -0.21091562509536743, "logps/chosen": -58.933048248291016, "logps/rejected": -72.55093383789062, "loss": 0.022, "rewards/accuracies": 0.9906495809555054, "rewards/chosen": 1.709058165550232, "rewards/margins": 7.6529622077941895, "rewards/rejected": -5.943903923034668, "step": 2032 }, { "epoch": 0.4500442956983955, "grad_norm": 0.3078814446926117, "learning_rate": 6.110260336906585e-07, "logits/chosen": -0.3692930340766907, "logits/rejected": -0.21181651949882507, "logps/chosen": -56.33852767944336, "logps/rejected": -78.77378845214844, "loss": 0.0193, "rewards/accuracies": 0.9916338324546814, "rewards/chosen": 1.583296775817871, "rewards/margins": 8.18080997467041, "rewards/rejected": -6.597513675689697, "step": 2286 }, { "epoch": 0.5000492174426617, "grad_norm": 0.026140812784433365, "learning_rate": 5.554583242178954e-07, "logits/chosen": -0.36246979236602783, "logits/rejected": -0.21610520780086517, "logps/chosen": -56.37531280517578, "logps/rejected": -81.92805480957031, "loss": 0.0298, "rewards/accuracies": 0.9886810779571533, "rewards/chosen": 1.608428955078125, "rewards/margins": 8.545919418334961, "rewards/rejected": -6.9374895095825195, "step": 2540 }, { "epoch": 0.5500541391869278, "grad_norm": 0.0485980287194252, "learning_rate": 4.998906147451324e-07, "logits/chosen": -0.35004737973213196, "logits/rejected": -0.1877668797969818, "logps/chosen": -56.29869079589844, "logps/rejected": -84.21609497070312, "loss": 0.0229, "rewards/accuracies": 0.9901574850082397, "rewards/chosen": 1.6171692609786987, "rewards/margins": 8.883115768432617, "rewards/rejected": -7.265947341918945, "step": 2794 }, { "epoch": 0.600059060931194, "grad_norm": 0.13149231672286987, "learning_rate": 4.4432290527236927e-07, "logits/chosen": -0.3316061198711395, "logits/rejected": -0.17913725972175598, "logps/chosen": -57.6456413269043, "logps/rejected": -89.23247528076172, "loss": 0.0144, "rewards/accuracies": 0.9960629940032959, "rewards/chosen": 1.622791051864624, "rewards/margins": 9.201993942260742, "rewards/rejected": -7.579202651977539, "step": 3048 }, { "epoch": 0.6500639826754602, "grad_norm": 0.03225807845592499, "learning_rate": 3.887551957996062e-07, "logits/chosen": -0.3303147554397583, "logits/rejected": -0.16745421290397644, "logps/chosen": -57.618045806884766, "logps/rejected": -90.39539337158203, "loss": 0.0165, "rewards/accuracies": 0.9921259880065918, "rewards/chosen": 1.5876142978668213, "rewards/margins": 9.376455307006836, "rewards/rejected": -7.788840293884277, "step": 3302 }, { "epoch": 0.7000689044197264, "grad_norm": 0.024763241410255432, "learning_rate": 3.3318748632684314e-07, "logits/chosen": -0.3256986141204834, "logits/rejected": -0.1574079841375351, "logps/chosen": -59.10237121582031, "logps/rejected": -92.18179321289062, "loss": 0.0203, "rewards/accuracies": 0.9901574850082397, "rewards/chosen": 1.5978204011917114, "rewards/margins": 9.474949836730957, "rewards/rejected": -7.877129554748535, "step": 3556 }, { "epoch": 0.7500738261639925, "grad_norm": 0.18685077130794525, "learning_rate": 2.7761977685408005e-07, "logits/chosen": -0.31320706009864807, "logits/rejected": -0.14784303307533264, "logps/chosen": -58.15943908691406, "logps/rejected": -92.56378936767578, "loss": 0.0265, "rewards/accuracies": 0.9906495809555054, "rewards/chosen": 1.5292613506317139, "rewards/margins": 9.509092330932617, "rewards/rejected": -7.979831218719482, "step": 3810 }, { "epoch": 0.8000787479082587, "grad_norm": 0.01488853245973587, "learning_rate": 2.22052067381317e-07, "logits/chosen": -0.3247720003128052, "logits/rejected": -0.15560078620910645, "logps/chosen": -56.86127471923828, "logps/rejected": -93.76302337646484, "loss": 0.0189, "rewards/accuracies": 0.9931102395057678, "rewards/chosen": 1.586428165435791, "rewards/margins": 9.719764709472656, "rewards/rejected": -8.133337020874023, "step": 4064 }, { "epoch": 0.8500836696525249, "grad_norm": 4.113521575927734, "learning_rate": 1.6648435790855392e-07, "logits/chosen": -0.31977561116218567, "logits/rejected": -0.164890855550766, "logps/chosen": -56.98260498046875, "logps/rejected": -95.06165313720703, "loss": 0.0233, "rewards/accuracies": 0.9921259880065918, "rewards/chosen": 1.4957386255264282, "rewards/margins": 9.688507080078125, "rewards/rejected": -8.192767143249512, "step": 4318 }, { "epoch": 0.900088591396791, "grad_norm": 0.0642678439617157, "learning_rate": 1.1091664843579085e-07, "logits/chosen": -0.3185438811779022, "logits/rejected": -0.1604050248861313, "logps/chosen": -57.956459045410156, "logps/rejected": -96.31430053710938, "loss": 0.0202, "rewards/accuracies": 0.9916338324546814, "rewards/chosen": 1.4874851703643799, "rewards/margins": 9.749979019165039, "rewards/rejected": -8.262493133544922, "step": 4572 }, { "epoch": 0.9500935131410572, "grad_norm": 0.0038960117381066084, "learning_rate": 5.534893896302778e-08, "logits/chosen": -0.3140643537044525, "logits/rejected": -0.1565851867198944, "logps/chosen": -59.727909088134766, "logps/rejected": -95.10466003417969, "loss": 0.0173, "rewards/accuracies": 0.9936023354530334, "rewards/chosen": 1.604878544807434, "rewards/margins": 9.842850685119629, "rewards/rejected": -8.237971305847168, "step": 4826 }, { "epoch": 0.9999015651146766, "step": 5079, "total_flos": 3.074560994106409e+18, "train_loss": 0.08538520530960554, "train_runtime": 75713.4377, "train_samples_per_second": 0.537, "train_steps_per_second": 0.067 } ], "logging_steps": 254, "max_steps": 5079, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.074560994106409e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }