{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999067251189254, "eval_steps": 500, "global_step": 2680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04999533625594627, "grad_norm": 5.705146789550781, "learning_rate": 5e-07, "logits/chosen": -0.5462052822113037, "logits/rejected": -0.46075063943862915, "logps/chosen": -69.6404800415039, "logps/rejected": -10.503483772277832, "loss": 0.69, "rewards/accuracies": 0.5541044473648071, "rewards/chosen": 0.005677139386534691, "rewards/margins": 0.006748478394001722, "rewards/rejected": -0.0010713385418057442, "step": 134 }, { "epoch": 0.09999067251189254, "grad_norm": 4.263132572174072, "learning_rate": 1e-06, "logits/chosen": -0.5453211665153503, "logits/rejected": -0.4581734836101532, "logps/chosen": -71.13265228271484, "logps/rejected": -11.624285697937012, "loss": 0.6205, "rewards/accuracies": 0.9160447716712952, "rewards/chosen": 0.14199481904506683, "rewards/margins": 0.16203062236309052, "rewards/rejected": -0.020035814493894577, "step": 268 }, { "epoch": 0.14998600876783882, "grad_norm": 1.5373331308364868, "learning_rate": 9.444444444444444e-07, "logits/chosen": -0.545798122882843, "logits/rejected": -0.4811278283596039, "logps/chosen": -61.10158920288086, "logps/rejected": -15.043923377990723, "loss": 0.3159, "rewards/accuracies": 0.9869402647018433, "rewards/chosen": 0.9139772653579712, "rewards/margins": 1.3145134449005127, "rewards/rejected": -0.4005362391471863, "step": 402 }, { "epoch": 0.1999813450237851, "grad_norm": 0.5485444068908691, "learning_rate": 8.888888888888888e-07, "logits/chosen": -0.5420589447021484, "logits/rejected": -0.49538397789001465, "logps/chosen": -52.57987976074219, "logps/rejected": -23.705102920532227, "loss": 0.1156, "rewards/accuracies": 0.9906716346740723, "rewards/chosen": 1.417620301246643, "rewards/margins": 2.7980282306671143, "rewards/rejected": -1.3804079294204712, "step": 536 }, { "epoch": 0.24997668127973136, "grad_norm": 0.19278554618358612, "learning_rate": 8.333333333333333e-07, "logits/chosen": -0.5059043765068054, "logits/rejected": -0.4366276264190674, "logps/chosen": -55.41277313232422, "logps/rejected": -34.412296295166016, "loss": 0.05, "rewards/accuracies": 0.9850745797157288, "rewards/chosen": 1.6597167253494263, "rewards/margins": 4.129410266876221, "rewards/rejected": -2.469693660736084, "step": 670 }, { "epoch": 0.29997201753567765, "grad_norm": 0.5303434729576111, "learning_rate": 7.777777777777778e-07, "logits/chosen": -0.4820927679538727, "logits/rejected": -0.3845590353012085, "logps/chosen": -52.056922912597656, "logps/rejected": -41.084842681884766, "loss": 0.0391, "rewards/accuracies": 0.9860074520111084, "rewards/chosen": 1.6465966701507568, "rewards/margins": 4.822764873504639, "rewards/rejected": -3.1761679649353027, "step": 804 }, { "epoch": 0.3499673537916239, "grad_norm": 0.06953659653663635, "learning_rate": 7.222222222222221e-07, "logits/chosen": -0.4638223350048065, "logits/rejected": -0.3506713807582855, "logps/chosen": -52.924964904785156, "logps/rejected": -46.31932830810547, "loss": 0.0416, "rewards/accuracies": 0.9850745797157288, "rewards/chosen": 1.6556929349899292, "rewards/margins": 5.256211757659912, "rewards/rejected": -3.6005189418792725, "step": 938 }, { "epoch": 0.3999626900475702, "grad_norm": 0.13936500251293182, "learning_rate": 6.666666666666666e-07, "logits/chosen": -0.4642048180103302, "logits/rejected": -0.3347207307815552, "logps/chosen": -51.326290130615234, "logps/rejected": -47.818946838378906, "loss": 0.03, "rewards/accuracies": 0.9860074520111084, "rewards/chosen": 1.7104003429412842, "rewards/margins": 5.57481575012207, "rewards/rejected": -3.864415407180786, "step": 1072 }, { "epoch": 0.44995802630351645, "grad_norm": 0.10513754934072495, "learning_rate": 6.111111111111112e-07, "logits/chosen": -0.4259939193725586, "logits/rejected": -0.29308220744132996, "logps/chosen": -52.058380126953125, "logps/rejected": -50.53205871582031, "loss": 0.0353, "rewards/accuracies": 0.9813432693481445, "rewards/chosen": 1.7411428689956665, "rewards/margins": 5.778336048126221, "rewards/rejected": -4.037193775177002, "step": 1206 }, { "epoch": 0.4999533625594627, "grad_norm": 15.070347785949707, "learning_rate": 5.555555555555555e-07, "logits/chosen": -0.4217334985733032, "logits/rejected": -0.2778339684009552, "logps/chosen": -49.258907318115234, "logps/rejected": -52.084754943847656, "loss": 0.0386, "rewards/accuracies": 0.9785447716712952, "rewards/chosen": 1.7135344743728638, "rewards/margins": 5.933449745178223, "rewards/rejected": -4.21991491317749, "step": 1340 }, { "epoch": 0.549948698815409, "grad_norm": 0.07054832577705383, "learning_rate": 5e-07, "logits/chosen": -0.41517916321754456, "logits/rejected": -0.25792089104652405, "logps/chosen": -51.234615325927734, "logps/rejected": -53.174564361572266, "loss": 0.0263, "rewards/accuracies": 0.9850745797157288, "rewards/chosen": 1.7893245220184326, "rewards/margins": 6.14929723739624, "rewards/rejected": -4.3599724769592285, "step": 1474 }, { "epoch": 0.5999440350713553, "grad_norm": 1.686726689338684, "learning_rate": 4.444444444444444e-07, "logits/chosen": -0.3858945369720459, "logits/rejected": -0.22172169387340546, "logps/chosen": -51.00803756713867, "logps/rejected": -54.77092361450195, "loss": 0.0446, "rewards/accuracies": 0.9776118993759155, "rewards/chosen": 1.7425472736358643, "rewards/margins": 6.227110385894775, "rewards/rejected": -4.48456335067749, "step": 1608 }, { "epoch": 0.6499393713273015, "grad_norm": 6.180748462677002, "learning_rate": 3.888888888888889e-07, "logits/chosen": -0.3795308768749237, "logits/rejected": -0.213688462972641, "logps/chosen": -50.311119079589844, "logps/rejected": -57.47030258178711, "loss": 0.0178, "rewards/accuracies": 0.9944029450416565, "rewards/chosen": 1.7370685338974, "rewards/margins": 6.562605381011963, "rewards/rejected": -4.825536727905273, "step": 1742 }, { "epoch": 0.6999347075832478, "grad_norm": 1.3855689764022827, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.37974509596824646, "logits/rejected": -0.1997498720884323, "logps/chosen": -54.77206802368164, "logps/rejected": -58.50139617919922, "loss": 0.0329, "rewards/accuracies": 0.983208954334259, "rewards/chosen": 1.8076502084732056, "rewards/margins": 6.584301948547363, "rewards/rejected": -4.776651859283447, "step": 1876 }, { "epoch": 0.7499300438391941, "grad_norm": 3.921687364578247, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -0.37658217549324036, "logits/rejected": -0.20967237651348114, "logps/chosen": -52.74870681762695, "logps/rejected": -60.60834503173828, "loss": 0.0284, "rewards/accuracies": 0.9878731369972229, "rewards/chosen": 1.7630212306976318, "rewards/margins": 6.771360397338867, "rewards/rejected": -5.008338928222656, "step": 2010 }, { "epoch": 0.7999253800951404, "grad_norm": 0.07866105437278748, "learning_rate": 2.222222222222222e-07, "logits/chosen": -0.3648207485675812, "logits/rejected": -0.1908709853887558, "logps/chosen": -52.76331329345703, "logps/rejected": -61.42716598510742, "loss": 0.0236, "rewards/accuracies": 0.9897387623786926, "rewards/chosen": 1.758689522743225, "rewards/margins": 6.900312900543213, "rewards/rejected": -5.141623497009277, "step": 2144 }, { "epoch": 0.8499207163510867, "grad_norm": 0.038292620331048965, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.3632016181945801, "logits/rejected": -0.20296621322631836, "logps/chosen": -51.63236618041992, "logps/rejected": -64.2586898803711, "loss": 0.0385, "rewards/accuracies": 0.9822760820388794, "rewards/chosen": 1.6954851150512695, "rewards/margins": 6.922590732574463, "rewards/rejected": -5.227106094360352, "step": 2278 }, { "epoch": 0.8999160526070329, "grad_norm": 0.05809802561998367, "learning_rate": 1.111111111111111e-07, "logits/chosen": -0.3890366554260254, "logits/rejected": -0.21105322241783142, "logps/chosen": -52.29930114746094, "logps/rejected": -61.95232009887695, "loss": 0.0161, "rewards/accuracies": 0.9934701323509216, "rewards/chosen": 1.7759125232696533, "rewards/margins": 6.998130798339844, "rewards/rejected": -5.2222185134887695, "step": 2412 }, { "epoch": 0.9499113888629792, "grad_norm": 0.06595258414745331, "learning_rate": 5.555555555555555e-08, "logits/chosen": -0.37663742899894714, "logits/rejected": -0.19746284186840057, "logps/chosen": -52.22157669067383, "logps/rejected": -63.86149597167969, "loss": 0.0283, "rewards/accuracies": 0.9822760820388794, "rewards/chosen": 1.6980862617492676, "rewards/margins": 7.005319118499756, "rewards/rejected": -5.307232856750488, "step": 2546 }, { "epoch": 0.9999067251189254, "grad_norm": 0.06881717592477798, "learning_rate": 0.0, "logits/chosen": -0.35994064807891846, "logits/rejected": -0.19044946134090424, "logps/chosen": -53.963340759277344, "logps/rejected": -64.53665924072266, "loss": 0.0188, "rewards/accuracies": 0.9925373196601868, "rewards/chosen": 1.7544020414352417, "rewards/margins": 7.084873676300049, "rewards/rejected": -5.330471515655518, "step": 2680 }, { "epoch": 0.9999067251189254, "step": 2680, "total_flos": 1.5261823890782945e+18, "train_loss": 0.11258992188012422, "train_runtime": 37304.1181, "train_samples_per_second": 0.575, "train_steps_per_second": 0.072 } ], "logging_steps": 134, "max_steps": 2680, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5261823890782945e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }