{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999219055056618, "eval_steps": 500, "global_step": 3201, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05029285435376806, "grad_norm": 3.709867000579834, "learning_rate": 5.015576323987538e-07, "logits/chosen": -0.5341636538505554, "logits/rejected": -0.4424578845500946, "logps/chosen": -72.9678726196289, "logps/rejected": -9.562939643859863, "loss": 0.6897, "rewards/accuracies": 0.5854037404060364, "rewards/chosen": 0.006333178840577602, "rewards/margins": 0.007243483327329159, "rewards/rejected": -0.0009103047195822, "step": 161 }, { "epoch": 0.10058570870753612, "grad_norm": 4.478637218475342, "learning_rate": 9.996527777777777e-07, "logits/chosen": -0.5466479063034058, "logits/rejected": -0.4609982669353485, "logps/chosen": -67.95721435546875, "logps/rejected": -10.801457405090332, "loss": 0.5858, "rewards/accuracies": 0.9448757767677307, "rewards/chosen": 0.21187880635261536, "rewards/margins": 0.2563822567462921, "rewards/rejected": -0.04450342431664467, "step": 322 }, { "epoch": 0.15087856306130418, "grad_norm": 3.006552219390869, "learning_rate": 9.4375e-07, "logits/chosen": -0.5257240533828735, "logits/rejected": -0.4737217128276825, "logps/chosen": -55.05992126464844, "logps/rejected": -17.820192337036133, "loss": 0.2339, "rewards/accuracies": 0.9798136949539185, "rewards/chosen": 1.0976394414901733, "rewards/margins": 1.828528642654419, "rewards/rejected": -0.7308891415596008, "step": 483 }, { "epoch": 0.20117141741507225, "grad_norm": 3.329214572906494, "learning_rate": 8.878472222222221e-07, "logits/chosen": -0.5365005135536194, "logits/rejected": -0.4758988916873932, "logps/chosen": -56.26280975341797, "logps/rejected": -29.085416793823242, "loss": 0.0705, "rewards/accuracies": 0.9852484464645386, "rewards/chosen": 1.5725164413452148, "rewards/margins": 3.528273344039917, "rewards/rejected": -1.9557571411132812, "step": 644 }, { "epoch": 0.2514642717688403, "grad_norm": 0.14665871858596802, "learning_rate": 8.319444444444444e-07, "logits/chosen": -0.48797789216041565, "logits/rejected": -0.402078241109848, "logps/chosen": -51.26579666137695, "logps/rejected": -40.8376350402832, "loss": 0.0447, "rewards/accuracies": 0.9790372848510742, "rewards/chosen": 1.6765810251235962, "rewards/margins": 4.7392683029174805, "rewards/rejected": -3.062687397003174, "step": 805 }, { "epoch": 0.30175712612260835, "grad_norm": 0.1438707709312439, "learning_rate": 7.760416666666666e-07, "logits/chosen": -0.48938173055648804, "logits/rejected": -0.3820492923259735, "logps/chosen": -50.945064544677734, "logps/rejected": -47.827144622802734, "loss": 0.0257, "rewards/accuracies": 0.989130437374115, "rewards/chosen": 1.6951720714569092, "rewards/margins": 5.385878562927246, "rewards/rejected": -3.6907060146331787, "step": 966 }, { "epoch": 0.3520499804763764, "grad_norm": 2.6210033893585205, "learning_rate": 7.201388888888889e-07, "logits/chosen": -0.47775155305862427, "logits/rejected": -0.3484514653682709, "logps/chosen": -51.115570068359375, "logps/rejected": -48.93259811401367, "loss": 0.0334, "rewards/accuracies": 0.9852484464645386, "rewards/chosen": 1.7170895338058472, "rewards/margins": 5.645313262939453, "rewards/rejected": -3.928223133087158, "step": 1127 }, { "epoch": 0.4023428348301445, "grad_norm": 4.137578964233398, "learning_rate": 6.642361111111111e-07, "logits/chosen": -0.418182373046875, "logits/rejected": -0.27966105937957764, "logps/chosen": -50.84815979003906, "logps/rejected": -52.41380310058594, "loss": 0.031, "rewards/accuracies": 0.9836956858634949, "rewards/chosen": 1.7334542274475098, "rewards/margins": 5.92018985748291, "rewards/rejected": -4.1867356300354, "step": 1288 }, { "epoch": 0.45263568918391256, "grad_norm": 0.07434514909982681, "learning_rate": 6.083333333333333e-07, "logits/chosen": -0.4359574019908905, "logits/rejected": -0.27854442596435547, "logps/chosen": -52.3484992980957, "logps/rejected": -55.29338073730469, "loss": 0.0264, "rewards/accuracies": 0.9883540272712708, "rewards/chosen": 1.8382827043533325, "rewards/margins": 6.365357398986816, "rewards/rejected": -4.527073860168457, "step": 1449 }, { "epoch": 0.5029285435376806, "grad_norm": 0.17919230461120605, "learning_rate": 5.524305555555555e-07, "logits/chosen": -0.4308469891548157, "logits/rejected": -0.27532947063446045, "logps/chosen": -52.922943115234375, "logps/rejected": -57.74534225463867, "loss": 0.0238, "rewards/accuracies": 0.9860248565673828, "rewards/chosen": 1.7562286853790283, "rewards/margins": 6.457979679107666, "rewards/rejected": -4.701751708984375, "step": 1610 }, { "epoch": 0.5532213978914486, "grad_norm": 0.026971790939569473, "learning_rate": 4.965277777777777e-07, "logits/chosen": -0.3874114155769348, "logits/rejected": -0.2123931497335434, "logps/chosen": -54.096187591552734, "logps/rejected": -60.510658264160156, "loss": 0.0233, "rewards/accuracies": 0.9860248565673828, "rewards/chosen": 1.8518177270889282, "rewards/margins": 6.871143341064453, "rewards/rejected": -5.019325256347656, "step": 1771 }, { "epoch": 0.6035142522452167, "grad_norm": 0.09704186022281647, "learning_rate": 4.4062499999999996e-07, "logits/chosen": -0.37683001160621643, "logits/rejected": -0.19427433609962463, "logps/chosen": -54.42588424682617, "logps/rejected": -61.70278549194336, "loss": 0.0221, "rewards/accuracies": 0.986801266670227, "rewards/chosen": 1.8429250717163086, "rewards/margins": 7.024946212768555, "rewards/rejected": -5.182021141052246, "step": 1932 }, { "epoch": 0.6538071065989848, "grad_norm": 0.023908786475658417, "learning_rate": 3.8472222222222225e-07, "logits/chosen": -0.4016348421573639, "logits/rejected": -0.23176224529743195, "logps/chosen": -51.2470588684082, "logps/rejected": -64.10649871826172, "loss": 0.0229, "rewards/accuracies": 0.9883540272712708, "rewards/chosen": 1.6802482604980469, "rewards/margins": 7.057994365692139, "rewards/rejected": -5.377746105194092, "step": 2093 }, { "epoch": 0.7040999609527528, "grad_norm": 33.38786697387695, "learning_rate": 3.2881944444444443e-07, "logits/chosen": -0.38065940141677856, "logits/rejected": -0.19005167484283447, "logps/chosen": -52.53501892089844, "logps/rejected": -65.53170013427734, "loss": 0.0286, "rewards/accuracies": 0.986801266670227, "rewards/chosen": 1.750819444656372, "rewards/margins": 7.3389058113098145, "rewards/rejected": -5.5880866050720215, "step": 2254 }, { "epoch": 0.7543928153065209, "grad_norm": 33.96625518798828, "learning_rate": 2.729166666666666e-07, "logits/chosen": -0.36966800689697266, "logits/rejected": -0.18849784135818481, "logps/chosen": -51.86404800415039, "logps/rejected": -67.312255859375, "loss": 0.0249, "rewards/accuracies": 0.9883540272712708, "rewards/chosen": 1.648945927619934, "rewards/margins": 7.407442092895508, "rewards/rejected": -5.758496284484863, "step": 2415 }, { "epoch": 0.804685669660289, "grad_norm": 0.029783952981233597, "learning_rate": 2.1701388888888887e-07, "logits/chosen": -0.3718484044075012, "logits/rejected": -0.18486632406711578, "logps/chosen": -54.5934944152832, "logps/rejected": -68.00869750976562, "loss": 0.0317, "rewards/accuracies": 0.9852484464645386, "rewards/chosen": 1.6592566967010498, "rewards/margins": 7.394908905029297, "rewards/rejected": -5.735651969909668, "step": 2576 }, { "epoch": 0.854978524014057, "grad_norm": 0.015878599137067795, "learning_rate": 1.611111111111111e-07, "logits/chosen": -0.37658101320266724, "logits/rejected": -0.2032197117805481, "logps/chosen": -51.02374267578125, "logps/rejected": -68.18423461914062, "loss": 0.0295, "rewards/accuracies": 0.9852484464645386, "rewards/chosen": 1.6066235303878784, "rewards/margins": 7.458284854888916, "rewards/rejected": -5.851661205291748, "step": 2737 }, { "epoch": 0.9052713783678251, "grad_norm": 0.03441372141242027, "learning_rate": 1.0520833333333333e-07, "logits/chosen": -0.3815793991088867, "logits/rejected": -0.1944185197353363, "logps/chosen": -51.63225555419922, "logps/rejected": -68.67254638671875, "loss": 0.0241, "rewards/accuracies": 0.989130437374115, "rewards/chosen": 1.6394439935684204, "rewards/margins": 7.522336959838867, "rewards/rejected": -5.882892608642578, "step": 2898 }, { "epoch": 0.9555642327215931, "grad_norm": 0.028507934883236885, "learning_rate": 4.9305555555555555e-08, "logits/chosen": -0.35245007276535034, "logits/rejected": -0.17326129972934723, "logps/chosen": -54.05128479003906, "logps/rejected": -69.85031127929688, "loss": 0.0227, "rewards/accuracies": 0.9906832575798035, "rewards/chosen": 1.6639856100082397, "rewards/margins": 7.611756801605225, "rewards/rejected": -5.947770595550537, "step": 3059 }, { "epoch": 0.9999219055056618, "step": 3201, "total_flos": 1.8190117587218596e+18, "train_loss": 0.10125412571545654, "train_runtime": 44496.0824, "train_samples_per_second": 0.576, "train_steps_per_second": 0.072 } ], "logging_steps": 161, "max_steps": 3201, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8190117587218596e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }