{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3616146092302129, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007232292184604258, "grad_norm": 0.24258430302143097, "learning_rate": 4.977500000000001e-06, "log_odds_chosen": 0.024490734562277794, "log_odds_ratio": -0.8170725703239441, "logits/chosen": -1.157371997833252, "logits/rejected": -1.3074114322662354, "logps/chosen": -5.802087306976318, "logps/rejected": -5.824903964996338, "loss": 5.930429458618164, "nll_loss": 5.848721981048584, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.5802086591720581, "rewards/margins": 0.002281700726598501, "rewards/rejected": -0.5824903845787048, "step": 10 }, { "epoch": 0.014464584369208515, "grad_norm": 0.2757987976074219, "learning_rate": 4.9525000000000004e-06, "log_odds_chosen": 0.07715226709842682, "log_odds_ratio": -0.824821949005127, "logits/chosen": -1.2339580059051514, "logits/rejected": -1.3473726511001587, "logps/chosen": -5.762179851531982, "logps/rejected": -5.835549354553223, "loss": 5.804796600341797, "nll_loss": 5.722315311431885, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.5762180089950562, "rewards/margins": 0.007336919195950031, "rewards/rejected": -0.5835549235343933, "step": 20 }, { "epoch": 0.021696876553812774, "grad_norm": 0.335886150598526, "learning_rate": 4.927500000000001e-06, "log_odds_chosen": 0.1334013044834137, "log_odds_ratio": -0.7585476636886597, "logits/chosen": -1.2495059967041016, "logits/rejected": -1.377275824546814, "logps/chosen": -5.738375663757324, "logps/rejected": -5.869871616363525, "loss": 5.848030853271484, "nll_loss": 5.772176265716553, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.5738375782966614, "rewards/margins": 0.013149544596672058, "rewards/rejected": -0.5869871377944946, "step": 30 }, { "epoch": 0.02892916873841703, "grad_norm": 0.3094758987426758, "learning_rate": 4.902500000000001e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.2641137838363647, "logits/rejected": -1.3740476369857788, "logps/chosen": -5.545978546142578, "logps/rejected": NaN, "loss": 6.1603240966796875, "nll_loss": 5.585268974304199, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.5545979738235474, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 40 }, { "epoch": 0.03616146092302129, "grad_norm": 0.32100710272789, "learning_rate": 4.8775e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.30973219871521, "logits/rejected": -1.4038114547729492, "logps/chosen": NaN, "logps/rejected": -5.679649829864502, "loss": 5.871247863769531, "nll_loss": 5.540013790130615, "rewards/accuracies": 0.578125, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": -0.567965030670166, "step": 50 }, { "epoch": 0.04339375310762555, "grad_norm": 0.37317097187042236, "learning_rate": 4.8525000000000006e-06, "log_odds_chosen": 0.08813583850860596, "log_odds_ratio": -0.7889271974563599, "logits/chosen": -1.2617380619049072, "logits/rejected": -1.3692435026168823, "logps/chosen": -5.557856559753418, "logps/rejected": -5.643843173980713, "loss": 5.632916259765625, "nll_loss": 5.554023742675781, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5557857155799866, "rewards/margins": 0.008598615415394306, "rewards/rejected": -0.5643843412399292, "step": 60 }, { "epoch": 0.050626045292229804, "grad_norm": 0.2950762212276459, "learning_rate": 4.827500000000001e-06, "log_odds_chosen": 0.12244565784931183, "log_odds_ratio": -0.7549425959587097, "logits/chosen": -1.312510371208191, "logits/rejected": -1.41781485080719, "logps/chosen": -5.558300018310547, "logps/rejected": -5.677781581878662, "loss": 5.611359786987305, "nll_loss": 5.535863876342773, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.5558300018310547, "rewards/margins": 0.01194816268980503, "rewards/rejected": -0.5677782297134399, "step": 70 }, { "epoch": 0.05785833747683406, "grad_norm": 0.3010028600692749, "learning_rate": 4.8025e-06, "log_odds_chosen": 0.17995290458202362, "log_odds_ratio": -0.7352453470230103, "logits/chosen": -1.3281229734420776, "logits/rejected": -1.4346697330474854, "logps/chosen": -5.496776580810547, "logps/rejected": -5.675050735473633, "loss": 5.57300033569336, "nll_loss": 5.499476432800293, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5496777296066284, "rewards/margins": 0.017827384173870087, "rewards/rejected": -0.5675050616264343, "step": 80 }, { "epoch": 0.06509062966143832, "grad_norm": 0.2564737796783447, "learning_rate": 4.7775e-06, "log_odds_chosen": 0.08067800104618073, "log_odds_ratio": -0.757804274559021, "logits/chosen": -1.2539831399917603, "logits/rejected": -1.3801463842391968, "logps/chosen": -5.533335208892822, "logps/rejected": -5.611950874328613, "loss": 5.607465744018555, "nll_loss": 5.531683921813965, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5533335208892822, "rewards/margins": 0.007861590944230556, "rewards/rejected": -0.5611951351165771, "step": 90 }, { "epoch": 0.07232292184604258, "grad_norm": 0.2745928168296814, "learning_rate": 4.752500000000001e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.1503195762634277, "logits/rejected": -1.3092117309570312, "logps/chosen": -5.674561500549316, "logps/rejected": NaN, "loss": 5.882052612304688, "nll_loss": 5.633638381958008, "rewards/accuracies": 0.546875, "rewards/chosen": -0.5674561262130737, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 100 }, { "epoch": 0.07955521403064683, "grad_norm": 0.22206123173236847, "learning_rate": 4.7275e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.3089696168899536, "logits/rejected": -1.4873392581939697, "logps/chosen": -5.392054557800293, "logps/rejected": NaN, "loss": 5.727831649780273, "nll_loss": 5.414012908935547, "rewards/accuracies": 0.515625, "rewards/chosen": -0.5392054319381714, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 110 }, { "epoch": 0.0867875062152511, "grad_norm": 0.19849246740341187, "learning_rate": 4.7025e-06, "log_odds_chosen": 0.09243413805961609, "log_odds_ratio": -0.7682880759239197, "logits/chosen": -1.3715227842330933, "logits/rejected": -1.4874814748764038, "logps/chosen": -5.332463264465332, "logps/rejected": -5.421862602233887, "loss": 5.385763931274414, "nll_loss": 5.308935642242432, "rewards/accuracies": 0.5, "rewards/chosen": -0.5332463383674622, "rewards/margins": 0.00893993116915226, "rewards/rejected": -0.5421862602233887, "step": 120 }, { "epoch": 0.09401979839985536, "grad_norm": 0.20989899337291718, "learning_rate": 4.6775000000000005e-06, "log_odds_chosen": 0.07261505722999573, "log_odds_ratio": -0.7669461965560913, "logits/chosen": -1.2831826210021973, "logits/rejected": -1.3855401277542114, "logps/chosen": -5.570550441741943, "logps/rejected": -5.639805793762207, "loss": 5.608541107177734, "nll_loss": 5.531846046447754, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.5570551156997681, "rewards/margins": 0.006925526075065136, "rewards/rejected": -0.5639805793762207, "step": 130 }, { "epoch": 0.10125209058445961, "grad_norm": 0.24246586859226227, "learning_rate": 4.652500000000001e-06, "log_odds_chosen": 0.13198330998420715, "log_odds_ratio": -0.7554014921188354, "logits/chosen": -1.3736729621887207, "logits/rejected": -1.4753162860870361, "logps/chosen": -5.387119770050049, "logps/rejected": -5.51568078994751, "loss": 5.4450115203857425, "nll_loss": 5.369471549987793, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5387119650840759, "rewards/margins": 0.01285608857870102, "rewards/rejected": -0.5515680909156799, "step": 140 }, { "epoch": 0.10848438276906387, "grad_norm": 0.22028212249279022, "learning_rate": 4.6275e-06, "log_odds_chosen": 0.1945369690656662, "log_odds_ratio": -0.7150241732597351, "logits/chosen": -1.222019076347351, "logits/rejected": -1.3116670846939087, "logps/chosen": -5.559757232666016, "logps/rejected": -5.74977970123291, "loss": 5.611997985839844, "nll_loss": 5.540493965148926, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -0.5559757351875305, "rewards/margins": 0.01900230534374714, "rewards/rejected": -0.5749779939651489, "step": 150 }, { "epoch": 0.11571667495366812, "grad_norm": 0.24762941896915436, "learning_rate": 4.6025e-06, "log_odds_chosen": 0.12068144977092743, "log_odds_ratio": -0.7480632066726685, "logits/chosen": -1.2661784887313843, "logits/rejected": -1.3999755382537842, "logps/chosen": -5.444934844970703, "logps/rejected": -5.563178062438965, "loss": 5.528120040893555, "nll_loss": 5.453313827514648, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.544493556022644, "rewards/margins": 0.011824256740510464, "rewards/rejected": -0.5563178062438965, "step": 160 }, { "epoch": 0.12294896713827239, "grad_norm": 0.24410110712051392, "learning_rate": 4.577500000000001e-06, "log_odds_chosen": 0.07302852720022202, "log_odds_ratio": -0.7880190014839172, "logits/chosen": -1.2591315507888794, "logits/rejected": -1.379748821258545, "logps/chosen": -5.474093437194824, "logps/rejected": -5.544507026672363, "loss": 5.533624267578125, "nll_loss": 5.454824447631836, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.5474093556404114, "rewards/margins": 0.00704141054302454, "rewards/rejected": -0.5544507503509521, "step": 170 }, { "epoch": 0.13018125932287664, "grad_norm": 0.238921120762825, "learning_rate": 4.5525e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.2266502380371094, "logits/rejected": -1.3530925512313843, "logps/chosen": -5.5178141593933105, "logps/rejected": NaN, "loss": 5.79667854309082, "nll_loss": 5.490872859954834, "rewards/accuracies": 0.546875, "rewards/chosen": -0.551781415939331, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 180 }, { "epoch": 0.1374135515074809, "grad_norm": 0.24503076076507568, "learning_rate": 4.5275e-06, "log_odds_chosen": 0.03844783455133438, "log_odds_ratio": -0.7719030976295471, "logits/chosen": -1.2923024892807007, "logits/rejected": -1.4297457933425903, "logps/chosen": -5.426344871520996, "logps/rejected": -5.463438987731934, "loss": 5.509374618530273, "nll_loss": 5.432183265686035, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": -0.5426343679428101, "rewards/margins": 0.003709450364112854, "rewards/rejected": -0.5463439226150513, "step": 190 }, { "epoch": 0.14464584369208516, "grad_norm": 0.2474403828382492, "learning_rate": 4.5025000000000005e-06, "log_odds_chosen": 0.15352819859981537, "log_odds_ratio": -0.7516843676567078, "logits/chosen": -1.238797664642334, "logits/rejected": -1.3516855239868164, "logps/chosen": -5.482898235321045, "logps/rejected": -5.631514549255371, "loss": 5.482235336303711, "nll_loss": 5.407067775726318, "rewards/accuracies": 0.578125, "rewards/chosen": -0.5482897758483887, "rewards/margins": 0.01486161071807146, "rewards/rejected": -0.563151478767395, "step": 200 }, { "epoch": 0.1518781358766894, "grad_norm": 0.24647395312786102, "learning_rate": 4.4775e-06, "log_odds_chosen": 0.06791242212057114, "log_odds_ratio": -0.7611157894134521, "logits/chosen": -1.2040866613388062, "logits/rejected": -1.325791835784912, "logps/chosen": -5.527557373046875, "logps/rejected": -5.593737602233887, "loss": 5.5501853942871096, "nll_loss": 5.4740729331970215, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5527557134628296, "rewards/margins": 0.00661806296557188, "rewards/rejected": -0.5593737959861755, "step": 210 }, { "epoch": 0.15911042806129366, "grad_norm": 0.3191209137439728, "learning_rate": 4.4525e-06, "log_odds_chosen": 0.11287051439285278, "log_odds_ratio": -0.744554877281189, "logits/chosen": -1.2233508825302124, "logits/rejected": -1.334123134613037, "logps/chosen": -5.541935920715332, "logps/rejected": -5.6515703201293945, "loss": 5.557551574707031, "nll_loss": 5.483095169067383, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.5541935563087463, "rewards/margins": 0.010963483713567257, "rewards/rejected": -0.5651570558547974, "step": 220 }, { "epoch": 0.16634272024589794, "grad_norm": 0.3177518844604492, "learning_rate": 4.4275e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.2862989902496338, "logits/rejected": -1.396356225013733, "logps/chosen": -5.400467872619629, "logps/rejected": NaN, "loss": 5.670994567871094, "nll_loss": 5.330209255218506, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.5400468111038208, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 230 }, { "epoch": 0.1735750124305022, "grad_norm": 0.2698371410369873, "learning_rate": 4.4025e-06, "log_odds_chosen": 0.10866693407297134, "log_odds_ratio": -0.7732769250869751, "logits/chosen": -1.2277987003326416, "logits/rejected": -1.3508949279785156, "logps/chosen": -5.505708694458008, "logps/rejected": -5.6127214431762695, "loss": 5.515193176269531, "nll_loss": 5.437865257263184, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.5505709052085876, "rewards/margins": 0.010701271705329418, "rewards/rejected": -0.5612722039222717, "step": 240 }, { "epoch": 0.18080730461510644, "grad_norm": 0.2867446839809418, "learning_rate": 4.3775e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.2118570804595947, "logits/rejected": -1.3736122846603394, "logps/chosen": NaN, "logps/rejected": -5.480542182922363, "loss": 5.575833892822265, "nll_loss": 5.324864387512207, "rewards/accuracies": 0.528124988079071, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": -0.5480541586875916, "step": 250 }, { "epoch": 0.18803959679971072, "grad_norm": 0.31495073437690735, "learning_rate": 4.3525e-06, "log_odds_chosen": 0.008091190829873085, "log_odds_ratio": -0.8035072088241577, "logits/chosen": -1.214800477027893, "logits/rejected": -1.3568775653839111, "logps/chosen": -5.552915096282959, "logps/rejected": -5.560267925262451, "loss": 5.492018127441407, "nll_loss": 5.411666393280029, "rewards/accuracies": 0.5, "rewards/chosen": -0.5552915334701538, "rewards/margins": 0.00073523836908862, "rewards/rejected": -0.5560267567634583, "step": 260 }, { "epoch": 0.19527188898431497, "grad_norm": 0.3158721327781677, "learning_rate": 4.3275000000000005e-06, "log_odds_chosen": 0.12867891788482666, "log_odds_ratio": -0.7282706499099731, "logits/chosen": -1.1992931365966797, "logits/rejected": -1.3094508647918701, "logps/chosen": -5.397171497344971, "logps/rejected": -5.522560119628906, "loss": 5.501744079589844, "nll_loss": 5.42891788482666, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.5397171378135681, "rewards/margins": 0.012538868002593517, "rewards/rejected": -0.5522559881210327, "step": 270 }, { "epoch": 0.20250418116891922, "grad_norm": NaN, "learning_rate": 4.302500000000001e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.1986209154129028, "logits/rejected": -1.325928807258606, "logps/chosen": -5.435647964477539, "logps/rejected": NaN, "loss": 5.748141479492188, "nll_loss": 5.416440010070801, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5435648560523987, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 280 }, { "epoch": 0.2097364733535235, "grad_norm": 0.3696766197681427, "learning_rate": 4.2775e-06, "log_odds_chosen": -0.056973300874233246, "log_odds_ratio": -0.8312661051750183, "logits/chosen": -1.1497722864151, "logits/rejected": -1.2519116401672363, "logps/chosen": -5.443070888519287, "logps/rejected": -5.3841233253479, "loss": 5.432368469238281, "nll_loss": 5.349241733551025, "rewards/accuracies": 0.5, "rewards/chosen": -0.5443071126937866, "rewards/margins": -0.005894799251109362, "rewards/rejected": -0.5384122729301453, "step": 290 }, { "epoch": 0.21696876553812774, "grad_norm": 0.4062056839466095, "learning_rate": 4.2525e-06, "log_odds_chosen": 0.06658594310283661, "log_odds_ratio": -0.7710601687431335, "logits/chosen": -1.152179479598999, "logits/rejected": -1.2460296154022217, "logps/chosen": -5.450706958770752, "logps/rejected": -5.514741897583008, "loss": 5.461360168457031, "nll_loss": 5.384252548217773, "rewards/accuracies": 0.484375, "rewards/chosen": -0.5450707077980042, "rewards/margins": 0.006403499282896519, "rewards/rejected": -0.5514742136001587, "step": 300 }, { "epoch": 0.224201057722732, "grad_norm": 0.35252928733825684, "learning_rate": 4.227500000000001e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.11289381980896, "logits/rejected": -1.1899579763412476, "logps/chosen": NaN, "logps/rejected": -5.594590187072754, "loss": 5.755791473388672, "nll_loss": 5.4381585121154785, "rewards/accuracies": 0.503125011920929, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": -0.5594589710235596, "step": 310 }, { "epoch": 0.23143334990733624, "grad_norm": 0.32109105587005615, "learning_rate": 4.202500000000001e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.1014893054962158, "logits/rejected": -1.2668919563293457, "logps/chosen": NaN, "logps/rejected": -5.533962249755859, "loss": 5.670769500732422, "nll_loss": 5.3555803298950195, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": -0.5533961653709412, "step": 320 }, { "epoch": 0.23866564209194052, "grad_norm": 0.3987061083316803, "learning_rate": 4.1775e-06, "log_odds_chosen": 0.09470056742429733, "log_odds_ratio": -0.7665299773216248, "logits/chosen": -1.0869171619415283, "logits/rejected": -1.1906062364578247, "logps/chosen": -5.4382758140563965, "logps/rejected": -5.529966831207275, "loss": 5.484853363037109, "nll_loss": 5.408199787139893, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.5438276529312134, "rewards/margins": 0.009169066324830055, "rewards/rejected": -0.5529965758323669, "step": 330 }, { "epoch": 0.24589793427654477, "grad_norm": 0.3762986958026886, "learning_rate": 4.1525000000000005e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.0889309644699097, "logits/rejected": -1.210901141166687, "logps/chosen": NaN, "logps/rejected": -5.512875556945801, "loss": 5.707857513427735, "nll_loss": 5.374106407165527, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": -0.5512875318527222, "step": 340 }, { "epoch": 0.253130226461149, "grad_norm": 0.3539881706237793, "learning_rate": 4.127500000000001e-06, "log_odds_chosen": 0.05990752577781677, "log_odds_ratio": -0.7520009279251099, "logits/chosen": -1.1000460386276245, "logits/rejected": -1.1998263597488403, "logps/chosen": -5.449780464172363, "logps/rejected": -5.5063323974609375, "loss": 5.4952552795410154, "nll_loss": 5.420053958892822, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.5449780225753784, "rewards/margins": 0.005655230488628149, "rewards/rejected": -0.5506333112716675, "step": 350 }, { "epoch": 0.26036251864575327, "grad_norm": 0.4491257071495056, "learning_rate": 4.1025e-06, "log_odds_chosen": 0.0024302334059029818, "log_odds_ratio": -0.803369402885437, "logits/chosen": -1.1746547222137451, "logits/rejected": -1.2685177326202393, "logps/chosen": -5.358044624328613, "logps/rejected": -5.358481407165527, "loss": 5.37115707397461, "nll_loss": 5.290821075439453, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.5358044505119324, "rewards/margins": 4.368703957879916e-05, "rewards/rejected": -0.5358482003211975, "step": 360 }, { "epoch": 0.2675948108303575, "grad_norm": 0.4225010275840759, "learning_rate": 4.0775e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.1588351726531982, "logits/rejected": -1.2908666133880615, "logps/chosen": -5.33625602722168, "logps/rejected": NaN, "loss": 5.6588897705078125, "nll_loss": 5.325121879577637, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.533625602722168, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 370 }, { "epoch": 0.2748271030149618, "grad_norm": 0.39807629585266113, "learning_rate": 4.052500000000001e-06, "log_odds_chosen": 0.10123734176158905, "log_odds_ratio": -0.7437968850135803, "logits/chosen": -1.1793785095214844, "logits/rejected": -1.3256503343582153, "logps/chosen": -5.213901996612549, "logps/rejected": -5.312169551849365, "loss": 5.233790588378906, "nll_loss": 5.159411430358887, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5213901996612549, "rewards/margins": 0.009826736524701118, "rewards/rejected": -0.5312169790267944, "step": 380 }, { "epoch": 0.2820593951995661, "grad_norm": 0.4834080934524536, "learning_rate": 4.0275e-06, "log_odds_chosen": 0.07742507755756378, "log_odds_ratio": -0.7695199847221375, "logits/chosen": -1.0857242345809937, "logits/rejected": -1.2077770233154297, "logps/chosen": -5.406026363372803, "logps/rejected": -5.479708671569824, "loss": 5.404788970947266, "nll_loss": 5.3278374671936035, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.5406026840209961, "rewards/margins": 0.007368179503828287, "rewards/rejected": -0.5479708313941956, "step": 390 }, { "epoch": 0.2892916873841703, "grad_norm": 0.4307977259159088, "learning_rate": 4.0025e-06, "log_odds_chosen": 0.13691337406635284, "log_odds_ratio": -0.7147995233535767, "logits/chosen": -1.1390448808670044, "logits/rejected": -1.2907403707504272, "logps/chosen": -5.265523433685303, "logps/rejected": -5.39796257019043, "loss": 5.341230010986328, "nll_loss": 5.269749641418457, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.5265523791313171, "rewards/margins": 0.013243894092738628, "rewards/rejected": -0.5397962331771851, "step": 400 }, { "epoch": 0.2965239795687746, "grad_norm": 0.40860849618911743, "learning_rate": 3.9775000000000005e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.1594798564910889, "logits/rejected": -1.288938283920288, "logps/chosen": -5.337071895599365, "logps/rejected": NaN, "loss": 5.633375549316407, "nll_loss": 5.293572425842285, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5337072014808655, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 410 }, { "epoch": 0.3037562717533788, "grad_norm": 0.5024765729904175, "learning_rate": 3.9525e-06, "log_odds_chosen": 0.14180947840213776, "log_odds_ratio": -0.716079831123352, "logits/chosen": -1.05372154712677, "logits/rejected": -1.1723723411560059, "logps/chosen": -5.377732753753662, "logps/rejected": -5.514806747436523, "loss": 5.390876770019531, "nll_loss": 5.319269180297852, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.5377733707427979, "rewards/margins": 0.013707393780350685, "rewards/rejected": -0.5514807105064392, "step": 420 }, { "epoch": 0.3109885639379831, "grad_norm": 0.43874257802963257, "learning_rate": 3.9275e-06, "log_odds_chosen": 0.041213370859622955, "log_odds_ratio": -0.7626296281814575, "logits/chosen": -1.0877110958099365, "logits/rejected": -1.2165416479110718, "logps/chosen": -5.347620010375977, "logps/rejected": -5.385829925537109, "loss": 5.355945587158203, "nll_loss": 5.279682636260986, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.5347620248794556, "rewards/margins": 0.0038209576159715652, "rewards/rejected": -0.538582980632782, "step": 430 }, { "epoch": 0.3182208561225873, "grad_norm": 0.4173244535923004, "learning_rate": 3.9025e-06, "log_odds_chosen": 0.11863790452480316, "log_odds_ratio": -0.7459925413131714, "logits/chosen": -1.1519792079925537, "logits/rejected": -1.3020139932632446, "logps/chosen": -5.2671380043029785, "logps/rejected": -5.38327693939209, "loss": 5.298666381835938, "nll_loss": 5.224067211151123, "rewards/accuracies": 0.578125, "rewards/chosen": -0.5267137885093689, "rewards/margins": 0.011613896116614342, "rewards/rejected": -0.538327693939209, "step": 440 }, { "epoch": 0.32545314830719163, "grad_norm": 0.4220745861530304, "learning_rate": 3.8775000000000006e-06, "log_odds_chosen": NaN, "log_odds_ratio": NaN, "logits/chosen": -1.0332152843475342, "logits/rejected": -1.1924374103546143, "logps/chosen": NaN, "logps/rejected": -5.533570766448975, "loss": 5.6558387756347654, "nll_loss": 5.354229927062988, "rewards/accuracies": 0.528124988079071, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": -0.5533571243286133, "step": 450 }, { "epoch": 0.3326854404917959, "grad_norm": 0.4184921085834503, "learning_rate": 3.8525e-06, "log_odds_chosen": 0.0849083662033081, "log_odds_ratio": -0.7574716210365295, "logits/chosen": -1.0905861854553223, "logits/rejected": -1.18590247631073, "logps/chosen": -5.311153888702393, "logps/rejected": -5.390518665313721, "loss": 5.285702133178711, "nll_loss": 5.209954261779785, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.5311154127120972, "rewards/margins": 0.007936512120068073, "rewards/rejected": -0.53905189037323, "step": 460 }, { "epoch": 0.33991773267640013, "grad_norm": 0.4340634047985077, "learning_rate": 3.8275e-06, "log_odds_chosen": 0.12307295948266983, "log_odds_ratio": -0.7435885667800903, "logits/chosen": -1.1616504192352295, "logits/rejected": -1.275611162185669, "logps/chosen": -5.210690498352051, "logps/rejected": -5.329777717590332, "loss": 5.172726058959961, "nll_loss": 5.098366737365723, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5210691094398499, "rewards/margins": 0.011908676475286484, "rewards/rejected": -0.5329777598381042, "step": 470 }, { "epoch": 0.3471500248610044, "grad_norm": 0.5348623394966125, "learning_rate": 3.8025e-06, "log_odds_chosen": 0.10718987882137299, "log_odds_ratio": -0.7553779482841492, "logits/chosen": -1.069937825202942, "logits/rejected": -1.2259876728057861, "logps/chosen": -5.3289594650268555, "logps/rejected": -5.430902004241943, "loss": 5.337881469726563, "nll_loss": 5.262343406677246, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.5328959822654724, "rewards/margins": 0.010194242931902409, "rewards/rejected": -0.5430902242660522, "step": 480 }, { "epoch": 0.35438231704560863, "grad_norm": 0.5375458002090454, "learning_rate": 3.7775000000000003e-06, "log_odds_chosen": -0.0630793422460556, "log_odds_ratio": -0.8402652740478516, "logits/chosen": -1.0562111139297485, "logits/rejected": -1.1850430965423584, "logps/chosen": -5.353094100952148, "logps/rejected": -5.288957595825195, "loss": 5.370440673828125, "nll_loss": 5.286414623260498, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5353094339370728, "rewards/margins": -0.006413729395717382, "rewards/rejected": -0.5288957357406616, "step": 490 }, { "epoch": 0.3616146092302129, "grad_norm": 0.5694707036018372, "learning_rate": 3.7525e-06, "log_odds_chosen": 0.1951710283756256, "log_odds_ratio": -0.7369016408920288, "logits/chosen": -1.1101913452148438, "logits/rejected": -1.2623517513275146, "logps/chosen": -5.364739418029785, "logps/rejected": -5.554258823394775, "loss": 5.355496597290039, "nll_loss": 5.281806945800781, "rewards/accuracies": 0.546875, "rewards/chosen": -0.5364739298820496, "rewards/margins": 0.018951958045363426, "rewards/rejected": -0.5554260015487671, "step": 500 } ], "logging_steps": 10, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }