{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 100, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020931449502878076, "grad_norm": 4.875121866371553, "learning_rate": 4.166666666666666e-09, "logits/chosen": -2.238138437271118, "logits/rejected": -2.554456949234009, "logps/chosen": -443.7523193359375, "logps/rejected": -491.8927001953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.020931449502878074, "grad_norm": 5.553929970393955, "learning_rate": 4.166666666666667e-08, "logits/chosen": -2.4126930236816406, "logits/rejected": -2.5005030632019043, "logps/chosen": -418.43328857421875, "logps/rejected": -405.0360107421875, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0017023859545588493, "rewards/margins": 0.00048581857117824256, "rewards/rejected": 0.0012165673542767763, "step": 10 }, { "epoch": 0.04186289900575615, "grad_norm": 4.513029874801273, "learning_rate": 8.333333333333334e-08, "logits/chosen": -2.208683490753174, "logits/rejected": -2.485910415649414, "logps/chosen": -428.45208740234375, "logps/rejected": -408.13763427734375, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0008482746779918671, "rewards/margins": -0.00037219192017801106, "rewards/rejected": 0.0012204666854813695, "step": 20 }, { "epoch": 0.06279434850863422, "grad_norm": 4.637552468831084, "learning_rate": 1.25e-07, "logits/chosen": -2.224863290786743, "logits/rejected": -2.4407901763916016, "logps/chosen": -398.6038818359375, "logps/rejected": -367.05999755859375, "loss": 0.6924, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0041518621146678925, "rewards/margins": 0.0011339159682393074, "rewards/rejected": -0.005285778548568487, "step": 30 }, { "epoch": 0.0837257980115123, "grad_norm": 4.657136939144448, "learning_rate": 1.6666666666666668e-07, "logits/chosen": -2.3235936164855957, "logits/rejected": -2.4915928840637207, "logps/chosen": -372.97442626953125, "logps/rejected": -390.05841064453125, "loss": 0.6899, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.019573217257857323, "rewards/margins": 0.007190874312072992, "rewards/rejected": -0.026764091104269028, "step": 40 }, { "epoch": 0.10465724751439037, "grad_norm": 4.947790369246717, "learning_rate": 1.9998927475076103e-07, "logits/chosen": -2.1541531085968018, "logits/rejected": -2.355862855911255, "logps/chosen": -408.7329406738281, "logps/rejected": -406.50347900390625, "loss": 0.6855, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.04146841913461685, "rewards/margins": 0.02013658545911312, "rewards/rejected": -0.061604999005794525, "step": 50 }, { "epoch": 0.12558869701726844, "grad_norm": 6.135445605235113, "learning_rate": 1.9961413253717213e-07, "logits/chosen": -2.120229482650757, "logits/rejected": -2.287370204925537, "logps/chosen": -376.740234375, "logps/rejected": -386.8778381347656, "loss": 0.678, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.08536554872989655, "rewards/margins": 0.03690432757139206, "rewards/rejected": -0.12226986885070801, "step": 60 }, { "epoch": 0.14652014652014653, "grad_norm": 5.2300665585071835, "learning_rate": 1.9870502626379125e-07, "logits/chosen": -2.208547830581665, "logits/rejected": -2.316659927368164, "logps/chosen": -425.2916564941406, "logps/rejected": -429.31463623046875, "loss": 0.6673, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.14128030836582184, "rewards/margins": 0.05471445247530937, "rewards/rejected": -0.1959947645664215, "step": 70 }, { "epoch": 0.1674515960230246, "grad_norm": 6.361729619349137, "learning_rate": 1.9726682903510838e-07, "logits/chosen": -1.8886642456054688, "logits/rejected": -2.2390127182006836, "logps/chosen": -470.6441955566406, "logps/rejected": -419.4126892089844, "loss": 0.6583, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2689892053604126, "rewards/margins": 0.07578183710575104, "rewards/rejected": -0.34477105736732483, "step": 80 }, { "epoch": 0.18838304552590268, "grad_norm": 7.250967252041406, "learning_rate": 1.9530725005474194e-07, "logits/chosen": -2.3355867862701416, "logits/rejected": -2.404792070388794, "logps/chosen": -411.76806640625, "logps/rejected": -441.7333068847656, "loss": 0.6355, "rewards/accuracies": 0.625, "rewards/chosen": -0.42172950506210327, "rewards/margins": 0.12971071898937225, "rewards/rejected": -0.5514402985572815, "step": 90 }, { "epoch": 0.20931449502878074, "grad_norm": 7.1454110672964335, "learning_rate": 1.9283679330160724e-07, "logits/chosen": -2.2639448642730713, "logits/rejected": -2.5537800788879395, "logps/chosen": -477.0587463378906, "logps/rejected": -489.705810546875, "loss": 0.6351, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6349204778671265, "rewards/margins": 0.18245458602905273, "rewards/rejected": -0.8173751831054688, "step": 100 }, { "epoch": 0.20931449502878074, "eval_logits/chosen": -2.2922377586364746, "eval_logits/rejected": -2.4565351009368896, "eval_logps/chosen": -472.2982177734375, "eval_logps/rejected": -487.7696533203125, "eval_loss": 0.6359348893165588, "eval_rewards/accuracies": 0.6746031641960144, "eval_rewards/chosen": -0.675361156463623, "eval_rewards/margins": 0.2425757199525833, "eval_rewards/rejected": -0.9179368615150452, "eval_runtime": 88.9262, "eval_samples_per_second": 22.491, "eval_steps_per_second": 0.708, "step": 100 }, { "epoch": 0.2302459445316588, "grad_norm": 9.360622478279684, "learning_rate": 1.898687012251826e-07, "logits/chosen": -2.217447280883789, "logits/rejected": -2.3863320350646973, "logps/chosen": -481.96990966796875, "logps/rejected": -499.48345947265625, "loss": 0.6311, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.7452836036682129, "rewards/margins": 0.209157794713974, "rewards/rejected": -0.9544414281845093, "step": 110 }, { "epoch": 0.25117739403453687, "grad_norm": 7.953755036427896, "learning_rate": 1.8641888376168482e-07, "logits/chosen": -2.2092318534851074, "logits/rejected": -2.2929816246032715, "logps/chosen": -454.405517578125, "logps/rejected": -497.1351623535156, "loss": 0.6209, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7448408007621765, "rewards/margins": 0.29463425278663635, "rewards/rejected": -1.0394752025604248, "step": 120 }, { "epoch": 0.272108843537415, "grad_norm": 8.821105331401093, "learning_rate": 1.8250583305165094e-07, "logits/chosen": -2.2061495780944824, "logits/rejected": -2.3711869716644287, "logps/chosen": -472.7056579589844, "logps/rejected": -487.33880615234375, "loss": 0.6204, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7287603616714478, "rewards/margins": 0.2083979845046997, "rewards/rejected": -0.9371584057807922, "step": 130 }, { "epoch": 0.29304029304029305, "grad_norm": 9.167325969849378, "learning_rate": 1.78150524316067e-07, "logits/chosen": -2.2468433380126953, "logits/rejected": -2.466036319732666, "logps/chosen": -501.697021484375, "logps/rejected": -497.5772399902344, "loss": 0.6195, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7497612237930298, "rewards/margins": 0.30543631315231323, "rewards/rejected": -1.0551974773406982, "step": 140 }, { "epoch": 0.3139717425431711, "grad_norm": 10.828055616866019, "learning_rate": 1.7337630342238038e-07, "logits/chosen": -2.163837432861328, "logits/rejected": -2.328864574432373, "logps/chosen": -474.3462829589844, "logps/rejected": -480.0904846191406, "loss": 0.621, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7979869246482849, "rewards/margins": 0.22926858067512512, "rewards/rejected": -1.0272555351257324, "step": 150 }, { "epoch": 0.3349031920460492, "grad_norm": 9.907119624068729, "learning_rate": 1.682087617430782e-07, "logits/chosen": -2.1256282329559326, "logits/rejected": -2.4207208156585693, "logps/chosen": -476.00933837890625, "logps/rejected": -491.25799560546875, "loss": 0.6148, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8471584320068359, "rewards/margins": 0.2906045913696289, "rewards/rejected": -1.1377630233764648, "step": 160 }, { "epoch": 0.35583464154892724, "grad_norm": 10.130673374633192, "learning_rate": 1.6267559897763025e-07, "logits/chosen": -2.240748405456543, "logits/rejected": -2.3730461597442627, "logps/chosen": -466.5884704589844, "logps/rejected": -470.2240295410156, "loss": 0.6136, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8948806524276733, "rewards/margins": 0.24292059242725372, "rewards/rejected": -1.137801170349121, "step": 170 }, { "epoch": 0.37676609105180536, "grad_norm": 12.664244024162585, "learning_rate": 1.5680647467311557e-07, "logits/chosen": -2.3886361122131348, "logits/rejected": -2.48551344871521, "logps/chosen": -466.68115234375, "logps/rejected": -481.260498046875, "loss": 0.589, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8065212965011597, "rewards/margins": 0.28530603647232056, "rewards/rejected": -1.091827392578125, "step": 180 }, { "epoch": 0.3976975405546834, "grad_norm": 15.413041204374277, "learning_rate": 1.506328492394303e-07, "logits/chosen": -2.425926685333252, "logits/rejected": -2.436657190322876, "logps/chosen": -480.2686462402344, "logps/rejected": -514.1541137695312, "loss": 0.6247, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0268957614898682, "rewards/margins": 0.26106053590774536, "rewards/rejected": -1.2879562377929688, "step": 190 }, { "epoch": 0.4186289900575615, "grad_norm": 16.30024056431674, "learning_rate": 1.4418781531128634e-07, "logits/chosen": -2.3286993503570557, "logits/rejected": -2.387202024459839, "logps/chosen": -454.547119140625, "logps/rejected": -511.773681640625, "loss": 0.6101, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8713696599006653, "rewards/margins": 0.2568342685699463, "rewards/rejected": -1.1282037496566772, "step": 200 }, { "epoch": 0.4186289900575615, "eval_logits/chosen": -2.293304443359375, "eval_logits/rejected": -2.447746753692627, "eval_logps/chosen": -484.72442626953125, "eval_logps/rejected": -515.6393432617188, "eval_loss": 0.5989560484886169, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": -0.7996230125427246, "eval_rewards/margins": 0.39701077342033386, "eval_rewards/rejected": -1.1966338157653809, "eval_runtime": 88.7991, "eval_samples_per_second": 22.523, "eval_steps_per_second": 0.709, "step": 200 }, { "epoch": 0.43956043956043955, "grad_norm": 12.590959189684769, "learning_rate": 1.375059203609562e-07, "logits/chosen": -2.251105785369873, "logits/rejected": -2.49545955657959, "logps/chosen": -514.7989501953125, "logps/rejected": -508.8777770996094, "loss": 0.6036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9383622407913208, "rewards/margins": 0.3089975416660309, "rewards/rejected": -1.2473597526550293, "step": 210 }, { "epoch": 0.4604918890633176, "grad_norm": 32.27211919256004, "learning_rate": 1.306229815126159e-07, "logits/chosen": -2.374002456665039, "logits/rejected": -2.5104002952575684, "logps/chosen": -453.17889404296875, "logps/rejected": -502.31829833984375, "loss": 0.5905, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0016330480575562, "rewards/margins": 0.3531147539615631, "rewards/rejected": -1.3547478914260864, "step": 220 }, { "epoch": 0.48142333856619574, "grad_norm": 11.074374701972996, "learning_rate": 1.2357589355094274e-07, "logits/chosen": -2.240893602371216, "logits/rejected": -2.4365756511688232, "logps/chosen": -464.9483947753906, "logps/rejected": -497.55950927734375, "loss": 0.6032, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8673335909843445, "rewards/margins": 0.4288042187690735, "rewards/rejected": -1.2961379289627075, "step": 230 }, { "epoch": 0.5023547880690737, "grad_norm": 13.608161796310325, "learning_rate": 1.1640243115310217e-07, "logits/chosen": -2.263231039047241, "logits/rejected": -2.374429225921631, "logps/chosen": -483.5979919433594, "logps/rejected": -511.7247009277344, "loss": 0.5829, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8133866190910339, "rewards/margins": 0.35704511404037476, "rewards/rejected": -1.1704318523406982, "step": 240 }, { "epoch": 0.5232862375719518, "grad_norm": 14.904992006409358, "learning_rate": 1.0914104640422679e-07, "logits/chosen": -2.312152862548828, "logits/rejected": -2.504575490951538, "logps/chosen": -487.4195861816406, "logps/rejected": -509.62213134765625, "loss": 0.5914, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9289507865905762, "rewards/margins": 0.4651150703430176, "rewards/rejected": -1.3940656185150146, "step": 250 }, { "epoch": 0.54421768707483, "grad_norm": 32.859126344847056, "learning_rate": 1.0183066268176774e-07, "logits/chosen": -2.452216863632202, "logits/rejected": -2.5787224769592285, "logps/chosen": -454.101318359375, "logps/rejected": -491.07708740234375, "loss": 0.5958, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8231406211853027, "rewards/margins": 0.37211668491363525, "rewards/rejected": -1.1952574253082275, "step": 260 }, { "epoch": 0.565149136577708, "grad_norm": 16.410575278967542, "learning_rate": 9.451046601356724e-08, "logits/chosen": -2.4211385250091553, "logits/rejected": -2.5718777179718018, "logps/chosen": -482.42889404296875, "logps/rejected": -517.08447265625, "loss": 0.5968, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7412260174751282, "rewards/margins": 0.46059027314186096, "rewards/rejected": -1.201816439628601, "step": 270 }, { "epoch": 0.5860805860805861, "grad_norm": 14.64481409505789, "learning_rate": 8.721969502803953e-08, "logits/chosen": -2.414080858230591, "logits/rejected": -2.641306161880493, "logps/chosen": -471.8504943847656, "logps/rejected": -492.3824157714844, "loss": 0.6088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9498642086982727, "rewards/margins": 0.3709770143032074, "rewards/rejected": -1.3208411931991577, "step": 280 }, { "epoch": 0.6070120355834642, "grad_norm": 21.87484189841818, "learning_rate": 7.999743062239557e-08, "logits/chosen": -2.5216970443725586, "logits/rejected": -2.5266430377960205, "logps/chosen": -452.1351623535156, "logps/rejected": -507.50408935546875, "loss": 0.5975, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9256707429885864, "rewards/margins": 0.38739797472953796, "rewards/rejected": -1.3130687475204468, "step": 290 }, { "epoch": 0.6279434850863422, "grad_norm": 13.23460942074812, "learning_rate": 7.28823864763583e-08, "logits/chosen": -2.3628604412078857, "logits/rejected": -2.5071964263916016, "logps/chosen": -530.2737426757812, "logps/rejected": -534.9356689453125, "loss": 0.5738, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9033306241035461, "rewards/margins": 0.409872442483902, "rewards/rejected": -1.313202977180481, "step": 300 }, { "epoch": 0.6279434850863422, "eval_logits/chosen": -2.3505780696868896, "eval_logits/rejected": -2.500311851501465, "eval_logps/chosen": -511.9820861816406, "eval_logps/rejected": -562.04541015625, "eval_loss": 0.5819065570831299, "eval_rewards/accuracies": 0.7142857313156128, "eval_rewards/chosen": -1.0721999406814575, "eval_rewards/margins": 0.5884942412376404, "eval_rewards/rejected": -1.6606942415237427, "eval_runtime": 88.8035, "eval_samples_per_second": 22.522, "eval_steps_per_second": 0.709, "step": 300 }, { "epoch": 0.6488749345892203, "grad_norm": 23.240653261962176, "learning_rate": 6.591270153428288e-08, "logits/chosen": -2.3066353797912598, "logits/rejected": -2.4188685417175293, "logps/chosen": -530.1605224609375, "logps/rejected": -555.5882568359375, "loss": 0.5816, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0851608514785767, "rewards/margins": 0.6294665932655334, "rewards/rejected": -1.7146275043487549, "step": 310 }, { "epoch": 0.6698063840920984, "grad_norm": 12.35925417664361, "learning_rate": 5.912573556804452e-08, "logits/chosen": -2.4511845111846924, "logits/rejected": -2.5960700511932373, "logps/chosen": -462.8910217285156, "logps/rejected": -492.77459716796875, "loss": 0.5721, "rewards/accuracies": 0.75, "rewards/chosen": -0.9141901135444641, "rewards/margins": 0.49542441964149475, "rewards/rejected": -1.4096145629882812, "step": 320 }, { "epoch": 0.6907378335949764, "grad_norm": 19.635922794228048, "learning_rate": 5.255786891654399e-08, "logits/chosen": -2.2881722450256348, "logits/rejected": -2.3245983123779297, "logps/chosen": -490.61956787109375, "logps/rejected": -528.5936279296875, "loss": 0.5831, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0118191242218018, "rewards/margins": 0.3562072217464447, "rewards/rejected": -1.3680263757705688, "step": 330 }, { "epoch": 0.7116692830978545, "grad_norm": 34.0341920873177, "learning_rate": 4.624430747529102e-08, "logits/chosen": -2.2541534900665283, "logits/rejected": -2.3677923679351807, "logps/chosen": -520.711181640625, "logps/rejected": -555.8665771484375, "loss": 0.5771, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1455854177474976, "rewards/margins": 0.44834479689598083, "rewards/rejected": -1.5939301252365112, "step": 340 }, { "epoch": 0.7326007326007326, "grad_norm": 20.184086200131315, "learning_rate": 4.0218893981385925e-08, "logits/chosen": -2.336240291595459, "logits/rejected": -2.5228190422058105, "logps/chosen": -490.032470703125, "logps/rejected": -514.3966064453125, "loss": 0.5772, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1221544742584229, "rewards/margins": 0.41546517610549927, "rewards/rejected": -1.5376195907592773, "step": 350 }, { "epoch": 0.7535321821036107, "grad_norm": 14.840705395348046, "learning_rate": 3.45139266054715e-08, "logits/chosen": -2.3588707447052, "logits/rejected": -2.5286855697631836, "logps/chosen": -525.8394775390625, "logps/rejected": -543.2139892578125, "loss": 0.5961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9700316190719604, "rewards/margins": 0.42892080545425415, "rewards/rejected": -1.3989523649215698, "step": 360 }, { "epoch": 0.7744636316064888, "grad_norm": 12.56992511385935, "learning_rate": 2.9159985823062993e-08, "logits/chosen": -2.4362387657165527, "logits/rejected": -2.588212251663208, "logps/chosen": -469.63018798828125, "logps/rejected": -491.34185791015625, "loss": 0.5787, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9046362638473511, "rewards/margins": 0.42833614349365234, "rewards/rejected": -1.332972526550293, "step": 370 }, { "epoch": 0.7953950811093669, "grad_norm": 14.216122099186137, "learning_rate": 2.4185770493280577e-08, "logits/chosen": -2.4785826206207275, "logits/rejected": -2.5475876331329346, "logps/chosen": -463.3335876464844, "logps/rejected": -562.8516235351562, "loss": 0.5816, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0568846464157104, "rewards/margins": 0.6403349041938782, "rewards/rejected": -1.6972196102142334, "step": 380 }, { "epoch": 0.8163265306122449, "grad_norm": 17.166403382209694, "learning_rate": 1.9617944023656108e-08, "logits/chosen": -2.3412299156188965, "logits/rejected": -2.431159257888794, "logps/chosen": -569.6896362304688, "logps/rejected": -604.4752197265625, "loss": 0.5647, "rewards/accuracies": 0.71875, "rewards/chosen": -1.135259985923767, "rewards/margins": 0.5612015724182129, "rewards/rejected": -1.6964616775512695, "step": 390 }, { "epoch": 0.837257980115123, "grad_norm": 25.5326876410102, "learning_rate": 1.5480991445620538e-08, "logits/chosen": -2.438910961151123, "logits/rejected": -2.621582269668579, "logps/chosen": -477.71551513671875, "logps/rejected": -516.8345336914062, "loss": 0.5808, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0198707580566406, "rewards/margins": 0.485908567905426, "rewards/rejected": -1.5057791471481323, "step": 400 }, { "epoch": 0.837257980115123, "eval_logits/chosen": -2.4454309940338135, "eval_logits/rejected": -2.60603404045105, "eval_logps/chosen": -509.0269470214844, "eval_logps/rejected": -557.9309692382812, "eval_loss": 0.5776250958442688, "eval_rewards/accuracies": 0.7063491940498352, "eval_rewards/chosen": -1.042648196220398, "eval_rewards/margins": 0.5769018530845642, "eval_rewards/rejected": -1.619550108909607, "eval_runtime": 88.8844, "eval_samples_per_second": 22.501, "eval_steps_per_second": 0.709, "step": 400 }, { "epoch": 0.858189429618001, "grad_norm": 12.2363803809367, "learning_rate": 1.1797088166794e-08, "logits/chosen": -2.327822208404541, "logits/rejected": -2.539658308029175, "logps/chosen": -523.35693359375, "logps/rejected": -556.1873168945312, "loss": 0.5837, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0230482816696167, "rewards/margins": 0.5963117480278015, "rewards/rejected": -1.6193599700927734, "step": 410 }, { "epoch": 0.8791208791208791, "grad_norm": 17.1630701293683, "learning_rate": 8.585981103608341e-09, "logits/chosen": -2.3502843379974365, "logits/rejected": -2.5074477195739746, "logps/chosen": -481.4237365722656, "logps/rejected": -559.5806884765625, "loss": 0.567, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0329768657684326, "rewards/margins": 0.5681900978088379, "rewards/rejected": -1.6011669635772705, "step": 420 }, { "epoch": 0.9000523286237572, "grad_norm": 16.184790708379772, "learning_rate": 5.864882831430273e-09, "logits/chosen": -2.352280378341675, "logits/rejected": -2.436026096343994, "logps/chosen": -513.5238647460938, "logps/rejected": -551.8958129882812, "loss": 0.5755, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0582252740859985, "rewards/margins": 0.5332168340682983, "rewards/rejected": -1.5914418697357178, "step": 430 }, { "epoch": 0.9209837781266352, "grad_norm": 17.526839475687186, "learning_rate": 3.6483793195745682e-09, "logits/chosen": -2.3311455249786377, "logits/rejected": -2.440988063812256, "logps/chosen": -482.4281311035156, "logps/rejected": -498.60345458984375, "loss": 0.5787, "rewards/accuracies": 0.625, "rewards/chosen": -1.0323375463485718, "rewards/margins": 0.4054194390773773, "rewards/rejected": -1.4377570152282715, "step": 440 }, { "epoch": 0.9419152276295133, "grad_norm": 14.705602904039639, "learning_rate": 1.9483517457776433e-09, "logits/chosen": -2.2350025177001953, "logits/rejected": -2.3830924034118652, "logps/chosen": -490.513427734375, "logps/rejected": -551.2727661132812, "loss": 0.579, "rewards/accuracies": 0.75, "rewards/chosen": -1.0369895696640015, "rewards/margins": 0.5606644153594971, "rewards/rejected": -1.597654104232788, "step": 450 }, { "epoch": 0.9628466771323915, "grad_norm": 15.228089724513376, "learning_rate": 7.739128092312918e-10, "logits/chosen": -2.281054973602295, "logits/rejected": -2.4768524169921875, "logps/chosen": -496.84814453125, "logps/rejected": -510.46258544921875, "loss": 0.579, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0984748601913452, "rewards/margins": 0.47915878891944885, "rewards/rejected": -1.5776336193084717, "step": 460 }, { "epoch": 0.9837781266352695, "grad_norm": 17.607957497609636, "learning_rate": 1.313578835593465e-10, "logits/chosen": -2.3311634063720703, "logits/rejected": -2.4415996074676514, "logps/chosen": -519.3492431640625, "logps/rejected": -541.9041137695312, "loss": 0.5694, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0364539623260498, "rewards/margins": 0.33034905791282654, "rewards/rejected": -1.3668031692504883, "step": 470 }, { "epoch": 0.9984301412872841, "step": 477, "total_flos": 0.0, "train_loss": 0.6095632167232361, "train_runtime": 6900.3625, "train_samples_per_second": 8.86, "train_steps_per_second": 0.069 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }