davidanugraha's picture
Upload folder using huggingface_hub
8cebc90 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 610,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009842116054951815,
"grad_norm": 2.7811431884765625,
"learning_rate": 3.278688524590164e-08,
"logits/chosen": -2.533964157104492,
"logits/rejected": -2.5735201835632324,
"logps/chosen": -0.4404516816139221,
"logps/rejected": -0.44272005558013916,
"loss": 1.3164,
"rewards/accuracies": 0.4895833432674408,
"rewards/chosen": -0.8809033632278442,
"rewards/margins": 0.00453670509159565,
"rewards/rejected": -0.8854400515556335,
"step": 3
},
{
"epoch": 0.01968423210990363,
"grad_norm": 3.193603992462158,
"learning_rate": 8.196721311475409e-08,
"logits/chosen": -2.5366404056549072,
"logits/rejected": -2.580451250076294,
"logps/chosen": -0.4687342345714569,
"logps/rejected": -0.4700230658054352,
"loss": 1.319,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.9374685287475586,
"rewards/margins": 0.002577731851488352,
"rewards/rejected": -0.9400461316108704,
"step": 6
},
{
"epoch": 0.029526348164855443,
"grad_norm": 2.600137233734131,
"learning_rate": 1.3114754098360656e-07,
"logits/chosen": -2.457185983657837,
"logits/rejected": -2.439577579498291,
"logps/chosen": -0.4295300841331482,
"logps/rejected": -0.44163596630096436,
"loss": 1.3021,
"rewards/accuracies": 0.5260416865348816,
"rewards/chosen": -0.8590601682662964,
"rewards/margins": 0.024211766198277473,
"rewards/rejected": -0.8832719326019287,
"step": 9
},
{
"epoch": 0.03936846421980726,
"grad_norm": 3.330892324447632,
"learning_rate": 1.80327868852459e-07,
"logits/chosen": -2.6398239135742188,
"logits/rejected": -2.584653615951538,
"logps/chosen": -0.4457446336746216,
"logps/rejected": -0.4714476466178894,
"loss": 1.283,
"rewards/accuracies": 0.5520833134651184,
"rewards/chosen": -0.8914893269538879,
"rewards/margins": 0.05140605568885803,
"rewards/rejected": -0.9428953528404236,
"step": 12
},
{
"epoch": 0.04921058027475907,
"grad_norm": 3.791318416595459,
"learning_rate": 2.2950819672131146e-07,
"logits/chosen": -2.695323944091797,
"logits/rejected": -2.656032085418701,
"logps/chosen": -0.44835442304611206,
"logps/rejected": -0.45536863803863525,
"loss": 1.3095,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.8967088460922241,
"rewards/margins": 0.0140285175293684,
"rewards/rejected": -0.9107372760772705,
"step": 15
},
{
"epoch": 0.059052696329710885,
"grad_norm": 3.3407955169677734,
"learning_rate": 2.786885245901639e-07,
"logits/chosen": -2.5617423057556152,
"logits/rejected": -2.535289764404297,
"logps/chosen": -0.4351993799209595,
"logps/rejected": -0.4376446008682251,
"loss": 1.3161,
"rewards/accuracies": 0.4791666865348816,
"rewards/chosen": -0.870398759841919,
"rewards/margins": 0.004890482872724533,
"rewards/rejected": -0.8752892017364502,
"step": 18
},
{
"epoch": 0.0688948123846627,
"grad_norm": 2.501546621322632,
"learning_rate": 3.2786885245901637e-07,
"logits/chosen": -2.4582276344299316,
"logits/rejected": -2.462825059890747,
"logps/chosen": -0.4415985345840454,
"logps/rejected": -0.45250964164733887,
"loss": 1.3035,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.8831970691680908,
"rewards/margins": 0.021822253242135048,
"rewards/rejected": -0.9050193428993225,
"step": 21
},
{
"epoch": 0.07873692843961452,
"grad_norm": 3.018726348876953,
"learning_rate": 3.770491803278688e-07,
"logits/chosen": -2.4941272735595703,
"logits/rejected": -2.504913568496704,
"logps/chosen": -0.4402480721473694,
"logps/rejected": -0.43463078141212463,
"loss": 1.3275,
"rewards/accuracies": 0.463541716337204,
"rewards/chosen": -0.8804961442947388,
"rewards/margins": -0.011234622448682785,
"rewards/rejected": -0.8692615032196045,
"step": 24
},
{
"epoch": 0.08857904449456633,
"grad_norm": 2.763169527053833,
"learning_rate": 4.2622950819672127e-07,
"logits/chosen": -2.6146433353424072,
"logits/rejected": -2.6000146865844727,
"logps/chosen": -0.4407660961151123,
"logps/rejected": -0.44831183552742004,
"loss": 1.3084,
"rewards/accuracies": 0.5104166865348816,
"rewards/chosen": -0.8815321922302246,
"rewards/margins": 0.015091471374034882,
"rewards/rejected": -0.8966236114501953,
"step": 27
},
{
"epoch": 0.09842116054951815,
"grad_norm": 2.839801073074341,
"learning_rate": 4.754098360655737e-07,
"logits/chosen": -2.450800895690918,
"logits/rejected": -2.3901357650756836,
"logps/chosen": -0.4461948275566101,
"logps/rejected": -0.4516681730747223,
"loss": 1.3124,
"rewards/accuracies": 0.5052083134651184,
"rewards/chosen": -0.8923896551132202,
"rewards/margins": 0.010946739464998245,
"rewards/rejected": -0.9033364057540894,
"step": 30
},
{
"epoch": 0.10826327660446997,
"grad_norm": 3.3853325843811035,
"learning_rate": 5.245901639344262e-07,
"logits/chosen": -2.3206844329833984,
"logits/rejected": -2.391359329223633,
"logps/chosen": -0.4440017640590668,
"logps/rejected": -0.43253856897354126,
"loss": 1.3369,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.8880034685134888,
"rewards/margins": -0.022926393896341324,
"rewards/rejected": -0.8650770783424377,
"step": 33
},
{
"epoch": 0.11810539265942177,
"grad_norm": 3.0044500827789307,
"learning_rate": 5.737704918032786e-07,
"logits/chosen": -2.3636105060577393,
"logits/rejected": -2.3779592514038086,
"logps/chosen": -0.44306662678718567,
"logps/rejected": -0.439554363489151,
"loss": 1.3247,
"rewards/accuracies": 0.4739583432674408,
"rewards/chosen": -0.8861331939697266,
"rewards/margins": -0.007024487480521202,
"rewards/rejected": -0.879108726978302,
"step": 36
},
{
"epoch": 0.1279475087143736,
"grad_norm": 2.8635122776031494,
"learning_rate": 6.229508196721311e-07,
"logits/chosen": -2.499943733215332,
"logits/rejected": -2.465728282928467,
"logps/chosen": -0.44928643107414246,
"logps/rejected": -0.457317590713501,
"loss": 1.3083,
"rewards/accuracies": 0.5364583730697632,
"rewards/chosen": -0.8985728621482849,
"rewards/margins": 0.016062280163168907,
"rewards/rejected": -0.914635181427002,
"step": 39
},
{
"epoch": 0.1377896247693254,
"grad_norm": 2.9235432147979736,
"learning_rate": 6.721311475409835e-07,
"logits/chosen": -2.313934326171875,
"logits/rejected": -2.4113597869873047,
"logps/chosen": -0.43200892210006714,
"logps/rejected": -0.4239245653152466,
"loss": 1.3305,
"rewards/accuracies": 0.4322916865348816,
"rewards/chosen": -0.8640178442001343,
"rewards/margins": -0.01616874523460865,
"rewards/rejected": -0.8478491306304932,
"step": 42
},
{
"epoch": 0.1476317408242772,
"grad_norm": 2.583570957183838,
"learning_rate": 7.21311475409836e-07,
"logits/chosen": -2.4851250648498535,
"logits/rejected": -2.4660589694976807,
"logps/chosen": -0.42572450637817383,
"logps/rejected": -0.4340115785598755,
"loss": 1.3069,
"rewards/accuracies": 0.5052083730697632,
"rewards/chosen": -0.8514490723609924,
"rewards/margins": 0.016574125736951828,
"rewards/rejected": -0.868023157119751,
"step": 45
},
{
"epoch": 0.15747385687922905,
"grad_norm": 2.7927021980285645,
"learning_rate": 7.704918032786884e-07,
"logits/chosen": -2.5673110485076904,
"logits/rejected": -2.5053553581237793,
"logps/chosen": -0.42316704988479614,
"logps/rejected": -0.41728323698043823,
"loss": 1.3275,
"rewards/accuracies": 0.4427083432674408,
"rewards/chosen": -0.8463341593742371,
"rewards/margins": -0.011767696589231491,
"rewards/rejected": -0.8345664739608765,
"step": 48
},
{
"epoch": 0.16731597293418085,
"grad_norm": 2.707350730895996,
"learning_rate": 8.196721311475409e-07,
"logits/chosen": -2.493770122528076,
"logits/rejected": -2.497191905975342,
"logps/chosen": -0.413898766040802,
"logps/rejected": -0.42288342118263245,
"loss": 1.3051,
"rewards/accuracies": 0.5260416865348816,
"rewards/chosen": -0.8277975916862488,
"rewards/margins": 0.01796923577785492,
"rewards/rejected": -0.8457668423652649,
"step": 51
},
{
"epoch": 0.17715808898913266,
"grad_norm": 2.8383336067199707,
"learning_rate": 8.688524590163933e-07,
"logits/chosen": -2.3650898933410645,
"logits/rejected": -2.4355216026306152,
"logps/chosen": -0.41062265634536743,
"logps/rejected": -0.40795671939849854,
"loss": 1.3217,
"rewards/accuracies": 0.4739583432674408,
"rewards/chosen": -0.8212453126907349,
"rewards/margins": -0.005331846419721842,
"rewards/rejected": -0.8159134387969971,
"step": 54
},
{
"epoch": 0.1870002050440845,
"grad_norm": 3.264463424682617,
"learning_rate": 9.180327868852458e-07,
"logits/chosen": -2.3384056091308594,
"logits/rejected": -2.3722102642059326,
"logps/chosen": -0.415290892124176,
"logps/rejected": -0.4106101989746094,
"loss": 1.3245,
"rewards/accuracies": 0.4687500298023224,
"rewards/chosen": -0.830581784248352,
"rewards/margins": -0.009361350908875465,
"rewards/rejected": -0.8212203979492188,
"step": 57
},
{
"epoch": 0.1968423210990363,
"grad_norm": 2.702704429626465,
"learning_rate": 9.672131147540984e-07,
"logits/chosen": -2.6796839237213135,
"logits/rejected": -2.70339298248291,
"logps/chosen": -0.38649609684944153,
"logps/rejected": -0.389847993850708,
"loss": 1.3119,
"rewards/accuracies": 0.5208333730697632,
"rewards/chosen": -0.7729922533035278,
"rewards/margins": 0.0067038037814199924,
"rewards/rejected": -0.7796960473060608,
"step": 60
},
{
"epoch": 0.2066844371539881,
"grad_norm": 3.295572519302368,
"learning_rate": 9.981785063752275e-07,
"logits/chosen": -2.549654483795166,
"logits/rejected": -2.5630557537078857,
"logps/chosen": -0.3998144268989563,
"logps/rejected": -0.4005410671234131,
"loss": 1.3159,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": -0.7996287941932678,
"rewards/margins": 0.0014533549547195435,
"rewards/rejected": -0.8010821342468262,
"step": 63
},
{
"epoch": 0.21652655320893993,
"grad_norm": 3.1199636459350586,
"learning_rate": 9.927140255009107e-07,
"logits/chosen": -2.619666337966919,
"logits/rejected": -2.6088433265686035,
"logps/chosen": -0.3825004994869232,
"logps/rejected": -0.3724328875541687,
"loss": 1.3313,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7650009393692017,
"rewards/margins": -0.020135192200541496,
"rewards/rejected": -0.7448657751083374,
"step": 66
},
{
"epoch": 0.22636866926389174,
"grad_norm": 2.7287650108337402,
"learning_rate": 9.872495446265937e-07,
"logits/chosen": -2.1145222187042236,
"logits/rejected": -2.1984646320343018,
"logps/chosen": -0.3847331702709198,
"logps/rejected": -0.3833593726158142,
"loss": 1.318,
"rewards/accuracies": 0.4739583730697632,
"rewards/chosen": -0.7694664001464844,
"rewards/margins": -0.0027476283721625805,
"rewards/rejected": -0.7667187452316284,
"step": 69
},
{
"epoch": 0.23621078531884354,
"grad_norm": 3.2083938121795654,
"learning_rate": 9.817850637522768e-07,
"logits/chosen": -2.5148842334747314,
"logits/rejected": -2.515829086303711,
"logps/chosen": -0.37398943305015564,
"logps/rejected": -0.37285110354423523,
"loss": 1.3179,
"rewards/accuracies": 0.4843750298023224,
"rewards/chosen": -0.7479788661003113,
"rewards/margins": -0.0022766790352761745,
"rewards/rejected": -0.7457021474838257,
"step": 72
},
{
"epoch": 0.24605290137379537,
"grad_norm": 2.843623638153076,
"learning_rate": 9.7632058287796e-07,
"logits/chosen": -2.1537227630615234,
"logits/rejected": -2.186638355255127,
"logps/chosen": -0.3700721561908722,
"logps/rejected": -0.3716978430747986,
"loss": 1.3131,
"rewards/accuracies": 0.5520833730697632,
"rewards/chosen": -0.7401443123817444,
"rewards/margins": 0.0032513481564819813,
"rewards/rejected": -0.7433956861495972,
"step": 75
},
{
"epoch": 0.2558950174287472,
"grad_norm": 2.9954681396484375,
"learning_rate": 9.70856102003643e-07,
"logits/chosen": -2.37508487701416,
"logits/rejected": -2.3783116340637207,
"logps/chosen": -0.3550634980201721,
"logps/rejected": -0.36521193385124207,
"loss": 1.3005,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7101269960403442,
"rewards/margins": 0.020296888425946236,
"rewards/rejected": -0.7304238677024841,
"step": 78
},
{
"epoch": 0.265737133483699,
"grad_norm": 2.9920599460601807,
"learning_rate": 9.65391621129326e-07,
"logits/chosen": -2.3454084396362305,
"logits/rejected": -2.2591981887817383,
"logps/chosen": -0.3407334089279175,
"logps/rejected": -0.3508540987968445,
"loss": 1.3001,
"rewards/accuracies": 0.6041666865348816,
"rewards/chosen": -0.6814668774604797,
"rewards/margins": 0.02024134062230587,
"rewards/rejected": -0.701708197593689,
"step": 81
},
{
"epoch": 0.2755792495386508,
"grad_norm": 2.8075363636016846,
"learning_rate": 9.599271402550091e-07,
"logits/chosen": -2.3530383110046387,
"logits/rejected": -2.3570096492767334,
"logps/chosen": -0.3530980944633484,
"logps/rejected": -0.34843918681144714,
"loss": 1.3219,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.7061961889266968,
"rewards/margins": -0.009317765012383461,
"rewards/rejected": -0.6968783736228943,
"step": 84
},
{
"epoch": 0.28542136559360265,
"grad_norm": 3.062509536743164,
"learning_rate": 9.54462659380692e-07,
"logits/chosen": -2.3423635959625244,
"logits/rejected": -2.3681883811950684,
"logps/chosen": -0.3502524793148041,
"logps/rejected": -0.3471028804779053,
"loss": 1.3204,
"rewards/accuracies": 0.4895833432674408,
"rewards/chosen": -0.7005049586296082,
"rewards/margins": -0.006299168802797794,
"rewards/rejected": -0.6942057013511658,
"step": 87
},
{
"epoch": 0.2952634816485544,
"grad_norm": 2.8204712867736816,
"learning_rate": 9.489981785063752e-07,
"logits/chosen": -2.2016220092773438,
"logits/rejected": -2.176506996154785,
"logps/chosen": -0.32860067486763,
"logps/rejected": -0.3329722583293915,
"loss": 1.3082,
"rewards/accuracies": 0.5677083730697632,
"rewards/chosen": -0.6572014093399048,
"rewards/margins": 0.00874313898384571,
"rewards/rejected": -0.665944516658783,
"step": 90
},
{
"epoch": 0.30510559770350626,
"grad_norm": 2.8776941299438477,
"learning_rate": 9.435336976320582e-07,
"logits/chosen": -2.272819995880127,
"logits/rejected": -2.2633821964263916,
"logps/chosen": -0.33269160985946655,
"logps/rejected": -0.3359033465385437,
"loss": 1.3102,
"rewards/accuracies": 0.5104166865348816,
"rewards/chosen": -0.6653832197189331,
"rewards/margins": 0.006423423532396555,
"rewards/rejected": -0.6718066930770874,
"step": 93
},
{
"epoch": 0.3149477137584581,
"grad_norm": 2.593137264251709,
"learning_rate": 9.380692167577413e-07,
"logits/chosen": -2.213297128677368,
"logits/rejected": -2.2387821674346924,
"logps/chosen": -0.3308699429035187,
"logps/rejected": -0.3250887393951416,
"loss": 1.3232,
"rewards/accuracies": 0.4739583730697632,
"rewards/chosen": -0.6617398262023926,
"rewards/margins": -0.011562440544366837,
"rewards/rejected": -0.6501774191856384,
"step": 96
},
{
"epoch": 0.32478982981340987,
"grad_norm": 2.9904048442840576,
"learning_rate": 9.326047358834243e-07,
"logits/chosen": -2.3887205123901367,
"logits/rejected": -2.4502334594726562,
"logps/chosen": -0.3256734609603882,
"logps/rejected": -0.32411491870880127,
"loss": 1.3169,
"rewards/accuracies": 0.4635416865348816,
"rewards/chosen": -0.6513469219207764,
"rewards/margins": -0.0031170835718512535,
"rewards/rejected": -0.6482298970222473,
"step": 99
},
{
"epoch": 0.3346319458683617,
"grad_norm": 2.659723997116089,
"learning_rate": 9.271402550091074e-07,
"logits/chosen": -2.0707240104675293,
"logits/rejected": -2.098159074783325,
"logps/chosen": -0.3261147737503052,
"logps/rejected": -0.32286298274993896,
"loss": 1.3197,
"rewards/accuracies": 0.4895833730697632,
"rewards/chosen": -0.6522295475006104,
"rewards/margins": -0.006503552198410034,
"rewards/rejected": -0.6457259654998779,
"step": 102
},
{
"epoch": 0.34447406192331353,
"grad_norm": 2.617114782333374,
"learning_rate": 9.216757741347905e-07,
"logits/chosen": -2.3909406661987305,
"logits/rejected": -2.2843799591064453,
"logps/chosen": -0.31216752529144287,
"logps/rejected": -0.31390637159347534,
"loss": 1.3118,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.624334990978241,
"rewards/margins": 0.0034777685068547726,
"rewards/rejected": -0.6278128027915955,
"step": 105
},
{
"epoch": 0.3543161779782653,
"grad_norm": 2.900043249130249,
"learning_rate": 9.162112932604735e-07,
"logits/chosen": -2.1978025436401367,
"logits/rejected": -2.2727980613708496,
"logps/chosen": -0.3117530345916748,
"logps/rejected": -0.30951735377311707,
"loss": 1.3177,
"rewards/accuracies": 0.479166716337204,
"rewards/chosen": -0.6235060691833496,
"rewards/margins": -0.004471416585147381,
"rewards/rejected": -0.6190346479415894,
"step": 108
},
{
"epoch": 0.36415829403321714,
"grad_norm": 2.785658359527588,
"learning_rate": 9.107468123861566e-07,
"logits/chosen": -2.2299718856811523,
"logits/rejected": -2.2097911834716797,
"logps/chosen": -0.3144725561141968,
"logps/rejected": -0.31601589918136597,
"loss": 1.3121,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.6289451122283936,
"rewards/margins": 0.003086656332015991,
"rewards/rejected": -0.6320317983627319,
"step": 111
},
{
"epoch": 0.374000410088169,
"grad_norm": 3.3421730995178223,
"learning_rate": 9.052823315118397e-07,
"logits/chosen": -2.1038126945495605,
"logits/rejected": -2.0839600563049316,
"logps/chosen": -0.3226383328437805,
"logps/rejected": -0.32167741656303406,
"loss": 1.3162,
"rewards/accuracies": 0.4791666865348816,
"rewards/chosen": -0.645276665687561,
"rewards/margins": -0.0019218978704884648,
"rewards/rejected": -0.6433548331260681,
"step": 114
},
{
"epoch": 0.38384252614312075,
"grad_norm": 2.9325544834136963,
"learning_rate": 8.998178506375227e-07,
"logits/chosen": -2.20025634765625,
"logits/rejected": -2.2536721229553223,
"logps/chosen": -0.311615526676178,
"logps/rejected": -0.31010982394218445,
"loss": 1.3166,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.623231053352356,
"rewards/margins": -0.0030114587862044573,
"rewards/rejected": -0.6202195882797241,
"step": 117
},
{
"epoch": 0.3936846421980726,
"grad_norm": 2.7917957305908203,
"learning_rate": 8.943533697632057e-07,
"logits/chosen": -2.3032007217407227,
"logits/rejected": -2.2467172145843506,
"logps/chosen": -0.31110310554504395,
"logps/rejected": -0.30983883142471313,
"loss": 1.316,
"rewards/accuracies": 0.4531250298023224,
"rewards/chosen": -0.6222062110900879,
"rewards/margins": -0.002528547076508403,
"rewards/rejected": -0.6196776628494263,
"step": 120
},
{
"epoch": 0.4035267582530244,
"grad_norm": 2.7737972736358643,
"learning_rate": 8.888888888888888e-07,
"logits/chosen": -2.2247915267944336,
"logits/rejected": -2.185898542404175,
"logps/chosen": -0.30768126249313354,
"logps/rejected": -0.31419891119003296,
"loss": 1.3047,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": -0.6153625249862671,
"rewards/margins": 0.013035254552960396,
"rewards/rejected": -0.6283978223800659,
"step": 123
},
{
"epoch": 0.4133688743079762,
"grad_norm": 2.8648312091827393,
"learning_rate": 8.834244080145718e-07,
"logits/chosen": -2.2792012691497803,
"logits/rejected": -2.2944419384002686,
"logps/chosen": -0.3053058981895447,
"logps/rejected": -0.30649513006210327,
"loss": 1.3127,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.6106117963790894,
"rewards/margins": 0.0023784590885043144,
"rewards/rejected": -0.6129902601242065,
"step": 126
},
{
"epoch": 0.42321099036292803,
"grad_norm": 3.244568109512329,
"learning_rate": 8.77959927140255e-07,
"logits/chosen": -2.3420162200927734,
"logits/rejected": -2.3962247371673584,
"logps/chosen": -0.29792097210884094,
"logps/rejected": -0.3045887351036072,
"loss": 1.3048,
"rewards/accuracies": 0.5572916865348816,
"rewards/chosen": -0.5958418846130371,
"rewards/margins": 0.01333555020391941,
"rewards/rejected": -0.6091774702072144,
"step": 129
},
{
"epoch": 0.43305310641787986,
"grad_norm": 3.226536989212036,
"learning_rate": 8.724954462659381e-07,
"logits/chosen": -2.076824188232422,
"logits/rejected": -2.115086793899536,
"logps/chosen": -0.30216705799102783,
"logps/rejected": -0.3021796941757202,
"loss": 1.3144,
"rewards/accuracies": 0.4479166865348816,
"rewards/chosen": -0.6043341159820557,
"rewards/margins": 2.5328248739242554e-05,
"rewards/rejected": -0.6043593883514404,
"step": 132
},
{
"epoch": 0.44289522247283164,
"grad_norm": 2.785240411758423,
"learning_rate": 8.670309653916211e-07,
"logits/chosen": -2.142348289489746,
"logits/rejected": -2.162036418914795,
"logps/chosen": -0.3002782166004181,
"logps/rejected": -0.29516181349754333,
"loss": 1.3219,
"rewards/accuracies": 0.4843750298023224,
"rewards/chosen": -0.6005564332008362,
"rewards/margins": -0.010232776403427124,
"rewards/rejected": -0.5903236269950867,
"step": 135
},
{
"epoch": 0.4527373385277835,
"grad_norm": 2.65671968460083,
"learning_rate": 8.615664845173042e-07,
"logits/chosen": -2.1252198219299316,
"logits/rejected": -2.1430177688598633,
"logps/chosen": -0.3017180562019348,
"logps/rejected": -0.31176209449768066,
"loss": 1.2998,
"rewards/accuracies": 0.5625000596046448,
"rewards/chosen": -0.6034361124038696,
"rewards/margins": 0.02008809708058834,
"rewards/rejected": -0.6235241889953613,
"step": 138
},
{
"epoch": 0.4625794545827353,
"grad_norm": 2.574594020843506,
"learning_rate": 8.561020036429873e-07,
"logits/chosen": -2.204986095428467,
"logits/rejected": -2.273808002471924,
"logps/chosen": -0.3037709593772888,
"logps/rejected": -0.3029387295246124,
"loss": 1.3155,
"rewards/accuracies": 0.4947916865348816,
"rewards/chosen": -0.6075419187545776,
"rewards/margins": -0.0016644850838929415,
"rewards/rejected": -0.6058773994445801,
"step": 141
},
{
"epoch": 0.4724215706376871,
"grad_norm": 3.181304931640625,
"learning_rate": 8.506375227686703e-07,
"logits/chosen": -2.242096424102783,
"logits/rejected": -2.195438861846924,
"logps/chosen": -0.30111944675445557,
"logps/rejected": -0.3043938875198364,
"loss": 1.3097,
"rewards/accuracies": 0.5104167461395264,
"rewards/chosen": -0.6022388935089111,
"rewards/margins": 0.00654886569827795,
"rewards/rejected": -0.6087877750396729,
"step": 144
},
{
"epoch": 0.4822636866926389,
"grad_norm": 3.2186405658721924,
"learning_rate": 8.451730418943533e-07,
"logits/chosen": -2.217180013656616,
"logits/rejected": -2.3594436645507812,
"logps/chosen": -0.3065149486064911,
"logps/rejected": -0.3052757978439331,
"loss": 1.3163,
"rewards/accuracies": 0.5208333730697632,
"rewards/chosen": -0.6130298972129822,
"rewards/margins": -0.002478264272212982,
"rewards/rejected": -0.6105515956878662,
"step": 147
},
{
"epoch": 0.49210580274759075,
"grad_norm": 3.3180651664733887,
"learning_rate": 8.397085610200364e-07,
"logits/chosen": -2.209411859512329,
"logits/rejected": -2.153970718383789,
"logps/chosen": -0.30137962102890015,
"logps/rejected": -0.3014276325702667,
"loss": 1.3146,
"rewards/accuracies": 0.5260417461395264,
"rewards/chosen": -0.6027592420578003,
"rewards/margins": 9.602296631783247e-05,
"rewards/rejected": -0.6028553247451782,
"step": 150
},
{
"epoch": 0.5019479188025425,
"grad_norm": 2.8915350437164307,
"learning_rate": 8.342440801457194e-07,
"logits/chosen": -2.2405731678009033,
"logits/rejected": -2.197042465209961,
"logps/chosen": -0.2927592694759369,
"logps/rejected": -0.2949146032333374,
"loss": 1.3112,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": -0.5855185985565186,
"rewards/margins": 0.0043106647208333015,
"rewards/rejected": -0.5898292064666748,
"step": 153
},
{
"epoch": 0.5117900348574944,
"grad_norm": 3.5988616943359375,
"learning_rate": 8.287795992714025e-07,
"logits/chosen": -2.556368112564087,
"logits/rejected": -2.5539543628692627,
"logps/chosen": -0.29329127073287964,
"logps/rejected": -0.30116719007492065,
"loss": 1.3027,
"rewards/accuracies": 0.5364583730697632,
"rewards/chosen": -0.5865825414657593,
"rewards/margins": 0.015751861035823822,
"rewards/rejected": -0.6023343801498413,
"step": 156
},
{
"epoch": 0.5216321509124462,
"grad_norm": 3.1660470962524414,
"learning_rate": 8.233151183970856e-07,
"logits/chosen": -2.296084403991699,
"logits/rejected": -2.319601058959961,
"logps/chosen": -0.29829874634742737,
"logps/rejected": -0.3049090504646301,
"loss": 1.3047,
"rewards/accuracies": 0.5208333730697632,
"rewards/chosen": -0.5965974926948547,
"rewards/margins": 0.013220642693340778,
"rewards/rejected": -0.6098181009292603,
"step": 159
},
{
"epoch": 0.531474266967398,
"grad_norm": 3.3348488807678223,
"learning_rate": 8.178506375227686e-07,
"logits/chosen": -2.1131765842437744,
"logits/rejected": -2.0808582305908203,
"logps/chosen": -0.3017235994338989,
"logps/rejected": -0.30382847785949707,
"loss": 1.3113,
"rewards/accuracies": 0.5104166865348816,
"rewards/chosen": -0.6034471988677979,
"rewards/margins": 0.004209776874631643,
"rewards/rejected": -0.6076569557189941,
"step": 162
},
{
"epoch": 0.5413163830223499,
"grad_norm": 3.311103582382202,
"learning_rate": 8.123861566484517e-07,
"logits/chosen": -2.4615557193756104,
"logits/rejected": -2.393066644668579,
"logps/chosen": -0.296355664730072,
"logps/rejected": -0.30595722794532776,
"loss": 1.3007,
"rewards/accuracies": 0.5677083730697632,
"rewards/chosen": -0.592711329460144,
"rewards/margins": 0.01920315995812416,
"rewards/rejected": -0.6119144558906555,
"step": 165
},
{
"epoch": 0.5511584990773016,
"grad_norm": 3.7719385623931885,
"learning_rate": 8.069216757741348e-07,
"logits/chosen": -2.6202847957611084,
"logits/rejected": -2.6656908988952637,
"logps/chosen": -0.299972265958786,
"logps/rejected": -0.2976400554180145,
"loss": 1.318,
"rewards/accuracies": 0.5104166865348816,
"rewards/chosen": -0.5999445915222168,
"rewards/margins": -0.004664432257413864,
"rewards/rejected": -0.595280110836029,
"step": 168
},
{
"epoch": 0.5610006151322534,
"grad_norm": 2.891984462738037,
"learning_rate": 8.014571948998177e-07,
"logits/chosen": -2.2948923110961914,
"logits/rejected": -2.3426403999328613,
"logps/chosen": -0.2913352847099304,
"logps/rejected": -0.29462340474128723,
"loss": 1.3094,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.5826705098152161,
"rewards/margins": 0.006576266605407,
"rewards/rejected": -0.5892468690872192,
"step": 171
},
{
"epoch": 0.5708427311872053,
"grad_norm": 3.1071033477783203,
"learning_rate": 7.959927140255008e-07,
"logits/chosen": -2.54439115524292,
"logits/rejected": -2.484774112701416,
"logps/chosen": -0.2987141013145447,
"logps/rejected": -0.29878202080726624,
"loss": 1.3143,
"rewards/accuracies": 0.5052083730697632,
"rewards/chosen": -0.5974282026290894,
"rewards/margins": 0.0001358254812657833,
"rewards/rejected": -0.5975640416145325,
"step": 174
},
{
"epoch": 0.5806848472421571,
"grad_norm": 3.294955253601074,
"learning_rate": 7.905282331511839e-07,
"logits/chosen": -2.5297629833221436,
"logits/rejected": -2.6556806564331055,
"logps/chosen": -0.2948570251464844,
"logps/rejected": -0.292624831199646,
"loss": 1.3182,
"rewards/accuracies": 0.4687500298023224,
"rewards/chosen": -0.5897141098976135,
"rewards/margins": -0.004464422352612019,
"rewards/rejected": -0.585249662399292,
"step": 177
},
{
"epoch": 0.5905269632971089,
"grad_norm": 3.587242364883423,
"learning_rate": 7.850637522768669e-07,
"logits/chosen": -2.43650484085083,
"logits/rejected": -2.4716460704803467,
"logps/chosen": -0.3128420114517212,
"logps/rejected": -0.30596795678138733,
"loss": 1.3251,
"rewards/accuracies": 0.4739583730697632,
"rewards/chosen": -0.6256840229034424,
"rewards/margins": -0.01374807208776474,
"rewards/rejected": -0.6119359135627747,
"step": 180
},
{
"epoch": 0.6003690793520607,
"grad_norm": 3.2697906494140625,
"learning_rate": 7.795992714025501e-07,
"logits/chosen": -2.6053969860076904,
"logits/rejected": -2.605846881866455,
"logps/chosen": -0.30022186040878296,
"logps/rejected": -0.2978324294090271,
"loss": 1.3181,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.6004437208175659,
"rewards/margins": -0.004778880625963211,
"rewards/rejected": -0.5956648588180542,
"step": 183
},
{
"epoch": 0.6102111954070125,
"grad_norm": 5.84366512298584,
"learning_rate": 7.741347905282332e-07,
"logits/chosen": -2.2874748706817627,
"logits/rejected": -2.3448486328125,
"logps/chosen": -0.2971286177635193,
"logps/rejected": -0.29523584246635437,
"loss": 1.3177,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.5942572355270386,
"rewards/margins": -0.0037855547852814198,
"rewards/rejected": -0.590471625328064,
"step": 186
},
{
"epoch": 0.6200533114619643,
"grad_norm": 3.066098213195801,
"learning_rate": 7.686703096539162e-07,
"logits/chosen": -2.271486520767212,
"logits/rejected": -2.32065486907959,
"logps/chosen": -0.29515424370765686,
"logps/rejected": -0.30097514390945435,
"loss": 1.306,
"rewards/accuracies": 0.4895833730697632,
"rewards/chosen": -0.5903085470199585,
"rewards/margins": 0.011641697026789188,
"rewards/rejected": -0.6019502878189087,
"step": 189
},
{
"epoch": 0.6298954275169162,
"grad_norm": 3.2268269062042236,
"learning_rate": 7.632058287795993e-07,
"logits/chosen": -2.565883159637451,
"logits/rejected": -2.644766330718994,
"logps/chosen": -0.28512534499168396,
"logps/rejected": -0.28749996423721313,
"loss": 1.3108,
"rewards/accuracies": 0.4895833730697632,
"rewards/chosen": -0.5702506899833679,
"rewards/margins": 0.004749252460896969,
"rewards/rejected": -0.5749999284744263,
"step": 192
},
{
"epoch": 0.639737543571868,
"grad_norm": 3.53078031539917,
"learning_rate": 7.577413479052824e-07,
"logits/chosen": -2.412142515182495,
"logits/rejected": -2.4293620586395264,
"logps/chosen": -0.2922815978527069,
"logps/rejected": -0.2959037125110626,
"loss": 1.3091,
"rewards/accuracies": 0.5364583730697632,
"rewards/chosen": -0.5845631957054138,
"rewards/margins": 0.0072442456148564816,
"rewards/rejected": -0.5918074250221252,
"step": 195
},
{
"epoch": 0.6495796596268197,
"grad_norm": 2.991565227508545,
"learning_rate": 7.522768670309653e-07,
"logits/chosen": -2.1828391551971436,
"logits/rejected": -2.2708098888397217,
"logps/chosen": -0.2814570367336273,
"logps/rejected": -0.2823142409324646,
"loss": 1.3132,
"rewards/accuracies": 0.479166716337204,
"rewards/chosen": -0.5629140734672546,
"rewards/margins": 0.001714351586997509,
"rewards/rejected": -0.5646284222602844,
"step": 198
},
{
"epoch": 0.6594217756817716,
"grad_norm": 3.2695488929748535,
"learning_rate": 7.468123861566484e-07,
"logits/chosen": -2.651029109954834,
"logits/rejected": -2.5657577514648438,
"logps/chosen": -0.2821059226989746,
"logps/rejected": -0.288613885641098,
"loss": 1.3049,
"rewards/accuracies": 0.5885416865348816,
"rewards/chosen": -0.564211905002594,
"rewards/margins": 0.013015862554311752,
"rewards/rejected": -0.577227771282196,
"step": 201
},
{
"epoch": 0.6692638917367234,
"grad_norm": 3.027118444442749,
"learning_rate": 7.413479052823315e-07,
"logits/chosen": -2.4459362030029297,
"logits/rejected": -2.4375133514404297,
"logps/chosen": -0.2884170114994049,
"logps/rejected": -0.29369017481803894,
"loss": 1.3066,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.576833963394165,
"rewards/margins": 0.010546308942139149,
"rewards/rejected": -0.5873803496360779,
"step": 204
},
{
"epoch": 0.6791060077916752,
"grad_norm": 3.216331720352173,
"learning_rate": 7.358834244080145e-07,
"logits/chosen": -2.2358179092407227,
"logits/rejected": -2.272609233856201,
"logps/chosen": -0.29196596145629883,
"logps/rejected": -0.29673081636428833,
"loss": 1.3078,
"rewards/accuracies": 0.5520833730697632,
"rewards/chosen": -0.5839319229125977,
"rewards/margins": 0.009529721923172474,
"rewards/rejected": -0.5934616327285767,
"step": 207
},
{
"epoch": 0.6889481238466271,
"grad_norm": 3.1151554584503174,
"learning_rate": 7.304189435336976e-07,
"logits/chosen": -2.1243269443511963,
"logits/rejected": -2.25223708152771,
"logps/chosen": -0.28017833828926086,
"logps/rejected": -0.28699177503585815,
"loss": 1.3044,
"rewards/accuracies": 0.5729166865348816,
"rewards/chosen": -0.5603566765785217,
"rewards/margins": 0.013626816682517529,
"rewards/rejected": -0.5739835500717163,
"step": 210
},
{
"epoch": 0.6987902399015788,
"grad_norm": 2.807512044906616,
"learning_rate": 7.249544626593807e-07,
"logits/chosen": -2.404712677001953,
"logits/rejected": -2.342726945877075,
"logps/chosen": -0.274359792470932,
"logps/rejected": -0.27572181820869446,
"loss": 1.3124,
"rewards/accuracies": 0.494791716337204,
"rewards/chosen": -0.548719584941864,
"rewards/margins": 0.002724003978073597,
"rewards/rejected": -0.5514436364173889,
"step": 213
},
{
"epoch": 0.7086323559565306,
"grad_norm": 2.903782606124878,
"learning_rate": 7.194899817850637e-07,
"logits/chosen": -2.2218027114868164,
"logits/rejected": -2.2185091972351074,
"logps/chosen": -0.2856960594654083,
"logps/rejected": -0.2866899073123932,
"loss": 1.3127,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.5713921189308167,
"rewards/margins": 0.001987707568332553,
"rewards/rejected": -0.5733798742294312,
"step": 216
},
{
"epoch": 0.7184744720114825,
"grad_norm": 2.9293315410614014,
"learning_rate": 7.140255009107468e-07,
"logits/chosen": -2.249828338623047,
"logits/rejected": -2.340233087539673,
"logps/chosen": -0.2771129608154297,
"logps/rejected": -0.2712464928627014,
"loss": 1.3229,
"rewards/accuracies": 0.4687500298023224,
"rewards/chosen": -0.5542259216308594,
"rewards/margins": -0.011732938699424267,
"rewards/rejected": -0.5424929857254028,
"step": 219
},
{
"epoch": 0.7283165880664343,
"grad_norm": 3.209792137145996,
"learning_rate": 7.085610200364299e-07,
"logits/chosen": -2.6383891105651855,
"logits/rejected": -2.5945024490356445,
"logps/chosen": -0.275699257850647,
"logps/rejected": -0.2788482904434204,
"loss": 1.3099,
"rewards/accuracies": 0.5104166865348816,
"rewards/chosen": -0.551398515701294,
"rewards/margins": 0.0062980144284665585,
"rewards/rejected": -0.5576965808868408,
"step": 222
},
{
"epoch": 0.7381587041213861,
"grad_norm": 3.3083884716033936,
"learning_rate": 7.030965391621128e-07,
"logits/chosen": -2.1804957389831543,
"logits/rejected": -2.2220664024353027,
"logps/chosen": -0.2919140160083771,
"logps/rejected": -0.28688621520996094,
"loss": 1.3218,
"rewards/accuracies": 0.4739583432674408,
"rewards/chosen": -0.5838280320167542,
"rewards/margins": -0.01005559042096138,
"rewards/rejected": -0.5737724304199219,
"step": 225
},
{
"epoch": 0.748000820176338,
"grad_norm": 3.429899215698242,
"learning_rate": 6.976320582877959e-07,
"logits/chosen": -2.5299549102783203,
"logits/rejected": -2.5133347511291504,
"logps/chosen": -0.27780574560165405,
"logps/rejected": -0.2840309739112854,
"loss": 1.3053,
"rewards/accuracies": 0.4635416865348816,
"rewards/chosen": -0.5556114912033081,
"rewards/margins": 0.01245046779513359,
"rewards/rejected": -0.568061888217926,
"step": 228
},
{
"epoch": 0.7578429362312897,
"grad_norm": 3.071539878845215,
"learning_rate": 6.92167577413479e-07,
"logits/chosen": -2.312579393386841,
"logits/rejected": -2.415515422821045,
"logps/chosen": -0.27620574831962585,
"logps/rejected": -0.27957403659820557,
"loss": 1.3096,
"rewards/accuracies": 0.5364583730697632,
"rewards/chosen": -0.5524114966392517,
"rewards/margins": 0.006736626382917166,
"rewards/rejected": -0.5591480731964111,
"step": 231
},
{
"epoch": 0.7676850522862415,
"grad_norm": 3.4358389377593994,
"learning_rate": 6.86703096539162e-07,
"logits/chosen": -2.452547550201416,
"logits/rejected": -2.4726529121398926,
"logps/chosen": -0.284848153591156,
"logps/rejected": -0.2895206809043884,
"loss": 1.3076,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.569696307182312,
"rewards/margins": 0.009345123544335365,
"rewards/rejected": -0.5790413618087769,
"step": 234
},
{
"epoch": 0.7775271683411934,
"grad_norm": 3.199431896209717,
"learning_rate": 6.812386156648452e-07,
"logits/chosen": -2.2307732105255127,
"logits/rejected": -2.2812376022338867,
"logps/chosen": -0.29158759117126465,
"logps/rejected": -0.2949972152709961,
"loss": 1.3096,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.5831751823425293,
"rewards/margins": 0.006819295696914196,
"rewards/rejected": -0.5899944305419922,
"step": 237
},
{
"epoch": 0.7873692843961452,
"grad_norm": 2.6213815212249756,
"learning_rate": 6.757741347905283e-07,
"logits/chosen": -2.1308844089508057,
"logits/rejected": -2.17814302444458,
"logps/chosen": -0.27422723174095154,
"logps/rejected": -0.2723295986652374,
"loss": 1.3171,
"rewards/accuracies": 0.5104166865348816,
"rewards/chosen": -0.5484545230865479,
"rewards/margins": -0.0037952661514282227,
"rewards/rejected": -0.5446591973304749,
"step": 240
},
{
"epoch": 0.797211400451097,
"grad_norm": 2.6146914958953857,
"learning_rate": 6.703096539162113e-07,
"logits/chosen": -2.217012405395508,
"logits/rejected": -2.30794095993042,
"logps/chosen": -0.2751448452472687,
"logps/rejected": -0.2789701223373413,
"loss": 1.3088,
"rewards/accuracies": 0.5052083730697632,
"rewards/chosen": -0.5502896904945374,
"rewards/margins": 0.007650562096387148,
"rewards/rejected": -0.5579402446746826,
"step": 243
},
{
"epoch": 0.8070535165060488,
"grad_norm": 3.266503095626831,
"learning_rate": 6.648451730418944e-07,
"logits/chosen": -2.3135907649993896,
"logits/rejected": -2.2477428913116455,
"logps/chosen": -0.2836703360080719,
"logps/rejected": -0.28257811069488525,
"loss": 1.3161,
"rewards/accuracies": 0.4739583730697632,
"rewards/chosen": -0.5673407316207886,
"rewards/margins": -0.002184514421969652,
"rewards/rejected": -0.5651561617851257,
"step": 246
},
{
"epoch": 0.8168956325610006,
"grad_norm": 3.104182481765747,
"learning_rate": 6.593806921675775e-07,
"logits/chosen": -2.328064441680908,
"logits/rejected": -2.3774280548095703,
"logps/chosen": -0.276460200548172,
"logps/rejected": -0.2795277237892151,
"loss": 1.3097,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.552920401096344,
"rewards/margins": 0.006135078612715006,
"rewards/rejected": -0.559055507183075,
"step": 249
},
{
"epoch": 0.8267377486159524,
"grad_norm": 3.145758867263794,
"learning_rate": 6.539162112932604e-07,
"logits/chosen": -2.0514845848083496,
"logits/rejected": -2.1181387901306152,
"logps/chosen": -0.2676684558391571,
"logps/rejected": -0.2710112929344177,
"loss": 1.3094,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.5353369116783142,
"rewards/margins": 0.006685652770102024,
"rewards/rejected": -0.5420225262641907,
"step": 252
},
{
"epoch": 0.8365798646709043,
"grad_norm": 2.8558096885681152,
"learning_rate": 6.484517304189435e-07,
"logits/chosen": -2.133450984954834,
"logits/rejected": -2.174488067626953,
"logps/chosen": -0.2785017192363739,
"logps/rejected": -0.27986931800842285,
"loss": 1.3122,
"rewards/accuracies": 0.4895833730697632,
"rewards/chosen": -0.557003378868103,
"rewards/margins": 0.0027352517936378717,
"rewards/rejected": -0.5597386360168457,
"step": 255
},
{
"epoch": 0.8464219807258561,
"grad_norm": 2.6255104541778564,
"learning_rate": 6.429872495446266e-07,
"logits/chosen": -2.120138168334961,
"logits/rejected": -2.167184829711914,
"logps/chosen": -0.2761051654815674,
"logps/rejected": -0.27183249592781067,
"loss": 1.3209,
"rewards/accuracies": 0.4843750298023224,
"rewards/chosen": -0.5522103309631348,
"rewards/margins": -0.008545313030481339,
"rewards/rejected": -0.5436649918556213,
"step": 258
},
{
"epoch": 0.8562640967808078,
"grad_norm": 3.049882173538208,
"learning_rate": 6.375227686703096e-07,
"logits/chosen": -2.144340991973877,
"logits/rejected": -2.2074198722839355,
"logps/chosen": -0.28039246797561646,
"logps/rejected": -0.2806190848350525,
"loss": 1.314,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.5607849359512329,
"rewards/margins": 0.00045326724648475647,
"rewards/rejected": -0.5612382292747498,
"step": 261
},
{
"epoch": 0.8661062128357597,
"grad_norm": 3.1524369716644287,
"learning_rate": 6.320582877959927e-07,
"logits/chosen": -2.2970197200775146,
"logits/rejected": -2.2662580013275146,
"logps/chosen": -0.2740846872329712,
"logps/rejected": -0.27690619230270386,
"loss": 1.3101,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.5481693744659424,
"rewards/margins": 0.005643073469400406,
"rewards/rejected": -0.5538123846054077,
"step": 264
},
{
"epoch": 0.8759483288907115,
"grad_norm": 2.8429999351501465,
"learning_rate": 6.265938069216758e-07,
"logits/chosen": -2.36765456199646,
"logits/rejected": -2.348548650741577,
"logps/chosen": -0.2780293822288513,
"logps/rejected": -0.2812359631061554,
"loss": 1.3096,
"rewards/accuracies": 0.5104166865348816,
"rewards/chosen": -0.5560587644577026,
"rewards/margins": 0.00641320226714015,
"rewards/rejected": -0.5624719262123108,
"step": 267
},
{
"epoch": 0.8857904449456633,
"grad_norm": 3.0595552921295166,
"learning_rate": 6.211293260473588e-07,
"logits/chosen": -2.509822368621826,
"logits/rejected": -2.49569034576416,
"logps/chosen": -0.2772996425628662,
"logps/rejected": -0.28799355030059814,
"loss": 1.299,
"rewards/accuracies": 0.5729166865348816,
"rewards/chosen": -0.5545992851257324,
"rewards/margins": 0.021387770771980286,
"rewards/rejected": -0.5759870409965515,
"step": 270
},
{
"epoch": 0.8956325610006152,
"grad_norm": 3.2352564334869385,
"learning_rate": 6.156648451730419e-07,
"logits/chosen": -2.1871042251586914,
"logits/rejected": -2.1954808235168457,
"logps/chosen": -0.27809447050094604,
"logps/rejected": -0.2834944427013397,
"loss": 1.3066,
"rewards/accuracies": 0.5364583730697632,
"rewards/chosen": -0.5561889410018921,
"rewards/margins": 0.010799943469464779,
"rewards/rejected": -0.5669888854026794,
"step": 273
},
{
"epoch": 0.905474677055567,
"grad_norm": 2.931501865386963,
"learning_rate": 6.102003642987249e-07,
"logits/chosen": -2.225825309753418,
"logits/rejected": -2.2486400604248047,
"logps/chosen": -0.280958890914917,
"logps/rejected": -0.2846115827560425,
"loss": 1.3091,
"rewards/accuracies": 0.5052083730697632,
"rewards/chosen": -0.561917781829834,
"rewards/margins": 0.0073054153472185135,
"rewards/rejected": -0.569223165512085,
"step": 276
},
{
"epoch": 0.9153167931105187,
"grad_norm": 3.286054849624634,
"learning_rate": 6.047358834244079e-07,
"logits/chosen": -2.196420669555664,
"logits/rejected": -2.27847957611084,
"logps/chosen": -0.2832242250442505,
"logps/rejected": -0.28931811451911926,
"loss": 1.306,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.5664485096931458,
"rewards/margins": 0.012187773361802101,
"rewards/rejected": -0.5786362290382385,
"step": 279
},
{
"epoch": 0.9251589091654706,
"grad_norm": 2.9873507022857666,
"learning_rate": 5.99271402550091e-07,
"logits/chosen": -2.2308907508850098,
"logits/rejected": -2.211625576019287,
"logps/chosen": -0.28117528557777405,
"logps/rejected": -0.28368309140205383,
"loss": 1.3107,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.5623506307601929,
"rewards/margins": 0.005015634000301361,
"rewards/rejected": -0.5673662424087524,
"step": 282
},
{
"epoch": 0.9350010252204224,
"grad_norm": 3.100102663040161,
"learning_rate": 5.93806921675774e-07,
"logits/chosen": -2.177976369857788,
"logits/rejected": -2.0842134952545166,
"logps/chosen": -0.27946293354034424,
"logps/rejected": -0.28198277950286865,
"loss": 1.3106,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.5589258670806885,
"rewards/margins": 0.005039653740823269,
"rewards/rejected": -0.5639654994010925,
"step": 285
},
{
"epoch": 0.9448431412753742,
"grad_norm": 3.292073965072632,
"learning_rate": 5.883424408014571e-07,
"logits/chosen": -2.5255026817321777,
"logits/rejected": -2.5117194652557373,
"logps/chosen": -0.27470558881759644,
"logps/rejected": -0.2794868052005768,
"loss": 1.3077,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5494111776351929,
"rewards/margins": 0.009562441147863865,
"rewards/rejected": -0.5589736104011536,
"step": 288
},
{
"epoch": 0.954685257330326,
"grad_norm": 2.966644048690796,
"learning_rate": 5.828779599271403e-07,
"logits/chosen": -2.282688856124878,
"logits/rejected": -2.3302011489868164,
"logps/chosen": -0.281721293926239,
"logps/rejected": -0.2839483320713043,
"loss": 1.311,
"rewards/accuracies": 0.5052083134651184,
"rewards/chosen": -0.563442587852478,
"rewards/margins": 0.004454084672033787,
"rewards/rejected": -0.5678966641426086,
"step": 291
},
{
"epoch": 0.9645273733852778,
"grad_norm": 3.1071043014526367,
"learning_rate": 5.774134790528234e-07,
"logits/chosen": -2.2296414375305176,
"logits/rejected": -2.2465267181396484,
"logps/chosen": -0.27823498845100403,
"logps/rejected": -0.27505937218666077,
"loss": 1.319,
"rewards/accuracies": 0.4895833730697632,
"rewards/chosen": -0.5564700365066528,
"rewards/margins": -0.006351290736347437,
"rewards/rejected": -0.5501187443733215,
"step": 294
},
{
"epoch": 0.9743694894402296,
"grad_norm": 3.342630386352539,
"learning_rate": 5.719489981785064e-07,
"logits/chosen": -2.301790714263916,
"logits/rejected": -2.232879638671875,
"logps/chosen": -0.28276318311691284,
"logps/rejected": -0.2836660146713257,
"loss": 1.3133,
"rewards/accuracies": 0.5729166865348816,
"rewards/chosen": -0.5655263662338257,
"rewards/margins": 0.0018056412227451801,
"rewards/rejected": -0.5673320293426514,
"step": 297
},
{
"epoch": 0.9842116054951815,
"grad_norm": 2.8772573471069336,
"learning_rate": 5.664845173041895e-07,
"logits/chosen": -2.7162556648254395,
"logits/rejected": -2.6399483680725098,
"logps/chosen": -0.276977002620697,
"logps/rejected": -0.27825504541397095,
"loss": 1.3131,
"rewards/accuracies": 0.4791666865348816,
"rewards/chosen": -0.553954005241394,
"rewards/margins": 0.00255610141903162,
"rewards/rejected": -0.5565100908279419,
"step": 300
},
{
"epoch": 0.9940537215501333,
"grad_norm": 2.8912346363067627,
"learning_rate": 5.610200364298725e-07,
"logits/chosen": -2.5115840435028076,
"logits/rejected": -2.4576642513275146,
"logps/chosen": -0.2784987688064575,
"logps/rejected": -0.2799062728881836,
"loss": 1.3124,
"rewards/accuracies": 0.5364583730697632,
"rewards/chosen": -0.556997537612915,
"rewards/margins": 0.00281504332087934,
"rewards/rejected": -0.5598125457763672,
"step": 303
},
{
"epoch": 1.0032807053516506,
"grad_norm": 3.153007745742798,
"learning_rate": 5.555555555555555e-07,
"logits/chosen": -2.0730063915252686,
"logits/rejected": -2.148167371749878,
"logps/chosen": -0.28567373752593994,
"logps/rejected": -0.30247652530670166,
"loss": 1.21,
"rewards/accuracies": 0.6166666746139526,
"rewards/chosen": -0.5713474750518799,
"rewards/margins": 0.03360557556152344,
"rewards/rejected": -0.6049530506134033,
"step": 306
},
{
"epoch": 1.0131228214066024,
"grad_norm": 3.5555901527404785,
"learning_rate": 5.500910746812386e-07,
"logits/chosen": -2.525221824645996,
"logits/rejected": -2.485888957977295,
"logps/chosen": -0.26666826009750366,
"logps/rejected": -0.29451531171798706,
"loss": 1.2742,
"rewards/accuracies": 0.6822916865348816,
"rewards/chosen": -0.5333365201950073,
"rewards/margins": 0.055694133043289185,
"rewards/rejected": -0.5890306234359741,
"step": 309
},
{
"epoch": 1.0229649374615541,
"grad_norm": 3.351257085800171,
"learning_rate": 5.446265938069217e-07,
"logits/chosen": -2.073094129562378,
"logits/rejected": -2.082315444946289,
"logps/chosen": -0.2812093496322632,
"logps/rejected": -0.30162906646728516,
"loss": 1.2852,
"rewards/accuracies": 0.6197916865348816,
"rewards/chosen": -0.5624186992645264,
"rewards/margins": 0.04083945229649544,
"rewards/rejected": -0.6032581329345703,
"step": 312
},
{
"epoch": 1.0328070535165061,
"grad_norm": 3.3620002269744873,
"learning_rate": 5.391621129326047e-07,
"logits/chosen": -2.327975034713745,
"logits/rejected": -2.2632720470428467,
"logps/chosen": -0.28332576155662537,
"logps/rejected": -0.30366650223731995,
"loss": 1.2851,
"rewards/accuracies": 0.6614583730697632,
"rewards/chosen": -0.5666515231132507,
"rewards/margins": 0.04068151116371155,
"rewards/rejected": -0.6073330044746399,
"step": 315
},
{
"epoch": 1.042649169571458,
"grad_norm": 3.8589353561401367,
"learning_rate": 5.336976320582878e-07,
"logits/chosen": -2.3901515007019043,
"logits/rejected": -2.3333704471588135,
"logps/chosen": -0.27824974060058594,
"logps/rejected": -0.3028562366962433,
"loss": 1.2793,
"rewards/accuracies": 0.6510416865348816,
"rewards/chosen": -0.5564994215965271,
"rewards/margins": 0.04921308159828186,
"rewards/rejected": -0.6057125329971313,
"step": 318
},
{
"epoch": 1.0524912856264097,
"grad_norm": 2.8481204509735107,
"learning_rate": 5.282331511839709e-07,
"logits/chosen": -2.478609323501587,
"logits/rejected": -2.4115850925445557,
"logps/chosen": -0.2729317545890808,
"logps/rejected": -0.2993568778038025,
"loss": 1.2768,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -0.5458635687828064,
"rewards/margins": 0.05285021662712097,
"rewards/rejected": -0.598713755607605,
"step": 321
},
{
"epoch": 1.0623334016813615,
"grad_norm": 3.818915367126465,
"learning_rate": 5.227686703096539e-07,
"logits/chosen": -2.4622111320495605,
"logits/rejected": -2.5216636657714844,
"logps/chosen": -0.27243781089782715,
"logps/rejected": -0.3012939691543579,
"loss": 1.2731,
"rewards/accuracies": 0.6770833730697632,
"rewards/chosen": -0.5448756217956543,
"rewards/margins": 0.05771230533719063,
"rewards/rejected": -0.6025879383087158,
"step": 324
},
{
"epoch": 1.0721755177363133,
"grad_norm": 4.67787504196167,
"learning_rate": 5.17304189435337e-07,
"logits/chosen": -2.3278470039367676,
"logits/rejected": -2.316049575805664,
"logps/chosen": -0.29923686385154724,
"logps/rejected": -0.3328157663345337,
"loss": 1.2673,
"rewards/accuracies": 0.6510417461395264,
"rewards/chosen": -0.5984737277030945,
"rewards/margins": 0.06715783476829529,
"rewards/rejected": -0.6656315326690674,
"step": 327
},
{
"epoch": 1.082017633791265,
"grad_norm": 4.611724853515625,
"learning_rate": 5.118397085610199e-07,
"logits/chosen": -2.601393461227417,
"logits/rejected": -2.50719952583313,
"logps/chosen": -0.2986273169517517,
"logps/rejected": -0.3138871192932129,
"loss": 1.2934,
"rewards/accuracies": 0.5520833730697632,
"rewards/chosen": -0.5972546339035034,
"rewards/margins": 0.030519628897309303,
"rewards/rejected": -0.6277742385864258,
"step": 330
},
{
"epoch": 1.091859749846217,
"grad_norm": 4.721919536590576,
"learning_rate": 5.06375227686703e-07,
"logits/chosen": -2.552699565887451,
"logits/rejected": -2.6258749961853027,
"logps/chosen": -0.28720822930336,
"logps/rejected": -0.3122625946998596,
"loss": 1.2794,
"rewards/accuracies": 0.6041666865348816,
"rewards/chosen": -0.5744165182113647,
"rewards/margins": 0.050108686089515686,
"rewards/rejected": -0.6245251893997192,
"step": 333
},
{
"epoch": 1.1017018659011688,
"grad_norm": 5.033515453338623,
"learning_rate": 5.009107468123861e-07,
"logits/chosen": -2.828622817993164,
"logits/rejected": -2.8359427452087402,
"logps/chosen": -0.29961028695106506,
"logps/rejected": -0.3207009732723236,
"loss": 1.2851,
"rewards/accuracies": 0.5885416865348816,
"rewards/chosen": -0.5992205739021301,
"rewards/margins": 0.04218133166432381,
"rewards/rejected": -0.641402006149292,
"step": 336
},
{
"epoch": 1.1115439819561206,
"grad_norm": 5.343788146972656,
"learning_rate": 4.954462659380693e-07,
"logits/chosen": -3.03733229637146,
"logits/rejected": -3.0399529933929443,
"logps/chosen": -0.30108141899108887,
"logps/rejected": -0.32776015996932983,
"loss": 1.2779,
"rewards/accuracies": 0.5885416865348816,
"rewards/chosen": -0.602162778377533,
"rewards/margins": 0.05335747450590134,
"rewards/rejected": -0.6555203199386597,
"step": 339
},
{
"epoch": 1.1213860980110724,
"grad_norm": 5.3723602294921875,
"learning_rate": 4.899817850637522e-07,
"logits/chosen": -2.981997013092041,
"logits/rejected": -2.949530839920044,
"logps/chosen": -0.3047109544277191,
"logps/rejected": -0.33689817786216736,
"loss": 1.2696,
"rewards/accuracies": 0.6197917461395264,
"rewards/chosen": -0.6094219088554382,
"rewards/margins": 0.06437446177005768,
"rewards/rejected": -0.6737963557243347,
"step": 342
},
{
"epoch": 1.1312282140660241,
"grad_norm": 4.727919101715088,
"learning_rate": 4.845173041894353e-07,
"logits/chosen": -2.695420980453491,
"logits/rejected": -2.6776719093322754,
"logps/chosen": -0.30032098293304443,
"logps/rejected": -0.3254402279853821,
"loss": 1.2794,
"rewards/accuracies": 0.5677083730697632,
"rewards/chosen": -0.6006419658660889,
"rewards/margins": 0.0502384789288044,
"rewards/rejected": -0.6508804559707642,
"step": 345
},
{
"epoch": 1.141070330120976,
"grad_norm": 5.740645885467529,
"learning_rate": 4.790528233151183e-07,
"logits/chosen": -2.89811372756958,
"logits/rejected": -2.8297605514526367,
"logps/chosen": -0.3110088109970093,
"logps/rejected": -0.3469686210155487,
"loss": 1.2643,
"rewards/accuracies": 0.6822916865348816,
"rewards/chosen": -0.6220176219940186,
"rewards/margins": 0.07191960513591766,
"rewards/rejected": -0.6939372420310974,
"step": 348
},
{
"epoch": 1.150912446175928,
"grad_norm": 5.677977561950684,
"learning_rate": 4.735883424408014e-07,
"logits/chosen": -2.919649362564087,
"logits/rejected": -2.7204208374023438,
"logps/chosen": -0.31897199153900146,
"logps/rejected": -0.3537505269050598,
"loss": 1.2661,
"rewards/accuracies": 0.6510416865348816,
"rewards/chosen": -0.6379439830780029,
"rewards/margins": 0.06955704838037491,
"rewards/rejected": -0.7075010538101196,
"step": 351
},
{
"epoch": 1.1607545622308797,
"grad_norm": 5.252042293548584,
"learning_rate": 4.681238615664845e-07,
"logits/chosen": -2.69753360748291,
"logits/rejected": -2.699644088745117,
"logps/chosen": -0.3222702145576477,
"logps/rejected": -0.3525330424308777,
"loss": 1.2736,
"rewards/accuracies": 0.6145833730697632,
"rewards/chosen": -0.6445404291152954,
"rewards/margins": 0.06052564084529877,
"rewards/rejected": -0.7050661444664001,
"step": 354
},
{
"epoch": 1.1705966782858315,
"grad_norm": 6.118551254272461,
"learning_rate": 4.6265938069216755e-07,
"logits/chosen": -2.956627368927002,
"logits/rejected": -2.8907008171081543,
"logps/chosen": -0.31340500712394714,
"logps/rejected": -0.3473677635192871,
"loss": 1.2679,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.6268100738525391,
"rewards/margins": 0.06792548298835754,
"rewards/rejected": -0.6947354674339294,
"step": 357
},
{
"epoch": 1.1804387943407832,
"grad_norm": 6.526240348815918,
"learning_rate": 4.5719489981785067e-07,
"logits/chosen": -2.988039493560791,
"logits/rejected": -2.947343349456787,
"logps/chosen": -0.3325228691101074,
"logps/rejected": -0.3738601505756378,
"loss": 1.2573,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -0.6650457382202148,
"rewards/margins": 0.08267448842525482,
"rewards/rejected": -0.7477203011512756,
"step": 360
},
{
"epoch": 1.190280910395735,
"grad_norm": 6.204545497894287,
"learning_rate": 4.517304189435337e-07,
"logits/chosen": -2.952932357788086,
"logits/rejected": -2.989872455596924,
"logps/chosen": -0.34380555152893066,
"logps/rejected": -0.37077945470809937,
"loss": 1.2777,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.6876111030578613,
"rewards/margins": 0.053947802633047104,
"rewards/rejected": -0.7415589094161987,
"step": 363
},
{
"epoch": 1.2001230264506868,
"grad_norm": 9.464421272277832,
"learning_rate": 4.4626593806921675e-07,
"logits/chosen": -3.19895601272583,
"logits/rejected": -3.2810332775115967,
"logps/chosen": -0.3429660499095917,
"logps/rejected": -0.37942421436309814,
"loss": 1.2644,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -0.6859321594238281,
"rewards/margins": 0.07291623204946518,
"rewards/rejected": -0.7588483095169067,
"step": 366
},
{
"epoch": 1.2099651425056388,
"grad_norm": 7.448987007141113,
"learning_rate": 4.408014571948998e-07,
"logits/chosen": -2.8745474815368652,
"logits/rejected": -2.9124350547790527,
"logps/chosen": -0.3625420033931732,
"logps/rejected": -0.40313583612442017,
"loss": 1.2592,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.7250840663909912,
"rewards/margins": 0.0811876431107521,
"rewards/rejected": -0.8062716722488403,
"step": 369
},
{
"epoch": 1.2198072585605906,
"grad_norm": 8.745219230651855,
"learning_rate": 4.353369763205829e-07,
"logits/chosen": -3.0614566802978516,
"logits/rejected": -3.113219738006592,
"logps/chosen": -0.3707306683063507,
"logps/rejected": -0.40356966853141785,
"loss": 1.2707,
"rewards/accuracies": 0.6458333730697632,
"rewards/chosen": -0.7414613962173462,
"rewards/margins": 0.06567797809839249,
"rewards/rejected": -0.8071393966674805,
"step": 372
},
{
"epoch": 1.2296493746155424,
"grad_norm": 6.997180461883545,
"learning_rate": 4.298724954462659e-07,
"logits/chosen": -2.9044456481933594,
"logits/rejected": -3.052482843399048,
"logps/chosen": -0.3646923005580902,
"logps/rejected": -0.39414650201797485,
"loss": 1.2752,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -0.7293846011161804,
"rewards/margins": 0.05890839919447899,
"rewards/rejected": -0.7882929444313049,
"step": 375
},
{
"epoch": 1.2394914906704941,
"grad_norm": 10.55185317993164,
"learning_rate": 4.2440801457194896e-07,
"logits/chosen": -3.2242932319641113,
"logits/rejected": -3.0834662914276123,
"logps/chosen": -0.4069775640964508,
"logps/rejected": -0.463001012802124,
"loss": 1.2405,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -0.8139551281929016,
"rewards/margins": 0.11204691231250763,
"rewards/rejected": -0.9260020852088928,
"step": 378
},
{
"epoch": 1.249333606725446,
"grad_norm": 20.299375534057617,
"learning_rate": 4.1894353369763203e-07,
"logits/chosen": -3.1378321647644043,
"logits/rejected": -3.108181953430176,
"logps/chosen": -0.40514567494392395,
"logps/rejected": -0.44149208068847656,
"loss": 1.2673,
"rewards/accuracies": 0.6458333730697632,
"rewards/chosen": -0.8102913498878479,
"rewards/margins": 0.07269280403852463,
"rewards/rejected": -0.8829841613769531,
"step": 381
},
{
"epoch": 1.2591757227803977,
"grad_norm": 8.039090156555176,
"learning_rate": 4.134790528233151e-07,
"logits/chosen": -2.9066882133483887,
"logits/rejected": -2.9090116024017334,
"logps/chosen": -0.4023885130882263,
"logps/rejected": -0.44079360365867615,
"loss": 1.2646,
"rewards/accuracies": 0.5989583730697632,
"rewards/chosen": -0.8047770261764526,
"rewards/margins": 0.07681018859148026,
"rewards/rejected": -0.8815872073173523,
"step": 384
},
{
"epoch": 1.2690178388353495,
"grad_norm": 9.490564346313477,
"learning_rate": 4.0801457194899816e-07,
"logits/chosen": -3.021369457244873,
"logits/rejected": -2.905494213104248,
"logps/chosen": -0.409457266330719,
"logps/rejected": -0.4580920338630676,
"loss": 1.2519,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.818914532661438,
"rewards/margins": 0.09726953506469727,
"rewards/rejected": -0.9161840677261353,
"step": 387
},
{
"epoch": 1.2788599548903015,
"grad_norm": 9.488383293151855,
"learning_rate": 4.0255009107468123e-07,
"logits/chosen": -3.3218090534210205,
"logits/rejected": -3.2660317420959473,
"logps/chosen": -0.41564124822616577,
"logps/rejected": -0.43702036142349243,
"loss": 1.2879,
"rewards/accuracies": 0.6145833730697632,
"rewards/chosen": -0.8312824368476868,
"rewards/margins": 0.04275830462574959,
"rewards/rejected": -0.8740407228469849,
"step": 390
},
{
"epoch": 1.2887020709452532,
"grad_norm": 9.484984397888184,
"learning_rate": 3.970856102003643e-07,
"logits/chosen": -3.089040756225586,
"logits/rejected": -3.155805826187134,
"logps/chosen": -0.4346025586128235,
"logps/rejected": -0.4728645384311676,
"loss": 1.2674,
"rewards/accuracies": 0.5729166865348816,
"rewards/chosen": -0.869205117225647,
"rewards/margins": 0.07652393728494644,
"rewards/rejected": -0.9457290768623352,
"step": 393
},
{
"epoch": 1.298544187000205,
"grad_norm": 10.901666641235352,
"learning_rate": 3.9162112932604736e-07,
"logits/chosen": -2.9061741828918457,
"logits/rejected": -2.9783828258514404,
"logps/chosen": -0.4548156261444092,
"logps/rejected": -0.4865460991859436,
"loss": 1.2736,
"rewards/accuracies": 0.5885416865348816,
"rewards/chosen": -0.9096312522888184,
"rewards/margins": 0.06346089392900467,
"rewards/rejected": -0.9730921387672424,
"step": 396
},
{
"epoch": 1.3083863030551568,
"grad_norm": 10.082768440246582,
"learning_rate": 3.8615664845173043e-07,
"logits/chosen": -3.2704148292541504,
"logits/rejected": -3.3333544731140137,
"logps/chosen": -0.4401400685310364,
"logps/rejected": -0.47681349515914917,
"loss": 1.2666,
"rewards/accuracies": 0.6302083730697632,
"rewards/chosen": -0.8802801370620728,
"rewards/margins": 0.07334680110216141,
"rewards/rejected": -0.9536269903182983,
"step": 399
},
{
"epoch": 1.3182284191101088,
"grad_norm": 8.94642448425293,
"learning_rate": 3.8069216757741344e-07,
"logits/chosen": -3.1391327381134033,
"logits/rejected": -3.2130041122436523,
"logps/chosen": -0.405760794878006,
"logps/rejected": -0.4274853765964508,
"loss": 1.289,
"rewards/accuracies": 0.5208333730697632,
"rewards/chosen": -0.8115215301513672,
"rewards/margins": 0.043449223041534424,
"rewards/rejected": -0.8549707531929016,
"step": 402
},
{
"epoch": 1.3280705351650606,
"grad_norm": 8.592528343200684,
"learning_rate": 3.752276867030965e-07,
"logits/chosen": -2.9488821029663086,
"logits/rejected": -2.944669485092163,
"logps/chosen": -0.40197545289993286,
"logps/rejected": -0.4468863308429718,
"loss": 1.2564,
"rewards/accuracies": 0.6458333730697632,
"rewards/chosen": -0.8039509654045105,
"rewards/margins": 0.08982174843549728,
"rewards/rejected": -0.8937727212905884,
"step": 405
},
{
"epoch": 1.3379126512200123,
"grad_norm": 8.592171669006348,
"learning_rate": 3.697632058287796e-07,
"logits/chosen": -3.245591878890991,
"logits/rejected": -3.060716390609741,
"logps/chosen": -0.39691171050071716,
"logps/rejected": -0.45668381452560425,
"loss": 1.2334,
"rewards/accuracies": 0.6927083730697632,
"rewards/chosen": -0.7938233613967896,
"rewards/margins": 0.11954419314861298,
"rewards/rejected": -0.9133676290512085,
"step": 408
},
{
"epoch": 1.3477547672749641,
"grad_norm": 8.549724578857422,
"learning_rate": 3.6429872495446264e-07,
"logits/chosen": -3.181145191192627,
"logits/rejected": -3.142042875289917,
"logps/chosen": -0.4377954304218292,
"logps/rejected": -0.48063966631889343,
"loss": 1.2598,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8755908012390137,
"rewards/margins": 0.085688516497612,
"rewards/rejected": -0.9612793326377869,
"step": 411
},
{
"epoch": 1.357596883329916,
"grad_norm": 9.25687026977539,
"learning_rate": 3.5883424408014566e-07,
"logits/chosen": -2.8549139499664307,
"logits/rejected": -2.8353519439697266,
"logps/chosen": -0.43043509125709534,
"logps/rejected": -0.47761547565460205,
"loss": 1.252,
"rewards/accuracies": 0.6041666865348816,
"rewards/chosen": -0.8608702421188354,
"rewards/margins": 0.09436076134443283,
"rewards/rejected": -0.9552309513092041,
"step": 414
},
{
"epoch": 1.3674389993848677,
"grad_norm": 10.600887298583984,
"learning_rate": 3.533697632058288e-07,
"logits/chosen": -3.306717872619629,
"logits/rejected": -3.220644235610962,
"logps/chosen": -0.44229361414909363,
"logps/rejected": -0.4992312490940094,
"loss": 1.2405,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.884587287902832,
"rewards/margins": 0.11387524008750916,
"rewards/rejected": -0.9984624981880188,
"step": 417
},
{
"epoch": 1.3772811154398195,
"grad_norm": 9.458456993103027,
"learning_rate": 3.4790528233151184e-07,
"logits/chosen": -3.274623394012451,
"logits/rejected": -3.216095447540283,
"logps/chosen": -0.42786625027656555,
"logps/rejected": -0.47491148114204407,
"loss": 1.2528,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8557324409484863,
"rewards/margins": 0.0940905213356018,
"rewards/rejected": -0.9498229622840881,
"step": 420
},
{
"epoch": 1.3871232314947715,
"grad_norm": 10.303235054016113,
"learning_rate": 3.424408014571949e-07,
"logits/chosen": -3.2499632835388184,
"logits/rejected": -3.265618324279785,
"logps/chosen": -0.44693538546562195,
"logps/rejected": -0.49852439761161804,
"loss": 1.2465,
"rewards/accuracies": 0.6458333730697632,
"rewards/chosen": -0.8938708305358887,
"rewards/margins": 0.103178009390831,
"rewards/rejected": -0.9970487952232361,
"step": 423
},
{
"epoch": 1.3969653475497232,
"grad_norm": 11.34437370300293,
"learning_rate": 3.369763205828779e-07,
"logits/chosen": -3.083287239074707,
"logits/rejected": -2.965116500854492,
"logps/chosen": -0.4754757583141327,
"logps/rejected": -0.5378195643424988,
"loss": 1.234,
"rewards/accuracies": 0.5885417461395264,
"rewards/chosen": -0.9509515166282654,
"rewards/margins": 0.12468753010034561,
"rewards/rejected": -1.0756391286849976,
"step": 426
},
{
"epoch": 1.406807463604675,
"grad_norm": 10.716517448425293,
"learning_rate": 3.31511839708561e-07,
"logits/chosen": -2.8256187438964844,
"logits/rejected": -2.988969326019287,
"logps/chosen": -0.46769338846206665,
"logps/rejected": -0.509919285774231,
"loss": 1.2604,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9353867769241333,
"rewards/margins": 0.0844518169760704,
"rewards/rejected": -1.019838571548462,
"step": 429
},
{
"epoch": 1.4166495796596268,
"grad_norm": 11.292407989501953,
"learning_rate": 3.2604735883424406e-07,
"logits/chosen": -3.2503647804260254,
"logits/rejected": -3.2309374809265137,
"logps/chosen": -0.47588497400283813,
"logps/rejected": -0.5366601943969727,
"loss": 1.2343,
"rewards/accuracies": 0.6614583730697632,
"rewards/chosen": -0.9517699480056763,
"rewards/margins": 0.12155050784349442,
"rewards/rejected": -1.0733203887939453,
"step": 432
},
{
"epoch": 1.4264916957145786,
"grad_norm": 10.401873588562012,
"learning_rate": 3.205828779599271e-07,
"logits/chosen": -2.9665112495422363,
"logits/rejected": -3.125314950942993,
"logps/chosen": -0.46969038248062134,
"logps/rejected": -0.5004762411117554,
"loss": 1.2755,
"rewards/accuracies": 0.578125,
"rewards/chosen": -0.9393807649612427,
"rewards/margins": 0.0615716427564621,
"rewards/rejected": -1.0009524822235107,
"step": 435
},
{
"epoch": 1.4363338117695306,
"grad_norm": 11.833475112915039,
"learning_rate": 3.151183970856102e-07,
"logits/chosen": -3.078540325164795,
"logits/rejected": -3.116511821746826,
"logps/chosen": -0.48638832569122314,
"logps/rejected": -0.5369706153869629,
"loss": 1.2495,
"rewards/accuracies": 0.6145833730697632,
"rewards/chosen": -0.9727766513824463,
"rewards/margins": 0.10116463154554367,
"rewards/rejected": -1.0739412307739258,
"step": 438
},
{
"epoch": 1.4461759278244823,
"grad_norm": 9.525228500366211,
"learning_rate": 3.096539162112932e-07,
"logits/chosen": -3.2219226360321045,
"logits/rejected": -3.1760897636413574,
"logps/chosen": -0.4626237154006958,
"logps/rejected": -0.5069611072540283,
"loss": 1.2574,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.9252474308013916,
"rewards/margins": 0.08867473900318146,
"rewards/rejected": -1.0139222145080566,
"step": 441
},
{
"epoch": 1.4560180438794341,
"grad_norm": 10.4345121383667,
"learning_rate": 3.041894353369763e-07,
"logits/chosen": -3.1432931423187256,
"logits/rejected": -3.2274186611175537,
"logps/chosen": -0.4711434245109558,
"logps/rejected": -0.5244415998458862,
"loss": 1.2465,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": -0.9422869086265564,
"rewards/margins": 0.10659627616405487,
"rewards/rejected": -1.0488831996917725,
"step": 444
},
{
"epoch": 1.465860159934386,
"grad_norm": 11.221490859985352,
"learning_rate": 2.987249544626594e-07,
"logits/chosen": -3.4779763221740723,
"logits/rejected": -3.4805781841278076,
"logps/chosen": -0.44780024886131287,
"logps/rejected": -0.4945845901966095,
"loss": 1.2547,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8956004977226257,
"rewards/margins": 0.0935685932636261,
"rewards/rejected": -0.9891691207885742,
"step": 447
},
{
"epoch": 1.4757022759893377,
"grad_norm": 11.709156036376953,
"learning_rate": 2.9326047358834246e-07,
"logits/chosen": -3.1414666175842285,
"logits/rejected": -3.1980159282684326,
"logps/chosen": -0.47789841890335083,
"logps/rejected": -0.5440502166748047,
"loss": 1.2274,
"rewards/accuracies": 0.6770833730697632,
"rewards/chosen": -0.9557968378067017,
"rewards/margins": 0.1323036104440689,
"rewards/rejected": -1.0881004333496094,
"step": 450
},
{
"epoch": 1.4855443920442895,
"grad_norm": 10.401344299316406,
"learning_rate": 2.8779599271402547e-07,
"logits/chosen": -3.1782946586608887,
"logits/rejected": -3.184023857116699,
"logps/chosen": -0.46204572916030884,
"logps/rejected": -0.4961879849433899,
"loss": 1.2697,
"rewards/accuracies": 0.6145833730697632,
"rewards/chosen": -0.9240914583206177,
"rewards/margins": 0.06828457117080688,
"rewards/rejected": -0.9923759698867798,
"step": 453
},
{
"epoch": 1.4953865080992412,
"grad_norm": 11.5233793258667,
"learning_rate": 2.8233151183970854e-07,
"logits/chosen": -3.2461302280426025,
"logits/rejected": -3.2496047019958496,
"logps/chosen": -0.4670412540435791,
"logps/rejected": -0.5199131965637207,
"loss": 1.2448,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -0.934082567691803,
"rewards/margins": 0.1057438999414444,
"rewards/rejected": -1.0398263931274414,
"step": 456
},
{
"epoch": 1.505228624154193,
"grad_norm": 12.667001724243164,
"learning_rate": 2.768670309653916e-07,
"logits/chosen": -3.2025842666625977,
"logits/rejected": -3.323841094970703,
"logps/chosen": -0.49344322085380554,
"logps/rejected": -0.5365071296691895,
"loss": 1.2605,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.9868864417076111,
"rewards/margins": 0.0861278772354126,
"rewards/rejected": -1.073014259338379,
"step": 459
},
{
"epoch": 1.515070740209145,
"grad_norm": 10.09226131439209,
"learning_rate": 2.7140255009107467e-07,
"logits/chosen": -3.169337749481201,
"logits/rejected": -3.1390461921691895,
"logps/chosen": -0.4897007346153259,
"logps/rejected": -0.5485378503799438,
"loss": 1.2382,
"rewards/accuracies": 0.6458333730697632,
"rewards/chosen": -0.9794015288352966,
"rewards/margins": 0.11767425388097763,
"rewards/rejected": -1.0970757007598877,
"step": 462
},
{
"epoch": 1.5249128562640968,
"grad_norm": 11.221296310424805,
"learning_rate": 2.659380692167577e-07,
"logits/chosen": -3.3969221115112305,
"logits/rejected": -3.386396884918213,
"logps/chosen": -0.47860053181648254,
"logps/rejected": -0.5197103023529053,
"loss": 1.2622,
"rewards/accuracies": 0.5885417461395264,
"rewards/chosen": -0.9572010636329651,
"rewards/margins": 0.08221958577632904,
"rewards/rejected": -1.0394206047058105,
"step": 465
},
{
"epoch": 1.5347549723190486,
"grad_norm": 11.274225234985352,
"learning_rate": 2.6047358834244075e-07,
"logits/chosen": -2.8993306159973145,
"logits/rejected": -2.910104513168335,
"logps/chosen": -0.49471789598464966,
"logps/rejected": -0.5440719723701477,
"loss": 1.2494,
"rewards/accuracies": 0.6197916865348816,
"rewards/chosen": -0.9894358515739441,
"rewards/margins": 0.0987081229686737,
"rewards/rejected": -1.0881439447402954,
"step": 468
},
{
"epoch": 1.5445970883740006,
"grad_norm": 11.793699264526367,
"learning_rate": 2.5500910746812387e-07,
"logits/chosen": -3.160764694213867,
"logits/rejected": -3.1699819564819336,
"logps/chosen": -0.5097307562828064,
"logps/rejected": -0.556024432182312,
"loss": 1.2529,
"rewards/accuracies": 0.6510416865348816,
"rewards/chosen": -1.0194615125656128,
"rewards/margins": 0.09258747845888138,
"rewards/rejected": -1.112048864364624,
"step": 471
},
{
"epoch": 1.5544392044289523,
"grad_norm": 10.38713264465332,
"learning_rate": 2.495446265938069e-07,
"logits/chosen": -3.261507749557495,
"logits/rejected": -3.2834768295288086,
"logps/chosen": -0.4616134762763977,
"logps/rejected": -0.5000269412994385,
"loss": 1.2651,
"rewards/accuracies": 0.6041667461395264,
"rewards/chosen": -0.9232269525527954,
"rewards/margins": 0.07682683318853378,
"rewards/rejected": -1.000053882598877,
"step": 474
},
{
"epoch": 1.5642813204839041,
"grad_norm": 11.302956581115723,
"learning_rate": 2.4408014571949e-07,
"logits/chosen": -3.1922333240509033,
"logits/rejected": -3.187087059020996,
"logps/chosen": -0.5025749206542969,
"logps/rejected": -0.5681424736976624,
"loss": 1.2272,
"rewards/accuracies": 0.7135417461395264,
"rewards/chosen": -1.0051498413085938,
"rewards/margins": 0.1311352252960205,
"rewards/rejected": -1.1362850666046143,
"step": 477
},
{
"epoch": 1.574123436538856,
"grad_norm": 11.857656478881836,
"learning_rate": 2.38615664845173e-07,
"logits/chosen": -3.1751718521118164,
"logits/rejected": -3.2345504760742188,
"logps/chosen": -0.4723814129829407,
"logps/rejected": -0.5178726315498352,
"loss": 1.2564,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": -0.9447628259658813,
"rewards/margins": 0.09098244458436966,
"rewards/rejected": -1.0357452630996704,
"step": 480
},
{
"epoch": 1.5839655525938077,
"grad_norm": 11.975550651550293,
"learning_rate": 2.3315118397085608e-07,
"logits/chosen": -3.2860124111175537,
"logits/rejected": -3.2100272178649902,
"logps/chosen": -0.5020599961280823,
"logps/rejected": -0.5415343046188354,
"loss": 1.2634,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.0041199922561646,
"rewards/margins": 0.07894869148731232,
"rewards/rejected": -1.083068609237671,
"step": 483
},
{
"epoch": 1.5938076686487594,
"grad_norm": 10.477987289428711,
"learning_rate": 2.2768670309653915e-07,
"logits/chosen": -3.417156934738159,
"logits/rejected": -3.4442646503448486,
"logps/chosen": -0.4748501181602478,
"logps/rejected": -0.5157420039176941,
"loss": 1.2617,
"rewards/accuracies": 0.5572916865348816,
"rewards/chosen": -0.9497002959251404,
"rewards/margins": 0.08178383111953735,
"rewards/rejected": -1.0314841270446777,
"step": 486
},
{
"epoch": 1.6036497847037112,
"grad_norm": 10.905505180358887,
"learning_rate": 2.222222222222222e-07,
"logits/chosen": -3.232025384902954,
"logits/rejected": -3.3174943923950195,
"logps/chosen": -0.48250630497932434,
"logps/rejected": -0.5197854042053223,
"loss": 1.2651,
"rewards/accuracies": 0.6041666865348816,
"rewards/chosen": -0.9650125503540039,
"rewards/margins": 0.07455817610025406,
"rewards/rejected": -1.039570689201355,
"step": 489
},
{
"epoch": 1.613491900758663,
"grad_norm": 9.681890487670898,
"learning_rate": 2.1675774134790528e-07,
"logits/chosen": -3.3105580806732178,
"logits/rejected": -3.2920236587524414,
"logps/chosen": -0.480990469455719,
"logps/rejected": -0.5195989608764648,
"loss": 1.2647,
"rewards/accuracies": 0.5989583730697632,
"rewards/chosen": -0.961980938911438,
"rewards/margins": 0.07721701264381409,
"rewards/rejected": -1.0391979217529297,
"step": 492
},
{
"epoch": 1.6233340168136148,
"grad_norm": 11.535338401794434,
"learning_rate": 2.1129326047358833e-07,
"logits/chosen": -3.3885207176208496,
"logits/rejected": -3.4516897201538086,
"logps/chosen": -0.4919550120830536,
"logps/rejected": -0.537667989730835,
"loss": 1.256,
"rewards/accuracies": 0.5520833730697632,
"rewards/chosen": -0.9839099645614624,
"rewards/margins": 0.09142596274614334,
"rewards/rejected": -1.07533597946167,
"step": 495
},
{
"epoch": 1.6331761328685668,
"grad_norm": 11.257711410522461,
"learning_rate": 2.058287795992714e-07,
"logits/chosen": -3.418330669403076,
"logits/rejected": -3.4276938438415527,
"logps/chosen": -0.4669812321662903,
"logps/rejected": -0.5167567729949951,
"loss": 1.251,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.9339624643325806,
"rewards/margins": 0.09955108910799026,
"rewards/rejected": -1.0335136651992798,
"step": 498
},
{
"epoch": 1.6430182489235186,
"grad_norm": 31.353042602539062,
"learning_rate": 2.0036429872495443e-07,
"logits/chosen": -3.2747483253479004,
"logits/rejected": -3.2000725269317627,
"logps/chosen": -0.49033480882644653,
"logps/rejected": -0.5444329380989075,
"loss": 1.2428,
"rewards/accuracies": 0.6406250596046448,
"rewards/chosen": -0.9806696176528931,
"rewards/margins": 0.10819630324840546,
"rewards/rejected": -1.0888659954071045,
"step": 501
},
{
"epoch": 1.6528603649784703,
"grad_norm": 12.651705741882324,
"learning_rate": 1.9489981785063753e-07,
"logits/chosen": -2.9816644191741943,
"logits/rejected": -3.024106979370117,
"logps/chosen": -0.5482050180435181,
"logps/rejected": -0.5978180170059204,
"loss": 1.251,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0964101552963257,
"rewards/margins": 0.09922590851783752,
"rewards/rejected": -1.1956360340118408,
"step": 504
},
{
"epoch": 1.6627024810334223,
"grad_norm": 11.137748718261719,
"learning_rate": 1.894353369763206e-07,
"logits/chosen": -3.3126227855682373,
"logits/rejected": -3.354325532913208,
"logps/chosen": -0.4819042980670929,
"logps/rejected": -0.5410703420639038,
"loss": 1.2365,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.9638086557388306,
"rewards/margins": 0.11833213269710541,
"rewards/rejected": -1.0821408033370972,
"step": 507
},
{
"epoch": 1.672544597088374,
"grad_norm": 11.545525550842285,
"learning_rate": 1.8397085610200363e-07,
"logits/chosen": -3.5107603073120117,
"logits/rejected": -3.5072875022888184,
"logps/chosen": -0.515521764755249,
"logps/rejected": -0.5749427080154419,
"loss": 1.2368,
"rewards/accuracies": 0.6822916865348816,
"rewards/chosen": -1.0310436487197876,
"rewards/margins": 0.11884190142154694,
"rewards/rejected": -1.1498854160308838,
"step": 510
},
{
"epoch": 1.6823867131433259,
"grad_norm": 10.51858139038086,
"learning_rate": 1.785063752276867e-07,
"logits/chosen": -3.210937023162842,
"logits/rejected": -3.261808395385742,
"logps/chosen": -0.5181245803833008,
"logps/rejected": -0.5811792612075806,
"loss": 1.2335,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -1.0362491607666016,
"rewards/margins": 0.12610934674739838,
"rewards/rejected": -1.1623585224151611,
"step": 513
},
{
"epoch": 1.6922288291982777,
"grad_norm": 12.993363380432129,
"learning_rate": 1.7304189435336974e-07,
"logits/chosen": -2.966552972793579,
"logits/rejected": -2.980912685394287,
"logps/chosen": -0.568572461605072,
"logps/rejected": -0.6211986541748047,
"loss": 1.2491,
"rewards/accuracies": 0.6302083730697632,
"rewards/chosen": -1.137144923210144,
"rewards/margins": 0.10525240004062653,
"rewards/rejected": -1.2423973083496094,
"step": 516
},
{
"epoch": 1.7020709452532294,
"grad_norm": 10.730545043945312,
"learning_rate": 1.6757741347905283e-07,
"logits/chosen": -3.201765298843384,
"logits/rejected": -3.2614786624908447,
"logps/chosen": -0.49252548813819885,
"logps/rejected": -0.5455992221832275,
"loss": 1.2456,
"rewards/accuracies": 0.6302083730697632,
"rewards/chosen": -0.9850510358810425,
"rewards/margins": 0.10614749789237976,
"rewards/rejected": -1.091198444366455,
"step": 519
},
{
"epoch": 1.7119130613081812,
"grad_norm": 12.557461738586426,
"learning_rate": 1.6211293260473587e-07,
"logits/chosen": -3.0382239818573,
"logits/rejected": -3.0993409156799316,
"logps/chosen": -0.5261319875717163,
"logps/rejected": -0.5957756042480469,
"loss": 1.2264,
"rewards/accuracies": 0.6614583730697632,
"rewards/chosen": -1.0522639751434326,
"rewards/margins": 0.13928718864917755,
"rewards/rejected": -1.1915510892868042,
"step": 522
},
{
"epoch": 1.721755177363133,
"grad_norm": 12.95654296875,
"learning_rate": 1.5664845173041894e-07,
"logits/chosen": -3.4481818675994873,
"logits/rejected": -3.484841823577881,
"logps/chosen": -0.5029021501541138,
"logps/rejected": -0.5526931285858154,
"loss": 1.2491,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.0058043003082275,
"rewards/margins": 0.09958191215991974,
"rewards/rejected": -1.1053862571716309,
"step": 525
},
{
"epoch": 1.7315972934180848,
"grad_norm": 13.081609725952148,
"learning_rate": 1.5118397085610198e-07,
"logits/chosen": -3.1820714473724365,
"logits/rejected": -3.1990368366241455,
"logps/chosen": -0.5167220830917358,
"logps/rejected": -0.5637269020080566,
"loss": 1.2527,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0334441661834717,
"rewards/margins": 0.094009630382061,
"rewards/rejected": -1.1274538040161133,
"step": 528
},
{
"epoch": 1.7414394094730365,
"grad_norm": 11.695033073425293,
"learning_rate": 1.4571948998178507e-07,
"logits/chosen": -3.054007053375244,
"logits/rejected": -3.153001308441162,
"logps/chosen": -0.5482903718948364,
"logps/rejected": -0.6053259372711182,
"loss": 1.2394,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0965806245803833,
"rewards/margins": 0.11407110095024109,
"rewards/rejected": -1.2106518745422363,
"step": 531
},
{
"epoch": 1.7512815255279885,
"grad_norm": 12.978899955749512,
"learning_rate": 1.402550091074681e-07,
"logits/chosen": -3.115644693374634,
"logits/rejected": -3.0674333572387695,
"logps/chosen": -0.5462538003921509,
"logps/rejected": -0.600370466709137,
"loss": 1.2462,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": -1.0925076007843018,
"rewards/margins": 0.10823334753513336,
"rewards/rejected": -1.2007410526275635,
"step": 534
},
{
"epoch": 1.7611236415829403,
"grad_norm": 11.404801368713379,
"learning_rate": 1.3479052823315118e-07,
"logits/chosen": -3.090611219406128,
"logits/rejected": -3.2334389686584473,
"logps/chosen": -0.5118279457092285,
"logps/rejected": -0.5650106072425842,
"loss": 1.2462,
"rewards/accuracies": 0.6041666865348816,
"rewards/chosen": -1.023655891418457,
"rewards/margins": 0.10636530816555023,
"rewards/rejected": -1.1300212144851685,
"step": 537
},
{
"epoch": 1.770965757637892,
"grad_norm": 12.541303634643555,
"learning_rate": 1.2932604735883425e-07,
"logits/chosen": -3.310518264770508,
"logits/rejected": -3.390636444091797,
"logps/chosen": -0.525303840637207,
"logps/rejected": -0.5825842618942261,
"loss": 1.2412,
"rewards/accuracies": 0.6197916865348816,
"rewards/chosen": -1.050607681274414,
"rewards/margins": 0.11456100642681122,
"rewards/rejected": -1.1651686429977417,
"step": 540
},
{
"epoch": 1.780807873692844,
"grad_norm": 14.414772987365723,
"learning_rate": 1.238615664845173e-07,
"logits/chosen": -2.993222713470459,
"logits/rejected": -3.050335168838501,
"logps/chosen": -0.5568434000015259,
"logps/rejected": -0.5955438613891602,
"loss": 1.2667,
"rewards/accuracies": 0.5625000596046448,
"rewards/chosen": -1.1136868000030518,
"rewards/margins": 0.07740084081888199,
"rewards/rejected": -1.1910877227783203,
"step": 543
},
{
"epoch": 1.7906499897477959,
"grad_norm": 11.817216873168945,
"learning_rate": 1.1839708561020035e-07,
"logits/chosen": -3.2628841400146484,
"logits/rejected": -3.3687920570373535,
"logps/chosen": -0.5100204944610596,
"logps/rejected": -0.5642228722572327,
"loss": 1.2437,
"rewards/accuracies": 0.6614583730697632,
"rewards/chosen": -1.0200409889221191,
"rewards/margins": 0.10840488970279694,
"rewards/rejected": -1.1284458637237549,
"step": 546
},
{
"epoch": 1.8004921058027477,
"grad_norm": 12.728991508483887,
"learning_rate": 1.1293260473588342e-07,
"logits/chosen": -3.1474390029907227,
"logits/rejected": -3.1595215797424316,
"logps/chosen": -0.5479636788368225,
"logps/rejected": -0.6197019815444946,
"loss": 1.2228,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -1.095927357673645,
"rewards/margins": 0.14347663521766663,
"rewards/rejected": -1.2394039630889893,
"step": 549
},
{
"epoch": 1.8103342218576994,
"grad_norm": 12.561689376831055,
"learning_rate": 1.0746812386156647e-07,
"logits/chosen": -2.919260025024414,
"logits/rejected": -3.0359532833099365,
"logps/chosen": -0.5316007137298584,
"logps/rejected": -0.578840970993042,
"loss": 1.2546,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0632013082504272,
"rewards/margins": 0.09448058158159256,
"rewards/rejected": -1.157681941986084,
"step": 552
},
{
"epoch": 1.8201763379126512,
"grad_norm": 13.989924430847168,
"learning_rate": 1.0200364298724954e-07,
"logits/chosen": -3.202847957611084,
"logits/rejected": -3.3060855865478516,
"logps/chosen": -0.5514649748802185,
"logps/rejected": -0.6039638519287109,
"loss": 1.2515,
"rewards/accuracies": 0.5833333730697632,
"rewards/chosen": -1.102929949760437,
"rewards/margins": 0.10499779880046844,
"rewards/rejected": -1.2079277038574219,
"step": 555
},
{
"epoch": 1.830018453967603,
"grad_norm": 13.695045471191406,
"learning_rate": 9.653916211293261e-08,
"logits/chosen": -2.9378960132598877,
"logits/rejected": -2.944990634918213,
"logps/chosen": -0.5494365692138672,
"logps/rejected": -0.6013669371604919,
"loss": 1.2493,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0988731384277344,
"rewards/margins": 0.10386063158512115,
"rewards/rejected": -1.2027339935302734,
"step": 558
},
{
"epoch": 1.8398605700225548,
"grad_norm": 14.38819694519043,
"learning_rate": 9.107468123861566e-08,
"logits/chosen": -2.811845302581787,
"logits/rejected": -2.8457605838775635,
"logps/chosen": -0.5353372097015381,
"logps/rejected": -0.5632250905036926,
"loss": 1.2837,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0706744194030762,
"rewards/margins": 0.055775657296180725,
"rewards/rejected": -1.1264501810073853,
"step": 561
},
{
"epoch": 1.8497026860775065,
"grad_norm": 11.79210376739502,
"learning_rate": 8.561020036429873e-08,
"logits/chosen": -3.3166909217834473,
"logits/rejected": -3.299351215362549,
"logps/chosen": -0.5189695358276367,
"logps/rejected": -0.572175145149231,
"loss": 1.2461,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.0379390716552734,
"rewards/margins": 0.10641118884086609,
"rewards/rejected": -1.1443501710891724,
"step": 564
},
{
"epoch": 1.8595448021324583,
"grad_norm": 12.88967227935791,
"learning_rate": 8.014571948998178e-08,
"logits/chosen": -3.1258530616760254,
"logits/rejected": -3.1050872802734375,
"logps/chosen": -0.5324106216430664,
"logps/rejected": -0.5854768753051758,
"loss": 1.2475,
"rewards/accuracies": 0.6354166865348816,
"rewards/chosen": -1.0648212432861328,
"rewards/margins": 0.1061326190829277,
"rewards/rejected": -1.1709537506103516,
"step": 567
},
{
"epoch": 1.8693869181874103,
"grad_norm": 13.419716835021973,
"learning_rate": 7.468123861566485e-08,
"logits/chosen": -2.8158140182495117,
"logits/rejected": -2.991899013519287,
"logps/chosen": -0.5541937947273254,
"logps/rejected": -0.5925447344779968,
"loss": 1.2686,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1083875894546509,
"rewards/margins": 0.07670189440250397,
"rewards/rejected": -1.1850894689559937,
"step": 570
},
{
"epoch": 1.879229034242362,
"grad_norm": 13.015626907348633,
"learning_rate": 6.92167577413479e-08,
"logits/chosen": -3.060314655303955,
"logits/rejected": -3.1021711826324463,
"logps/chosen": -0.5426675081253052,
"logps/rejected": -0.5891798734664917,
"loss": 1.2529,
"rewards/accuracies": 0.6302083730697632,
"rewards/chosen": -1.0853350162506104,
"rewards/margins": 0.09302478283643723,
"rewards/rejected": -1.1783597469329834,
"step": 573
},
{
"epoch": 1.8890711502973139,
"grad_norm": 12.878572463989258,
"learning_rate": 6.375227686703097e-08,
"logits/chosen": -3.18039870262146,
"logits/rejected": -3.163947343826294,
"logps/chosen": -0.5458322167396545,
"logps/rejected": -0.5959421396255493,
"loss": 1.252,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.0916643142700195,
"rewards/margins": 0.1002199798822403,
"rewards/rejected": -1.1918842792510986,
"step": 576
},
{
"epoch": 1.8989132663522659,
"grad_norm": 12.223690032958984,
"learning_rate": 5.828779599271402e-08,
"logits/chosen": -3.4556312561035156,
"logits/rejected": -3.4509153366088867,
"logps/chosen": -0.5170646905899048,
"logps/rejected": -0.5667902231216431,
"loss": 1.2527,
"rewards/accuracies": 0.6197916865348816,
"rewards/chosen": -1.0341295003890991,
"rewards/margins": 0.09945093840360641,
"rewards/rejected": -1.1335804462432861,
"step": 579
},
{
"epoch": 1.9087553824072176,
"grad_norm": 11.406511306762695,
"learning_rate": 5.282331511839708e-08,
"logits/chosen": -3.1296825408935547,
"logits/rejected": -3.053649425506592,
"logps/chosen": -0.4993407130241394,
"logps/rejected": -0.557974636554718,
"loss": 1.2393,
"rewards/accuracies": 0.6145833730697632,
"rewards/chosen": -0.9986814260482788,
"rewards/margins": 0.11726780235767365,
"rewards/rejected": -1.115949273109436,
"step": 582
},
{
"epoch": 1.9185974984621694,
"grad_norm": 12.973370552062988,
"learning_rate": 4.735883424408015e-08,
"logits/chosen": -2.872734785079956,
"logits/rejected": -2.9354488849639893,
"logps/chosen": -0.5702813863754272,
"logps/rejected": -0.6181797981262207,
"loss": 1.255,
"rewards/accuracies": 0.6197916865348816,
"rewards/chosen": -1.1405627727508545,
"rewards/margins": 0.0957968458533287,
"rewards/rejected": -1.2363595962524414,
"step": 585
},
{
"epoch": 1.9284396145171212,
"grad_norm": 10.770984649658203,
"learning_rate": 4.189435336976321e-08,
"logits/chosen": -3.0323398113250732,
"logits/rejected": -3.084712266921997,
"logps/chosen": -0.5010161995887756,
"logps/rejected": -0.5550758242607117,
"loss": 1.2431,
"rewards/accuracies": 0.6197916865348816,
"rewards/chosen": -1.0020323991775513,
"rewards/margins": 0.10811936110258102,
"rewards/rejected": -1.110151767730713,
"step": 588
},
{
"epoch": 1.938281730572073,
"grad_norm": 13.19275951385498,
"learning_rate": 3.642987249544627e-08,
"logits/chosen": -3.0209498405456543,
"logits/rejected": -3.197568893432617,
"logps/chosen": -0.5342345237731934,
"logps/rejected": -0.5740767121315002,
"loss": 1.2665,
"rewards/accuracies": 0.6093750596046448,
"rewards/chosen": -1.0684690475463867,
"rewards/margins": 0.07968436926603317,
"rewards/rejected": -1.148153305053711,
"step": 591
},
{
"epoch": 1.9481238466270248,
"grad_norm": 13.59703540802002,
"learning_rate": 3.096539162112933e-08,
"logits/chosen": -3.445460319519043,
"logits/rejected": -3.4261269569396973,
"logps/chosen": -0.5216549634933472,
"logps/rejected": -0.5718774795532227,
"loss": 1.2519,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.0433099269866943,
"rewards/margins": 0.10044509172439575,
"rewards/rejected": -1.1437549591064453,
"step": 594
},
{
"epoch": 1.9579659626819765,
"grad_norm": 13.935587882995605,
"learning_rate": 2.5500910746812385e-08,
"logits/chosen": -3.1474225521087646,
"logits/rejected": -3.192647933959961,
"logps/chosen": -0.5525200366973877,
"logps/rejected": -0.6108919382095337,
"loss": 1.2421,
"rewards/accuracies": 0.5885416865348816,
"rewards/chosen": -1.1050399541854858,
"rewards/margins": 0.1167440339922905,
"rewards/rejected": -1.2217838764190674,
"step": 597
},
{
"epoch": 1.9678080787369283,
"grad_norm": 12.028646469116211,
"learning_rate": 2.0036429872495445e-08,
"logits/chosen": -2.8930859565734863,
"logits/rejected": -3.0996487140655518,
"logps/chosen": -0.5432897210121155,
"logps/rejected": -0.6004898548126221,
"loss": 1.2421,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.086579442024231,
"rewards/margins": 0.11440026760101318,
"rewards/rejected": -1.2009797096252441,
"step": 600
},
{
"epoch": 1.97765019479188,
"grad_norm": 12.551324844360352,
"learning_rate": 1.4571948998178505e-08,
"logits/chosen": -2.6679749488830566,
"logits/rejected": -2.9275968074798584,
"logps/chosen": -0.5680824518203735,
"logps/rejected": -0.6121156215667725,
"loss": 1.2617,
"rewards/accuracies": 0.5989583730697632,
"rewards/chosen": -1.136164903640747,
"rewards/margins": 0.08806635439395905,
"rewards/rejected": -1.224231243133545,
"step": 603
},
{
"epoch": 1.987492310846832,
"grad_norm": 13.448480606079102,
"learning_rate": 9.107468123861567e-09,
"logits/chosen": -2.95973801612854,
"logits/rejected": -3.03635835647583,
"logps/chosen": -0.5279879570007324,
"logps/rejected": -0.5912687182426453,
"loss": 1.2317,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0559759140014648,
"rewards/margins": 0.1265614926815033,
"rewards/rejected": -1.1825374364852905,
"step": 606
},
{
"epoch": 1.9973344269017839,
"grad_norm": 12.023303985595703,
"learning_rate": 3.6429872495446263e-09,
"logits/chosen": -2.846055746078491,
"logits/rejected": -2.954606533050537,
"logps/chosen": -0.5345979332923889,
"logps/rejected": -0.5786259770393372,
"loss": 1.2608,
"rewards/accuracies": 0.5989583730697632,
"rewards/chosen": -1.0691958665847778,
"rewards/margins": 0.08805612474679947,
"rewards/rejected": -1.1572520732879639,
"step": 609
},
{
"epoch": 2.0,
"step": 610,
"total_flos": 174251037229056.0,
"train_loss": 1.2841152152077095,
"train_runtime": 12715.4202,
"train_samples_per_second": 3.068,
"train_steps_per_second": 0.048
}
],
"logging_steps": 3,
"max_steps": 610,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 174251037229056.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}