llama3-2k / trainer_state.json
Seohyeong Lee
add dataset
e031c07
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 1000,
"global_step": 375,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 1.3157894736842104e-08,
"logits/chosen": -0.5324900150299072,
"logits/rejected": -0.5734304189682007,
"logps/chosen": -543.2296752929688,
"logps/rejected": -325.48358154296875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/mix_margin": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.08,
"learning_rate": 1.3157894736842104e-07,
"logits/chosen": -0.48575523495674133,
"logits/rejected": -0.5831019878387451,
"logps/chosen": -334.6309509277344,
"logps/rejected": -278.2859802246094,
"loss": 0.6997,
"rewards/accuracies": 0.4027777910232544,
"rewards/chosen": -0.012853524647653103,
"rewards/confidence": -0.0746772438287735,
"rewards/confidence_mean_diff": 0.0746772438287735,
"rewards/confidence_moving_diff": 0.0021637948229908943,
"rewards/margins": -0.007044664584100246,
"rewards/mix_margin": -0.007044283673167229,
"rewards/real_percentage": 14.129032135009766,
"rewards/rejected": -0.005808859597891569,
"step": 10
},
{
"epoch": 0.16,
"learning_rate": 2.631578947368421e-07,
"logits/chosen": -0.45206984877586365,
"logits/rejected": -0.4436320662498474,
"logps/chosen": -378.46478271484375,
"logps/rejected": -291.097412109375,
"loss": 0.687,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.019860025495290756,
"rewards/confidence": -0.07699747383594513,
"rewards/confidence_mean_diff": 0.07699747383594513,
"rewards/confidence_moving_diff": -6.244657561182976e-05,
"rewards/margins": 0.010339610278606415,
"rewards/mix_margin": 0.010339389555156231,
"rewards/real_percentage": 11.975000381469727,
"rewards/rejected": 0.009520411491394043,
"step": 20
},
{
"epoch": 0.24,
"learning_rate": 3.9473684210526315e-07,
"logits/chosen": -0.48425692319869995,
"logits/rejected": -0.5238968133926392,
"logps/chosen": -363.4825439453125,
"logps/rejected": -330.880859375,
"loss": 0.6746,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.07081757485866547,
"rewards/confidence": -0.0583333782851696,
"rewards/confidence_mean_diff": 0.0583333782851696,
"rewards/confidence_moving_diff": 0.00017116544768214226,
"rewards/margins": 0.04097529500722885,
"rewards/mix_margin": 0.04097532853484154,
"rewards/real_percentage": 12.024999618530273,
"rewards/rejected": 0.029842281714081764,
"step": 30
},
{
"epoch": 0.32,
"learning_rate": 4.999565492409831e-07,
"logits/chosen": -0.47305864095687866,
"logits/rejected": -0.582284152507782,
"logps/chosen": -335.81610107421875,
"logps/rejected": -256.0378723144531,
"loss": 0.6474,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.16260090470314026,
"rewards/confidence": -0.11778082698583603,
"rewards/confidence_mean_diff": 0.11778082698583603,
"rewards/confidence_moving_diff": 0.0008547043544240296,
"rewards/margins": 0.05970517918467522,
"rewards/mix_margin": 0.05970512703061104,
"rewards/real_percentage": 12.074999809265137,
"rewards/rejected": 0.10289572179317474,
"step": 40
},
{
"epoch": 0.4,
"learning_rate": 4.984373579809777e-07,
"logits/chosen": -0.5092490911483765,
"logits/rejected": -0.5690798163414001,
"logps/chosen": -329.53302001953125,
"logps/rejected": -295.02294921875,
"loss": 0.5866,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.3996647000312805,
"rewards/confidence": -0.14608541131019592,
"rewards/confidence_mean_diff": 0.14608541131019592,
"rewards/confidence_moving_diff": -0.00040556181920692325,
"rewards/margins": 0.20342092216014862,
"rewards/mix_margin": 0.2034207135438919,
"rewards/real_percentage": 12.0,
"rewards/rejected": 0.1962437778711319,
"step": 50
},
{
"epoch": 0.48,
"learning_rate": 4.947607089353757e-07,
"logits/chosen": -0.4855988025665283,
"logits/rejected": -0.5692173838615417,
"logps/chosen": -365.7965393066406,
"logps/rejected": -290.7939147949219,
"loss": 0.6262,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.6372241973876953,
"rewards/confidence": -0.27295011281967163,
"rewards/confidence_mean_diff": 0.27295011281967163,
"rewards/confidence_moving_diff": -8.605476614320651e-05,
"rewards/margins": 0.25993281602859497,
"rewards/mix_margin": 0.2599331736564636,
"rewards/real_percentage": 12.100000381469727,
"rewards/rejected": 0.37729138135910034,
"step": 60
},
{
"epoch": 0.56,
"learning_rate": 4.889585305354435e-07,
"logits/chosen": -0.511881411075592,
"logits/rejected": -0.5559085607528687,
"logps/chosen": -374.42559814453125,
"logps/rejected": -350.49285888671875,
"loss": 0.5776,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.938610851764679,
"rewards/confidence": -0.17002172768115997,
"rewards/confidence_mean_diff": 0.17002172768115997,
"rewards/confidence_moving_diff": 0.006227460689842701,
"rewards/margins": 0.39301368594169617,
"rewards/mix_margin": 0.39301276206970215,
"rewards/real_percentage": 12.199999809265137,
"rewards/rejected": 0.5455971360206604,
"step": 70
},
{
"epoch": 0.64,
"learning_rate": 4.810812095469401e-07,
"logits/chosen": -0.4341855049133301,
"logits/rejected": -0.4922330975532532,
"logps/chosen": -382.85986328125,
"logps/rejected": -316.0860595703125,
"loss": 0.4931,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 1.085030198097229,
"rewards/confidence": -0.36220604181289673,
"rewards/confidence_mean_diff": 0.36220604181289673,
"rewards/confidence_moving_diff": -0.004994163755327463,
"rewards/margins": 0.727383017539978,
"rewards/mix_margin": 0.7273828387260437,
"rewards/real_percentage": 11.899999618530273,
"rewards/rejected": 0.3576471507549286,
"step": 80
},
{
"epoch": 0.72,
"learning_rate": 4.711971535058109e-07,
"logits/chosen": -0.4119408130645752,
"logits/rejected": -0.5046309232711792,
"logps/chosen": -335.9080810546875,
"logps/rejected": -228.6096649169922,
"loss": 0.5641,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.9102222323417664,
"rewards/confidence": -0.18118831515312195,
"rewards/confidence_mean_diff": 0.18118831515312195,
"rewards/confidence_moving_diff": 0.0009786130394786596,
"rewards/margins": 0.6580663919448853,
"rewards/mix_margin": 0.6580665707588196,
"rewards/real_percentage": 12.074999809265137,
"rewards/rejected": 0.2521558403968811,
"step": 90
},
{
"epoch": 0.8,
"learning_rate": 4.593921966594997e-07,
"logits/chosen": -0.4459192752838135,
"logits/rejected": -0.4894910454750061,
"logps/chosen": -371.5416259765625,
"logps/rejected": -310.64239501953125,
"loss": 0.5469,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 1.2468544244766235,
"rewards/confidence": -0.45958179235458374,
"rewards/confidence_mean_diff": 0.45958179235458374,
"rewards/confidence_moving_diff": 0.004259251989424229,
"rewards/margins": 0.7577625513076782,
"rewards/mix_margin": 0.757762610912323,
"rewards/real_percentage": 12.100000381469727,
"rewards/rejected": 0.4890917241573334,
"step": 100
},
{
"epoch": 0.88,
"learning_rate": 4.457688545727496e-07,
"logits/chosen": -0.5113216042518616,
"logits/rejected": -0.5288140177726746,
"logps/chosen": -352.3919982910156,
"logps/rejected": -276.9599304199219,
"loss": 0.5222,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 1.0378813743591309,
"rewards/confidence": -0.29791101813316345,
"rewards/confidence_mean_diff": 0.29791101813316345,
"rewards/confidence_moving_diff": -0.0015016455436125398,
"rewards/margins": 0.7134403586387634,
"rewards/mix_margin": 0.7134405374526978,
"rewards/real_percentage": 11.949999809265137,
"rewards/rejected": 0.3244408965110779,
"step": 110
},
{
"epoch": 0.96,
"learning_rate": 4.3044543387098026e-07,
"logits/chosen": -0.5033639669418335,
"logits/rejected": -0.5167360901832581,
"logps/chosen": -323.29119873046875,
"logps/rejected": -265.58221435546875,
"loss": 0.5039,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 1.0098955631256104,
"rewards/confidence": -0.42422398924827576,
"rewards/confidence_mean_diff": 0.42422398924827576,
"rewards/confidence_moving_diff": 0.0008432863396592438,
"rewards/margins": 0.9732195138931274,
"rewards/mix_margin": 0.9732197523117065,
"rewards/real_percentage": 11.949999809265137,
"rewards/rejected": 0.036676160991191864,
"step": 120
},
{
"epoch": 1.04,
"learning_rate": 4.1355500485232917e-07,
"logits/chosen": -0.4795234203338623,
"logits/rejected": -0.5551981329917908,
"logps/chosen": -367.8242492675781,
"logps/rejected": -284.45062255859375,
"loss": 0.381,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.1803219318389893,
"rewards/confidence": -0.1641966998577118,
"rewards/confidence_mean_diff": 0.1641966998577118,
"rewards/confidence_moving_diff": -0.009292250499129295,
"rewards/margins": 1.2318060398101807,
"rewards/mix_margin": 1.2318063974380493,
"rewards/real_percentage": 11.875,
"rewards/rejected": -0.051483988761901855,
"step": 130
},
{
"epoch": 1.12,
"learning_rate": 3.9524424589030863e-07,
"logits/chosen": -0.47544917464256287,
"logits/rejected": -0.45598697662353516,
"logps/chosen": -368.21197509765625,
"logps/rejected": -327.8885803222656,
"loss": 0.2637,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.4148461818695068,
"rewards/confidence": 0.12870559096336365,
"rewards/confidence_mean_diff": -0.12870559096336365,
"rewards/confidence_moving_diff": -4.9034319090424106e-05,
"rewards/margins": 1.949605941772461,
"rewards/mix_margin": 1.9496057033538818,
"rewards/real_percentage": 12.024999618530273,
"rewards/rejected": -0.5347597599029541,
"step": 140
},
{
"epoch": 1.2,
"learning_rate": 3.7567216966241556e-07,
"logits/chosen": -0.5132138133049011,
"logits/rejected": -0.5720852613449097,
"logps/chosen": -349.05706787109375,
"logps/rejected": -309.68194580078125,
"loss": 0.2546,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.3815641403198242,
"rewards/confidence": 0.15192195773124695,
"rewards/confidence_mean_diff": -0.15192195773124695,
"rewards/confidence_moving_diff": -0.004711526446044445,
"rewards/margins": 1.766579031944275,
"rewards/mix_margin": 1.766579031944275,
"rewards/real_percentage": 11.925000190734863,
"rewards/rejected": -0.3850148320198059,
"step": 150
},
{
"epoch": 1.28,
"learning_rate": 3.5500874226626633e-07,
"logits/chosen": -0.41593313217163086,
"logits/rejected": -0.47519993782043457,
"logps/chosen": -424.2110290527344,
"logps/rejected": -386.99688720703125,
"loss": 0.2319,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 1.594541072845459,
"rewards/confidence": 0.3976772129535675,
"rewards/confidence_mean_diff": -0.3976772129535675,
"rewards/confidence_moving_diff": 0.0034655616618692875,
"rewards/margins": 2.2624146938323975,
"rewards/mix_margin": 2.2624149322509766,
"rewards/real_percentage": 12.024999618530273,
"rewards/rejected": -0.667873740196228,
"step": 160
},
{
"epoch": 1.36,
"learning_rate": 3.334334072150074e-07,
"logits/chosen": -0.4277438223361969,
"logits/rejected": -0.44190508127212524,
"logps/chosen": -359.75262451171875,
"logps/rejected": -304.85107421875,
"loss": 0.244,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 1.3704854249954224,
"rewards/confidence": 0.43457871675491333,
"rewards/confidence_mean_diff": -0.43457871675491333,
"rewards/confidence_moving_diff": 0.0003442527668084949,
"rewards/margins": 2.137328624725342,
"rewards/mix_margin": 2.137328624725342,
"rewards/real_percentage": 12.024999618530273,
"rewards/rejected": -0.766843318939209,
"step": 170
},
{
"epoch": 1.44,
"learning_rate": 3.1113352712978995e-07,
"logits/chosen": -0.4778042733669281,
"logits/rejected": -0.5502051115036011,
"logps/chosen": -285.4638671875,
"logps/rejected": -259.30963134765625,
"loss": 0.2673,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.1096071004867554,
"rewards/confidence": 0.36355599761009216,
"rewards/confidence_mean_diff": -0.36355599761009216,
"rewards/confidence_moving_diff": 2.6996247470378876e-05,
"rewards/margins": 1.878248929977417,
"rewards/mix_margin": 1.878249168395996,
"rewards/real_percentage": 11.975000381469727,
"rewards/rejected": -0.7686418294906616,
"step": 180
},
{
"epoch": 1.52,
"learning_rate": 2.8830275666182565e-07,
"logits/chosen": -0.5888835191726685,
"logits/rejected": -0.5946951508522034,
"logps/chosen": -345.4639587402344,
"logps/rejected": -269.433349609375,
"loss": 0.2581,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.4412751197814941,
"rewards/confidence": 0.27181780338287354,
"rewards/confidence_mean_diff": -0.27181780338287354,
"rewards/confidence_moving_diff": -0.0013337878044694662,
"rewards/margins": 1.7757008075714111,
"rewards/mix_margin": 1.7756999731063843,
"rewards/real_percentage": 11.975000381469727,
"rewards/rejected": -0.3344256579875946,
"step": 190
},
{
"epoch": 1.6,
"learning_rate": 2.651393607737495e-07,
"logits/chosen": -0.43257981538772583,
"logits/rejected": -0.5586498975753784,
"logps/chosen": -332.6167907714844,
"logps/rejected": -258.5675354003906,
"loss": 0.2367,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.6590750217437744,
"rewards/confidence": 0.20384028553962708,
"rewards/confidence_mean_diff": -0.20384028553962708,
"rewards/confidence_moving_diff": 0.0033724855165928602,
"rewards/margins": 2.299750804901123,
"rewards/mix_margin": 2.299750804901123,
"rewards/real_percentage": 12.074999809265137,
"rewards/rejected": -0.6406754851341248,
"step": 200
},
{
"epoch": 1.68,
"learning_rate": 2.418444929845241e-07,
"logits/chosen": -0.5128785371780396,
"logits/rejected": -0.5602482557296753,
"logps/chosen": -347.55145263671875,
"logps/rejected": -316.63287353515625,
"loss": 0.2366,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.4995661973953247,
"rewards/confidence": 0.29339924454689026,
"rewards/confidence_mean_diff": -0.29339924454689026,
"rewards/confidence_moving_diff": -0.002348523121327162,
"rewards/margins": 2.240088939666748,
"rewards/mix_margin": 2.2400896549224854,
"rewards/real_percentage": 11.949999809265137,
"rewards/rejected": -0.7405228018760681,
"step": 210
},
{
"epoch": 1.76,
"learning_rate": 2.186204485297965e-07,
"logits/chosen": -0.5206685066223145,
"logits/rejected": -0.49740782380104065,
"logps/chosen": -327.6163024902344,
"logps/rejected": -312.23345947265625,
"loss": 0.259,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.3457434177398682,
"rewards/confidence": 0.5061102509498596,
"rewards/confidence_mean_diff": -0.5061102509498596,
"rewards/confidence_moving_diff": 0.0032404728699475527,
"rewards/margins": 2.1838455200195312,
"rewards/mix_margin": 2.183845281600952,
"rewards/real_percentage": 12.125,
"rewards/rejected": -0.8381019830703735,
"step": 220
},
{
"epoch": 1.84,
"learning_rate": 1.956689076074607e-07,
"logits/chosen": -0.47606563568115234,
"logits/rejected": -0.5649515986442566,
"logps/chosen": -359.9063415527344,
"logps/rejected": -272.35333251953125,
"loss": 0.2392,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.2465178966522217,
"rewards/confidence": 0.15909627079963684,
"rewards/confidence_mean_diff": -0.15909627079963684,
"rewards/confidence_moving_diff": -0.005671085324138403,
"rewards/margins": 2.0437004566192627,
"rewards/mix_margin": 2.043700695037842,
"rewards/real_percentage": 11.899999618530273,
"rewards/rejected": -0.7971823811531067,
"step": 230
},
{
"epoch": 1.92,
"learning_rate": 1.7318918396427674e-07,
"logits/chosen": -0.5379046201705933,
"logits/rejected": -0.5706161260604858,
"logps/chosen": -386.26861572265625,
"logps/rejected": -303.8609619140625,
"loss": 0.2138,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 1.7834584712982178,
"rewards/confidence": 0.13540206849575043,
"rewards/confidence_mean_diff": -0.13540206849575043,
"rewards/confidence_moving_diff": 0.0014076533261686563,
"rewards/margins": 2.3947689533233643,
"rewards/mix_margin": 2.394768476486206,
"rewards/real_percentage": 11.975000381469727,
"rewards/rejected": -0.6113101840019226,
"step": 240
},
{
"epoch": 2.0,
"learning_rate": 1.513764940330155e-07,
"logits/chosen": -0.39151811599731445,
"logits/rejected": -0.473433256149292,
"logps/chosen": -336.6163024902344,
"logps/rejected": -306.16046142578125,
"loss": 0.2558,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.611301064491272,
"rewards/confidence": 0.2504025101661682,
"rewards/confidence_mean_diff": -0.2504025101661682,
"rewards/confidence_moving_diff": 0.000840538355987519,
"rewards/margins": 2.209441661834717,
"rewards/mix_margin": 2.209441661834717,
"rewards/real_percentage": 12.050000190734863,
"rewards/rejected": -0.5981408357620239,
"step": 250
},
{
"epoch": 2.08,
"learning_rate": 1.304202616511362e-07,
"logits/chosen": -0.5112959742546082,
"logits/rejected": -0.5279114842414856,
"logps/chosen": -377.9098815917969,
"logps/rejected": -316.77825927734375,
"loss": 0.1627,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 2.204784631729126,
"rewards/confidence": 0.5538536906242371,
"rewards/confidence_mean_diff": -0.5538536906242371,
"rewards/confidence_moving_diff": -0.005331903696060181,
"rewards/margins": 2.7410061359405518,
"rewards/mix_margin": 2.7410056591033936,
"rewards/real_percentage": 11.774999618530273,
"rewards/rejected": -0.5362212657928467,
"step": 260
},
{
"epoch": 2.16,
"learning_rate": 1.1050247308300944e-07,
"logits/chosen": -0.48956188559532166,
"logits/rejected": -0.5282370448112488,
"logps/chosen": -370.67767333984375,
"logps/rejected": -370.94476318359375,
"loss": 0.1444,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 1.979029893875122,
"rewards/confidence": 0.7768798470497131,
"rewards/confidence_mean_diff": -0.7768798470497131,
"rewards/confidence_moving_diff": -0.008751118555665016,
"rewards/margins": 5.2986369132995605,
"rewards/mix_margin": 5.298637866973877,
"rewards/real_percentage": 11.875,
"rewards/rejected": -3.3196072578430176,
"step": 270
},
{
"epoch": 2.24,
"learning_rate": 9.179609663085594e-08,
"logits/chosen": -0.478290855884552,
"logits/rejected": -0.5842245817184448,
"logps/chosen": -354.32220458984375,
"logps/rejected": -323.82830810546875,
"loss": 0.1632,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 2.003836154937744,
"rewards/confidence": 1.0290786027908325,
"rewards/confidence_mean_diff": -1.0290786027908325,
"rewards/confidence_moving_diff": 0.009490849450230598,
"rewards/margins": 2.7980473041534424,
"rewards/mix_margin": 2.7980475425720215,
"rewards/real_percentage": 12.175000190734863,
"rewards/rejected": -0.7942115068435669,
"step": 280
},
{
"epoch": 2.32,
"learning_rate": 7.446358055867688e-08,
"logits/chosen": -0.4719129502773285,
"logits/rejected": -0.5351340174674988,
"logps/chosen": -284.57977294921875,
"logps/rejected": -244.1188507080078,
"loss": 0.1959,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.3237833976745605,
"rewards/confidence": 0.36271917819976807,
"rewards/confidence_mean_diff": -0.36271917819976807,
"rewards/confidence_moving_diff": 0.002586688846349716,
"rewards/margins": 2.3666577339172363,
"rewards/mix_margin": 2.3666574954986572,
"rewards/real_percentage": 12.074999809265137,
"rewards/rejected": -1.0428742170333862,
"step": 290
},
{
"epoch": 2.4,
"learning_rate": 5.8655442373371164e-08,
"logits/chosen": -0.581800639629364,
"logits/rejected": -0.6199262142181396,
"logps/chosen": -420.638671875,
"logps/rejected": -355.60736083984375,
"loss": 0.152,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 1.8940696716308594,
"rewards/confidence": 0.7953528165817261,
"rewards/confidence_mean_diff": -0.7953528165817261,
"rewards/confidence_moving_diff": -0.006861658301204443,
"rewards/margins": 2.8314507007598877,
"rewards/mix_margin": 2.831450939178467,
"rewards/real_percentage": 11.875,
"rewards/rejected": -0.9373809695243835,
"step": 300
},
{
"epoch": 2.48,
"learning_rate": 4.450896171388219e-08,
"logits/chosen": -0.5456718802452087,
"logits/rejected": -0.5629149079322815,
"logps/chosen": -384.376953125,
"logps/rejected": -332.3739318847656,
"loss": 0.1365,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 2.1317899227142334,
"rewards/confidence": 0.8724759221076965,
"rewards/confidence_mean_diff": -0.8724759221076965,
"rewards/confidence_moving_diff": 0.006575644016265869,
"rewards/margins": 3.160860300064087,
"rewards/mix_margin": 3.160860061645508,
"rewards/real_percentage": 12.024999618530273,
"rewards/rejected": -1.0290701389312744,
"step": 310
},
{
"epoch": 2.56,
"learning_rate": 3.214698819946879e-08,
"logits/chosen": -0.5238803625106812,
"logits/rejected": -0.5871630907058716,
"logps/chosen": -375.8655090332031,
"logps/rejected": -300.987548828125,
"loss": 0.1748,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 1.7292721271514893,
"rewards/confidence": 0.5153323411941528,
"rewards/confidence_mean_diff": -0.5153323411941528,
"rewards/confidence_moving_diff": 0.0004902526852674782,
"rewards/margins": 2.5365805625915527,
"rewards/mix_margin": 2.5365803241729736,
"rewards/real_percentage": 11.949999809265137,
"rewards/rejected": -0.8073086738586426,
"step": 320
},
{
"epoch": 2.64,
"learning_rate": 2.1676874589879908e-08,
"logits/chosen": -0.49646130204200745,
"logits/rejected": -0.5325660705566406,
"logps/chosen": -361.79986572265625,
"logps/rejected": -292.0426940917969,
"loss": 0.1878,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 1.4343092441558838,
"rewards/confidence": 0.5628241300582886,
"rewards/confidence_mean_diff": -0.5628241300582886,
"rewards/confidence_moving_diff": 0.0001227855682373047,
"rewards/margins": 2.628760576248169,
"rewards/mix_margin": 2.628760814666748,
"rewards/real_percentage": 12.024999618530273,
"rewards/rejected": -1.1944514513015747,
"step": 330
},
{
"epoch": 2.72,
"learning_rate": 1.3189544521990032e-08,
"logits/chosen": -0.5395928025245667,
"logits/rejected": -0.5778788328170776,
"logps/chosen": -332.5323791503906,
"logps/rejected": -296.8447265625,
"loss": 0.1826,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 1.4775941371917725,
"rewards/confidence": 0.5103145837783813,
"rewards/confidence_mean_diff": -0.5103145837783813,
"rewards/confidence_moving_diff": -0.003172731725499034,
"rewards/margins": 2.3591160774230957,
"rewards/mix_margin": 2.3591160774230957,
"rewards/real_percentage": 11.875,
"rewards/rejected": -0.8815220594406128,
"step": 340
},
{
"epoch": 2.8,
"learning_rate": 6.7587029187732014e-09,
"logits/chosen": -0.5066567659378052,
"logits/rejected": -0.5228812098503113,
"logps/chosen": -346.0731201171875,
"logps/rejected": -309.6691589355469,
"loss": 0.1769,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.6007888317108154,
"rewards/confidence": 0.6578723788261414,
"rewards/confidence_mean_diff": -0.6578723788261414,
"rewards/confidence_moving_diff": 0.004312982317060232,
"rewards/margins": 2.8280742168426514,
"rewards/mix_margin": 2.8280739784240723,
"rewards/real_percentage": 12.125,
"rewards/rejected": -1.2272855043411255,
"step": 350
},
{
"epoch": 2.88,
"learning_rate": 2.4401959275140437e-09,
"logits/chosen": -0.4290226399898529,
"logits/rejected": -0.4782096743583679,
"logps/chosen": -323.8050231933594,
"logps/rejected": -286.50225830078125,
"loss": 0.1644,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 1.5553691387176514,
"rewards/confidence": 0.5026761889457703,
"rewards/confidence_mean_diff": -0.5026761889457703,
"rewards/confidence_moving_diff": -0.0073528410866856575,
"rewards/margins": 2.44854474067688,
"rewards/mix_margin": 2.4485442638397217,
"rewards/real_percentage": 11.899999618530273,
"rewards/rejected": -0.8931753039360046,
"step": 360
},
{
"epoch": 2.96,
"learning_rate": 2.715259456224084e-10,
"logits/chosen": -0.5070622563362122,
"logits/rejected": -0.5159127712249756,
"logps/chosen": -368.29248046875,
"logps/rejected": -357.6094665527344,
"loss": 0.1676,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 1.714816689491272,
"rewards/confidence": 0.8204771876335144,
"rewards/confidence_mean_diff": -0.8204771876335144,
"rewards/confidence_moving_diff": 0.002654359443113208,
"rewards/margins": 2.7493791580200195,
"rewards/mix_margin": 2.7493796348571777,
"rewards/real_percentage": 12.125,
"rewards/rejected": -1.034562587738037,
"step": 370
},
{
"epoch": 3.0,
"step": 375,
"total_flos": 0.0,
"train_loss": 0.3348727149963379,
"train_runtime": 2536.3599,
"train_samples_per_second": 2.366,
"train_steps_per_second": 0.148
}
],
"logging_steps": 10,
"max_steps": 375,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}