diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,12870 +1,4461 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.3333333333333333, + "epoch": 0.9111617312072893, "eval_steps": 200, - "global_step": 2500, + "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "directrewards_student/accuracies": 0.0, "epoch": 0.0, "learning_rate": 5.000000000000001e-07, - "logits/chosen": -0.3314533233642578, - "logits/chosen_friction": -0.33421453833580017, - "logits/rejected": -0.3311963677406311, - "logits/rejected_friction": -0.33563724160194397, - "logps/chosen": -0.5838855504989624, - "logps/chosen_friction": -0.463901549577713, - "logps/rejected": -0.6250548958778381, - "logps/rejected_friction": -0.5124940872192383, - "loss": 0.25, - "policy_friction_nll_loss": 0.4502220153808594, - "policy_nll_loss": 0.5682294368743896, - "rewards/accuracies": 0.0, - "rewards/accuracies_friction": 0.0, - "rewards/chosen": 0.0, - "rewards/chosen_fricton": 0.0, - "rewards/margins": 0.0, - "rewards/margins_friction": 0.0, - "rewards/rejected": 0.0, - "rewards/rejected_friction": 0.0, - "step": 1 - }, - { - "directrewards_student/accuracies": 0.28125, - "epoch": 0.0, - "learning_rate": 2.5e-06, - "logits/chosen": -0.3300497829914093, - "logits/chosen_friction": -0.33648085594177246, - "logits/rejected": -0.3312627971172333, - "logits/rejected_friction": -0.33952832221984863, - "logps/chosen": -0.6087216138839722, - "logps/chosen_friction": -0.48861163854599, - "logps/rejected": -0.6463426351547241, - "logps/rejected_friction": -0.53397536277771, - "loss": 0.25, - "policy_friction_nll_loss": 0.47604960203170776, - "policy_nll_loss": 0.5964580774307251, - "rewards/accuracies": 0.28125, - "rewards/accuracies_friction": 0.5, - "rewards/chosen": -1.9978731870651245e-05, - "rewards/chosen_fricton": -1.3062729522062e-05, - "rewards/margins": -3.0584619707951788e-06, - "rewards/margins_friction": 9.24617052078247e-06, - "rewards/rejected": -1.6920270354603417e-05, - "rewards/rejected_friction": -2.230889913334977e-05, - "step": 5 - }, - { - "directrewards_student/accuracies": 0.6499999761581421, - "epoch": 0.01, - "learning_rate": 5e-06, - "logits/chosen": -0.3318561613559723, - "logits/chosen_friction": -0.33736085891723633, - "logits/rejected": -0.3324087858200073, - "logits/rejected_friction": -0.3403892517089844, - "logps/chosen": -0.642116904258728, - "logps/chosen_friction": -0.5181363821029663, - "logps/rejected": -0.6696888208389282, - "logps/rejected_friction": -0.5513863563537598, - "loss": 0.25, - "policy_friction_nll_loss": 0.5101621747016907, - "policy_nll_loss": 0.6344469785690308, - "rewards/accuracies": 0.6499999761581421, - "rewards/accuracies_friction": 0.574999988079071, - "rewards/chosen": -2.8610975277842954e-05, - "rewards/chosen_fricton": -1.8187687601312064e-05, - "rewards/margins": 1.9045324734179303e-05, - "rewards/margins_friction": -1.036376488627866e-06, - "rewards/rejected": -4.765630001202226e-05, - "rewards/rejected_friction": -1.71513129316736e-05, + "logits/chosen": -0.300163596868515, + "logits/rejected": -0.3011459410190582, + "logps/chosen": -418.81268310546875, + "logps/rejected": -421.69482421875, + "loss": 0.6923, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.008436297997832298, + "rewards/margins": 0.001967963995411992, + "rewards/rejected": 0.006468335632234812, "step": 10 }, { - "directrewards_student/accuracies": 0.824999988079071, - "epoch": 0.01, - "learning_rate": 4.9999914040105576e-06, - "logits/chosen": -0.3272313177585602, - "logits/chosen_friction": -0.33226650953292847, - "logits/rejected": -0.3310847878456116, - "logits/rejected_friction": -0.3380160927772522, - "logps/chosen": -0.6102452874183655, - "logps/chosen_friction": -0.49308449029922485, - "logps/rejected": -0.649484395980835, - "logps/rejected_friction": -0.5418084859848022, - "loss": 0.2499, - "policy_friction_nll_loss": 0.48209887742996216, - "policy_nll_loss": 0.5997186303138733, - "rewards/accuracies": 0.824999988079071, - "rewards/accuracies_friction": 0.800000011920929, - "rewards/chosen": -2.7420896003604867e-05, - "rewards/chosen_fricton": -7.554153853561729e-07, - "rewards/margins": 3.723971894942224e-05, - "rewards/margins_friction": 7.078632188495249e-05, - "rewards/rejected": -6.466060585808009e-05, - "rewards/rejected_friction": -7.154174090828747e-05, - "step": 15 - }, - { - "directrewards_student/accuracies": 0.824999988079071, "epoch": 0.01, - "learning_rate": 4.999965616101344e-06, - "logits/chosen": -0.32881733775138855, - "logits/chosen_friction": -0.33485737442970276, - "logits/rejected": -0.3322547376155853, - "logits/rejected_friction": -0.33989447355270386, - "logps/chosen": -0.5711789131164551, - "logps/chosen_friction": -0.45309242606163025, - "logps/rejected": -0.604923665523529, - "logps/rejected_friction": -0.49484434723854065, - "loss": 0.2498, - "policy_friction_nll_loss": 0.4459647536277771, - "policy_nll_loss": 0.5625153183937073, - "rewards/accuracies": 0.824999988079071, - "rewards/accuracies_friction": 0.824999988079071, - "rewards/chosen": -2.6578452889225446e-05, - "rewards/chosen_fricton": -2.1164196368772537e-05, - "rewards/margins": 7.541291415691376e-05, - "rewards/margins_friction": 9.230435534846038e-05, - "rewards/rejected": -0.0001019913688651286, - "rewards/rejected_friction": -0.0001134685444412753, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -0.31174224615097046, + "logits/rejected": -0.3135172724723816, + "logps/chosen": -428.8531799316406, + "logps/rejected": -427.205810546875, + "loss": 0.6951, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.0215766541659832, + "rewards/margins": -0.0034640885423868895, + "rewards/rejected": 0.02504074200987816, "step": 20 }, { - "directrewards_student/accuracies": 0.8999999761581421, "epoch": 0.01, - "learning_rate": 4.999922636449696e-06, - "logits/chosen": -0.3217795491218567, - "logits/chosen_friction": -0.3273763954639435, - "logits/rejected": -0.324879914522171, - "logits/rejected_friction": -0.33370068669319153, - "logps/chosen": -0.589401364326477, - "logps/chosen_friction": -0.4671592116355896, - "logps/rejected": -0.6231091618537903, - "logps/rejected_friction": -0.5077701210975647, - "loss": 0.2497, - "policy_friction_nll_loss": 0.4569900631904602, - "policy_nll_loss": 0.5787663459777832, - "rewards/accuracies": 0.8999999761581421, - "rewards/accuracies_friction": 0.9750000238418579, - "rewards/chosen": -2.518646579119377e-05, - "rewards/chosen_fricton": 4.283413090888644e-06, - "rewards/margins": 0.00012288667494431138, - "rewards/margins_friction": 0.00015296251513063908, - "rewards/rejected": -0.00014807314437348396, - "rewards/rejected_friction": -0.00014867910067550838, - "step": 25 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.02, - "learning_rate": 4.999862465351179e-06, - "logits/chosen": -0.3266090750694275, - "logits/chosen_friction": -0.33185869455337524, - "logits/rejected": -0.3287024199962616, - "logits/rejected_friction": -0.33678242564201355, - "logps/chosen": -0.6144950985908508, - "logps/chosen_friction": -0.4893600046634674, - "logps/rejected": -0.6368525624275208, - "logps/rejected_friction": -0.5191283226013184, - "loss": 0.2496, - "policy_friction_nll_loss": 0.4801305830478668, - "policy_nll_loss": 0.6058436036109924, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 0.949999988079071, - "rewards/chosen": 5.797028279630467e-05, - "rewards/chosen_fricton": 0.00015906497719697654, - "rewards/margins": 0.0001840894401539117, - "rewards/margins_friction": 0.00020838812633883208, - "rewards/rejected": -0.0001261191937373951, - "rewards/rejected_friction": -4.932314550387673e-05, + "learning_rate": 1.5e-06, + "logits/chosen": -0.2996385097503662, + "logits/rejected": -0.30060532689094543, + "logps/chosen": -416.20086669921875, + "logps/rejected": -412.4971618652344, + "loss": 0.6924, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.030052989721298218, + "rewards/margins": 0.0019294738303869963, + "rewards/rejected": 0.028123509138822556, "step": 30 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.02, - "learning_rate": 4.999785103219572e-06, - "logits/chosen": -0.33666232228279114, - "logits/chosen_friction": -0.342490017414093, - "logits/rejected": -0.33894795179367065, - "logits/rejected_friction": -0.34691697359085083, - "logps/chosen": -0.6305209398269653, - "logps/chosen_friction": -0.5164369344711304, - "logps/rejected": -0.6715608835220337, - "logps/rejected_friction": -0.5667839646339417, - "loss": 0.2493, - "policy_friction_nll_loss": 0.5049845576286316, - "policy_nll_loss": 0.6191027164459229, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": 4.8966485337587073e-05, - "rewards/chosen_fricton": 0.00014157546684145927, - "rewards/margins": 0.0002854872727766633, - "rewards/margins_friction": 0.00037270496250130236, - "rewards/rejected": -0.00023652074742130935, - "rewards/rejected_friction": -0.00023112946655601263, - "step": 35 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.02, - "learning_rate": 4.9996905505868824e-06, - "logits/chosen": -0.32595768570899963, - "logits/chosen_friction": -0.33247053623199463, - "logits/rejected": -0.328016072511673, - "logits/rejected_friction": -0.3372390866279602, - "logps/chosen": -0.5594505071640015, - "logps/chosen_friction": -0.45141783356666565, - "logps/rejected": -0.5903497338294983, - "logps/rejected_friction": -0.4909297823905945, - "loss": 0.2492, - "policy_friction_nll_loss": 0.4403902590274811, - "policy_nll_loss": 0.5483871102333069, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 0.9750000238418579, - "rewards/chosen": -1.531124144094065e-05, - "rewards/chosen_fricton": 0.00010183952690567821, - "rewards/margins": 0.00033240107586607337, - "rewards/margins_friction": 0.00040705030551180243, - "rewards/rejected": -0.0003477123682387173, - "rewards/rejected_friction": -0.00030521079315803945, + "epoch": 0.01, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -0.3022615313529968, + "logits/rejected": -0.3025739789009094, + "logps/chosen": -426.9918518066406, + "logps/rejected": -423.1588439941406, + "loss": 0.692, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.02528352662920952, + "rewards/margins": 0.002774887252599001, + "rewards/rejected": 0.022508641704916954, "step": 40 }, { - "directrewards_student/accuracies": 0.9750000238418579, "epoch": 0.02, - "learning_rate": 4.999578808103326e-06, - "logits/chosen": -0.32239049673080444, - "logits/chosen_friction": -0.3302277624607086, - "logits/rejected": -0.32480430603027344, - "logits/rejected_friction": -0.33383268117904663, - "logps/chosen": -0.6057384610176086, - "logps/chosen_friction": -0.48935776948928833, - "logps/rejected": -0.6407490372657776, - "logps/rejected_friction": -0.5273593664169312, - "loss": 0.249, - "policy_friction_nll_loss": 0.4759996831417084, - "policy_nll_loss": 0.5932338833808899, - "rewards/accuracies": 0.9750000238418579, - "rewards/accuracies_friction": 0.949999988079071, - "rewards/chosen": 9.49446257436648e-05, - "rewards/chosen_fricton": 0.00023312153643928468, - "rewards/margins": 0.0004690810455940664, - "rewards/margins_friction": 0.0005263808998279274, - "rewards/rejected": -0.0003741364344023168, - "rewards/rejected_friction": -0.0002932594215963036, - "step": 45 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.03, - "learning_rate": 4.999449876537335e-06, - "logits/chosen": -0.32989493012428284, - "logits/chosen_friction": -0.3369276523590088, - "logits/rejected": -0.33128267526626587, - "logits/rejected_friction": -0.34040239453315735, - "logps/chosen": -0.60160231590271, - "logps/chosen_friction": -0.4819873869419098, - "logps/rejected": -0.644713282585144, - "logps/rejected_friction": -0.5301627516746521, - "loss": 0.2487, - "policy_friction_nll_loss": 0.473734050989151, - "policy_nll_loss": 0.5929361581802368, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": 0.00011222458851989359, - "rewards/chosen_fricton": 0.0002957905817311257, - "rewards/margins": 0.000659335171803832, - "rewards/margins_friction": 0.0007767742499709129, - "rewards/rejected": -0.0005471105105243623, - "rewards/rejected_friction": -0.00048098372644744813, + "learning_rate": 2.5e-06, + "logits/chosen": -0.30438098311424255, + "logits/rejected": -0.30549854040145874, + "logps/chosen": -421.03363037109375, + "logps/rejected": -421.8212890625, + "loss": 0.6898, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.03513988479971886, + "rewards/margins": 0.007109012454748154, + "rewards/rejected": 0.028030872344970703, "step": 50 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.03, - "learning_rate": 4.999303756775543e-06, - "logits/chosen": -0.33514538407325745, - "logits/chosen_friction": -0.3435589075088501, - "logits/rejected": -0.33841410279273987, - "logits/rejected_friction": -0.34810203313827515, - "logps/chosen": -0.6034568548202515, - "logps/chosen_friction": -0.48961010575294495, - "logps/rejected": -0.6470072865486145, - "logps/rejected_friction": -0.5397754907608032, - "loss": 0.2484, - "policy_friction_nll_loss": 0.47952526807785034, - "policy_nll_loss": 0.5944103598594666, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": 6.542988558067009e-05, - "rewards/chosen_fricton": 0.0002793499734252691, - "rewards/margins": 0.0007501248037442565, - "rewards/margins_friction": 0.0008952980861067772, - "rewards/rejected": -0.0006846949690952897, - "rewards/rejected_friction": -0.0006159482290968299, - "step": 55 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.03, - "learning_rate": 4.999140449822787e-06, - "logits/chosen": -0.3278871178627014, - "logits/chosen_friction": -0.3354930877685547, - "logits/rejected": -0.3299504518508911, - "logits/rejected_friction": -0.33917486667633057, - "logps/chosen": -0.5952373147010803, - "logps/chosen_friction": -0.4732632637023926, - "logps/rejected": -0.6378481388092041, - "logps/rejected_friction": -0.5252599120140076, - "loss": 0.2479, - "policy_friction_nll_loss": 0.46376147866249084, - "policy_nll_loss": 0.5865526795387268, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -3.0553484975825995e-05, - "rewards/chosen_fricton": 0.00020931563631165773, - "rewards/margins": 0.0009901832090690732, - "rewards/margins_friction": 0.0011314416769891977, - "rewards/rejected": -0.001020736643113196, - "rewards/rejected_friction": -0.0009221258806064725, + "epoch": 0.02, + "learning_rate": 3e-06, + "logits/chosen": -0.30687031149864197, + "logits/rejected": -0.3071025013923645, + "logps/chosen": -417.4591369628906, + "logps/rejected": -417.7974548339844, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.014512499794363976, + "rewards/margins": 0.00045255664736032486, + "rewards/rejected": 0.014059944078326225, "step": 60 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.03, - "learning_rate": 4.998959956802094e-06, - "logits/chosen": -0.34334826469421387, - "logits/chosen_friction": -0.35196560621261597, - "logits/rejected": -0.3461974561214447, - "logits/rejected_friction": -0.35653167963027954, - "logps/chosen": -0.6163719892501831, - "logps/chosen_friction": -0.49949321150779724, - "logps/rejected": -0.6602912545204163, - "logps/rejected_friction": -0.5531876683235168, - "loss": 0.2475, - "policy_friction_nll_loss": 0.4937437176704407, - "policy_nll_loss": 0.6102361679077148, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": 2.260349538119044e-05, - "rewards/chosen_fricton": 0.00031058528111316264, - "rewards/margins": 0.0011804368114098907, - "rewards/margins_friction": 0.0013661442790180445, - "rewards/rejected": -0.0011578333796933293, - "rewards/rejected_friction": -0.0010555589105933905, - "step": 65 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.04, - "learning_rate": 4.9987622789546765e-06, - "logits/chosen": -0.3344830274581909, - "logits/chosen_friction": -0.3436603248119354, - "logits/rejected": -0.339668869972229, - "logits/rejected_friction": -0.3503101170063019, - "logps/chosen": -0.5671979188919067, - "logps/chosen_friction": -0.4466860890388489, - "logps/rejected": -0.6132090091705322, - "logps/rejected_friction": -0.5015113949775696, - "loss": 0.2472, - "policy_friction_nll_loss": 0.44329914450645447, - "policy_nll_loss": 0.5640937685966492, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": 4.299484135117382e-05, - "rewards/chosen_fricton": 0.00037319929106161, - "rewards/margins": 0.0012940842425450683, - "rewards/margins_friction": 0.0014758602483198047, - "rewards/rejected": -0.0012510893866419792, - "rewards/rejected_friction": -0.0011026610154658556, + "epoch": 0.02, + "learning_rate": 3.5e-06, + "logits/chosen": -0.30733975768089294, + "logits/rejected": -0.3082950711250305, + "logps/chosen": -422.18487548828125, + "logps/rejected": -422.29052734375, + "loss": 0.6887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.06789219379425049, + "rewards/margins": 0.00933685339987278, + "rewards/rejected": 0.05855534225702286, "step": 70 }, { - "directrewards_student/accuracies": 0.9750000238418579, - "epoch": 0.04, - "learning_rate": 4.998547417639925e-06, - "logits/chosen": -0.3224430978298187, - "logits/chosen_friction": -0.3318747878074646, - "logits/rejected": -0.3252509534358978, - "logits/rejected_friction": -0.3358073830604553, - "logps/chosen": -0.6398383975028992, - "logps/chosen_friction": -0.5184356570243835, - "logps/rejected": -0.7021744847297668, - "logps/rejected_friction": -0.5895752906799316, - "loss": 0.2461, - "policy_friction_nll_loss": 0.49901336431503296, - "policy_nll_loss": 0.6207466721534729, - "rewards/accuracies": 0.9750000238418579, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0001823289057938382, - "rewards/chosen_fricton": 0.0002266673545818776, - "rewards/margins": 0.0018745588604360819, - "rewards/margins_friction": 0.002156948670744896, - "rewards/rejected": -0.0020568876061588526, - "rewards/rejected_friction": -0.0019302812870591879, - "step": 75 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.04, - "learning_rate": 4.998315374335394e-06, - "logits/chosen": -0.33648496866226196, - "logits/chosen_friction": -0.34589987993240356, - "logits/rejected": -0.33825454115867615, - "logits/rejected_friction": -0.3481026887893677, - "logps/chosen": -0.6123756170272827, - "logps/chosen_friction": -0.4908915162086487, - "logps/rejected": -0.6575005054473877, - "logps/rejected_friction": -0.5529050827026367, - "loss": 0.246, - "policy_friction_nll_loss": 0.48068150877952576, - "policy_nll_loss": 0.6030642986297607, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0004563714028336108, - "rewards/chosen_fricton": -4.812577390111983e-05, - "rewards/margins": 0.0019278066465631127, - "rewards/margins_friction": 0.002276803832501173, - "rewards/rejected": -0.0023841778747737408, - "rewards/rejected_friction": -0.0023249296937137842, + "epoch": 0.02, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -0.309120774269104, + "logits/rejected": -0.3103254437446594, + "logps/chosen": -424.8710021972656, + "logps/rejected": -423.9234924316406, + "loss": 0.6875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0613434836268425, + "rewards/margins": 0.011882667429745197, + "rewards/rejected": 0.04946080967783928, "step": 80 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.05, - "learning_rate": 4.998066150636798e-06, - "logits/chosen": -0.33869704604148865, - "logits/chosen_friction": -0.3485885560512543, - "logits/rejected": -0.34255295991897583, - "logits/rejected_friction": -0.3524096608161926, - "logps/chosen": -0.5781728029251099, - "logps/chosen_friction": -0.4582653045654297, - "logps/rejected": -0.6287859678268433, - "logps/rejected_friction": -0.5191149711608887, - "loss": 0.2455, - "policy_friction_nll_loss": 0.44995012879371643, - "policy_nll_loss": 0.5705350637435913, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0004226035380270332, - "rewards/chosen_fricton": -1.4264507626648992e-05, - "rewards/margins": 0.002040986204519868, - "rewards/margins_friction": 0.0023747719824314117, - "rewards/rejected": -0.0024635898880660534, - "rewards/rejected_friction": -0.0023890365846455097, - "step": 85 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.05, - "learning_rate": 4.997799748257998e-06, - "logits/chosen": -0.3416880965232849, - "logits/chosen_friction": -0.3525909185409546, - "logits/rejected": -0.344137579202652, - "logits/rejected_friction": -0.3555726408958435, - "logps/chosen": -0.6055851578712463, - "logps/chosen_friction": -0.4858245849609375, - "logps/rejected": -0.664664089679718, - "logps/rejected_friction": -0.5530509948730469, - "loss": 0.2443, - "policy_friction_nll_loss": 0.47591716051101685, - "policy_nll_loss": 0.5951492786407471, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0007902191719040275, - "rewards/chosen_fricton": -0.00045988266356289387, - "rewards/margins": 0.0027454805094748735, - "rewards/margins_friction": 0.0030213601421564817, - "rewards/rejected": -0.003535699797794223, - "rewards/rejected_friction": -0.0034812428057193756, + "epoch": 0.03, + "learning_rate": 4.5e-06, + "logits/chosen": -0.3092747628688812, + "logits/rejected": -0.3102528750896454, + "logps/chosen": -417.56097412109375, + "logps/rejected": -420.48541259765625, + "loss": 0.6882, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1002880111336708, + "rewards/margins": 0.010797671973705292, + "rewards/rejected": 0.08949033915996552, "step": 90 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.05, - "learning_rate": 4.997516169030985e-06, - "logits/chosen": -0.3439766764640808, - "logits/chosen_friction": -0.35428574681282043, - "logits/rejected": -0.3487510681152344, - "logits/rejected_friction": -0.3592156767845154, - "logps/chosen": -0.6095544695854187, - "logps/chosen_friction": -0.4840007722377777, - "logps/rejected": -0.6753285527229309, - "logps/rejected_friction": -0.559005856513977, - "loss": 0.2437, - "policy_friction_nll_loss": 0.47107991576194763, - "policy_nll_loss": 0.596255898475647, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0008300847257487476, - "rewards/chosen_fricton": -0.0002638939185999334, - "rewards/margins": 0.0028340143617242575, - "rewards/margins_friction": 0.0032382875215262175, - "rewards/rejected": -0.003664099145680666, - "rewards/rejected_friction": -0.0035021814983338118, - "step": 95 - }, - { - "directrewards_student/accuracies": 0.9750000238418579, - "epoch": 0.05, - "learning_rate": 4.997215414905875e-06, - "logits/chosen": -0.35009559988975525, - "logits/chosen_friction": -0.3609665334224701, - "logits/rejected": -0.35199007391929626, - "logits/rejected_friction": -0.3622797131538391, - "logps/chosen": -0.6088067293167114, - "logps/chosen_friction": -0.4862481653690338, - "logps/rejected": -0.6731345653533936, - "logps/rejected_friction": -0.5614002346992493, - "loss": 0.2433, - "policy_friction_nll_loss": 0.47697192430496216, - "policy_nll_loss": 0.6005781888961792, - "rewards/accuracies": 0.9750000238418579, - "rewards/accuracies_friction": 0.949999988079071, - "rewards/chosen": -0.0014180821599438787, - "rewards/chosen_fricton": -0.0008968674810603261, - "rewards/margins": 0.002989314030855894, - "rewards/margins_friction": 0.003384134965017438, - "rewards/rejected": -0.004407396074384451, - "rewards/rejected_friction": -0.0042810020968317986, + "epoch": 0.03, + "learning_rate": 5e-06, + "logits/chosen": -0.3046155571937561, + "logits/rejected": -0.3053414225578308, + "logps/chosen": -417.95501708984375, + "logps/rejected": -416.2376403808594, + "loss": 0.6748, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.1339004933834076, + "rewards/margins": 0.03802730515599251, + "rewards/rejected": 0.09587319195270538, "step": 100 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.06, - "learning_rate": 4.996897487950892e-06, - "logits/chosen": -0.35082730650901794, - "logits/chosen_friction": -0.3619600534439087, - "logits/rejected": -0.3528831899166107, - "logits/rejected_friction": -0.363946795463562, - "logps/chosen": -0.5796958208084106, - "logps/chosen_friction": -0.45677414536476135, - "logps/rejected": -0.6409854888916016, - "logps/rejected_friction": -0.5271649360656738, - "loss": 0.2424, - "policy_friction_nll_loss": 0.45154300332069397, - "policy_nll_loss": 0.5744017362594604, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0016038784524425864, - "rewards/chosen_fricton": -0.001137354294769466, - "rewards/margins": 0.0035149212926626205, - "rewards/margins_friction": 0.0038368888199329376, - "rewards/rejected": -0.005118799861520529, - "rewards/rejected_friction": -0.004974243231117725, - "step": 105 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.06, - "learning_rate": 4.996562390352354e-06, - "logits/chosen": -0.3437003493309021, - "logits/chosen_friction": -0.35601335763931274, - "logits/rejected": -0.34960511326789856, - "logits/rejected_friction": -0.3616386353969574, - "logps/chosen": -0.648828387260437, - "logps/chosen_friction": -0.5261073112487793, - "logps/rejected": -0.7201586961746216, - "logps/rejected_friction": -0.6042051315307617, - "loss": 0.2416, - "policy_friction_nll_loss": 0.5101394057273865, - "policy_nll_loss": 0.6333123445510864, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.001830583205446601, - "rewards/chosen_fricton": -0.0010888322722166777, - "rewards/margins": 0.004135648254305124, - "rewards/margins_friction": 0.004670650232583284, - "rewards/rejected": -0.0059662312269210815, - "rewards/rejected_friction": -0.005759482271969318, + "epoch": 0.03, + "learning_rate": 4.999853306957783e-06, + "logits/chosen": -0.3040740489959717, + "logits/rejected": -0.30468136072158813, + "logps/chosen": -416.46527099609375, + "logps/rejected": -415.51568603515625, + "loss": 0.6714, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.13915565609931946, + "rewards/margins": 0.045606400817632675, + "rewards/rejected": 0.09354925900697708, "step": 110 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.06, - "learning_rate": 4.996210124414656e-06, - "logits/chosen": -0.3412129580974579, - "logits/chosen_friction": -0.35247960686683655, - "logits/rejected": -0.3440452218055725, - "logits/rejected_friction": -0.355155348777771, - "logps/chosen": -0.6148606538772583, - "logps/chosen_friction": -0.48786211013793945, - "logps/rejected": -0.6944883465766907, - "logps/rejected_friction": -0.5832083821296692, - "loss": 0.2399, - "policy_friction_nll_loss": 0.47236037254333496, - "policy_nll_loss": 0.5987261533737183, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.002373677445575595, - "rewards/chosen_fricton": -0.0016889851540327072, - "rewards/margins": 0.004747648723423481, - "rewards/margins_friction": 0.005473857279866934, - "rewards/rejected": -0.007121325936168432, - "rewards/rejected_friction": -0.007162842899560928, - "step": 115 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.06, - "learning_rate": 4.9958406925602576e-06, - "logits/chosen": -0.36241182684898376, - "logits/chosen_friction": -0.3731611371040344, - "logits/rejected": -0.3646659255027771, - "logits/rejected_friction": -0.37511172890663147, - "logps/chosen": -0.6370089650154114, - "logps/chosen_friction": -0.5052780508995056, - "logps/rejected": -0.7167387008666992, - "logps/rejected_friction": -0.6017129421234131, - "loss": 0.2392, - "policy_friction_nll_loss": 0.4941805303096771, - "policy_nll_loss": 0.6254404187202454, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0027572764083743095, - "rewards/chosen_fricton": -0.0017936134245246649, - "rewards/margins": 0.0050307405181229115, - "rewards/margins_friction": 0.005988875404000282, - "rewards/rejected": -0.007788016460835934, - "rewards/rejected_friction": -0.007782489061355591, + "epoch": 0.04, + "learning_rate": 4.99941324504621e-06, + "logits/chosen": -0.3062252104282379, + "logits/rejected": -0.30699923634529114, + "logps/chosen": -423.4345703125, + "logps/rejected": -421.33477783203125, + "loss": 0.6681, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.25515347719192505, + "rewards/margins": 0.05361776426434517, + "rewards/rejected": 0.20153570175170898, "step": 120 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.07, - "learning_rate": 4.995454097329666e-06, - "logits/chosen": -0.3570749759674072, - "logits/chosen_friction": -0.36768627166748047, - "logits/rejected": -0.3609080910682678, - "logits/rejected_friction": -0.37130722403526306, - "logps/chosen": -0.62144935131073, - "logps/chosen_friction": -0.5025854706764221, - "logps/rejected": -0.7100092768669128, - "logps/rejected_friction": -0.6085233688354492, - "loss": 0.237, - "policy_friction_nll_loss": 0.4860617220401764, - "policy_nll_loss": 0.6059733629226685, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.003347421530634165, - "rewards/chosen_fricton": -0.0026068980805575848, - "rewards/margins": 0.006213229149580002, - "rewards/margins_friction": 0.007079815957695246, - "rewards/rejected": -0.00956065021455288, - "rewards/rejected_friction": -0.00968671403825283, - "step": 125 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.07, - "learning_rate": 4.995050341381415e-06, - "logits/chosen": -0.3688618838787079, - "logits/chosen_friction": -0.38023677468299866, - "logits/rejected": -0.3727550208568573, - "logits/rejected_friction": -0.38367152214050293, - "logps/chosen": -0.5879384279251099, - "logps/chosen_friction": -0.4676678776741028, - "logps/rejected": -0.6895111799240112, - "logps/rejected_friction": -0.5875577926635742, - "loss": 0.2353, - "policy_friction_nll_loss": 0.4612109661102295, - "policy_nll_loss": 0.5806938409805298, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 0.9750000238418579, - "rewards/chosen": -0.003490444738417864, - "rewards/chosen_fricton": -0.0025712628848850727, - "rewards/margins": 0.0065823085606098175, - "rewards/margins_friction": 0.007413939572870731, - "rewards/rejected": -0.010072752833366394, - "rewards/rejected_friction": -0.009985201060771942, + "epoch": 0.04, + "learning_rate": 4.998679865908499e-06, + "logits/chosen": -0.3025161623954773, + "logits/rejected": -0.30388832092285156, + "logps/chosen": -421.076416015625, + "logps/rejected": -419.70428466796875, + "loss": 0.6432, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36134886741638184, + "rewards/margins": 0.10863993316888809, + "rewards/rejected": 0.25270897150039673, "step": 130 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.07, - "learning_rate": 4.99462942749205e-06, - "logits/chosen": -0.3738933205604553, - "logits/chosen_friction": -0.38529112935066223, - "logits/rejected": -0.37752649188041687, - "logits/rejected_friction": -0.3880864977836609, - "logps/chosen": -0.6596763134002686, - "logps/chosen_friction": -0.5253047347068787, - "logps/rejected": -0.7767431139945984, - "logps/rejected_friction": -0.6602280735969543, - "loss": 0.2352, - "policy_friction_nll_loss": 0.5175390839576721, - "policy_nll_loss": 0.6515557765960693, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.004567156080156565, - "rewards/chosen_fricton": -0.003076119814068079, - "rewards/margins": 0.007207978516817093, - "rewards/margins_friction": 0.008442142978310585, - "rewards/rejected": -0.011775135062634945, - "rewards/rejected_friction": -0.011518261395394802, - "step": 135 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.07, - "learning_rate": 4.9941913585561065e-06, - "logits/chosen": -0.368344247341156, - "logits/chosen_friction": -0.3772405982017517, - "logits/rejected": -0.37246227264404297, - "logits/rejected_friction": -0.38048383593559265, - "logps/chosen": -0.6823196411132812, - "logps/chosen_friction": -0.5456042885780334, - "logps/rejected": -0.8101092576980591, - "logps/rejected_friction": -0.6911510825157166, - "loss": 0.233, - "policy_friction_nll_loss": 0.5293003916740417, - "policy_nll_loss": 0.6656594276428223, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.005590664688497782, - "rewards/chosen_fricton": -0.004015929065644741, - "rewards/margins": 0.008802399970591068, - "rewards/margins_friction": 0.01024430152028799, - "rewards/rejected": -0.014393064193427563, - "rewards/rejected_friction": -0.014260229654610157, + "epoch": 0.04, + "learning_rate": 4.9976532556099425e-06, + "logits/chosen": -0.29753798246383667, + "logits/rejected": -0.2986024022102356, + "logps/chosen": -423.3164978027344, + "logps/rejected": -420.72918701171875, + "loss": 0.632, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.45896610617637634, + "rewards/margins": 0.13753186166286469, + "rewards/rejected": 0.32143422961235046, "step": 140 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.08, - "learning_rate": 4.993736137586096e-06, - "logits/chosen": -0.3835095167160034, - "logits/chosen_friction": -0.3920130729675293, - "logits/rejected": -0.38750743865966797, - "logits/rejected_friction": -0.3949539363384247, - "logps/chosen": -0.6395205855369568, - "logps/chosen_friction": -0.5071371793746948, - "logps/rejected": -0.7672320008277893, - "logps/rejected_friction": -0.6599465012550354, - "loss": 0.2303, - "policy_friction_nll_loss": 0.49227601289749146, - "policy_nll_loss": 0.6246052980422974, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.006210241466760635, - "rewards/chosen_fricton": -0.004715236369520426, - "rewards/margins": 0.00946205947548151, - "rewards/margins_friction": 0.011043312028050423, - "rewards/rejected": -0.01567230001091957, - "rewards/rejected_friction": -0.01575854793190956, - "step": 145 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.08, - "learning_rate": 4.993263767712478e-06, - "logits/chosen": -0.38702213764190674, - "logits/chosen_friction": -0.39643916487693787, - "logits/rejected": -0.3921067714691162, - "logits/rejected_friction": -0.4005972743034363, - "logps/chosen": -0.6607063412666321, - "logps/chosen_friction": -0.518939197063446, - "logps/rejected": -0.7928908467292786, - "logps/rejected_friction": -0.6746068000793457, - "loss": 0.2303, - "policy_friction_nll_loss": 0.5094133615493774, - "policy_nll_loss": 0.6507609486579895, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.007442126981914043, - "rewards/chosen_fricton": -0.005324510857462883, - "rewards/margins": 0.009219849482178688, - "rewards/margins_friction": 0.011021386831998825, - "rewards/rejected": -0.016661977395415306, - "rewards/rejected_friction": -0.016345897689461708, + "epoch": 0.05, + "learning_rate": 4.99633353462781e-06, + "logits/chosen": -0.300027072429657, + "logits/rejected": -0.3015795648097992, + "logps/chosen": -413.91973876953125, + "logps/rejected": -415.4903869628906, + "loss": 0.6428, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.47876471281051636, + "rewards/margins": 0.11648330837488174, + "rewards/rejected": 0.3622814118862152, "step": 150 }, { - "directrewards_student/accuracies": 0.9750000238418579, - "epoch": 0.08, - "learning_rate": 4.99277425218364e-06, - "logits/chosen": -0.3967662453651428, - "logits/chosen_friction": -0.4063017964363098, - "logits/rejected": -0.4008919298648834, - "logits/rejected_friction": -0.4088623523712158, - "logps/chosen": -0.7018299102783203, - "logps/chosen_friction": -0.5560966730117798, - "logps/rejected": -0.8417743444442749, - "logps/rejected_friction": -0.7252101898193359, - "loss": 0.2272, - "policy_friction_nll_loss": 0.5466572046279907, - "policy_nll_loss": 0.6929546594619751, - "rewards/accuracies": 0.9750000238418579, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.008221302181482315, - "rewards/chosen_fricton": -0.005651370622217655, - "rewards/margins": 0.01100502721965313, - "rewards/margins_friction": 0.013441326096653938, - "rewards/rejected": -0.019226327538490295, - "rewards/rejected_friction": -0.019092697650194168, - "step": 155 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.09, - "learning_rate": 4.99226759436588e-06, - "logits/chosen": -0.39735832810401917, - "logits/chosen_friction": -0.40663081407546997, - "logits/rejected": -0.4018126130104065, - "logits/rejected_friction": -0.4097481667995453, - "logps/chosen": -0.6642957925796509, - "logps/chosen_friction": -0.5099631547927856, - "logps/rejected": -0.8251299858093262, - "logps/rejected_friction": -0.7092744708061218, - "loss": 0.2249, - "policy_friction_nll_loss": 0.5002068281173706, - "policy_nll_loss": 0.6525818705558777, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.008269032463431358, - "rewards/chosen_fricton": -0.004728867672383785, - "rewards/margins": 0.011942217126488686, - "rewards/margins_friction": 0.014678199775516987, - "rewards/rejected": -0.020211249589920044, - "rewards/rejected_friction": -0.019407067447900772, + "epoch": 0.05, + "learning_rate": 4.994720857837211e-06, + "logits/chosen": -0.3021107316017151, + "logits/rejected": -0.30334895849227905, + "logps/chosen": -419.66571044921875, + "logps/rejected": -420.95068359375, + "loss": 0.623, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.5215948820114136, + "rewards/margins": 0.1642296016216278, + "rewards/rejected": 0.35736531019210815, "step": 160 }, { - "directrewards_student/accuracies": 0.9750000238418579, - "epoch": 0.09, - "learning_rate": 4.991743797743377e-06, - "logits/chosen": -0.38976359367370605, - "logits/chosen_friction": -0.3999491035938263, - "logits/rejected": -0.39475566148757935, - "logits/rejected_friction": -0.4035674035549164, - "logps/chosen": -0.6658697128295898, - "logps/chosen_friction": -0.5152384042739868, - "logps/rejected": -0.8189758062362671, - "logps/rejected_friction": -0.701639711856842, - "loss": 0.2231, - "policy_friction_nll_loss": 0.5042110681533813, - "policy_nll_loss": 0.6530486345291138, - "rewards/accuracies": 0.9750000238418579, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.010016108863055706, - "rewards/chosen_fricton": -0.006795505993068218, - "rewards/margins": 0.011944970116019249, - "rewards/margins_friction": 0.014278659597039223, - "rewards/rejected": -0.02196108177304268, - "rewards/rejected_friction": -0.021074164658784866, - "step": 165 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.09, - "learning_rate": 4.991202865918171e-06, - "logits/chosen": -0.40273386240005493, - "logits/chosen_friction": -0.4123965799808502, - "logits/rejected": -0.4097481667995453, - "logits/rejected_friction": -0.4176662862300873, - "logps/chosen": -0.7326638698577881, - "logps/chosen_friction": -0.5706255435943604, - "logps/rejected": -0.9053562879562378, - "logps/rejected_friction": -0.7790883779525757, - "loss": 0.2215, - "policy_friction_nll_loss": 0.564799427986145, - "policy_nll_loss": 0.726116418838501, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.011555509641766548, - "rewards/chosen_fricton": -0.0077553391456604, - "rewards/margins": 0.013978390023112297, - "rewards/margins_friction": 0.016680995002388954, - "rewards/rejected": -0.025533899664878845, - "rewards/rejected_friction": -0.024436334148049355, + "epoch": 0.05, + "learning_rate": 4.992815414492917e-06, + "logits/chosen": -0.29045212268829346, + "logits/rejected": -0.29103735089302063, + "logps/chosen": -411.07635498046875, + "logps/rejected": -411.93463134765625, + "loss": 0.6303, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.5044211149215698, + "rewards/margins": 0.16038301587104797, + "rewards/rejected": 0.34403812885284424, "step": 170 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.09, - "learning_rate": 4.990644802610138e-06, - "logits/chosen": -0.40903764963150024, - "logits/chosen_friction": -0.41738295555114746, - "logits/rejected": -0.4147789478302002, - "logits/rejected_friction": -0.4218331277370453, - "logps/chosen": -0.7045283317565918, - "logps/chosen_friction": -0.5503469705581665, - "logps/rejected": -0.8826897740364075, - "logps/rejected_friction": -0.7644280195236206, - "loss": 0.218, - "policy_friction_nll_loss": 0.5431884527206421, - "policy_nll_loss": 0.69652259349823, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.013016602024435997, - "rewards/chosen_fricton": -0.009027759544551373, - "rewards/margins": 0.013800512067973614, - "rewards/margins_friction": 0.017072398215532303, - "rewards/rejected": -0.026817116886377335, - "rewards/rejected_friction": -0.02610015869140625, - "step": 175 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.1, - "learning_rate": 4.990069611656963e-06, - "logits/chosen": -0.42261362075805664, - "logits/chosen_friction": -0.42979201674461365, - "logits/rejected": -0.42727217078208923, - "logits/rejected_friction": -0.43392863869667053, - "logps/chosen": -0.7374964952468872, - "logps/chosen_friction": -0.5757134556770325, - "logps/rejected": -0.9290131330490112, - "logps/rejected_friction": -0.8090761303901672, - "loss": 0.2161, - "policy_friction_nll_loss": 0.5696359872817993, - "policy_nll_loss": 0.7302685379981995, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.015160107985138893, - "rewards/chosen_fricton": -0.010406826622784138, - "rewards/margins": 0.01590946689248085, - "rewards/margins_friction": 0.019323986023664474, - "rewards/rejected": -0.031069573014974594, - "rewards/rejected_friction": -0.029730811715126038, + "epoch": 0.05, + "learning_rate": 4.990617428207153e-06, + "logits/chosen": -0.29839888215065, + "logits/rejected": -0.29893797636032104, + "logps/chosen": -430.1136169433594, + "logps/rejected": -428.3583984375, + "loss": 0.6029, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5385109782218933, + "rewards/margins": 0.22802197933197021, + "rewards/rejected": 0.3104889690876007, "step": 180 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.1, - "learning_rate": 4.9894772970141145e-06, - "logits/chosen": -0.4141605496406555, - "logits/chosen_friction": -0.41954874992370605, - "logits/rejected": -0.4202291965484619, - "logits/rejected_friction": -0.424614816904068, - "logps/chosen": -0.7721706628799438, - "logps/chosen_friction": -0.6033174395561218, - "logps/rejected": -0.9761008024215698, - "logps/rejected_friction": -0.8593313097953796, - "loss": 0.2128, - "policy_friction_nll_loss": 0.5907949209213257, - "policy_nll_loss": 0.7611840963363647, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.016601013019680977, - "rewards/chosen_fricton": -0.011196363717317581, - "rewards/margins": 0.017585884779691696, - "rewards/margins_friction": 0.022283922880887985, - "rewards/rejected": -0.034186895936727524, - "rewards/rejected_friction": -0.033480290323495865, - "step": 185 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.1, - "learning_rate": 4.988867862754816e-06, - "logits/chosen": -0.41763535141944885, - "logits/chosen_friction": -0.42315587401390076, - "logits/rejected": -0.42337122559547424, - "logits/rejected_friction": -0.42775505781173706, - "logps/chosen": -0.7826138138771057, - "logps/chosen_friction": -0.6025420427322388, - "logps/rejected": -1.0235003232955933, - "logps/rejected_friction": -0.9037812352180481, - "loss": 0.2079, - "policy_friction_nll_loss": 0.5889862179756165, - "policy_nll_loss": 0.769546627998352, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.018881764262914658, - "rewards/chosen_fricton": -0.012712238356471062, - "rewards/margins": 0.020308542996644974, - "rewards/margins_friction": 0.025638053193688393, - "rewards/rejected": -0.03919030725955963, - "rewards/rejected_friction": -0.038350291550159454, + "epoch": 0.06, + "learning_rate": 4.988127156923355e-06, + "logits/chosen": -0.2956782281398773, + "logits/rejected": -0.2963833212852478, + "logps/chosen": -415.17071533203125, + "logps/rejected": -414.58148193359375, + "loss": 0.6078, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4606494903564453, + "rewards/margins": 0.22775804996490479, + "rewards/rejected": 0.23289147019386292, "step": 190 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.1, - "learning_rate": 4.988241313070021e-06, - "logits/chosen": -0.4240478575229645, - "logits/chosen_friction": -0.4286819398403168, - "logits/rejected": -0.4291207790374756, - "logits/rejected_friction": -0.4324862062931061, - "logps/chosen": -0.8318597078323364, - "logps/chosen_friction": -0.6378362774848938, - "logps/rejected": -1.0774534940719604, - "logps/rejected_friction": -0.955168604850769, - "loss": 0.2053, - "policy_friction_nll_loss": 0.6262146234512329, - "policy_nll_loss": 0.820708155632019, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0212906114757061, - "rewards/chosen_fricton": -0.014426725916564465, - "rewards/margins": 0.021477866917848587, - "rewards/margins_friction": 0.027774566784501076, - "rewards/rejected": -0.04276847466826439, - "rewards/rejected_friction": -0.042201291769742966, - "step": 195 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.11, - "learning_rate": 4.98759765226838e-06, - "logits/chosen": -0.4178541302680969, - "logits/chosen_friction": -0.42234906554222107, - "logits/rejected": -0.42459768056869507, - "logits/rejected_friction": -0.4283132553100586, - "logps/chosen": -0.7865546345710754, - "logps/chosen_friction": -0.6109766960144043, - "logps/rejected": -1.0411224365234375, - "logps/rejected_friction": -0.9348773956298828, - "loss": 0.2012, - "policy_friction_nll_loss": 0.5941272377967834, - "policy_nll_loss": 0.7690792083740234, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.02233363687992096, - "rewards/chosen_fricton": -0.015303623862564564, - "rewards/margins": 0.021791942417621613, - "rewards/margins_friction": 0.028036698698997498, - "rewards/rejected": -0.04412557929754257, - "rewards/rejected_friction": -0.04334031790494919, + "epoch": 0.06, + "learning_rate": 4.985344892885899e-06, + "logits/chosen": -0.29678258299827576, + "logits/rejected": -0.2977609634399414, + "logps/chosen": -416.76275634765625, + "logps/rejected": -419.5223693847656, + "loss": 0.5821, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.44212013483047485, + "rewards/margins": 0.29724568128585815, + "rewards/rejected": 0.1448744386434555, "step": 200 }, { - "epoch": 0.11, - "eval_directrewards_student/accuracies": 0.9959999918937683, - "eval_logits/chosen": -0.4694657325744629, - "eval_logits/chosen_friction": -0.4735925793647766, - "eval_logits/rejected": -0.4758722484111786, - "eval_logits/rejected_friction": -0.47876453399658203, - "eval_logps/chosen": -0.8564494848251343, - "eval_logps/chosen_friction": -0.6548765897750854, - "eval_logps/rejected": -1.1303471326828003, - "eval_logps/rejected_friction": -1.0138707160949707, - "eval_loss": 0.19786414504051208, - "eval_policy_friction_nll_loss": 0.6548765897750854, - "eval_policy_nll_loss": 0.8564494848251343, - "eval_rewards/accuracies": 0.9959999918937683, - "eval_rewards/accuracies_friction": 0.9800000190734863, - "eval_rewards/chosen": -0.025702781975269318, - "eval_rewards/chosen_fricton": -0.01721893809735775, - "eval_rewards/margins": 0.023911969736218452, - "eval_rewards/margins_friction": 0.031752247363328934, - "eval_rewards/rejected": -0.04961474612355232, - "eval_rewards/rejected_friction": -0.04897118732333183, - "eval_runtime": 551.0568, - "eval_samples_per_second": 0.907, - "eval_steps_per_second": 0.454, + "epoch": 0.06, + "eval_logits/chosen": -0.35281771421432495, + "eval_logits/rejected": -0.35360345244407654, + "eval_logps/chosen": -408.5499267578125, + "eval_logps/rejected": -409.8388977050781, + "eval_loss": 0.5728641152381897, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": 0.43412691354751587, + "eval_rewards/margins": 0.3201069235801697, + "eval_rewards/rejected": 0.11402001231908798, + "eval_runtime": 351.7745, + "eval_samples_per_second": 1.421, + "eval_steps_per_second": 1.421, "step": 200 }, { - "directrewards_student/accuracies": 0.9750000238418579, - "epoch": 0.11, - "learning_rate": 4.986936884776215e-06, - "logits/chosen": -0.4296306073665619, - "logits/chosen_friction": -0.4341045320034027, - "logits/rejected": -0.4361076354980469, - "logits/rejected_friction": -0.4398559629917145, - "logps/chosen": -0.8858259320259094, - "logps/chosen_friction": -0.6715798377990723, - "logps/rejected": -1.1616358757019043, - "logps/rejected_friction": -1.0395201444625854, - "loss": 0.1964, - "policy_friction_nll_loss": 0.6609475612640381, - "policy_nll_loss": 0.8756254315376282, - "rewards/accuracies": 0.9750000238418579, - "rewards/accuracies_friction": 0.9750000238418579, - "rewards/chosen": -0.027454610913991928, - "rewards/chosen_fricton": -0.01796814054250717, - "rewards/margins": 0.023924008011817932, - "rewards/margins_friction": 0.03221968188881874, - "rewards/rejected": -0.05137861892580986, - "rewards/rejected_friction": -0.05018782615661621, - "step": 205 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.11, - "learning_rate": 4.986259015137485e-06, - "logits/chosen": -0.4449669420719147, - "logits/chosen_friction": -0.446676105260849, - "logits/rejected": -0.45134392380714417, - "logits/rejected_friction": -0.45192980766296387, - "logps/chosen": -0.9017565846443176, - "logps/chosen_friction": -0.6855921745300293, - "logps/rejected": -1.1791397333145142, - "logps/rejected_friction": -1.0634921789169312, - "loss": 0.1948, - "policy_friction_nll_loss": 0.6774778366088867, - "policy_nll_loss": 0.8938648104667664, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.030214663594961166, - "rewards/chosen_fricton": -0.020085137337446213, - "rewards/margins": 0.025329966098070145, - "rewards/margins_friction": 0.03494200110435486, - "rewards/rejected": -0.05554462596774101, - "rewards/rejected_friction": -0.05502713844180107, + "epoch": 0.06, + "learning_rate": 4.9822709626058065e-06, + "logits/chosen": -0.29128286242485046, + "logits/rejected": -0.2920396327972412, + "logps/chosen": -416.55322265625, + "logps/rejected": -417.397216796875, + "loss": 0.5743, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42414647340774536, + "rewards/margins": 0.3180859684944153, + "rewards/rejected": 0.1060604602098465, "step": 210 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.11, - "learning_rate": 4.985564048013761e-06, - "logits/chosen": -0.4312487244606018, - "logits/chosen_friction": -0.43202877044677734, - "logits/rejected": -0.43727749586105347, - "logits/rejected_friction": -0.43759140372276306, - "logps/chosen": -0.9737440943717957, - "logps/chosen_friction": -0.7406086921691895, - "logps/rejected": -1.2550621032714844, - "logps/rejected_friction": -1.1227493286132812, - "loss": 0.1887, - "policy_friction_nll_loss": 0.7196871638298035, - "policy_nll_loss": 0.9533294439315796, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.03457275778055191, - "rewards/chosen_fricton": -0.02362975850701332, - "rewards/margins": 0.025296147912740707, - "rewards/margins_friction": 0.03458528593182564, - "rewards/rejected": -0.059868909418582916, - "rewards/rejected_friction": -0.05821504443883896, - "step": 215 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.12, - "learning_rate": 4.984851988184184e-06, - "logits/chosen": -0.4429115653038025, - "logits/chosen_friction": -0.44250816106796265, - "logits/rejected": -0.4472218453884125, - "logits/rejected_friction": -0.44656485319137573, - "logps/chosen": -1.0061335563659668, - "logps/chosen_friction": -0.7516798973083496, - "logps/rejected": -1.320921778678894, - "logps/rejected_friction": -1.2011637687683105, - "loss": 0.1801, - "policy_friction_nll_loss": 0.7428168058395386, - "policy_nll_loss": 0.9964038133621216, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.03765086457133293, - "rewards/chosen_fricton": -0.024546844884753227, - "rewards/margins": 0.02950698509812355, - "rewards/margins_friction": 0.042336251586675644, - "rewards/rejected": -0.06715784966945648, - "rewards/rejected_friction": -0.06688309460878372, + "epoch": 0.07, + "learning_rate": 4.978905726822424e-06, + "logits/chosen": -0.29205116629600525, + "logits/rejected": -0.2932327687740326, + "logps/chosen": -429.031005859375, + "logps/rejected": -432.4542541503906, + "loss": 0.5944, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.34927603602409363, + "rewards/margins": 0.2745763659477234, + "rewards/rejected": 0.07469968497753143, "step": 220 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.12, - "learning_rate": 4.984122840545444e-06, - "logits/chosen": -0.4324073791503906, - "logits/chosen_friction": -0.4317920207977295, - "logits/rejected": -0.4399832785129547, - "logits/rejected_friction": -0.439213365316391, - "logps/chosen": -0.9478651285171509, - "logps/chosen_friction": -0.6928271055221558, - "logps/rejected": -1.2944138050079346, - "logps/rejected_friction": -1.1915733814239502, - "loss": 0.1775, - "policy_friction_nll_loss": 0.6737472414970398, - "policy_nll_loss": 0.9282517433166504, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.03653339296579361, - "rewards/chosen_fricton": -0.022479115054011345, - "rewards/margins": 0.03136512637138367, - "rewards/margins_friction": 0.04562152549624443, - "rewards/rejected": -0.06789851933717728, - "rewards/rejected_friction": -0.06810064613819122, - "step": 225 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.12, - "learning_rate": 4.983376610111733e-06, - "logits/chosen": -0.45605674386024475, - "logits/chosen_friction": -0.4536314010620117, - "logits/rejected": -0.4615457057952881, - "logits/rejected_friction": -0.45918139815330505, - "logps/chosen": -0.9893831014633179, - "logps/chosen_friction": -0.713188648223877, - "logps/rejected": -1.3852999210357666, - "logps/rejected_friction": -1.2636282444000244, - "loss": 0.1716, - "policy_friction_nll_loss": 0.6996903419494629, - "policy_nll_loss": 0.9757669568061829, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.040653884410858154, - "rewards/chosen_fricton": -0.025614067912101746, - "rewards/margins": 0.03576738387346268, - "rewards/margins_friction": 0.050214141607284546, - "rewards/rejected": -0.07642126083374023, - "rewards/rejected_friction": -0.07582820951938629, + "epoch": 0.07, + "learning_rate": 4.975249580461092e-06, + "logits/chosen": -0.29278379678726196, + "logits/rejected": -0.29318395256996155, + "logps/chosen": -415.50640869140625, + "logps/rejected": -414.65631103515625, + "loss": 0.6108, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.20576027035713196, + "rewards/margins": 0.24422487616539001, + "rewards/rejected": -0.038464583456516266, "step": 230 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.13, - "learning_rate": 4.982613302014726e-06, - "logits/chosen": -0.4438694417476654, - "logits/chosen_friction": -0.4402969479560852, - "logits/rejected": -0.44896000623703003, - "logits/rejected_friction": -0.4451751112937927, - "logps/chosen": -1.0327517986297607, - "logps/chosen_friction": -0.7390481233596802, - "logps/rejected": -1.438905954360962, - "logps/rejected_friction": -1.3272321224212646, - "loss": 0.1624, - "policy_friction_nll_loss": 0.7226125001907349, - "policy_nll_loss": 1.0154742002487183, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.044286929070949554, - "rewards/chosen_fricton": -0.02610587514936924, - "rewards/margins": 0.03707171231508255, - "rewards/margins_friction": 0.05487145110964775, - "rewards/rejected": -0.08135863393545151, - "rewards/rejected_friction": -0.08097732812166214, - "step": 235 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.13, - "learning_rate": 4.981832921503533e-06, - "logits/chosen": -0.46316203474998474, - "logits/chosen_friction": -0.4580881595611572, - "logits/rejected": -0.4682955741882324, - "logits/rejected_friction": -0.4623699188232422, - "logps/chosen": -1.0788553953170776, - "logps/chosen_friction": -0.7713611125946045, - "logps/rejected": -1.5487130880355835, - "logps/rejected_friction": -1.4329941272735596, - "loss": 0.1537, - "policy_friction_nll_loss": 0.7613205909729004, - "policy_nll_loss": 1.0648301839828491, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.048601433634757996, - "rewards/chosen_fricton": -0.029649490490555763, - "rewards/margins": 0.042994752526283264, - "rewards/margins_friction": 0.0615568645298481, - "rewards/rejected": -0.09159617871046066, - "rewards/rejected_friction": -0.09120635688304901, + "epoch": 0.07, + "learning_rate": 4.971302952586796e-06, + "logits/chosen": -0.2884067893028259, + "logits/rejected": -0.2890322208404541, + "logps/chosen": -411.9427795410156, + "logps/rejected": -418.1693420410156, + "loss": 0.553, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.25125259160995483, + "rewards/margins": 0.39369240403175354, + "rewards/rejected": -0.1424398422241211, "step": 240 }, { - "directrewards_student/accuracies": 0.9750000238418579, - "epoch": 0.13, - "learning_rate": 4.981035473944667e-06, - "logits/chosen": -0.4645443558692932, - "logits/chosen_friction": -0.4591597616672516, - "logits/rejected": -0.46980157494544983, - "logits/rejected_friction": -0.4625352919101715, - "logps/chosen": -1.1394163370132446, - "logps/chosen_friction": -0.7471845149993896, - "logps/rejected": -1.6505944728851318, - "logps/rejected_friction": -1.4999287128448486, - "loss": 0.1534, - "policy_friction_nll_loss": 0.7363342046737671, - "policy_nll_loss": 1.1274616718292236, - "rewards/accuracies": 0.9750000238418579, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.052766840904951096, - "rewards/chosen_fricton": -0.026315325871109962, - "rewards/margins": 0.04792843014001846, - "rewards/margins_friction": 0.0711032822728157, - "rewards/rejected": -0.10069527477025986, - "rewards/rejected_friction": -0.09741861373186111, - "step": 245 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.13, - "learning_rate": 4.980220964822009e-06, - "logits/chosen": -0.4562797546386719, - "logits/chosen_friction": -0.45188841223716736, - "logits/rejected": -0.4612824022769928, - "logits/rejected_friction": -0.45525676012039185, - "logps/chosen": -1.1963955163955688, - "logps/chosen_friction": -0.8126664161682129, - "logps/rejected": -1.745265245437622, - "logps/rejected_friction": -1.5999141931533813, - "loss": 0.1431, - "policy_friction_nll_loss": 0.7960628867149353, - "policy_nll_loss": 1.1753641366958618, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.05832027643918991, - "rewards/chosen_fricton": -0.03138437122106552, - "rewards/margins": 0.050876159220933914, - "rewards/margins_friction": 0.07390347123146057, - "rewards/rejected": -0.10919643938541412, - "rewards/rejected_friction": -0.10528784990310669, + "epoch": 0.08, + "learning_rate": 4.967066306353816e-06, + "logits/chosen": -0.28915414214134216, + "logits/rejected": -0.29073747992515564, + "logps/chosen": -417.0771484375, + "logps/rejected": -419.65380859375, + "loss": 0.5598, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.25535959005355835, + "rewards/margins": 0.40900731086730957, + "rewards/rejected": -0.15364770591259003, "step": 250 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.14, - "learning_rate": 4.979389399736768e-06, - "logits/chosen": -0.4480307698249817, - "logits/chosen_friction": -0.4423221945762634, - "logits/rejected": -0.4539082944393158, - "logits/rejected_friction": -0.44505324959754944, - "logps/chosen": -1.260432481765747, - "logps/chosen_friction": -0.8147169947624207, - "logps/rejected": -1.8634297847747803, - "logps/rejected_friction": -1.6888949871063232, - "loss": 0.1342, - "policy_friction_nll_loss": 0.7983626127243042, - "policy_nll_loss": 1.2420291900634766, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.0646604672074318, - "rewards/chosen_fricton": -0.03252624720335007, - "rewards/margins": 0.056344129145145416, - "rewards/margins_friction": 0.08271750807762146, - "rewards/rejected": -0.12100458145141602, - "rewards/rejected_friction": -0.11524375528097153, - "step": 255 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.14, - "learning_rate": 4.978540784407445e-06, - "logits/chosen": -0.4533623158931732, - "logits/chosen_friction": -0.45006853342056274, - "logits/rejected": -0.4569335877895355, - "logits/rejected_friction": -0.45011940598487854, - "logps/chosen": -1.2111248970031738, - "logps/chosen_friction": -0.7525273561477661, - "logps/rejected": -1.865566611289978, - "logps/rejected_friction": -1.6906808614730835, - "loss": 0.1296, - "policy_friction_nll_loss": 0.7381371259689331, - "policy_nll_loss": 1.1937330961227417, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.06315203011035919, - "rewards/chosen_fricton": -0.028203774243593216, - "rewards/margins": 0.06221739202737808, - "rewards/margins_friction": 0.08972037583589554, - "rewards/rejected": -0.12536941468715668, - "rewards/rejected_friction": -0.11792413890361786, + "epoch": 0.08, + "learning_rate": 4.962540138951371e-06, + "logits/chosen": -0.2950271964073181, + "logits/rejected": -0.29611852765083313, + "logps/chosen": -420.79681396484375, + "logps/rejected": -425.1570739746094, + "loss": 0.5278, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.24652545154094696, + "rewards/margins": 0.48342761397361755, + "rewards/rejected": -0.2369021624326706, "step": 260 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.14, - "learning_rate": 4.97767512466979e-06, - "logits/chosen": -0.44031715393066406, - "logits/chosen_friction": -0.4379919171333313, - "logits/rejected": -0.44390669465065, - "logits/rejected_friction": -0.43702930212020874, - "logps/chosen": -1.342224359512329, - "logps/chosen_friction": -0.8228452801704407, - "logps/rejected": -1.9965769052505493, - "logps/rejected_friction": -1.7531007528305054, - "loss": 0.122, - "policy_friction_nll_loss": 0.8129754066467285, - "policy_nll_loss": 1.3245707750320435, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.07532495260238647, - "rewards/chosen_fricton": -0.0352577306330204, - "rewards/margins": 0.06234375759959221, - "rewards/margins_friction": 0.08917827904224396, - "rewards/rejected": -0.13766871392726898, - "rewards/rejected_friction": -0.12443602085113525, - "step": 265 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.14, - "learning_rate": 4.976792426476765e-06, - "logits/chosen": -0.4328429698944092, - "logits/chosen_friction": -0.43052777647972107, - "logits/rejected": -0.43646684288978577, - "logits/rejected_friction": -0.4288904666900635, - "logps/chosen": -1.482112169265747, - "logps/chosen_friction": -0.9083009958267212, - "logps/rejected": -2.2612717151641846, - "logps/rejected_friction": -2.004518985748291, - "loss": 0.1173, - "policy_friction_nll_loss": 0.8976882696151733, - "policy_nll_loss": 1.4655816555023193, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.08560501039028168, - "rewards/chosen_fricton": -0.039742451161146164, - "rewards/margins": 0.07418496906757355, - "rewards/margins_friction": 0.10537707805633545, - "rewards/rejected": -0.15978997945785522, - "rewards/rejected_friction": -0.14511951804161072, + "epoch": 0.08, + "learning_rate": 4.957724981545276e-06, + "logits/chosen": -0.28752994537353516, + "logits/rejected": -0.2876993417739868, + "logps/chosen": -413.72808837890625, + "logps/rejected": -418.2240295410156, + "loss": 0.5369, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.144112229347229, + "rewards/margins": 0.48878079652786255, + "rewards/rejected": -0.34466850757598877, "step": 270 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.15, - "learning_rate": 4.975892695898501e-06, - "logits/chosen": -0.41538795828819275, - "logits/chosen_friction": -0.4164756238460541, - "logits/rejected": -0.4182753562927246, - "logits/rejected_friction": -0.4142269194126129, - "logps/chosen": -1.5796778202056885, - "logps/chosen_friction": -0.9275256991386414, - "logps/rejected": -2.351724147796631, - "logps/rejected_friction": -2.0266902446746826, - "loss": 0.0984, - "policy_friction_nll_loss": 0.9139469265937805, - "policy_nll_loss": 1.5543015003204346, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.09492218494415283, - "rewards/chosen_fricton": -0.04171871021389961, - "rewards/margins": 0.07312412559986115, - "rewards/margins_friction": 0.10517062246799469, - "rewards/rejected": -0.16804631054401398, - "rewards/rejected_friction": -0.146889328956604, - "step": 275 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.15, - "learning_rate": 4.974975939122257e-06, - "logits/chosen": -0.4305237829685211, - "logits/chosen_friction": -0.42976292967796326, - "logits/rejected": -0.4303351044654846, - "logits/rejected_friction": -0.42708903551101685, - "logps/chosen": -1.5031272172927856, - "logps/chosen_friction": -0.8800420761108398, - "logps/rejected": -2.330683708190918, - "logps/rejected_friction": -1.9731191396713257, - "loss": 0.0903, - "policy_friction_nll_loss": 0.8627988696098328, - "policy_nll_loss": 1.4820935726165771, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.09303892403841019, - "rewards/chosen_fricton": -0.04190041497349739, - "rewards/margins": 0.07970280945301056, - "rewards/margins_friction": 0.1055978536605835, - "rewards/rejected": -0.17274174094200134, - "rewards/rejected_friction": -0.14749827980995178, + "epoch": 0.09, + "learning_rate": 4.952621399215598e-06, + "logits/chosen": -0.29713207483291626, + "logits/rejected": -0.29806575179100037, + "logps/chosen": -420.4150390625, + "logps/rejected": -428.95513916015625, + "loss": 0.5325, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.011465489864349365, + "rewards/margins": 0.47427234053611755, + "rewards/rejected": -0.4857378602027893, "step": 280 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.15, - "learning_rate": 4.9740421624523795e-06, - "logits/chosen": -0.3914481997489929, - "logits/chosen_friction": -0.3893260061740875, - "logits/rejected": -0.38819247484207153, - "logits/rejected_friction": -0.38773077726364136, - "logps/chosen": -1.598111867904663, - "logps/chosen_friction": -0.8648148775100708, - "logps/rejected": -2.5629799365997314, - "logps/rejected_friction": -2.110851764678955, - "loss": 0.0867, - "policy_friction_nll_loss": 0.8422088623046875, - "policy_nll_loss": 1.564345121383667, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.1000422015786171, - "rewards/chosen_fricton": -0.03797713667154312, - "rewards/margins": 0.09280092269182205, - "rewards/margins_friction": 0.12022973597049713, - "rewards/rejected": -0.19284310936927795, - "rewards/rejected_friction": -0.15820686519145966, - "step": 285 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.15, - "learning_rate": 4.973091372310256e-06, - "logits/chosen": -0.3900810182094574, - "logits/chosen_friction": -0.3903324007987976, - "logits/rejected": -0.38707008957862854, - "logits/rejected_friction": -0.38567858934402466, - "logps/chosen": -1.791994333267212, - "logps/chosen_friction": -0.9946066737174988, - "logps/rejected": -2.7745025157928467, - "logps/rejected_friction": -2.3479321002960205, - "loss": 0.0704, - "policy_friction_nll_loss": 0.9597315788269043, - "policy_nll_loss": 1.7453886270523071, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 0.9750000238418579, - "rewards/chosen": -0.11902382224798203, - "rewards/chosen_fricton": -0.05024139955639839, - "rewards/margins": 0.09580688178539276, - "rewards/margins_friction": 0.13228754699230194, - "rewards/rejected": -0.214830681681633, - "rewards/rejected_friction": -0.18252897262573242, + "epoch": 0.09, + "learning_rate": 4.947229990890356e-06, + "logits/chosen": -0.285542756319046, + "logits/rejected": -0.28633180260658264, + "logps/chosen": -420.0926208496094, + "logps/rejected": -423.4457092285156, + "loss": 0.5193, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.08504172414541245, + "rewards/margins": 0.5871935486793518, + "rewards/rejected": -0.6722352504730225, "step": 290 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.16, - "learning_rate": 4.972123575234271e-06, - "logits/chosen": -0.4044423997402191, - "logits/chosen_friction": -0.39920774102211, - "logits/rejected": -0.3959832191467285, - "logits/rejected_friction": -0.39631977677345276, - "logps/chosen": -1.7616033554077148, - "logps/chosen_friction": -0.9347909092903137, - "logps/rejected": -2.9489896297454834, - "logps/rejected_friction": -2.4276034832000732, - "loss": 0.0705, - "policy_friction_nll_loss": 0.9152876138687134, - "policy_nll_loss": 1.7318710088729858, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 0.9750000238418579, - "rewards/chosen": -0.11742955446243286, - "rewards/chosen_fricton": -0.046527616679668427, - "rewards/margins": 0.11587431281805038, - "rewards/margins_friction": 0.1456282138824463, - "rewards/rejected": -0.23330383002758026, - "rewards/rejected_friction": -0.1921558678150177, - "step": 295 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.16, - "learning_rate": 4.971138777879765e-06, - "logits/chosen": -0.40576261281967163, - "logits/chosen_friction": -0.4016001224517822, - "logits/rejected": -0.39521175622940063, - "logits/rejected_friction": -0.3978345990180969, - "logps/chosen": -1.8251638412475586, - "logps/chosen_friction": -0.8887651562690735, - "logps/rejected": -3.253871202468872, - "logps/rejected_friction": -2.6621804237365723, - "loss": 0.0456, - "policy_friction_nll_loss": 0.8773303031921387, - "policy_nll_loss": 1.7983405590057373, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.12381794303655624, - "rewards/chosen_fricton": -0.042604584246873856, - "rewards/margins": 0.13954277336597443, - "rewards/margins_friction": 0.1732209175825119, - "rewards/rejected": -0.2633606791496277, - "rewards/rejected_friction": -0.21582551300525665, + "epoch": 0.09, + "learning_rate": 4.941551389275217e-06, + "logits/chosen": -0.2842163145542145, + "logits/rejected": -0.28539806604385376, + "logps/chosen": -421.17822265625, + "logps/rejected": -424.78387451171875, + "loss": 0.5631, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19560113549232483, + "rewards/margins": 0.5197954177856445, + "rewards/rejected": -0.715396523475647, "step": 300 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.16, - "learning_rate": 4.970136987018982e-06, - "logits/chosen": -0.38239169120788574, - "logits/chosen_friction": -0.380502849817276, - "logits/rejected": -0.36894044280052185, - "logits/rejected_friction": -0.3730142116546631, - "logps/chosen": -2.0502452850341797, - "logps/chosen_friction": -1.0077028274536133, - "logps/rejected": -3.7720656394958496, - "logps/rejected_friction": -3.0153489112854004, - "loss": 0.0351, - "policy_friction_nll_loss": 0.9879337549209595, - "policy_nll_loss": 2.0071232318878174, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 0.9750000238418579, - "rewards/chosen": -0.14394401013851166, - "rewards/chosen_fricton": -0.051836490631103516, - "rewards/margins": 0.16920125484466553, - "rewards/margins_friction": 0.19742360711097717, - "rewards/rejected": -0.313145250082016, - "rewards/rejected_friction": -0.24926011264324188, - "step": 305 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.17, - "learning_rate": 4.96911820954103e-06, - "logits/chosen": -0.36577653884887695, - "logits/chosen_friction": -0.36586737632751465, - "logits/rejected": -0.3519921600818634, - "logits/rejected_friction": -0.3568960428237915, - "logps/chosen": -2.359595537185669, - "logps/chosen_friction": -1.1785389184951782, - "logps/rejected": -4.257076263427734, - "logps/rejected_friction": -3.3531723022460938, - "loss": 0.0236, - "policy_friction_nll_loss": 1.1638033390045166, - "policy_nll_loss": 2.3274807929992676, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 0.9750000238418579, - "rewards/chosen": -0.17551447451114655, - "rewards/chosen_fricton": -0.06852618604898453, - "rewards/margins": 0.18637296557426453, - "rewards/margins_friction": 0.21344831585884094, - "rewards/rejected": -0.3618874251842499, - "rewards/rejected_friction": -0.2819744944572449, + "epoch": 0.09, + "learning_rate": 4.935586260779261e-06, + "logits/chosen": -0.2907197177410126, + "logits/rejected": -0.29180362820625305, + "logps/chosen": -427.5953063964844, + "logps/rejected": -431.76788330078125, + "loss": 0.5331, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.4097444415092468, + "rewards/margins": 0.5406568646430969, + "rewards/rejected": -0.9504014253616333, "step": 310 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.17, - "learning_rate": 4.9680824524518305e-06, - "logits/chosen": -0.38078179955482483, - "logits/chosen_friction": -0.38527703285217285, - "logits/rejected": -0.3656056523323059, - "logits/rejected_friction": -0.37196841835975647, - "logps/chosen": -2.544616460800171, - "logps/chosen_friction": -1.260008692741394, - "logps/rejected": -4.5273118019104, - "logps/rejected_friction": -3.5324573516845703, - "loss": 0.0206, - "policy_friction_nll_loss": 1.2370517253875732, - "policy_nll_loss": 2.520250082015991, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.1955484002828598, - "rewards/chosen_fricton": -0.07926937192678452, - "rewards/margins": 0.19516585767269135, - "rewards/margins_friction": 0.22342698276042938, - "rewards/rejected": -0.39071425795555115, - "rewards/rejected_friction": -0.3026963770389557, - "step": 315 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.17, - "learning_rate": 4.967029722874067e-06, - "logits/chosen": -0.3538613021373749, - "logits/chosen_friction": -0.35751596093177795, - "logits/rejected": -0.33793050050735474, - "logits/rejected_friction": -0.34291940927505493, - "logps/chosen": -2.457169771194458, - "logps/chosen_friction": -1.043961763381958, - "logps/rejected": -4.879039764404297, - "logps/rejected_friction": -3.8753867149353027, - "loss": 0.015, - "policy_friction_nll_loss": 1.0183322429656982, - "policy_nll_loss": 2.409411668777466, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.18375785648822784, - "rewards/chosen_fricton": -0.054677315056324005, - "rewards/margins": 0.23851242661476135, - "rewards/margins_friction": 0.27853474020957947, - "rewards/rejected": -0.4222702980041504, - "rewards/rejected_friction": -0.33321207761764526, + "epoch": 0.1, + "learning_rate": 4.929335305436764e-06, + "logits/chosen": -0.2902284264564514, + "logits/rejected": -0.2910650670528412, + "logps/chosen": -427.05621337890625, + "logps/rejected": -427.26904296875, + "loss": 0.5694, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4887164533138275, + "rewards/margins": 0.4573966860771179, + "rewards/rejected": -0.9461132287979126, "step": 320 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.17, - "learning_rate": 4.965960028047142e-06, - "logits/chosen": -0.3635169565677643, - "logits/chosen_friction": -0.3674304485321045, - "logits/rejected": -0.3552597463130951, - "logits/rejected_friction": -0.35583221912384033, - "logps/chosen": -2.5129504203796387, - "logps/chosen_friction": -1.0651214122772217, - "logps/rejected": -4.54482364654541, - "logps/rejected_friction": -3.691784620285034, - "loss": 0.0216, - "policy_friction_nll_loss": 1.0506494045257568, - "policy_nll_loss": 2.486193895339966, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.18993008136749268, - "rewards/chosen_fricton": -0.05690830945968628, - "rewards/margins": 0.19990795850753784, - "rewards/margins_friction": 0.2584311068058014, - "rewards/rejected": -0.3898380398750305, - "rewards/rejected_friction": -0.3153393864631653, - "step": 325 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.18, - "learning_rate": 4.964873375327125e-06, - "logits/chosen": -0.36532488465309143, - "logits/chosen_friction": -0.36841195821762085, - "logits/rejected": -0.3515672981739044, - "logits/rejected_friction": -0.3557327687740326, - "logps/chosen": -2.445390224456787, - "logps/chosen_friction": -1.0170891284942627, - "logps/rejected": -4.541645050048828, - "logps/rejected_friction": -3.547783374786377, - "loss": 0.0119, - "policy_friction_nll_loss": 1.0014126300811768, - "policy_nll_loss": 2.408135175704956, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.18750940263271332, - "rewards/chosen_fricton": -0.056276462972164154, - "rewards/margins": 0.2065151035785675, - "rewards/margins_friction": 0.24918898940086365, - "rewards/rejected": -0.39402449131011963, - "rewards/rejected_friction": -0.3054654598236084, + "epoch": 0.1, + "learning_rate": 4.922799256825052e-06, + "logits/chosen": -0.30178460478782654, + "logits/rejected": -0.3031577467918396, + "logps/chosen": -432.64544677734375, + "logps/rejected": -437.355712890625, + "loss": 0.5759, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5430983304977417, + "rewards/margins": 0.5063012838363647, + "rewards/rejected": -1.0493996143341064, "step": 330 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.18, - "learning_rate": 4.963769772186699e-06, - "logits/chosen": -0.36932340264320374, - "logits/chosen_friction": -0.3734860420227051, - "logits/rejected": -0.34818825125694275, - "logits/rejected_friction": -0.35592004656791687, - "logps/chosen": -2.5251569747924805, - "logps/chosen_friction": -1.1229045391082764, - "logps/rejected": -4.794487953186035, - "logps/rejected_friction": -3.6952719688415527, - "loss": 0.0168, - "policy_friction_nll_loss": 1.1086058616638184, - "policy_nll_loss": 2.4912803173065186, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.19488473236560822, - "rewards/chosen_fricton": -0.06687520444393158, - "rewards/margins": 0.22396719455718994, - "rewards/margins_friction": 0.253078430891037, - "rewards/rejected": -0.41885191202163696, - "rewards/rejected_friction": -0.31995365023612976, - "step": 335 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.18, - "learning_rate": 4.962649226215113e-06, - "logits/chosen": -0.357687383890152, - "logits/chosen_friction": -0.36294102668762207, - "logits/rejected": -0.34104710817337036, - "logits/rejected_friction": -0.34594088792800903, - "logps/chosen": -2.645991802215576, - "logps/chosen_friction": -1.1658198833465576, - "logps/rejected": -4.797945499420166, - "logps/rejected_friction": -3.8405258655548096, - "loss": 0.0146, - "policy_friction_nll_loss": 1.143784761428833, - "policy_nll_loss": 2.609065055847168, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.20418190956115723, - "rewards/chosen_fricton": -0.06829319149255753, - "rewards/margins": 0.21158495545387268, - "rewards/margins_friction": 0.26317933201789856, - "rewards/rejected": -0.4157668650150299, - "rewards/rejected_friction": -0.3314725160598755, + "epoch": 0.1, + "learning_rate": 4.915978881978407e-06, + "logits/chosen": -0.2879001498222351, + "logits/rejected": -0.28882110118865967, + "logps/chosen": -418.3189392089844, + "logps/rejected": -420.14349365234375, + "loss": 0.5114, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.39648348093032837, + "rewards/margins": 0.6386500000953674, + "rewards/rejected": -1.0351333618164062, "step": 340 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.18, - "learning_rate": 4.9615117451181285e-06, - "logits/chosen": -0.35200875997543335, - "logits/chosen_friction": -0.35481300950050354, - "logits/rejected": -0.34778499603271484, - "logits/rejected_friction": -0.34264642000198364, - "logps/chosen": -2.4368200302124023, - "logps/chosen_friction": -1.0737630128860474, - "logps/rejected": -4.42581033706665, - "logps/rejected_friction": -3.7999961376190186, - "loss": 0.0196, - "policy_friction_nll_loss": 1.0574160814285278, - "policy_nll_loss": 2.410555124282837, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.1856798380613327, - "rewards/chosen_fricton": -0.06085171177983284, - "rewards/margins": 0.19548361003398895, - "rewards/margins_friction": 0.26832062005996704, - "rewards/rejected": -0.38116344809532166, - "rewards/rejected_friction": -0.3291723132133484, - "step": 345 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.19, - "learning_rate": 4.960357336717966e-06, - "logits/chosen": -0.3471812605857849, - "logits/chosen_friction": -0.35167810320854187, - "logits/rejected": -0.3352203071117401, - "logits/rejected_friction": -0.3382285237312317, - "logps/chosen": -2.4521892070770264, - "logps/chosen_friction": -1.0852452516555786, - "logps/rejected": -4.665430068969727, - "logps/rejected_friction": -3.7373881340026855, - "loss": 0.0202, - "policy_friction_nll_loss": 1.0674301385879517, - "policy_nll_loss": 2.4108941555023193, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.18801641464233398, - "rewards/chosen_fricton": -0.06278781592845917, - "rewards/margins": 0.2176388055086136, - "rewards/margins_friction": 0.2608237862586975, - "rewards/rejected": -0.40565523505210876, - "rewards/rejected_friction": -0.32361161708831787, + "epoch": 0.11, + "learning_rate": 4.908874981298058e-06, + "logits/chosen": -0.29214486479759216, + "logits/rejected": -0.29305440187454224, + "logps/chosen": -421.3182678222656, + "logps/rejected": -427.06317138671875, + "loss": 0.5628, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5688936114311218, + "rewards/margins": 0.5135782957077026, + "rewards/rejected": -1.0824719667434692, "step": 350 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.19, - "learning_rate": 4.959186008953249e-06, - "logits/chosen": -0.3506630063056946, - "logits/chosen_friction": -0.35589370131492615, - "logits/rejected": -0.3355846107006073, - "logits/rejected_friction": -0.3409242033958435, - "logps/chosen": -2.652618408203125, - "logps/chosen_friction": -1.1755043268203735, - "logps/rejected": -4.9310503005981445, - "logps/rejected_friction": -3.9205069541931152, - "loss": 0.0179, - "policy_friction_nll_loss": 1.1582355499267578, - "policy_nll_loss": 2.610299587249756, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.20621177554130554, - "rewards/chosen_fricton": -0.07015891373157501, - "rewards/margins": 0.22472074627876282, - "rewards/margins_friction": 0.2708670496940613, - "rewards/rejected": -0.43093252182006836, - "rewards/rejected_friction": -0.3410259485244751, - "step": 355 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.19, - "learning_rate": 4.957997769878959e-06, - "logits/chosen": -0.3564034402370453, - "logits/chosen_friction": -0.3582731783390045, - "logits/rejected": -0.3441784083843231, - "logits/rejected_friction": -0.3449742794036865, - "logps/chosen": -2.506476879119873, - "logps/chosen_friction": -1.1213239431381226, - "logps/rejected": -4.4981536865234375, - "logps/rejected_friction": -3.630359172821045, - "loss": 0.013, - "policy_friction_nll_loss": 1.1009345054626465, - "policy_nll_loss": 2.4694981575012207, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.1938176155090332, - "rewards/chosen_fricton": -0.06635096669197083, - "rewards/margins": 0.19543695449829102, - "rewards/margins_friction": 0.24666385352611542, - "rewards/rejected": -0.3892545700073242, - "rewards/rejected_friction": -0.31301483511924744, + "epoch": 0.11, + "learning_rate": 4.901488388458247e-06, + "logits/chosen": -0.2956882119178772, + "logits/rejected": -0.29717716574668884, + "logps/chosen": -429.40850830078125, + "logps/rejected": -432.0194396972656, + "loss": 0.5326, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5142576098442078, + "rewards/margins": 0.5949846506118774, + "rewards/rejected": -1.1092422008514404, "step": 360 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.19, - "learning_rate": 4.956792627666364e-06, - "logits/chosen": -0.33197954297065735, - "logits/chosen_friction": -0.33377885818481445, - "logits/rejected": -0.32438820600509644, - "logits/rejected_friction": -0.32076358795166016, - "logps/chosen": -2.799276828765869, - "logps/chosen_friction": -1.3144527673721313, - "logps/rejected": -4.78389835357666, - "logps/rejected_friction": -3.954665422439575, - "loss": 0.017, - "policy_friction_nll_loss": 1.2802320718765259, - "policy_nll_loss": 2.7410755157470703, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2156532257795334, - "rewards/chosen_fricton": -0.07995893061161041, - "rewards/margins": 0.19506819546222687, - "rewards/margins_friction": 0.25994348526000977, - "rewards/rejected": -0.41072145104408264, - "rewards/rejected_friction": -0.339902400970459, - "step": 365 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.2, - "learning_rate": 4.95557059060298e-06, - "logits/chosen": -0.33783167600631714, - "logits/chosen_friction": -0.3397655785083771, - "logits/rejected": -0.3208128809928894, - "logits/rejected_friction": -0.3221892714500427, - "logps/chosen": -2.793142080307007, - "logps/chosen_friction": -1.3219505548477173, - "logps/rejected": -4.904260158538818, - "logps/rejected_friction": -3.9411144256591797, - "loss": 0.0088, - "policy_friction_nll_loss": 1.297162652015686, - "policy_nll_loss": 2.7543084621429443, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.21826541423797607, - "rewards/chosen_fricton": -0.08236490935087204, - "rewards/margins": 0.20836412906646729, - "rewards/margins_friction": 0.258847713470459, - "rewards/rejected": -0.42662954330444336, - "rewards/rejected_friction": -0.34121260046958923, + "epoch": 0.11, + "learning_rate": 4.893819970308394e-06, + "logits/chosen": -0.29191336035728455, + "logits/rejected": -0.2928611636161804, + "logps/chosen": -432.4073181152344, + "logps/rejected": -437.53472900390625, + "loss": 0.5255, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5648801922798157, + "rewards/margins": 0.5746434926986694, + "rewards/rejected": -1.1395236253738403, "step": 370 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.2, - "learning_rate": 4.954331667092497e-06, - "logits/chosen": -0.3415621817111969, - "logits/chosen_friction": -0.3464515209197998, - "logits/rejected": -0.32556232810020447, - "logits/rejected_friction": -0.32830914855003357, - "logps/chosen": -2.944784164428711, - "logps/chosen_friction": -1.3309084177017212, - "logps/rejected": -5.156991004943848, - "logps/rejected_friction": -4.093745231628418, - "loss": 0.0133, - "policy_friction_nll_loss": 1.3091967105865479, - "policy_nll_loss": 2.912836790084839, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2330644130706787, - "rewards/chosen_fricton": -0.08425513654947281, - "rewards/margins": 0.2181960642337799, - "rewards/margins_friction": 0.27239251136779785, - "rewards/rejected": -0.45126041769981384, - "rewards/rejected_friction": -0.35664767026901245, - "step": 375 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.2, - "learning_rate": 4.953075865654736e-06, - "logits/chosen": -0.3588234484195709, - "logits/chosen_friction": -0.36306723952293396, - "logits/rejected": -0.3445814847946167, - "logits/rejected_friction": -0.3472568988800049, - "logps/chosen": -2.5857133865356445, - "logps/chosen_friction": -1.238526701927185, - "logps/rejected": -4.746018409729004, - "logps/rejected_friction": -3.8465754985809326, - "loss": 0.009, - "policy_friction_nll_loss": 1.2124632596969604, - "policy_nll_loss": 2.5385937690734863, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.20123954117298126, - "rewards/chosen_fricton": -0.07754774391651154, - "rewards/margins": 0.21382825076580048, - "rewards/margins_friction": 0.25747185945510864, - "rewards/rejected": -0.4150678217411041, - "rewards/rejected_friction": -0.33501964807510376, + "epoch": 0.12, + "learning_rate": 4.885870626771371e-06, + "logits/chosen": -0.2915678322315216, + "logits/rejected": -0.2924065887928009, + "logps/chosen": -421.0965881347656, + "logps/rejected": -425.9581604003906, + "loss": 0.5565, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6075869798660278, + "rewards/margins": 0.576026201248169, + "rewards/rejected": -1.1836131811141968, "step": 380 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.21, - "learning_rate": 4.951803194925581e-06, - "logits/chosen": -0.3504578173160553, - "logits/chosen_friction": -0.35224640369415283, - "logits/rejected": -0.3404080271720886, - "logits/rejected_friction": -0.3358613848686218, - "logps/chosen": -2.794495105743408, - "logps/chosen_friction": -1.3014914989471436, - "logps/rejected": -4.711997032165527, - "logps/rejected_friction": -3.8738300800323486, - "loss": 0.0139, - "policy_friction_nll_loss": 1.2760770320892334, - "policy_nll_loss": 2.751171827316284, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22045806050300598, - "rewards/chosen_fricton": -0.08297456055879593, - "rewards/margins": 0.18869347870349884, - "rewards/margins_friction": 0.25356703996658325, - "rewards/rejected": -0.40915149450302124, - "rewards/rejected_friction": -0.3365415632724762, - "step": 385 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.21, - "learning_rate": 4.950513663656924e-06, - "logits/chosen": -0.3433647155761719, - "logits/chosen_friction": -0.3474162220954895, - "logits/rejected": -0.3204881548881531, - "logits/rejected_friction": -0.3284006118774414, - "logps/chosen": -2.875647783279419, - "logps/chosen_friction": -1.3142247200012207, - "logps/rejected": -5.204892158508301, - "logps/rejected_friction": -3.973512649536133, - "loss": 0.013, - "policy_friction_nll_loss": 1.2973631620407104, - "policy_nll_loss": 2.8271312713623047, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.229364275932312, - "rewards/chosen_fricton": -0.08449070900678635, - "rewards/margins": 0.22952839732170105, - "rewards/margins_friction": 0.2621532082557678, - "rewards/rejected": -0.45889273285865784, - "rewards/rejected_friction": -0.34664395451545715, + "epoch": 0.12, + "learning_rate": 4.8776412907378845e-06, + "logits/chosen": -0.29022809863090515, + "logits/rejected": -0.2918907701969147, + "logps/chosen": -422.2085876464844, + "logps/rejected": -425.4307556152344, + "loss": 0.5346, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6413823962211609, + "rewards/margins": 0.6153510808944702, + "rewards/rejected": -1.2567334175109863, "step": 390 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.21, - "learning_rate": 4.949207280716603e-06, - "logits/chosen": -0.34024369716644287, - "logits/chosen_friction": -0.3433780074119568, - "logits/rejected": -0.3260836899280548, - "logits/rejected_friction": -0.3244568109512329, - "logps/chosen": -2.9202380180358887, - "logps/chosen_friction": -1.3234751224517822, - "logps/rejected": -4.998898983001709, - "logps/rejected_friction": -4.0874714851379395, - "loss": 0.0121, - "policy_friction_nll_loss": 1.29209566116333, - "policy_nll_loss": 2.8674979209899902, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2315751612186432, - "rewards/chosen_fricton": -0.08377519994974136, - "rewards/margins": 0.2044021189212799, - "rewards/margins_friction": 0.2720472514629364, - "rewards/rejected": -0.4359772801399231, - "rewards/rejected_friction": -0.35582244396209717, - "step": 395 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.21, - "learning_rate": 4.947884055088339e-06, - "logits/chosen": -0.3592532277107239, - "logits/chosen_friction": -0.3544307351112366, - "logits/rejected": -0.35949498414993286, - "logits/rejected_friction": -0.3402422070503235, - "logps/chosen": -2.692101240158081, - "logps/chosen_friction": -1.2672466039657593, - "logps/rejected": -4.464926719665527, - "logps/rejected_friction": -3.989372968673706, - "loss": 0.0114, - "policy_friction_nll_loss": 1.2484729290008545, - "policy_nll_loss": 2.671703815460205, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.20902463793754578, - "rewards/chosen_fricton": -0.07876187562942505, - "rewards/margins": 0.17323513329029083, - "rewards/margins_friction": 0.2674862742424011, - "rewards/rejected": -0.3822597861289978, - "rewards/rejected_friction": -0.34624817967414856, + "epoch": 0.12, + "learning_rate": 4.869132927957007e-06, + "logits/chosen": -0.2912658751010895, + "logits/rejected": -0.292255163192749, + "logps/chosen": -424.4219665527344, + "logps/rejected": -430.76885986328125, + "loss": 0.53, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7030640840530396, + "rewards/margins": 0.6264012455940247, + "rewards/rejected": -1.329465389251709, "step": 400 }, { - "epoch": 0.21, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.3922047019004822, - "eval_logits/chosen_friction": -0.38667961955070496, - "eval_logits/rejected": -0.39377009868621826, - "eval_logits/rejected_friction": -0.3687905967235565, - "eval_logps/chosen": -2.7412400245666504, - "eval_logps/chosen_friction": -1.3260095119476318, - "eval_logps/rejected": -4.419449329376221, - "eval_logps/rejected_friction": -3.9852206707000732, - "eval_loss": 0.013633196242153645, - "eval_policy_friction_nll_loss": 1.3260096311569214, - "eval_policy_nll_loss": 2.7412400245666504, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.2141818255186081, - "eval_rewards/chosen_fricton": -0.08433224260807037, - "eval_rewards/margins": 0.16434316337108612, - "eval_rewards/margins_friction": 0.2617739140987396, - "eval_rewards/rejected": -0.3785249590873718, - "eval_rewards/rejected_friction": -0.3461061716079712, - "eval_runtime": 550.9269, - "eval_samples_per_second": 0.908, - "eval_steps_per_second": 0.454, + "epoch": 0.12, + "eval_logits/chosen": -0.3515583574771881, + "eval_logits/rejected": -0.35239377617836, + "eval_logps/chosen": -419.6265563964844, + "eval_logps/rejected": -424.9375, + "eval_loss": 0.5038847327232361, + "eval_rewards/accuracies": 0.7379999756813049, + "eval_rewards/chosen": -0.6735388040542603, + "eval_rewards/margins": 0.7223072648048401, + "eval_rewards/rejected": -1.3958461284637451, + "eval_runtime": 375.1774, + "eval_samples_per_second": 1.333, + "eval_steps_per_second": 1.333, "step": 400 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.22, - "learning_rate": 4.946543995871681e-06, - "logits/chosen": -0.34704190492630005, - "logits/chosen_friction": -0.3438102602958679, - "logits/rejected": -0.34628504514694214, - "logits/rejected_friction": -0.32517513632774353, - "logps/chosen": -2.7699122428894043, - "logps/chosen_friction": -1.39215087890625, - "logps/rejected": -4.665611267089844, - "logps/rejected_friction": -4.259208679199219, - "loss": 0.0092, - "policy_friction_nll_loss": 1.3531534671783447, - "policy_nll_loss": 2.72179913520813, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2144896239042282, - "rewards/chosen_fricton": -0.08816899359226227, - "rewards/margins": 0.18656989932060242, - "rewards/margins_friction": 0.28306347131729126, - "rewards/rejected": -0.4010595381259918, - "rewards/rejected_friction": -0.37123245000839233, - "step": 405 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.22, - "learning_rate": 4.945187112281936e-06, - "logits/chosen": -0.3372710645198822, - "logits/chosen_friction": -0.3417302072048187, - "logits/rejected": -0.3218538165092468, - "logits/rejected_friction": -0.3214091658592224, - "logps/chosen": -3.019611358642578, - "logps/chosen_friction": -1.3670307397842407, - "logps/rejected": -5.232946872711182, - "logps/rejected_friction": -4.155798435211182, - "loss": 0.0122, - "policy_friction_nll_loss": 1.3203260898590088, - "policy_nll_loss": 2.9537792205810547, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24028857052326202, - "rewards/chosen_fricton": -0.08793630450963974, - "rewards/margins": 0.21869663894176483, - "rewards/margins_friction": 0.2749630808830261, - "rewards/rejected": -0.45898523926734924, - "rewards/rejected_friction": -0.36289942264556885, + "epoch": 0.12, + "learning_rate": 4.860346536922834e-06, + "logits/chosen": -0.29377973079681396, + "logits/rejected": -0.294566810131073, + "logps/chosen": -429.86907958984375, + "logps/rejected": -432.5889587402344, + "loss": 0.529, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7517430782318115, + "rewards/margins": 0.6350258588790894, + "rewards/rejected": -1.3867689371109009, "step": 410 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.22, - "learning_rate": 4.943813413650111e-06, - "logits/chosen": -0.3440500795841217, - "logits/chosen_friction": -0.3459983468055725, - "logits/rejected": -0.3315867781639099, - "logits/rejected_friction": -0.32809504866600037, - "logps/chosen": -2.92108154296875, - "logps/chosen_friction": -1.4051368236541748, - "logps/rejected": -5.0499725341796875, - "logps/rejected_friction": -4.131904602050781, - "loss": 0.0071, - "policy_friction_nll_loss": 1.349300503730774, - "policy_nll_loss": 2.8469061851501465, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2319747656583786, - "rewards/chosen_fricton": -0.09208868443965912, - "rewards/margins": 0.20922121405601501, - "rewards/margins_friction": 0.26841822266578674, - "rewards/rejected": -0.4411959648132324, - "rewards/rejected_friction": -0.36050689220428467, - "step": 415 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.22, - "learning_rate": 4.9424229094228425e-06, - "logits/chosen": -0.3392259180545807, - "logits/chosen_friction": -0.34190791845321655, - "logits/rejected": -0.327998548746109, - "logits/rejected_friction": -0.3205747604370117, - "logps/chosen": -3.0143022537231445, - "logps/chosen_friction": -1.425040364265442, - "logps/rejected": -5.042145729064941, - "logps/rejected_friction": -4.170163154602051, - "loss": 0.0081, - "policy_friction_nll_loss": 1.4001234769821167, - "policy_nll_loss": 2.9704723358154297, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23886021971702576, - "rewards/chosen_fricton": -0.09144577383995056, - "rewards/margins": 0.19974713027477264, - "rewards/margins_friction": 0.27086153626441956, - "rewards/rejected": -0.4386073648929596, - "rewards/rejected_friction": -0.3623073101043701, + "epoch": 0.13, + "learning_rate": 4.85128314875731e-06, + "logits/chosen": -0.2876330316066742, + "logits/rejected": -0.2890221178531647, + "logps/chosen": -433.5904846191406, + "logps/rejected": -438.02886962890625, + "loss": 0.5174, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7357751131057739, + "rewards/margins": 0.6541243195533752, + "rewards/rejected": -1.389899492263794, "step": 420 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.23, - "learning_rate": 4.941015609162341e-06, - "logits/chosen": -0.35913166403770447, - "logits/chosen_friction": -0.36161643266677856, - "logits/rejected": -0.3462074398994446, - "logits/rejected_friction": -0.34324946999549866, - "logps/chosen": -2.8735992908477783, - "logps/chosen_friction": -1.320122480392456, - "logps/rejected": -5.011282920837402, - "logps/rejected_friction": -4.097905158996582, - "loss": 0.0075, - "policy_friction_nll_loss": 1.3012974262237549, - "policy_nll_loss": 2.836951732635498, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22884142398834229, - "rewards/chosen_fricton": -0.08537876605987549, - "rewards/margins": 0.2104407548904419, - "rewards/margins_friction": 0.2737954258918762, - "rewards/rejected": -0.4392821788787842, - "rewards/rejected_friction": -0.3591741919517517, - "step": 425 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.23, - "learning_rate": 4.939591522546314e-06, - "logits/chosen": -0.34825918078422546, - "logits/chosen_friction": -0.3487854599952698, - "logits/rejected": -0.33759891986846924, - "logits/rejected_friction": -0.3301500976085663, - "logps/chosen": -2.984149932861328, - "logps/chosen_friction": -1.351409673690796, - "logps/rejected": -5.1544036865234375, - "logps/rejected_friction": -4.247395992279053, - "loss": 0.0091, - "policy_friction_nll_loss": 1.3253806829452515, - "policy_nll_loss": 2.9400362968444824, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2369297444820404, - "rewards/chosen_fricton": -0.08625730127096176, - "rewards/margins": 0.213802307844162, - "rewards/margins_friction": 0.28551042079925537, - "rewards/rejected": -0.4507320821285248, - "rewards/rejected_friction": -0.37176769971847534, + "epoch": 0.13, + "learning_rate": 4.841943827089223e-06, + "logits/chosen": -0.30073267221450806, + "logits/rejected": -0.3028663098812103, + "logps/chosen": -438.89056396484375, + "logps/rejected": -444.29443359375, + "loss": 0.5427, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7469267845153809, + "rewards/margins": 0.6645030379295349, + "rewards/rejected": -1.411429762840271, "step": 430 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.23, - "learning_rate": 4.9381506593679116e-06, - "logits/chosen": -0.3645872175693512, - "logits/chosen_friction": -0.36799517273902893, - "logits/rejected": -0.3505623936653137, - "logits/rejected_friction": -0.3488561511039734, - "logps/chosen": -2.8302319049835205, - "logps/chosen_friction": -1.2796424627304077, - "logps/rejected": -4.943591117858887, - "logps/rejected_friction": -3.9940438270568848, - "loss": 0.01, - "policy_friction_nll_loss": 1.262604832649231, - "policy_nll_loss": 2.8052361011505127, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22441408038139343, - "rewards/chosen_fricton": -0.0806896835565567, - "rewards/margins": 0.2086014300584793, - "rewards/margins_friction": 0.2680177688598633, - "rewards/rejected": -0.43301549553871155, - "rewards/rejected_friction": -0.3487074375152588, - "step": 435 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.23, - "learning_rate": 4.936693029535647e-06, - "logits/chosen": -0.36510735750198364, - "logits/chosen_friction": -0.36778753995895386, - "logits/rejected": -0.35848885774612427, - "logits/rejected_friction": -0.35037320852279663, - "logps/chosen": -2.822890520095825, - "logps/chosen_friction": -1.3064841032028198, - "logps/rejected": -4.720261096954346, - "logps/rejected_friction": -3.9292819499969482, - "loss": 0.007, - "policy_friction_nll_loss": 1.2794169187545776, - "policy_nll_loss": 2.783125877380371, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22499039769172668, - "rewards/chosen_fricton": -0.08477078378200531, - "rewards/margins": 0.18693511188030243, - "rewards/margins_friction": 0.25845271348953247, - "rewards/rejected": -0.4119254946708679, - "rewards/rejected_friction": -0.343223512172699, + "epoch": 0.13, + "learning_rate": 4.832329667929378e-06, + "logits/chosen": -0.30401021242141724, + "logits/rejected": -0.305408775806427, + "logps/chosen": -436.4923400878906, + "logps/rejected": -443.785400390625, + "loss": 0.4856, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7162739634513855, + "rewards/margins": 0.7617406845092773, + "rewards/rejected": -1.478014588356018, "step": 440 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.24, - "learning_rate": 4.935218643073339e-06, - "logits/chosen": -0.36008119583129883, - "logits/chosen_friction": -0.36234599351882935, - "logits/rejected": -0.35121724009513855, - "logits/rejected_friction": -0.34214964509010315, - "logps/chosen": -2.903782606124878, - "logps/chosen_friction": -1.333601474761963, - "logps/rejected": -4.922939777374268, - "logps/rejected_friction": -4.126588821411133, - "loss": 0.0105, - "policy_friction_nll_loss": 1.2957030534744263, - "policy_nll_loss": 2.854074001312256, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23140111565589905, - "rewards/chosen_fricton": -0.08558743447065353, - "rewards/margins": 0.19886091351509094, - "rewards/margins_friction": 0.2755585014820099, - "rewards/rejected": -0.43026202917099, - "rewards/rejected_friction": -0.361145943403244, - "step": 445 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.24, - "learning_rate": 4.933727510120035e-06, - "logits/chosen": -0.3626021444797516, - "logits/chosen_friction": -0.36293596029281616, - "logits/rejected": -0.34962528944015503, - "logits/rejected_friction": -0.33895203471183777, - "logps/chosen": -3.0314695835113525, - "logps/chosen_friction": -1.4067076444625854, - "logps/rejected": -5.090710163116455, - "logps/rejected_friction": -4.19690465927124, - "loss": 0.0073, - "policy_friction_nll_loss": 1.3903558254241943, - "policy_nll_loss": 3.006894111633301, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24165110290050507, - "rewards/chosen_fricton": -0.09097932279109955, - "rewards/margins": 0.20339789986610413, - "rewards/margins_friction": 0.2757622301578522, - "rewards/rejected": -0.445048987865448, - "rewards/rejected_friction": -0.3667415678501129, + "epoch": 0.14, + "learning_rate": 4.822441799541979e-06, + "logits/chosen": -0.29748016595840454, + "logits/rejected": -0.2987380027770996, + "logps/chosen": -432.2513122558594, + "logps/rejected": -439.78741455078125, + "loss": 0.5138, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.8289654850959778, + "rewards/margins": 0.7209790349006653, + "rewards/rejected": -1.549944519996643, "step": 450 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.24, - "learning_rate": 4.932219640929944e-06, - "logits/chosen": -0.35703200101852417, - "logits/chosen_friction": -0.3618159592151642, - "logits/rejected": -0.3460405170917511, - "logits/rejected_friction": -0.3427182137966156, - "logps/chosen": -2.8708226680755615, - "logps/chosen_friction": -1.2660937309265137, - "logps/rejected": -4.960195541381836, - "logps/rejected_friction": -4.008208274841309, - "loss": 0.0073, - "policy_friction_nll_loss": 1.2344942092895508, - "policy_nll_loss": 2.8179931640625, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23012569546699524, - "rewards/chosen_fricton": -0.08082834631204605, - "rewards/margins": 0.2065487802028656, - "rewards/margins_friction": 0.2712729871273041, - "rewards/rejected": -0.4366745054721832, - "rewards/rejected_friction": -0.3521013855934143, - "step": 455 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.25, - "learning_rate": 4.930695045872369e-06, - "logits/chosen": -0.3550131320953369, - "logits/chosen_friction": -0.35375291109085083, - "logits/rejected": -0.3561306595802307, - "logits/rejected_friction": -0.33387893438339233, - "logps/chosen": -2.8933029174804688, - "logps/chosen_friction": -1.3966244459152222, - "logps/rejected": -4.726448059082031, - "logps/rejected_friction": -4.334230422973633, - "loss": 0.0112, - "policy_friction_nll_loss": 1.3610707521438599, - "policy_nll_loss": 2.8479163646698, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22765116393566132, - "rewards/chosen_fricton": -0.08902428299188614, - "rewards/margins": 0.1795242428779602, - "rewards/margins_friction": 0.2895689010620117, - "rewards/rejected": -0.40717536211013794, - "rewards/rejected_friction": -0.3785931468009949, + "epoch": 0.14, + "learning_rate": 4.812281382312222e-06, + "logits/chosen": -0.28938063979148865, + "logits/rejected": -0.2903631031513214, + "logps/chosen": -421.52337646484375, + "logps/rejected": -426.65142822265625, + "loss": 0.4934, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7328917980194092, + "rewards/margins": 0.7723864316940308, + "rewards/rejected": -1.5052781105041504, "step": 460 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.25, - "learning_rate": 4.929153735431633e-06, - "logits/chosen": -0.36902618408203125, - "logits/chosen_friction": -0.3712102472782135, - "logits/rejected": -0.3533739447593689, - "logits/rejected_friction": -0.35087013244628906, - "logps/chosen": -2.8595051765441895, - "logps/chosen_friction": -1.2675787210464478, - "logps/rejected": -5.153629302978516, - "logps/rejected_friction": -4.111886978149414, - "loss": 0.0095, - "policy_friction_nll_loss": 1.2452621459960938, - "policy_nll_loss": 2.817878484725952, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22727620601654053, - "rewards/chosen_fricton": -0.08035627007484436, - "rewards/margins": 0.22622816264629364, - "rewards/margins_friction": 0.28057563304901123, - "rewards/rejected": -0.45350438356399536, - "rewards/rejected_friction": -0.3609318733215332, - "step": 465 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.25, - "learning_rate": 4.927595720207007e-06, - "logits/chosen": -0.3610824942588806, - "logits/chosen_friction": -0.36173102259635925, - "logits/rejected": -0.3453821539878845, - "logits/rejected_friction": -0.339348703622818, - "logps/chosen": -3.086256980895996, - "logps/chosen_friction": -1.362900733947754, - "logps/rejected": -5.236875534057617, - "logps/rejected_friction": -4.173074245452881, - "loss": 0.0077, - "policy_friction_nll_loss": 1.346858024597168, - "policy_nll_loss": 3.0594675540924072, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2462814748287201, - "rewards/chosen_fricton": -0.08591034263372421, - "rewards/margins": 0.21228058636188507, - "rewards/margins_friction": 0.2779102921485901, - "rewards/rejected": -0.45856207609176636, - "rewards/rejected_friction": -0.3638206422328949, + "epoch": 0.14, + "learning_rate": 4.801849608610119e-06, + "logits/chosen": -0.2995319366455078, + "logits/rejected": -0.3008275330066681, + "logps/chosen": -437.45916748046875, + "logps/rejected": -443.75799560546875, + "loss": 0.4984, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9069220423698425, + "rewards/margins": 0.7756569981575012, + "rewards/rejected": -1.6825790405273438, "step": 470 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.25, - "learning_rate": 4.926021010912636e-06, - "logits/chosen": -0.3676564693450928, - "logits/chosen_friction": -0.3671501576900482, - "logits/rejected": -0.35722658038139343, - "logits/rejected_friction": -0.34304100275039673, - "logps/chosen": -3.0892724990844727, - "logps/chosen_friction": -1.4188497066497803, - "logps/rejected": -5.159282207489014, - "logps/rejected_friction": -4.2591776847839355, - "loss": 0.0061, - "policy_friction_nll_loss": 1.405046820640564, - "policy_nll_loss": 3.0743765830993652, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2461220771074295, - "rewards/chosen_fricton": -0.08985219895839691, - "rewards/margins": 0.2032013237476349, - "rewards/margins_friction": 0.279625803232193, - "rewards/rejected": -0.4493234157562256, - "rewards/rejected_friction": -0.3694780170917511, - "step": 475 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.26, - "learning_rate": 4.924429618377468e-06, - "logits/chosen": -0.3779478669166565, - "logits/chosen_friction": -0.38230809569358826, - "logits/rejected": -0.3653263449668884, - "logits/rejected_friction": -0.36338454484939575, - "logps/chosen": -2.7456631660461426, - "logps/chosen_friction": -1.182466745376587, - "logps/rejected": -4.913125038146973, - "logps/rejected_friction": -3.9212241172790527, - "loss": 0.0065, - "policy_friction_nll_loss": 1.1643885374069214, - "policy_nll_loss": 2.718877077102661, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.21863611042499542, - "rewards/chosen_fricton": -0.07378090173006058, - "rewards/margins": 0.21327419579029083, - "rewards/margins_friction": 0.26955732703208923, - "rewards/rejected": -0.43191027641296387, - "rewards/rejected_friction": -0.3433382511138916, + "epoch": 0.15, + "learning_rate": 4.7911477026505656e-06, + "logits/chosen": -0.2930867373943329, + "logits/rejected": -0.2938670516014099, + "logps/chosen": -436.32305908203125, + "logps/rejected": -439.0968322753906, + "loss": 0.4882, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.054971694946289, + "rewards/margins": 0.8004587292671204, + "rewards/rejected": -1.8554306030273438, "step": 480 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.26, - "learning_rate": 4.922821553545177e-06, - "logits/chosen": -0.36998850107192993, - "logits/chosen_friction": -0.37169763445854187, - "logits/rejected": -0.3671855032444, - "logits/rejected_friction": -0.3509330451488495, - "logps/chosen": -3.042642116546631, - "logps/chosen_friction": -1.3900835514068604, - "logps/rejected": -5.097192764282227, - "logps/rejected_friction": -4.2986159324646, - "loss": 0.0045, - "policy_friction_nll_loss": 1.3733257055282593, - "policy_nll_loss": 3.021746873855591, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24165813624858856, - "rewards/chosen_fricton": -0.08778037130832672, - "rewards/margins": 0.20209646224975586, - "rewards/margins_friction": 0.28682059049606323, - "rewards/rejected": -0.4437545835971832, - "rewards/rejected_friction": -0.37460094690322876, - "step": 485 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.26, - "learning_rate": 4.92119682747409e-06, - "logits/chosen": -0.3773697018623352, - "logits/chosen_friction": -0.37685543298721313, - "logits/rejected": -0.36741751432418823, - "logits/rejected_friction": -0.36008161306381226, - "logps/chosen": -2.7271153926849365, - "logps/chosen_friction": -1.1769622564315796, - "logps/rejected": -4.82851505279541, - "logps/rejected_friction": -3.931596279144287, - "loss": 0.0058, - "policy_friction_nll_loss": 1.1631876230239868, - "policy_nll_loss": 2.6989340782165527, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.21779516339302063, - "rewards/chosen_fricton": -0.07432660460472107, - "rewards/margins": 0.20625026524066925, - "rewards/margins_friction": 0.27071624994277954, - "rewards/rejected": -0.4240454137325287, - "rewards/rejected_friction": -0.345042884349823, + "epoch": 0.15, + "learning_rate": 4.780176920349675e-06, + "logits/chosen": -0.2880414128303528, + "logits/rejected": -0.2893609404563904, + "logps/chosen": -426.8358459472656, + "logps/rejected": -432.79248046875, + "loss": 0.5123, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1646369695663452, + "rewards/margins": 0.739470362663269, + "rewards/rejected": -1.9041073322296143, "step": 490 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.26, - "learning_rate": 4.919555451337111e-06, - "logits/chosen": -0.36104169487953186, - "logits/chosen_friction": -0.35596364736557007, - "logits/rejected": -0.3627226948738098, - "logits/rejected_friction": -0.3365195691585541, - "logps/chosen": -2.9771728515625, - "logps/chosen_friction": -1.3325389623641968, - "logps/rejected": -4.542566299438477, - "logps/rejected_friction": -4.1334309577941895, - "loss": 0.0135, - "policy_friction_nll_loss": 1.2943031787872314, - "policy_nll_loss": 2.939058780670166, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2331182062625885, - "rewards/chosen_fricton": -0.08126387000083923, - "rewards/margins": 0.1532055288553238, - "rewards/margins_friction": 0.27599817514419556, - "rewards/rejected": -0.3863236904144287, - "rewards/rejected_friction": -0.3572620749473572, - "step": 495 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.27, - "learning_rate": 4.917897436421641e-06, - "logits/chosen": -0.36562711000442505, - "logits/chosen_friction": -0.36584752798080444, - "logits/rejected": -0.3608798384666443, - "logits/rejected_friction": -0.34487420320510864, - "logps/chosen": -2.9826807975769043, - "logps/chosen_friction": -1.3006752729415894, - "logps/rejected": -4.916216850280762, - "logps/rejected_friction": -4.198328971862793, - "loss": 0.0066, - "policy_friction_nll_loss": 1.280826210975647, - "policy_nll_loss": 2.9527883529663086, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23761311173439026, - "rewards/chosen_fricton": -0.08117898553609848, - "rewards/margins": 0.19045871496200562, - "rewards/margins_friction": 0.2859787046909332, - "rewards/rejected": -0.4280718266963959, - "rewards/rejected_friction": -0.3671576678752899, + "epoch": 0.15, + "learning_rate": 4.7689385491773934e-06, + "logits/chosen": -0.3000113070011139, + "logits/rejected": -0.3008071780204773, + "logps/chosen": -442.62860107421875, + "logps/rejected": -446.14825439453125, + "loss": 0.5871, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3722599744796753, + "rewards/margins": 0.6036561131477356, + "rewards/rejected": -1.9759161472320557, "step": 500 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.27, - "learning_rate": 4.9162227941295e-06, - "logits/chosen": -0.3699677288532257, - "logits/chosen_friction": -0.37496206164360046, - "logits/rejected": -0.352647989988327, - "logits/rejected_friction": -0.35215872526168823, - "logps/chosen": -2.982651948928833, - "logps/chosen_friction": -1.321885108947754, - "logps/rejected": -5.310145854949951, - "logps/rejected_friction": -4.216909885406494, - "loss": 0.0081, - "policy_friction_nll_loss": 1.2973592281341553, - "policy_nll_loss": 2.9507250785827637, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23992881178855896, - "rewards/chosen_fricton": -0.08489847183227539, - "rewards/margins": 0.22926945984363556, - "rewards/margins_friction": 0.2852926254272461, - "rewards/rejected": -0.4691982865333557, - "rewards/rejected_friction": -0.3701910972595215, - "step": 505 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.27, - "learning_rate": 4.9145315359768575e-06, - "logits/chosen": -0.37294796109199524, - "logits/chosen_friction": -0.37395477294921875, - "logits/rejected": -0.3696931004524231, - "logits/rejected_friction": -0.3544881045818329, - "logps/chosen": -3.0064480304718018, - "logps/chosen_friction": -1.332297444343567, - "logps/rejected": -4.86428689956665, - "logps/rejected_friction": -4.143670082092285, - "loss": 0.0075, - "policy_friction_nll_loss": 1.3054784536361694, - "policy_nll_loss": 2.9774253368377686, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2403772622346878, - "rewards/chosen_fricton": -0.08477666229009628, - "rewards/margins": 0.18324218690395355, - "rewards/margins_friction": 0.27790015935897827, - "rewards/rejected": -0.42361944913864136, - "rewards/rejected_friction": -0.36267685890197754, + "epoch": 0.15, + "learning_rate": 4.7574339080064046e-06, + "logits/chosen": -0.2956729829311371, + "logits/rejected": -0.29699647426605225, + "logps/chosen": -432.17486572265625, + "logps/rejected": -441.1890563964844, + "loss": 0.4989, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2666178941726685, + "rewards/margins": 0.7352741956710815, + "rewards/rejected": -2.001891851425171, "step": 510 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.27, - "learning_rate": 4.9128236735941415e-06, - "logits/chosen": -0.38037171959877014, - "logits/chosen_friction": -0.37804514169692993, - "logits/rejected": -0.3725968897342682, - "logits/rejected_friction": -0.3545185625553131, - "logps/chosen": -2.9758799076080322, - "logps/chosen_friction": -1.2965240478515625, - "logps/rejected": -4.943713188171387, - "logps/rejected_friction": -4.2497100830078125, - "loss": 0.0063, - "policy_friction_nll_loss": 1.2857757806777954, - "policy_nll_loss": 2.9581093788146973, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2381453961133957, - "rewards/chosen_fricton": -0.0812692791223526, - "rewards/margins": 0.19374985992908478, - "rewards/margins_friction": 0.2914428114891052, - "rewards/rejected": -0.43189531564712524, - "rewards/rejected_friction": -0.37271207571029663, - "step": 515 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.28, - "learning_rate": 4.911099218725966e-06, - "logits/chosen": -0.3569319546222687, - "logits/chosen_friction": -0.35978397727012634, - "logits/rejected": -0.3482981324195862, - "logits/rejected_friction": -0.3380506932735443, - "logps/chosen": -3.0628843307495117, - "logps/chosen_friction": -1.350184440612793, - "logps/rejected": -5.072754859924316, - "logps/rejected_friction": -4.215649604797363, - "loss": 0.0062, - "policy_friction_nll_loss": 1.313066840171814, - "policy_nll_loss": 2.9975008964538574, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24698762595653534, - "rewards/chosen_fricton": -0.08734525740146637, - "rewards/margins": 0.1974131315946579, - "rewards/margins_friction": 0.2822173833847046, - "rewards/rejected": -0.44440072774887085, - "rewards/rejected_friction": -0.36956265568733215, + "epoch": 0.16, + "learning_rate": 4.745664346957362e-06, + "logits/chosen": -0.29319706559181213, + "logits/rejected": -0.2932819724082947, + "logps/chosen": -441.1473083496094, + "logps/rejected": -443.6536560058594, + "loss": 0.5431, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1980129480361938, + "rewards/margins": 0.7274158000946045, + "rewards/rejected": -1.9254287481307983, "step": 520 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.28, - "learning_rate": 4.909358183231049e-06, - "logits/chosen": -0.3633722960948944, - "logits/chosen_friction": -0.36529648303985596, - "logits/rejected": -0.35473349690437317, - "logits/rejected_friction": -0.34482401609420776, - "logps/chosen": -2.970158100128174, - "logps/chosen_friction": -1.2897554636001587, - "logps/rejected": -4.961055278778076, - "logps/rejected_friction": -4.142546653747559, - "loss": 0.0052, - "policy_friction_nll_loss": 1.2507188320159912, - "policy_nll_loss": 2.9046480655670166, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2376093566417694, - "rewards/chosen_fricton": -0.08102350682020187, - "rewards/margins": 0.1967466175556183, - "rewards/margins_friction": 0.2823033928871155, - "rewards/rejected": -0.43435603380203247, - "rewards/rejected_friction": -0.36332690715789795, - "step": 525 - }, - { - "directrewards_student/accuracies": 0.9750000238418579, - "epoch": 0.28, - "learning_rate": 4.9076005790821265e-06, - "logits/chosen": -0.36544305086135864, - "logits/chosen_friction": -0.3654644191265106, - "logits/rejected": -0.3607226312160492, - "logits/rejected_friction": -0.34607887268066406, - "logps/chosen": -2.9406187534332275, - "logps/chosen_friction": -1.2371890544891357, - "logps/rejected": -4.835099697113037, - "logps/rejected_friction": -4.169564723968506, - "loss": 0.0077, - "policy_friction_nll_loss": 1.2142053842544556, - "policy_nll_loss": 2.9067881107330322, - "rewards/accuracies": 0.9750000238418579, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23489150404930115, - "rewards/chosen_fricton": -0.0764525905251503, - "rewards/margins": 0.186421200633049, - "rewards/margins_friction": 0.2894664406776428, - "rewards/rejected": -0.42131271958351135, - "rewards/rejected_friction": -0.36591899394989014, + "epoch": 0.16, + "learning_rate": 4.733631247240435e-06, + "logits/chosen": -0.28386861085891724, + "logits/rejected": -0.28545230627059937, + "logps/chosen": -424.7322692871094, + "logps/rejected": -432.74920654296875, + "loss": 0.5172, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3047645092010498, + "rewards/margins": 0.7416442632675171, + "rewards/rejected": -2.0464088916778564, "step": 530 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.29, - "learning_rate": 4.905826418365878e-06, - "logits/chosen": -0.3727979362010956, - "logits/chosen_friction": -0.37456947565078735, - "logits/rejected": -0.36715978384017944, - "logits/rejected_friction": -0.35148149728775024, - "logps/chosen": -3.111140727996826, - "logps/chosen_friction": -1.322655439376831, - "logps/rejected": -5.177600860595703, - "logps/rejected_friction": -4.362905502319336, - "loss": 0.0064, - "policy_friction_nll_loss": 1.3051303625106812, - "policy_nll_loss": 3.089388608932495, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2505788505077362, - "rewards/chosen_fricton": -0.08369584381580353, - "rewards/margins": 0.20267021656036377, - "rewards/margins_friction": 0.29910263419151306, - "rewards/rejected": -0.4532490670681, - "rewards/rejected_friction": -0.3827984631061554, - "step": 535 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.29, - "learning_rate": 4.9040357132828356e-06, - "logits/chosen": -0.37000876665115356, - "logits/chosen_friction": -0.3687375485897064, - "logits/rejected": -0.36584120988845825, - "logits/rejected_friction": -0.34627047181129456, - "logps/chosen": -3.0141587257385254, - "logps/chosen_friction": -1.2932196855545044, - "logps/rejected": -4.96317195892334, - "logps/rejected_friction": -4.290286540985107, - "loss": 0.0059, - "policy_friction_nll_loss": 1.2601526975631714, - "policy_nll_loss": 2.965761661529541, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2402983456850052, - "rewards/chosen_fricton": -0.08125822991132736, - "rewards/margins": 0.1910334825515747, - "rewards/margins_friction": 0.295121967792511, - "rewards/rejected": -0.4313318729400635, - "rewards/rejected_friction": -0.37638023495674133, + "epoch": 0.16, + "learning_rate": 4.721336020993228e-06, + "logits/chosen": -0.29582637548446655, + "logits/rejected": -0.2965632379055023, + "logps/chosen": -428.98992919921875, + "logps/rejected": -436.71533203125, + "loss": 0.5223, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.232280969619751, + "rewards/margins": 0.7531365752220154, + "rewards/rejected": -1.9854176044464111, "step": 540 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.29, - "learning_rate": 4.902228476147304e-06, - "logits/chosen": -0.3699187934398651, - "logits/chosen_friction": -0.3680361807346344, - "logits/rejected": -0.3704683184623718, - "logits/rejected_friction": -0.34429430961608887, - "logps/chosen": -3.176906108856201, - "logps/chosen_friction": -1.3612167835235596, - "logps/rejected": -4.985743522644043, - "logps/rejected_friction": -4.370680332183838, - "loss": 0.007, - "policy_friction_nll_loss": 1.3406118154525757, - "policy_nll_loss": 3.145676851272583, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25391003489494324, - "rewards/chosen_fricton": -0.08463708311319351, - "rewards/margins": 0.17800390720367432, - "rewards/margins_friction": 0.29753661155700684, - "rewards/rejected": -0.43191400170326233, - "rewards/rejected_friction": -0.38217365741729736, - "step": 545 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.29, - "learning_rate": 4.900404719387278e-06, - "logits/chosen": -0.36549481749534607, - "logits/chosen_friction": -0.36551564931869507, - "logits/rejected": -0.3619309961795807, - "logits/rejected_friction": -0.3443447947502136, - "logps/chosen": -3.0601348876953125, - "logps/chosen_friction": -1.3081169128417969, - "logps/rejected": -4.935264587402344, - "logps/rejected_friction": -4.266964912414551, - "loss": 0.004, - "policy_friction_nll_loss": 1.2848719358444214, - "policy_nll_loss": 3.0163626670837402, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.244129940867424, - "rewards/chosen_fricton": -0.08129890263080597, - "rewards/margins": 0.18401093780994415, - "rewards/margins_friction": 0.2915988564491272, - "rewards/rejected": -0.42814087867736816, - "rewards/rejected_friction": -0.37289777398109436, + "epoch": 0.17, + "learning_rate": 4.708780111115058e-06, + "logits/chosen": -0.3022860884666443, + "logits/rejected": -0.303489625453949, + "logps/chosen": -434.28936767578125, + "logps/rejected": -439.01043701171875, + "loss": 0.506, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1110032796859741, + "rewards/margins": 0.7986260652542114, + "rewards/rejected": -1.909629225730896, "step": 550 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.3, - "learning_rate": 4.898564455544352e-06, - "logits/chosen": -0.36489489674568176, - "logits/chosen_friction": -0.36835500597953796, - "logits/rejected": -0.3581060767173767, - "logits/rejected_friction": -0.34782707691192627, - "logps/chosen": -2.941488027572632, - "logps/chosen_friction": -1.2899830341339111, - "logps/rejected": -4.967020034790039, - "logps/rejected_friction": -4.196030616760254, - "loss": 0.0055, - "policy_friction_nll_loss": 1.2542521953582764, - "policy_nll_loss": 2.8890764713287354, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23428687453269958, - "rewards/chosen_fricton": -0.08072181046009064, - "rewards/margins": 0.20004919171333313, - "rewards/margins_friction": 0.2875484824180603, - "rewards/rejected": -0.4343360960483551, - "rewards/rejected_friction": -0.36827024817466736, - "step": 555 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.3, - "learning_rate": 4.896707697273637e-06, - "logits/chosen": -0.3737970292568207, - "logits/chosen_friction": -0.3748602271080017, - "logits/rejected": -0.3723887801170349, - "logits/rejected_friction": -0.3530288636684418, - "logps/chosen": -3.0456326007843018, - "logps/chosen_friction": -1.3191510438919067, - "logps/rejected": -4.92965841293335, - "logps/rejected_friction": -4.337514400482178, - "loss": 0.005, - "policy_friction_nll_loss": 1.2967182397842407, - "policy_nll_loss": 3.0189669132232666, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24332842230796814, - "rewards/chosen_fricton": -0.08296666294336319, - "rewards/margins": 0.1845158040523529, - "rewards/margins_friction": 0.2969358563423157, - "rewards/rejected": -0.42784422636032104, - "rewards/rejected_friction": -0.37990251183509827, + "epoch": 0.17, + "learning_rate": 4.6959649910976165e-06, + "logits/chosen": -0.3028009533882141, + "logits/rejected": -0.3035816550254822, + "logps/chosen": -433.6151428222656, + "logps/rejected": -436.40045166015625, + "loss": 0.5109, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0793737173080444, + "rewards/margins": 0.753380537033081, + "rewards/rejected": -1.832754373550415, "step": 560 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.3, - "learning_rate": 4.894834457343673e-06, - "logits/chosen": -0.3857825994491577, - "logits/chosen_friction": -0.3840736746788025, - "logits/rejected": -0.3876180052757263, - "logits/rejected_friction": -0.3633562922477722, - "logps/chosen": -2.9170608520507812, - "logps/chosen_friction": -1.2182682752609253, - "logps/rejected": -4.7514142990112305, - "logps/rejected_friction": -4.285270690917969, - "loss": 0.0034, - "policy_friction_nll_loss": 1.2022027969360352, - "policy_nll_loss": 2.888205051422119, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23138098418712616, - "rewards/chosen_fricton": -0.07316040992736816, - "rewards/margins": 0.17967157065868378, - "rewards/margins_friction": 0.30218613147735596, - "rewards/rejected": -0.41105255484580994, - "rewards/rejected_friction": -0.37534651160240173, - "step": 565 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.3, - "learning_rate": 4.892944748636339e-06, - "logits/chosen": -0.3814513087272644, - "logits/chosen_friction": -0.3828297257423401, - "logits/rejected": -0.37094739079475403, - "logits/rejected_friction": -0.36379674077033997, - "logps/chosen": -2.8514885902404785, - "logps/chosen_friction": -1.1874829530715942, - "logps/rejected": -4.895083904266357, - "logps/rejected_friction": -4.074179172515869, - "loss": 0.0048, - "policy_friction_nll_loss": 1.1597099304199219, - "policy_nll_loss": 2.8035573959350586, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23047485947608948, - "rewards/chosen_fricton": -0.07519306242465973, - "rewards/margins": 0.20012304186820984, - "rewards/margins_friction": 0.2838483452796936, - "rewards/rejected": -0.4305979311466217, - "rewards/rejected_friction": -0.35904139280319214, + "epoch": 0.17, + "learning_rate": 4.682892164852057e-06, + "logits/chosen": -0.29320716857910156, + "logits/rejected": -0.29399818181991577, + "logps/chosen": -428.3548889160156, + "logps/rejected": -433.96124267578125, + "loss": 0.5566, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1920106410980225, + "rewards/margins": 0.6818917989730835, + "rewards/rejected": -1.8739025592803955, "step": 570 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.31, - "learning_rate": 4.89103858414677e-06, - "logits/chosen": -0.3740740418434143, - "logits/chosen_friction": -0.3741939663887024, - "logits/rejected": -0.3693850040435791, - "logits/rejected_friction": -0.35136815905570984, - "logps/chosen": -2.9761691093444824, - "logps/chosen_friction": -1.2318999767303467, - "logps/rejected": -5.0625834465026855, - "logps/rejected_friction": -4.407397747039795, - "loss": 0.0049, - "policy_friction_nll_loss": 1.2178975343704224, - "policy_nll_loss": 2.949178695678711, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23696951568126678, - "rewards/chosen_fricton": -0.0744483470916748, - "rewards/margins": 0.2056441605091095, - "rewards/margins_friction": 0.313903272151947, - "rewards/rejected": -0.4426136612892151, - "rewards/rejected_friction": -0.38835158944129944, - "step": 575 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.31, - "learning_rate": 4.889115976983259e-06, - "logits/chosen": -0.39257627725601196, - "logits/chosen_friction": -0.3935685455799103, - "logits/rejected": -0.38717812299728394, - "logits/rejected_friction": -0.3710262179374695, - "logps/chosen": -2.9415717124938965, - "logps/chosen_friction": -1.2232686281204224, - "logps/rejected": -4.915060997009277, - "logps/rejected_friction": -4.254744529724121, - "loss": 0.0054, - "policy_friction_nll_loss": 1.2167807817459106, - "policy_nll_loss": 2.929110050201416, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23713883757591248, - "rewards/chosen_fricton": -0.07668948173522949, - "rewards/margins": 0.1938779056072235, - "rewards/margins_friction": 0.2988361418247223, - "rewards/rejected": -0.431016743183136, - "rewards/rejected_friction": -0.37552565336227417, + "epoch": 0.18, + "learning_rate": 4.669563166532504e-06, + "logits/chosen": -0.29630088806152344, + "logits/rejected": -0.2984740138053894, + "logps/chosen": -428.59405517578125, + "logps/rejected": -439.8580017089844, + "loss": 0.5099, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0762312412261963, + "rewards/margins": 0.8075464963912964, + "rewards/rejected": -1.8837776184082031, "step": 580 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.31, - "learning_rate": 4.887176940367179e-06, - "logits/chosen": -0.3831252157688141, - "logits/chosen_friction": -0.3856685757637024, - "logits/rejected": -0.3747841715812683, - "logits/rejected_friction": -0.3625125288963318, - "logps/chosen": -3.0540244579315186, - "logps/chosen_friction": -1.1927152872085571, - "logps/rejected": -5.0831804275512695, - "logps/rejected_friction": -4.1354546546936035, - "loss": 0.0045, - "policy_friction_nll_loss": 1.1798673868179321, - "policy_nll_loss": 3.0246217250823975, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24760928750038147, - "rewards/chosen_fricton": -0.07334326952695847, - "rewards/margins": 0.1996861696243286, - "rewards/margins_friction": 0.29023632407188416, - "rewards/rejected": -0.4472953677177429, - "rewards/rejected_friction": -0.36357957124710083, - "step": 585 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.31, - "learning_rate": 4.885221487632877e-06, - "logits/chosen": -0.37928256392478943, - "logits/chosen_friction": -0.3812408149242401, - "logits/rejected": -0.37743598222732544, - "logits/rejected_friction": -0.35908108949661255, - "logps/chosen": -2.9218530654907227, - "logps/chosen_friction": -1.2480697631835938, - "logps/rejected": -4.7808051109313965, - "logps/rejected_friction": -4.173957824707031, - "loss": 0.0078, - "policy_friction_nll_loss": 1.2350484132766724, - "policy_nll_loss": 2.8798210620880127, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23445403575897217, - "rewards/chosen_fricton": -0.07867997884750366, - "rewards/margins": 0.18237946927547455, - "rewards/margins_friction": 0.2885260581970215, - "rewards/rejected": -0.4168334901332855, - "rewards/rejected_friction": -0.36720603704452515, + "epoch": 0.18, + "learning_rate": 4.655979560356006e-06, + "logits/chosen": -0.299476683139801, + "logits/rejected": -0.30079394578933716, + "logps/chosen": -437.24359130859375, + "logps/rejected": -444.2662048339844, + "loss": 0.4679, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0544074773788452, + "rewards/margins": 0.8957304954528809, + "rewards/rejected": -1.9501378536224365, "step": 590 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.32, - "learning_rate": 4.883249632227595e-06, - "logits/chosen": -0.37339144945144653, - "logits/chosen_friction": -0.3742247521877289, - "logits/rejected": -0.36874204874038696, - "logits/rejected_friction": -0.35345956683158875, - "logps/chosen": -2.961474895477295, - "logps/chosen_friction": -1.221327543258667, - "logps/rejected": -4.865573406219482, - "logps/rejected_friction": -4.170202732086182, - "loss": 0.0063, - "policy_friction_nll_loss": 1.1941026449203491, - "policy_nll_loss": 2.9148340225219727, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23688066005706787, - "rewards/chosen_fricton": -0.07494596391916275, - "rewards/margins": 0.18786926567554474, - "rewards/margins_friction": 0.291456401348114, - "rewards/rejected": -0.4247499108314514, - "rewards/rejected_friction": -0.36640235781669617, - "step": 595 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.32, - "learning_rate": 4.8812613877113704e-06, - "logits/chosen": -0.3844270706176758, - "logits/chosen_friction": -0.38724273443222046, - "logits/rejected": -0.3775622248649597, - "logits/rejected_friction": -0.3644481301307678, - "logps/chosen": -3.1747889518737793, - "logps/chosen_friction": -1.3294063806533813, - "logps/rejected": -5.17557954788208, - "logps/rejected_friction": -4.2473554611206055, - "loss": 0.0066, - "policy_friction_nll_loss": 1.319451093673706, - "policy_nll_loss": 3.147367000579834, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.257737398147583, - "rewards/chosen_fricton": -0.08557120710611343, - "rewards/margins": 0.19738562405109406, - "rewards/margins_friction": 0.28809845447540283, - "rewards/rejected": -0.45512300729751587, - "rewards/rejected_friction": -0.37366965413093567, + "epoch": 0.18, + "learning_rate": 4.642142940418973e-06, + "logits/chosen": -0.3016494810581207, + "logits/rejected": -0.3028479218482971, + "logps/chosen": -428.2562561035156, + "logps/rejected": -436.1544494628906, + "loss": 0.4446, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1558116674423218, + "rewards/margins": 0.9864055514335632, + "rewards/rejected": -2.1422171592712402, "step": 600 }, { - "epoch": 0.32, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.40988969802856445, - "eval_logits/chosen_friction": -0.4123106002807617, - "eval_logits/rejected": -0.403967022895813, - "eval_logits/rejected_friction": -0.38654080033302307, - "eval_logps/chosen": -3.0868656635284424, - "eval_logps/chosen_friction": -1.3004549741744995, - "eval_logps/rejected": -5.113455295562744, - "eval_logps/rejected_friction": -4.310952186584473, - "eval_loss": 0.005615592934191227, - "eval_policy_friction_nll_loss": 1.3004549741744995, - "eval_policy_nll_loss": 3.0868656635284424, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.2487444281578064, - "eval_rewards/chosen_fricton": -0.08177678287029266, - "eval_rewards/margins": 0.19918113946914673, - "eval_rewards/margins_friction": 0.2969025671482086, - "eval_rewards/rejected": -0.4479255974292755, - "eval_rewards/rejected_friction": -0.3786793351173401, - "eval_runtime": 540.5215, - "eval_samples_per_second": 0.925, - "eval_steps_per_second": 0.463, + "epoch": 0.18, + "eval_logits/chosen": -0.3611030876636505, + "eval_logits/rejected": -0.36189064383506775, + "eval_logps/chosen": -425.2828674316406, + "eval_logps/rejected": -432.32147216796875, + "eval_loss": 0.4912301301956177, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -1.239168405532837, + "eval_rewards/margins": 0.895074725151062, + "eval_rewards/rejected": -2.1342432498931885, + "eval_runtime": 376.2893, + "eval_samples_per_second": 1.329, + "eval_steps_per_second": 1.329, "step": 600 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.32, - "learning_rate": 4.879256767756949e-06, - "logits/chosen": -0.36838045716285706, - "logits/chosen_friction": -0.36952444911003113, - "logits/rejected": -0.36654213070869446, - "logits/rejected_friction": -0.34628573060035706, - "logps/chosen": -3.203096866607666, - "logps/chosen_friction": -1.362221360206604, - "logps/rejected": -5.154852390289307, - "logps/rejected_friction": -4.5021491050720215, - "loss": 0.0052, - "policy_friction_nll_loss": 1.342139482498169, - "policy_nll_loss": 3.1736302375793457, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25601083040237427, - "rewards/chosen_fricton": -0.08459682017564774, - "rewards/margins": 0.19164033234119415, - "rewards/margins_friction": 0.3093457520008087, - "rewards/rejected": -0.4476511478424072, - "rewards/rejected_friction": -0.39394259452819824, - "step": 605 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.33, - "learning_rate": 4.877235786149681e-06, - "logits/chosen": -0.36276641488075256, - "logits/chosen_friction": -0.3629451394081116, - "logits/rejected": -0.3588744103908539, - "logits/rejected_friction": -0.34162068367004395, - "logps/chosen": -3.096522808074951, - "logps/chosen_friction": -1.3487895727157593, - "logps/rejected": -4.951355934143066, - "logps/rejected_friction": -4.383577346801758, - "loss": 0.0072, - "policy_friction_nll_loss": 1.3117563724517822, - "policy_nll_loss": 3.0344691276550293, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2498159110546112, - "rewards/chosen_fricton": -0.08677727729082108, - "rewards/margins": 0.1815294772386551, - "rewards/margins_friction": 0.29865169525146484, - "rewards/rejected": -0.4313453733921051, - "rewards/rejected_friction": -0.3854289650917053, + "epoch": 0.19, + "learning_rate": 4.6280549305101065e-06, + "logits/chosen": -0.30701732635498047, + "logits/rejected": -0.30861714482307434, + "logps/chosen": -430.90643310546875, + "logps/rejected": -437.9549255371094, + "loss": 0.545, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4101296663284302, + "rewards/margins": 0.7359235286712646, + "rewards/rejected": -2.1460530757904053, "step": 610 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.33, - "learning_rate": 4.875198456787439e-06, - "logits/chosen": -0.37790223956108093, - "logits/chosen_friction": -0.37998154759407043, - "logits/rejected": -0.37085336446762085, - "logits/rejected_friction": -0.35911816358566284, - "logps/chosen": -2.8618197441101074, - "logps/chosen_friction": -1.223262906074524, - "logps/rejected": -4.816946029663086, - "logps/rejected_friction": -4.181302547454834, - "loss": 0.0041, - "policy_friction_nll_loss": 1.2001675367355347, - "policy_nll_loss": 2.8179380893707275, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2285165786743164, - "rewards/chosen_fricton": -0.07599735260009766, - "rewards/margins": 0.19185248017311096, - "rewards/margins_friction": 0.29158100485801697, - "rewards/rejected": -0.42036905884742737, - "rewards/rejected_friction": -0.3675784170627594, - "step": 615 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.33, - "learning_rate": 4.873144793680511e-06, - "logits/chosen": -0.3753190338611603, - "logits/chosen_friction": -0.37545841932296753, - "logits/rejected": -0.3708342909812927, - "logits/rejected_friction": -0.35159119963645935, - "logps/chosen": -3.0468380451202393, - "logps/chosen_friction": -1.316632866859436, - "logps/rejected": -5.022671699523926, - "logps/rejected_friction": -4.364579677581787, - "loss": 0.0056, - "policy_friction_nll_loss": 1.3007173538208008, - "policy_nll_loss": 3.021544933319092, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2441560924053192, - "rewards/chosen_fricton": -0.08259124308824539, - "rewards/margins": 0.19379955530166626, - "rewards/margins_friction": 0.29992660880088806, - "rewards/rejected": -0.43795567750930786, - "rewards/rejected_friction": -0.38251787424087524, + "epoch": 0.19, + "learning_rate": 4.61371718391983e-06, + "logits/chosen": -0.30552786588668823, + "logits/rejected": -0.30662640929222107, + "logps/chosen": -432.50506591796875, + "logps/rejected": -443.53216552734375, + "loss": 0.486, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.181477427482605, + "rewards/margins": 0.9552658796310425, + "rewards/rejected": -2.1367435455322266, "step": 620 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.33, - "learning_rate": 4.871074810951509e-06, - "logits/chosen": -0.38454872369766235, - "logits/chosen_friction": -0.38569512963294983, - "logits/rejected": -0.3823607563972473, - "logits/rejected_friction": -0.3651474118232727, - "logps/chosen": -2.9188783168792725, - "logps/chosen_friction": -1.2284024953842163, - "logps/rejected": -4.8421125411987305, - "logps/rejected_friction": -4.270394325256348, - "loss": 0.0047, - "policy_friction_nll_loss": 1.2117477655410767, - "policy_nll_loss": 2.889458417892456, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.233725905418396, - "rewards/chosen_fricton": -0.07602645456790924, - "rewards/margins": 0.18911294639110565, - "rewards/margins_friction": 0.300600528717041, - "rewards/rejected": -0.42283883690834045, - "rewards/rejected_friction": -0.37662696838378906, - "step": 625 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.34, - "learning_rate": 4.868988522835274e-06, - "logits/chosen": -0.37047240138053894, - "logits/chosen_friction": -0.372148334980011, - "logits/rejected": -0.36696720123291016, - "logits/rejected_friction": -0.35054296255111694, - "logps/chosen": -2.8974075317382812, - "logps/chosen_friction": -1.2300665378570557, - "logps/rejected": -4.736468315124512, - "logps/rejected_friction": -4.195817470550537, - "loss": 0.005, - "policy_friction_nll_loss": 1.2026625871658325, - "policy_nll_loss": 2.85752272605896, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23099641501903534, - "rewards/chosen_fricton": -0.07558579742908478, - "rewards/margins": 0.18030384182929993, - "rewards/margins_friction": 0.2921091914176941, - "rewards/rejected": -0.4113002419471741, - "rewards/rejected_friction": -0.3676949739456177, + "epoch": 0.19, + "learning_rate": 4.599131383246277e-06, + "logits/chosen": -0.308699369430542, + "logits/rejected": -0.308963418006897, + "logps/chosen": -443.76837158203125, + "logps/rejected": -452.234130859375, + "loss": 0.5178, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5818192958831787, + "rewards/margins": 0.70851069688797, + "rewards/rejected": -2.290329933166504, "step": 630 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.34, - "learning_rate": 4.866885943678774e-06, - "logits/chosen": -0.3652501106262207, - "logits/chosen_friction": -0.3681142330169678, - "logits/rejected": -0.35625049471855164, - "logits/rejected_friction": -0.34370049834251404, - "logps/chosen": -3.0641226768493652, - "logps/chosen_friction": -1.2883943319320679, - "logps/rejected": -5.033955097198486, - "logps/rejected_friction": -4.274120807647705, - "loss": 0.0038, - "policy_friction_nll_loss": 1.2607519626617432, - "policy_nll_loss": 3.0188755989074707, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24520370364189148, - "rewards/chosen_fricton": -0.08013134449720383, - "rewards/margins": 0.1940164417028427, - "rewards/margins_friction": 0.2950137257575989, - "rewards/rejected": -0.439220130443573, - "rewards/rejected_friction": -0.3751450479030609, - "step": 635 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.34, - "learning_rate": 4.864767087941008e-06, - "logits/chosen": -0.37013569474220276, - "logits/chosen_friction": -0.372783899307251, - "logits/rejected": -0.3663265109062195, - "logits/rejected_friction": -0.34835073351860046, - "logps/chosen": -3.073009967803955, - "logps/chosen_friction": -1.280197024345398, - "logps/rejected": -5.019650459289551, - "logps/rejected_friction": -4.387875080108643, - "loss": 0.0041, - "policy_friction_nll_loss": 1.2628265619277954, - "policy_nll_loss": 3.034191608428955, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24486100673675537, - "rewards/chosen_fricton": -0.07790660858154297, - "rewards/margins": 0.1913476586341858, - "rewards/margins_friction": 0.3068021833896637, - "rewards/rejected": -0.43620872497558594, - "rewards/rejected_friction": -0.38470882177352905, + "epoch": 0.19, + "learning_rate": 4.584299240197826e-06, + "logits/chosen": -0.29901835322380066, + "logits/rejected": -0.2997357249259949, + "logps/chosen": -437.3292541503906, + "logps/rejected": -438.70013427734375, + "loss": 0.4941, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4868736267089844, + "rewards/margins": 0.9190858602523804, + "rewards/rejected": -2.405959129333496, "step": 640 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.34, - "learning_rate": 4.8626319701929045e-06, - "logits/chosen": -0.3745843768119812, - "logits/chosen_friction": -0.376962274312973, - "logits/rejected": -0.3719872534275055, - "logits/rejected_friction": -0.35481345653533936, - "logps/chosen": -2.923954725265503, - "logps/chosen_friction": -1.2312438488006592, - "logps/rejected": -4.743758201599121, - "logps/rejected_friction": -4.163157939910889, - "loss": 0.0051, - "policy_friction_nll_loss": 1.2159171104431152, - "policy_nll_loss": 2.896135091781616, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23146633803844452, - "rewards/chosen_fricton": -0.0749845877289772, - "rewards/margins": 0.1779761016368866, - "rewards/margins_friction": 0.28825247287750244, - "rewards/rejected": -0.4094424843788147, - "rewards/rejected_friction": -0.36323705315589905, - "step": 645 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.35, - "learning_rate": 4.860480605117225e-06, - "logits/chosen": -0.3936045467853546, - "logits/chosen_friction": -0.3979749083518982, - "logits/rejected": -0.3819504976272583, - "logits/rejected_friction": -0.3741615414619446, - "logps/chosen": -2.8594491481781006, - "logps/chosen_friction": -1.1894302368164062, - "logps/rejected": -4.970756530761719, - "logps/rejected_friction": -4.1009345054626465, - "loss": 0.0044, - "policy_friction_nll_loss": 1.1756641864776611, - "policy_nll_loss": 2.8335211277008057, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22908595204353333, - "rewards/chosen_fricton": -0.07329393178224564, - "rewards/margins": 0.20823995769023895, - "rewards/margins_friction": 0.2874842882156372, - "rewards/rejected": -0.4373258948326111, - "rewards/rejected_friction": -0.36077824234962463, + "epoch": 0.2, + "learning_rate": 4.569222495392227e-06, + "logits/chosen": -0.30075928568840027, + "logits/rejected": -0.30218517780303955, + "logps/chosen": -437.5245056152344, + "logps/rejected": -447.72271728515625, + "loss": 0.4425, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6097447872161865, + "rewards/margins": 1.0334211587905884, + "rewards/rejected": -2.6431655883789062, "step": 650 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.35, - "learning_rate": 4.858313007508456e-06, - "logits/chosen": -0.3642657995223999, - "logits/chosen_friction": -0.36636728048324585, - "logits/rejected": -0.35379675030708313, - "logits/rejected_friction": -0.34132882952690125, - "logps/chosen": -3.2559585571289062, - "logps/chosen_friction": -1.3138575553894043, - "logps/rejected": -5.380666732788086, - "logps/rejected_friction": -4.482171058654785, - "loss": 0.0052, - "policy_friction_nll_loss": 1.292079210281372, - "policy_nll_loss": 3.211887836456299, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26227259635925293, - "rewards/chosen_fricton": -0.07969280332326889, - "rewards/margins": 0.2094079554080963, - "rewards/margins_friction": 0.3131193220615387, - "rewards/rejected": -0.47168055176734924, - "rewards/rejected_friction": -0.3928121030330658, - "step": 655 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.35, - "learning_rate": 4.856129192272716e-06, - "logits/chosen": -0.37704694271087646, - "logits/chosen_friction": -0.3802163004875183, - "logits/rejected": -0.36783483624458313, - "logits/rejected_friction": -0.3598151206970215, - "logps/chosen": -2.8364193439483643, - "logps/chosen_friction": -1.204508662223816, - "logps/rejected": -4.8785810470581055, - "logps/rejected_friction": -4.138361930847168, - "loss": 0.0064, - "policy_friction_nll_loss": 1.1810928583145142, - "policy_nll_loss": 2.79130220413208, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22612512111663818, - "rewards/chosen_fricton": -0.07395689189434052, - "rewards/margins": 0.202052503824234, - "rewards/margins_friction": 0.29070720076560974, - "rewards/rejected": -0.4281776547431946, - "rewards/rejected_friction": -0.36466413736343384, + "epoch": 0.2, + "learning_rate": 4.553902918152329e-06, + "logits/chosen": -0.3034583628177643, + "logits/rejected": -0.3045238256454468, + "logps/chosen": -439.45159912109375, + "logps/rejected": -448.080322265625, + "loss": 0.4796, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7247978448867798, + "rewards/margins": 1.0205966234207153, + "rewards/rejected": -2.745394229888916, "step": 660 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.35, - "learning_rate": 4.853929174427647e-06, - "logits/chosen": -0.3786622881889343, - "logits/chosen_friction": -0.37854382395744324, - "logits/rejected": -0.37943875789642334, - "logits/rejected_friction": -0.35848087072372437, - "logps/chosen": -3.053617000579834, - "logps/chosen_friction": -1.214845895767212, - "logps/rejected": -4.762986660003662, - "logps/rejected_friction": -4.232400894165039, - "loss": 0.0062, - "policy_friction_nll_loss": 1.197760820388794, - "policy_nll_loss": 3.0220353603363037, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24410071969032288, - "rewards/chosen_fricton": -0.07254479080438614, - "rewards/margins": 0.16692794859409332, - "rewards/margins_friction": 0.29672184586524963, - "rewards/rejected": -0.4110286831855774, - "rewards/rejected_friction": -0.36926665902137756, - "step": 665 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.36, - "learning_rate": 4.8517129691023135e-06, - "logits/chosen": -0.3752982020378113, - "logits/chosen_friction": -0.3796137571334839, - "logits/rejected": -0.368490070104599, - "logits/rejected_friction": -0.35628166794776917, - "logps/chosen": -2.9030508995056152, - "logps/chosen_friction": -1.1891720294952393, - "logps/rejected": -4.918780326843262, - "logps/rejected_friction": -4.23046875, - "loss": 0.0051, - "policy_friction_nll_loss": 1.1663764715194702, - "policy_nll_loss": 2.8648087978363037, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22972413897514343, - "rewards/chosen_fricton": -0.07020814716815948, - "rewards/margins": 0.19838979840278625, - "rewards/margins_friction": 0.3003046214580536, - "rewards/rejected": -0.4281139373779297, - "rewards/rejected_friction": -0.37051278352737427, + "epoch": 0.2, + "learning_rate": 4.5383423062984455e-06, + "logits/chosen": -0.3042409420013428, + "logits/rejected": -0.3053613603115082, + "logps/chosen": -432.8832092285156, + "logps/rejected": -440.6971130371094, + "loss": 0.468, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.8063684701919556, + "rewards/margins": 0.9467433094978333, + "rewards/rejected": -2.7531113624572754, "step": 670 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.36, - "learning_rate": 4.849480591537097e-06, - "logits/chosen": -0.3794946074485779, - "logits/chosen_friction": -0.38210901618003845, - "logits/rejected": -0.3679543733596802, - "logits/rejected_friction": -0.3584458827972412, - "logps/chosen": -3.0573596954345703, - "logps/chosen_friction": -1.2234084606170654, - "logps/rejected": -5.120114326477051, - "logps/rejected_friction": -4.274056434631348, - "loss": 0.004, - "policy_friction_nll_loss": 1.2065708637237549, - "policy_nll_loss": 3.0257039070129395, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24495160579681396, - "rewards/chosen_fricton": -0.07385354489088058, - "rewards/margins": 0.20304660499095917, - "rewards/margins_friction": 0.3011299967765808, - "rewards/rejected": -0.44799819588661194, - "rewards/rejected_friction": -0.374983549118042, - "step": 675 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.36, - "learning_rate": 4.8472320570835925e-06, - "logits/chosen": -0.3740817904472351, - "logits/chosen_friction": -0.37646371126174927, - "logits/rejected": -0.3716391623020172, - "logits/rejected_friction": -0.3546949028968811, - "logps/chosen": -2.9149913787841797, - "logps/chosen_friction": -1.1995642185211182, - "logps/rejected": -4.810817718505859, - "logps/rejected_friction": -4.287019729614258, - "loss": 0.0043, - "policy_friction_nll_loss": 1.1769282817840576, - "policy_nll_loss": 2.8720896244049072, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2298654019832611, - "rewards/chosen_fricton": -0.07060084491968155, - "rewards/margins": 0.1864686906337738, - "rewards/margins_friction": 0.30518317222595215, - "rewards/rejected": -0.4163340926170349, - "rewards/rejected_friction": -0.3757840394973755, + "epoch": 0.21, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -0.2990413308143616, + "logits/rejected": -0.3002299666404724, + "logps/chosen": -435.41754150390625, + "logps/rejected": -442.31201171875, + "loss": 0.4606, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9637155532836914, + "rewards/margins": 0.984288215637207, + "rewards/rejected": -2.9480037689208984, "step": 680 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.37, - "learning_rate": 4.844967381204504e-06, - "logits/chosen": -0.37780824303627014, - "logits/chosen_friction": -0.379611611366272, - "logits/rejected": -0.3733257055282593, - "logits/rejected_friction": -0.35827136039733887, - "logps/chosen": -2.8700144290924072, - "logps/chosen_friction": -1.221091628074646, - "logps/rejected": -4.791624546051025, - "logps/rejected_friction": -4.2098565101623535, - "loss": 0.004, - "policy_friction_nll_loss": 1.1985007524490356, - "policy_nll_loss": 2.8246378898620605, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22726452350616455, - "rewards/chosen_fricton": -0.07433386892080307, - "rewards/margins": 0.18845926225185394, - "rewards/margins_friction": 0.29453858733177185, - "rewards/rejected": -0.4157238006591797, - "rewards/rejected_friction": -0.3688724637031555, - "step": 685 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.37, - "learning_rate": 4.842686579473534e-06, - "logits/chosen": -0.3805032968521118, - "logits/chosen_friction": -0.38289207220077515, - "logits/rejected": -0.37737488746643066, - "logits/rejected_friction": -0.36530399322509766, - "logps/chosen": -2.7965569496154785, - "logps/chosen_friction": -1.1750874519348145, - "logps/rejected": -4.6683855056762695, - "logps/rejected_friction": -4.175545692443848, - "loss": 0.0069, - "policy_friction_nll_loss": 1.1562236547470093, - "policy_nll_loss": 2.753976583480835, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2206982672214508, - "rewards/chosen_fricton": -0.07135482877492905, - "rewards/margins": 0.18342360854148865, - "rewards/margins_friction": 0.29574063420295715, - "rewards/rejected": -0.40412193536758423, - "rewards/rejected_friction": -0.3670954406261444, + "epoch": 0.21, + "learning_rate": 4.5065053112480725e-06, + "logits/chosen": -0.3054850697517395, + "logits/rejected": -0.3073977530002594, + "logps/chosen": -433.15771484375, + "logps/rejected": -440.9410095214844, + "loss": 0.4933, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.1777331829071045, + "rewards/margins": 0.8640511631965637, + "rewards/rejected": -3.0417845249176025, "step": 690 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.37, - "learning_rate": 4.840389667575281e-06, - "logits/chosen": -0.37743574380874634, - "logits/chosen_friction": -0.3805026710033417, - "logits/rejected": -0.37306979298591614, - "logits/rejected_friction": -0.35938382148742676, - "logps/chosen": -2.974402666091919, - "logps/chosen_friction": -1.2349941730499268, - "logps/rejected": -4.895740509033203, - "logps/rejected_friction": -4.286673545837402, - "loss": 0.0042, - "policy_friction_nll_loss": 1.2229740619659424, - "policy_nll_loss": 2.955324649810791, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2380216121673584, - "rewards/chosen_fricton": -0.07489867508411407, - "rewards/margins": 0.1894858032464981, - "rewards/margins_friction": 0.3018236756324768, - "rewards/rejected": -0.4275074601173401, - "rewards/rejected_friction": -0.3767223060131073, - "step": 695 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.37, - "learning_rate": 4.8380766613051295e-06, - "logits/chosen": -0.3883698582649231, - "logits/chosen_friction": -0.3915315270423889, - "logits/rejected": -0.38328370451927185, - "logits/rejected_friction": -0.3678188920021057, - "logps/chosen": -2.9330086708068848, - "logps/chosen_friction": -1.2048887014389038, - "logps/rejected": -4.81064510345459, - "logps/rejected_friction": -4.230401039123535, - "loss": 0.0049, - "policy_friction_nll_loss": 1.1941994428634644, - "policy_nll_loss": 2.9026637077331543, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23364940285682678, - "rewards/chosen_fricton": -0.07307380437850952, - "rewards/margins": 0.18447421491146088, - "rewards/margins_friction": 0.2985583245754242, - "rewards/rejected": -0.4181235730648041, - "rewards/rejected_friction": -0.3716321587562561, + "epoch": 0.21, + "learning_rate": 4.49023266426411e-06, + "logits/chosen": -0.30031442642211914, + "logits/rejected": -0.3014809787273407, + "logps/chosen": -441.3443908691406, + "logps/rejected": -447.56231689453125, + "loss": 0.5213, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.0933678150177, + "rewards/margins": 0.8899556994438171, + "rewards/rejected": -2.983323574066162, "step": 700 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.38, - "learning_rate": 4.835747576569142e-06, - "logits/chosen": -0.377177894115448, - "logits/chosen_friction": -0.38250288367271423, - "logits/rejected": -0.36567431688308716, - "logits/rejected_friction": -0.3604253828525543, - "logps/chosen": -2.9018666744232178, - "logps/chosen_friction": -1.1926378011703491, - "logps/rejected": -5.0863423347473145, - "logps/rejected_friction": -4.2676777839660645, - "loss": 0.0069, - "policy_friction_nll_loss": 1.1733002662658691, - "policy_nll_loss": 2.862914562225342, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2303980141878128, - "rewards/chosen_fricton": -0.07115094363689423, - "rewards/margins": 0.21505656838417053, - "rewards/margins_friction": 0.3031698763370514, - "rewards/rejected": -0.44545459747314453, - "rewards/rejected_friction": -0.3743208050727844, - "step": 705 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.38, - "learning_rate": 4.833402429383947e-06, - "logits/chosen": -0.3827642798423767, - "logits/chosen_friction": -0.3888317942619324, - "logits/rejected": -0.3777785301208496, - "logits/rejected_friction": -0.36616572737693787, - "logps/chosen": -2.864455461502075, - "logps/chosen_friction": -1.2599244117736816, - "logps/rejected": -4.67069149017334, - "logps/rejected_friction": -4.0836286544799805, - "loss": 0.0072, - "policy_friction_nll_loss": 1.2413429021835327, - "policy_nll_loss": 2.8102269172668457, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2296711951494217, - "rewards/chosen_fricton": -0.08039157837629318, - "rewards/margins": 0.17803847789764404, - "rewards/margins_friction": 0.27910709381103516, - "rewards/rejected": -0.40770968794822693, - "rewards/rejected_friction": -0.35949867963790894, + "epoch": 0.22, + "learning_rate": 4.473726454652755e-06, + "logits/chosen": -0.2997979521751404, + "logits/rejected": -0.30115145444869995, + "logps/chosen": -440.00372314453125, + "logps/rejected": -449.6446838378906, + "loss": 0.4733, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0547962188720703, + "rewards/margins": 1.1024844646453857, + "rewards/rejected": -3.157280445098877, "step": 710 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.38, - "learning_rate": 4.831041235876635e-06, - "logits/chosen": -0.374919593334198, - "logits/chosen_friction": -0.377795547246933, - "logits/rejected": -0.3760010898113251, - "logits/rejected_friction": -0.3532174527645111, - "logps/chosen": -2.9620213508605957, - "logps/chosen_friction": -1.2257423400878906, - "logps/rejected": -4.653872013092041, - "logps/rejected_friction": -4.342127799987793, - "loss": 0.0041, - "policy_friction_nll_loss": 1.2057602405548096, - "policy_nll_loss": 2.917423963546753, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2341981828212738, - "rewards/chosen_fricton": -0.07264856994152069, - "rewards/margins": 0.16557137668132782, - "rewards/margins_friction": 0.30715638399124146, - "rewards/rejected": -0.39976954460144043, - "rewards/rejected_friction": -0.37980496883392334, - "step": 715 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.38, - "learning_rate": 4.82866401228464e-06, - "logits/chosen": -0.37953734397888184, - "logits/chosen_friction": -0.38220322132110596, - "logits/rejected": -0.3705368638038635, - "logits/rejected_friction": -0.35651394724845886, - "logps/chosen": -3.029904842376709, - "logps/chosen_friction": -1.2264513969421387, - "logps/rejected": -4.98104190826416, - "logps/rejected_friction": -4.320191860198975, - "loss": 0.0044, - "policy_friction_nll_loss": 1.2068982124328613, - "policy_nll_loss": 2.989224433898926, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24114751815795898, - "rewards/chosen_fricton": -0.07239343971014023, - "rewards/margins": 0.19241128861904144, - "rewards/margins_friction": 0.30604439973831177, - "rewards/rejected": -0.43355879187583923, - "rewards/rejected_friction": -0.378437876701355, + "epoch": 0.22, + "learning_rate": 4.45698861949089e-06, + "logits/chosen": -0.3066961169242859, + "logits/rejected": -0.3076573610305786, + "logps/chosen": -442.42303466796875, + "logps/rejected": -448.47686767578125, + "loss": 0.5236, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1251652240753174, + "rewards/margins": 0.8965371251106262, + "rewards/rejected": -3.021702289581299, "step": 720 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.39, - "learning_rate": 4.8262707749556335e-06, - "logits/chosen": -0.38761788606643677, - "logits/chosen_friction": -0.39207693934440613, - "logits/rejected": -0.3782641291618347, - "logits/rejected_friction": -0.3692916929721832, - "logps/chosen": -2.957320213317871, - "logps/chosen_friction": -1.1906720399856567, - "logps/rejected": -4.881871700286865, - "logps/rejected_friction": -4.2223052978515625, - "loss": 0.0047, - "policy_friction_nll_loss": 1.1735365390777588, - "policy_nll_loss": 2.9160923957824707, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23678843677043915, - "rewards/chosen_fricton": -0.07215438783168793, - "rewards/margins": 0.18912889063358307, - "rewards/margins_friction": 0.29927390813827515, - "rewards/rejected": -0.4259173274040222, - "rewards/rejected_friction": -0.3714282512664795, - "step": 725 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.39, - "learning_rate": 4.823861540347411e-06, - "logits/chosen": -0.3863743245601654, - "logits/chosen_friction": -0.38958802819252014, - "logits/rejected": -0.38044387102127075, - "logits/rejected_friction": -0.3640071749687195, - "logps/chosen": -2.9965248107910156, - "logps/chosen_friction": -1.2145777940750122, - "logps/rejected": -4.80587100982666, - "logps/rejected_friction": -4.266003608703613, - "loss": 0.0032, - "policy_friction_nll_loss": 1.1922190189361572, - "policy_nll_loss": 2.9490387439727783, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23951056599617004, - "rewards/chosen_fricton": -0.07282309234142303, - "rewards/margins": 0.17809729278087616, - "rewards/margins_friction": 0.3017878532409668, - "rewards/rejected": -0.417607843875885, - "rewards/rejected_friction": -0.374610960483551, + "epoch": 0.22, + "learning_rate": 4.440021123037683e-06, + "logits/chosen": -0.29265230894088745, + "logits/rejected": -0.29371362924575806, + "logps/chosen": -441.66900634765625, + "logps/rejected": -450.8470153808594, + "loss": 0.5327, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.108212947845459, + "rewards/margins": 0.8388462066650391, + "rewards/rejected": -2.947059154510498, "step": 730 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.39, - "learning_rate": 4.8214363250277756e-06, - "logits/chosen": -0.3841674029827118, - "logits/chosen_friction": -0.38862431049346924, - "logits/rejected": -0.3742806315422058, - "logits/rejected_friction": -0.36324483156204224, - "logps/chosen": -3.0372061729431152, - "logps/chosen_friction": -1.260117769241333, - "logps/rejected": -4.978865146636963, - "logps/rejected_friction": -4.309460163116455, - "loss": 0.0042, - "policy_friction_nll_loss": 1.2322200536727905, - "policy_nll_loss": 3.0004451274871826, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2455594837665558, - "rewards/chosen_fricton": -0.07939223945140839, - "rewards/margins": 0.18982863426208496, - "rewards/margins_friction": 0.30012983083724976, - "rewards/rejected": -0.43538814783096313, - "rewards/rejected_friction": -0.37952202558517456, - "step": 735 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.39, - "learning_rate": 4.818995145674427e-06, - "logits/chosen": -0.3719593286514282, - "logits/chosen_friction": -0.37513524293899536, - "logits/rejected": -0.36784762144088745, - "logits/rejected_friction": -0.35148417949676514, - "logps/chosen": -3.0625061988830566, - "logps/chosen_friction": -1.2882226705551147, - "logps/rejected": -4.883954048156738, - "logps/rejected_friction": -4.470065116882324, - "loss": 0.0059, - "policy_friction_nll_loss": 1.2660655975341797, - "policy_nll_loss": 3.019824266433716, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24447543919086456, - "rewards/chosen_fricton": -0.07884252816438675, - "rewards/margins": 0.17864122986793518, - "rewards/margins_friction": 0.3138425946235657, - "rewards/rejected": -0.42311668395996094, - "rewards/rejected_friction": -0.3926851153373718, + "epoch": 0.22, + "learning_rate": 4.422825956504073e-06, + "logits/chosen": -0.3069104254245758, + "logits/rejected": -0.3083550035953522, + "logps/chosen": -449.7119140625, + "logps/rejected": -459.4678649902344, + "loss": 0.5117, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1724143028259277, + "rewards/margins": 0.8879534602165222, + "rewards/rejected": -3.0603675842285156, "step": 740 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.4, - "learning_rate": 4.8165380190748476e-06, - "logits/chosen": -0.38366061449050903, - "logits/chosen_friction": -0.3865950107574463, - "logits/rejected": -0.37568187713623047, - "logits/rejected_friction": -0.36265602707862854, - "logps/chosen": -2.9605743885040283, - "logps/chosen_friction": -1.212777853012085, - "logps/rejected": -4.91793155670166, - "logps/rejected_friction": -4.381437301635742, - "loss": 0.0039, - "policy_friction_nll_loss": 1.1978734731674194, - "policy_nll_loss": 2.9262168407440186, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23807159066200256, - "rewards/chosen_fricton": -0.07432057708501816, - "rewards/margins": 0.19206008315086365, - "rewards/margins_friction": 0.3125899136066437, - "rewards/rejected": -0.4301316738128662, - "rewards/rejected_friction": -0.38691049814224243, - "step": 745 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.4, - "learning_rate": 4.814064962126184e-06, - "logits/chosen": -0.384343683719635, - "logits/chosen_friction": -0.3865780234336853, - "logits/rejected": -0.3766785264015198, - "logits/rejected_friction": -0.3640988767147064, - "logps/chosen": -2.9205217361450195, - "logps/chosen_friction": -1.2799651622772217, - "logps/rejected": -4.795446872711182, - "logps/rejected_friction": -4.264823913574219, - "loss": 0.0059, - "policy_friction_nll_loss": 1.242357611656189, - "policy_nll_loss": 2.8817074298858643, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23404595255851746, - "rewards/chosen_fricton": -0.08143292367458344, - "rewards/margins": 0.1845119744539261, - "rewards/margins_friction": 0.29488056898117065, - "rewards/rejected": -0.41855794191360474, - "rewards/rejected_friction": -0.3763135075569153, + "epoch": 0.23, + "learning_rate": 4.4054051378190915e-06, + "logits/chosen": -0.30406031012535095, + "logits/rejected": -0.30475375056266785, + "logps/chosen": -447.04022216796875, + "logps/rejected": -452.49658203125, + "loss": 0.493, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.098881483078003, + "rewards/margins": 0.9250394105911255, + "rewards/rejected": -3.023920774459839, "step": 750 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.4, - "learning_rate": 4.811575991835134e-06, - "logits/chosen": -0.3772605359554291, - "logits/chosen_friction": -0.3807472586631775, - "logits/rejected": -0.37339216470718384, - "logits/rejected_friction": -0.3607543110847473, - "logps/chosen": -2.996735095977783, - "logps/chosen_friction": -1.2020673751831055, - "logps/rejected": -4.718087673187256, - "logps/rejected_friction": -4.167320728302002, - "loss": 0.0049, - "policy_friction_nll_loss": 1.1769534349441528, - "policy_nll_loss": 2.9460506439208984, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23953549563884735, - "rewards/chosen_fricton": -0.07216795533895493, - "rewards/margins": 0.1690492331981659, - "rewards/margins_friction": 0.2925325632095337, - "rewards/rejected": -0.40858468413352966, - "rewards/rejected_friction": -0.3647005259990692, - "step": 755 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.41, - "learning_rate": 4.809071125317829e-06, - "logits/chosen": -0.3659014403820038, - "logits/chosen_friction": -0.36891117691993713, - "logits/rejected": -0.3578733801841736, - "logits/rejected_friction": -0.3435431718826294, - "logps/chosen": -3.116964340209961, - "logps/chosen_friction": -1.2749396562576294, - "logps/rejected": -5.029179096221924, - "logps/rejected_friction": -4.437417984008789, - "loss": 0.0032, - "policy_friction_nll_loss": 1.248363733291626, - "policy_nll_loss": 3.051529884338379, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2490025758743286, - "rewards/chosen_fricton": -0.07640896737575531, - "rewards/margins": 0.18768055737018585, - "rewards/margins_friction": 0.312004953622818, - "rewards/rejected": -0.43668311834335327, - "rewards/rejected_friction": -0.3884138762950897, + "epoch": 0.23, + "learning_rate": 4.387760711393052e-06, + "logits/chosen": -0.3125828206539154, + "logits/rejected": -0.3135472536087036, + "logps/chosen": -441.21337890625, + "logps/rejected": -446.6187438964844, + "loss": 0.5226, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1020660400390625, + "rewards/margins": 0.8715343475341797, + "rewards/rejected": -2.973600387573242, "step": 760 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.41, - "learning_rate": 4.806550379799711e-06, - "logits/chosen": -0.3658471703529358, - "logits/chosen_friction": -0.37099626660346985, - "logits/rejected": -0.3559862971305847, - "logits/rejected_friction": -0.3471335172653198, - "logps/chosen": -3.0461158752441406, - "logps/chosen_friction": -1.2730337381362915, - "logps/rejected": -4.998549938201904, - "logps/rejected_friction": -4.35897159576416, - "loss": 0.0048, - "policy_friction_nll_loss": 1.237030267715454, - "policy_nll_loss": 2.9857640266418457, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2431354820728302, - "rewards/chosen_fricton": -0.07777302712202072, - "rewards/margins": 0.19099131226539612, - "rewards/margins_friction": 0.30372223258018494, - "rewards/rejected": -0.43412676453590393, - "rewards/rejected_friction": -0.38149523735046387, - "step": 765 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.41, - "learning_rate": 4.804013772615423e-06, - "logits/chosen": -0.3958445191383362, - "logits/chosen_friction": -0.4003049433231354, - "logits/rejected": -0.3888668417930603, - "logits/rejected_friction": -0.37467536330223083, - "logps/chosen": -2.932112693786621, - "logps/chosen_friction": -1.2128496170043945, - "logps/rejected": -4.720229148864746, - "logps/rejected_friction": -4.2299485206604, - "loss": 0.0041, - "policy_friction_nll_loss": 1.1959197521209717, - "policy_nll_loss": 2.8974974155426025, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23592177033424377, - "rewards/chosen_fricton": -0.07579877227544785, - "rewards/margins": 0.17575940489768982, - "rewards/margins_friction": 0.2977844774723053, - "rewards/rejected": -0.41168123483657837, - "rewards/rejected_friction": -0.37358325719833374, + "epoch": 0.23, + "learning_rate": 4.369894747877627e-06, + "logits/chosen": -0.30844077467918396, + "logits/rejected": -0.3093765676021576, + "logps/chosen": -439.68060302734375, + "logps/rejected": -447.6014709472656, + "loss": 0.4748, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9754493236541748, + "rewards/margins": 1.023809552192688, + "rewards/rejected": -2.9992587566375732, "step": 770 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.41, - "learning_rate": 4.801461321208684e-06, - "logits/chosen": -0.3901572823524475, - "logits/chosen_friction": -0.39398032426834106, - "logits/rejected": -0.38570329546928406, - "logits/rejected_friction": -0.37088295817375183, - "logps/chosen": -2.8668065071105957, - "logps/chosen_friction": -1.197466254234314, - "logps/rejected": -4.70769739151001, - "logps/rejected_friction": -4.301355838775635, - "loss": 0.0041, - "policy_friction_nll_loss": 1.1737028360366821, - "policy_nll_loss": 2.830641746520996, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22868947684764862, - "rewards/chosen_fricton": -0.07365847378969193, - "rewards/margins": 0.18000075221061707, - "rewards/margins_friction": 0.30544716119766235, - "rewards/rejected": -0.4086902141571045, - "rewards/rejected_friction": -0.3791056275367737, - "step": 775 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.42, - "learning_rate": 4.7988930431321695e-06, - "logits/chosen": -0.36885973811149597, - "logits/chosen_friction": -0.37414053082466125, - "logits/rejected": -0.3608701825141907, - "logits/rejected_friction": -0.3488486707210541, - "logps/chosen": -3.0646767616271973, - "logps/chosen_friction": -1.2240331172943115, - "logps/rejected": -5.0103230476379395, - "logps/rejected_friction": -4.4126715660095215, - "loss": 0.0034, - "policy_friction_nll_loss": 1.2057921886444092, - "policy_nll_loss": 3.0296311378479004, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24720671772956848, - "rewards/chosen_fricton": -0.07479321211576462, - "rewards/margins": 0.19040539860725403, - "rewards/margins_friction": 0.31395941972732544, - "rewards/rejected": -0.4376121163368225, - "rewards/rejected_friction": -0.38875263929367065, + "epoch": 0.24, + "learning_rate": 4.3518093439228484e-06, + "logits/chosen": -0.309563547372818, + "logits/rejected": -0.3104109764099121, + "logps/chosen": -442.0809631347656, + "logps/rejected": -449.5039978027344, + "loss": 0.4696, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.0284624099731445, + "rewards/margins": 0.8842188119888306, + "rewards/rejected": -2.9126813411712646, "step": 780 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.42, - "learning_rate": 4.796308956047393e-06, - "logits/chosen": -0.3953578770160675, - "logits/chosen_friction": -0.3981282413005829, - "logits/rejected": -0.3850625157356262, - "logits/rejected_friction": -0.3740263879299164, - "logps/chosen": -2.9131979942321777, - "logps/chosen_friction": -1.165722131729126, - "logps/rejected": -4.872370719909668, - "logps/rejected_friction": -4.198147773742676, - "loss": 0.0059, - "policy_friction_nll_loss": 1.1504223346710205, - "policy_nll_loss": 2.877537727355957, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23321180045604706, - "rewards/chosen_fricton": -0.07118720561265945, - "rewards/margins": 0.1919952929019928, - "rewards/margins_friction": 0.29851242899894714, - "rewards/rejected": -0.42520713806152344, - "rewards/rejected_friction": -0.3696995675563812, - "step": 785 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.42, - "learning_rate": 4.7937090777245835e-06, - "logits/chosen": -0.38956373929977417, - "logits/chosen_friction": -0.3918357789516449, - "logits/rejected": -0.38584306836128235, - "logits/rejected_friction": -0.3665474057197571, - "logps/chosen": -2.9928297996520996, - "logps/chosen_friction": -1.2204148769378662, - "logps/rejected": -4.768657207489014, - "logps/rejected_friction": -4.36868953704834, - "loss": 0.0035, - "policy_friction_nll_loss": 1.1991368532180786, - "policy_nll_loss": 2.955763578414917, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23950275778770447, - "rewards/chosen_fricton": -0.0742999017238617, - "rewards/margins": 0.17387959361076355, - "rewards/margins_friction": 0.3101205825805664, - "rewards/rejected": -0.4133823812007904, - "rewards/rejected_friction": -0.3844204545021057, + "epoch": 0.24, + "learning_rate": 4.333506621931056e-06, + "logits/chosen": -0.3095022737979889, + "logits/rejected": -0.3111112713813782, + "logps/chosen": -441.48736572265625, + "logps/rejected": -452.59466552734375, + "loss": 0.4302, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7804081439971924, + "rewards/margins": 1.1726500988006592, + "rewards/rejected": -2.9530580043792725, "step": 790 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.42, - "learning_rate": 4.79109342604256e-06, - "logits/chosen": -0.3922959864139557, - "logits/chosen_friction": -0.3976667523384094, - "logits/rejected": -0.38137924671173096, - "logits/rejected_friction": -0.3735475242137909, - "logps/chosen": -2.895329236984253, - "logps/chosen_friction": -1.137108564376831, - "logps/rejected": -4.865678787231445, - "logps/rejected_friction": -4.177489280700684, - "loss": 0.0047, - "policy_friction_nll_loss": 1.1207711696624756, - "policy_nll_loss": 2.856174945831299, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23496508598327637, - "rewards/chosen_fricton": -0.06934484094381332, - "rewards/margins": 0.19323372840881348, - "rewards/margins_friction": 0.2996440529823303, - "rewards/rejected": -0.42819881439208984, - "rewards/rejected_friction": -0.36898887157440186, - "step": 795 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.43, - "learning_rate": 4.788462018988616e-06, - "logits/chosen": -0.39632347226142883, - "logits/chosen_friction": -0.40111732482910156, - "logits/rejected": -0.3816663324832916, - "logits/rejected_friction": -0.3766455352306366, - "logps/chosen": -3.0518224239349365, - "logps/chosen_friction": -1.240099549293518, - "logps/rejected": -5.074397087097168, - "logps/rejected_friction": -4.206575870513916, - "loss": 0.0052, - "policy_friction_nll_loss": 1.2292897701263428, - "policy_nll_loss": 3.029637098312378, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2464655190706253, - "rewards/chosen_fricton": -0.07704909145832062, - "rewards/margins": 0.199875608086586, - "rewards/margins_friction": 0.2937766909599304, - "rewards/rejected": -0.4463411271572113, - "rewards/rejected_friction": -0.37082576751708984, + "epoch": 0.24, + "learning_rate": 4.3149887298078275e-06, + "logits/chosen": -0.3100133538246155, + "logits/rejected": -0.3110717535018921, + "logps/chosen": -439.8687438964844, + "logps/rejected": -447.7996520996094, + "loss": 0.4705, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8933576345443726, + "rewards/margins": 1.0525233745574951, + "rewards/rejected": -2.9458811283111572, "step": 800 }, { - "epoch": 0.43, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.41817161440849304, - "eval_logits/chosen_friction": -0.422261118888855, - "eval_logits/rejected": -0.40724387764930725, - "eval_logits/rejected_friction": -0.3938787281513214, - "eval_logps/chosen": -3.051438808441162, - "eval_logps/chosen_friction": -1.2548151016235352, - "eval_logps/rejected": -5.022398948669434, - "eval_logps/rejected_friction": -4.306788921356201, - "eval_loss": 0.0038001721259206533, - "eval_policy_friction_nll_loss": 1.2548149824142456, - "eval_policy_nll_loss": 3.051438808441162, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.2452016919851303, - "eval_rewards/chosen_fricton": -0.07721278071403503, - "eval_rewards/margins": 0.19361825287342072, - "eval_rewards/margins_friction": 0.3010503053665161, - "eval_rewards/rejected": -0.43881991505622864, - "eval_rewards/rejected_friction": -0.37826308608055115, - "eval_runtime": 547.4651, - "eval_samples_per_second": 0.913, - "eval_steps_per_second": 0.457, + "epoch": 0.24, + "eval_logits/chosen": -0.3688412606716156, + "eval_logits/rejected": -0.3696078956127167, + "eval_logps/chosen": -432.4996643066406, + "eval_logps/rejected": -440.57073974609375, + "eval_loss": 0.4888974726200104, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -1.9608467817306519, + "eval_rewards/margins": 0.9983222484588623, + "eval_rewards/rejected": -2.9591689109802246, + "eval_runtime": 376.2946, + "eval_samples_per_second": 1.329, + "eval_steps_per_second": 1.329, "step": 800 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.43, - "learning_rate": 4.78581487465839e-06, - "logits/chosen": -0.37330538034439087, - "logits/chosen_friction": -0.3783525824546814, - "logits/rejected": -0.3639804720878601, - "logits/rejected_friction": -0.35337382555007935, - "logps/chosen": -3.011664390563965, - "logps/chosen_friction": -1.1883898973464966, - "logps/rejected": -4.930671691894531, - "logps/rejected_friction": -4.296717643737793, - "loss": 0.0023, - "policy_friction_nll_loss": 1.1652092933654785, - "policy_nll_loss": 2.9646334648132324, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24054725468158722, - "rewards/chosen_fricton": -0.07097410410642624, - "rewards/margins": 0.1882513463497162, - "rewards/margins_friction": 0.3060223162174225, - "rewards/rejected": -0.4287985861301422, - "rewards/rejected_friction": -0.3769964277744293, - "step": 805 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.43, - "learning_rate": 4.783152011255739e-06, - "logits/chosen": -0.3987380266189575, - "logits/chosen_friction": -0.4046962857246399, - "logits/rejected": -0.38463157415390015, - "logits/rejected_friction": -0.3798651695251465, - "logps/chosen": -2.8835768699645996, - "logps/chosen_friction": -1.1812783479690552, - "logps/rejected": -4.819699287414551, - "logps/rejected_friction": -4.062263488769531, - "loss": 0.0049, - "policy_friction_nll_loss": 1.1666669845581055, - "policy_nll_loss": 2.854402542114258, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23424987494945526, - "rewards/chosen_fricton": -0.0747491642832756, - "rewards/margins": 0.1905045509338379, - "rewards/margins_friction": 0.28451675176620483, - "rewards/rejected": -0.42475447058677673, - "rewards/rejected_friction": -0.35926589369773865, + "epoch": 0.25, + "learning_rate": 4.296257840709906e-06, + "logits/chosen": -0.3060837686061859, + "logits/rejected": -0.30729439854621887, + "logps/chosen": -443.59765625, + "logps/rejected": -454.3882751464844, + "loss": 0.4934, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0910544395446777, + "rewards/margins": 0.964927077293396, + "rewards/rejected": -3.0559818744659424, "step": 810 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.43, - "learning_rate": 4.7804734470926215e-06, - "logits/chosen": -0.3709632158279419, - "logits/chosen_friction": -0.3723723888397217, - "logits/rejected": -0.3632492125034332, - "logits/rejected_friction": -0.3476722836494446, - "logps/chosen": -3.131695508956909, - "logps/chosen_friction": -1.2873214483261108, - "logps/rejected": -4.95932149887085, - "logps/rejected_friction": -4.416628360748291, - "loss": 0.0032, - "policy_friction_nll_loss": 1.2591087818145752, - "policy_nll_loss": 3.0854384899139404, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2525792717933655, - "rewards/chosen_fricton": -0.07918061316013336, - "rewards/margins": 0.17894655466079712, - "rewards/margins_friction": 0.30812591314315796, - "rewards/rejected": -0.431525856256485, - "rewards/rejected_friction": -0.3873065114021301, - "step": 815 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.44, - "learning_rate": 4.777779200588963e-06, - "logits/chosen": -0.3605310916900635, - "logits/chosen_friction": -0.36285313963890076, - "logits/rejected": -0.35064369440078735, - "logits/rejected_friction": -0.33785122632980347, - "logps/chosen": -3.209343671798706, - "logps/chosen_friction": -1.2769298553466797, - "logps/rejected": -5.054333686828613, - "logps/rejected_friction": -4.403319358825684, - "loss": 0.005, - "policy_friction_nll_loss": 1.238539218902588, - "policy_nll_loss": 3.1390223503112793, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2610393762588501, - "rewards/chosen_fricton": -0.07972871512174606, - "rewards/margins": 0.18003277480602264, - "rewards/margins_friction": 0.30752694606781006, - "rewards/rejected": -0.44107216596603394, - "rewards/rejected_friction": -0.38725563883781433, + "epoch": 0.25, + "learning_rate": 4.277316152790177e-06, + "logits/chosen": -0.3090333938598633, + "logits/rejected": -0.3097476363182068, + "logps/chosen": -446.78564453125, + "logps/rejected": -453.74859619140625, + "loss": 0.5066, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2516064643859863, + "rewards/margins": 0.9105945825576782, + "rewards/rejected": -3.162201404571533, "step": 820 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.44, - "learning_rate": 4.775069290272538e-06, - "logits/chosen": -0.3702056109905243, - "logits/chosen_friction": -0.37393343448638916, - "logits/rejected": -0.3602065443992615, - "logits/rejected_friction": -0.3480204641819, - "logps/chosen": -3.159524917602539, - "logps/chosen_friction": -1.2692396640777588, - "logps/rejected": -5.095942497253418, - "logps/rejected_friction": -4.434847831726074, - "loss": 0.003, - "policy_friction_nll_loss": 1.2571123838424683, - "policy_nll_loss": 3.13330340385437, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.255905419588089, - "rewards/chosen_fricton": -0.07853098213672638, - "rewards/margins": 0.18975184857845306, - "rewards/margins_friction": 0.3120003640651703, - "rewards/rejected": -0.44565725326538086, - "rewards/rejected_friction": -0.39053139090538025, - "step": 825 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.44, - "learning_rate": 4.772343734778834e-06, - "logits/chosen": -0.38169604539871216, - "logits/chosen_friction": -0.3848908543586731, - "logits/rejected": -0.37226423621177673, - "logits/rejected_friction": -0.3586348593235016, - "logps/chosen": -3.2531142234802246, - "logps/chosen_friction": -1.349050521850586, - "logps/rejected": -5.0882134437561035, - "logps/rejected_friction": -4.436912536621094, - "loss": 0.0045, - "policy_friction_nll_loss": 1.3249070644378662, - "policy_nll_loss": 3.21948504447937, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2645121216773987, - "rewards/chosen_fricton": -0.08604630827903748, - "rewards/margins": 0.18019908666610718, - "rewards/margins_friction": 0.3046807050704956, - "rewards/rejected": -0.4447111487388611, - "rewards/rejected_friction": -0.3907269835472107, + "epoch": 0.25, + "learning_rate": 4.2581658889397e-06, + "logits/chosen": -0.2983805537223816, + "logits/rejected": -0.29977601766586304, + "logps/chosen": -434.3565979003906, + "logps/rejected": -444.49542236328125, + "loss": 0.4289, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.9821250438690186, + "rewards/margins": 1.0745497941970825, + "rewards/rejected": -3.0566749572753906, "step": 830 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.45, - "learning_rate": 4.769602552850926e-06, - "logits/chosen": -0.37095361948013306, - "logits/chosen_friction": -0.3756476938724518, - "logits/rejected": -0.3626479506492615, - "logits/rejected_friction": -0.3494853675365448, - "logps/chosen": -3.117929697036743, - "logps/chosen_friction": -1.2722599506378174, - "logps/rejected": -4.9621663093566895, - "logps/rejected_friction": -4.468001365661621, - "loss": 0.0028, - "policy_friction_nll_loss": 1.2433900833129883, - "policy_nll_loss": 3.064082384109497, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25118589401245117, - "rewards/chosen_fricton": -0.07822006940841675, - "rewards/margins": 0.1815846562385559, - "rewards/margins_friction": 0.3157891035079956, - "rewards/rejected": -0.43277058005332947, - "rewards/rejected_friction": -0.39400920271873474, - "step": 835 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.45, - "learning_rate": 4.766845763339353e-06, - "logits/chosen": -0.37238067388534546, - "logits/chosen_friction": -0.375211626291275, - "logits/rejected": -0.3669355809688568, - "logits/rejected_friction": -0.3503592014312744, - "logps/chosen": -3.0723278522491455, - "logps/chosen_friction": -1.243667483329773, - "logps/rejected": -4.912738800048828, - "logps/rejected_friction": -4.464780330657959, - "loss": 0.0025, - "policy_friction_nll_loss": 1.222880244255066, - "policy_nll_loss": 3.0323498249053955, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.246707484126091, - "rewards/chosen_fricton": -0.0764845460653305, - "rewards/margins": 0.18053199350833893, - "rewards/margins_friction": 0.3176756501197815, - "rewards/rejected": -0.4272395074367523, - "rewards/rejected_friction": -0.39416012167930603, + "epoch": 0.26, + "learning_rate": 4.238809296526847e-06, + "logits/chosen": -0.30951178073883057, + "logits/rejected": -0.31038326025009155, + "logps/chosen": -453.48419189453125, + "logps/rejected": -461.54833984375, + "loss": 0.523, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.3027613162994385, + "rewards/margins": 0.8140772581100464, + "rewards/rejected": -3.1168384552001953, "step": 840 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.45, - "learning_rate": 4.76407338520198e-06, - "logits/chosen": -0.3742605149745941, - "logits/chosen_friction": -0.3803965449333191, - "logits/rejected": -0.36571890115737915, - "logits/rejected_friction": -0.3535844385623932, - "logps/chosen": -3.1570074558258057, - "logps/chosen_friction": -1.3173205852508545, - "logps/rejected": -4.933920383453369, - "logps/rejected_friction": -4.444101333618164, - "loss": 0.0037, - "policy_friction_nll_loss": 1.301795244216919, - "policy_nll_loss": 3.131974697113037, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25552043318748474, - "rewards/chosen_fricton": -0.08243141323328018, - "rewards/margins": 0.1746748387813568, - "rewards/margins_friction": 0.3092886805534363, - "rewards/rejected": -0.43019527196884155, - "rewards/rejected_friction": -0.39172008633613586, - "step": 845 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.45, - "learning_rate": 4.761285437503875e-06, - "logits/chosen": -0.3646545112133026, - "logits/chosen_friction": -0.3702804446220398, - "logits/rejected": -0.3529558777809143, - "logits/rejected_friction": -0.3445180654525757, - "logps/chosen": -3.0766944885253906, - "logps/chosen_friction": -1.234919548034668, - "logps/rejected": -5.0553154945373535, - "logps/rejected_friction": -4.377143383026123, - "loss": 0.0035, - "policy_friction_nll_loss": 1.2055364847183228, - "policy_nll_loss": 3.007896900177002, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2485547512769699, - "rewards/chosen_fricton": -0.07574470341205597, - "rewards/margins": 0.19493496417999268, - "rewards/margins_friction": 0.31118863821029663, - "rewards/rejected": -0.4434897005558014, - "rewards/rejected_friction": -0.3869333863258362, + "epoch": 0.26, + "learning_rate": 4.219248647133559e-06, + "logits/chosen": -0.3112717568874359, + "logits/rejected": -0.3124113082885742, + "logps/chosen": -437.2984313964844, + "logps/rejected": -447.7862243652344, + "loss": 0.4619, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.230332612991333, + "rewards/margins": 1.0623276233673096, + "rewards/rejected": -3.2926604747772217, "step": 850 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.46, - "learning_rate": 4.758481939417173e-06, - "logits/chosen": -0.38000568747520447, - "logits/chosen_friction": -0.3845260739326477, - "logits/rejected": -0.3692302107810974, - "logits/rejected_friction": -0.3569611608982086, - "logps/chosen": -3.110448122024536, - "logps/chosen_friction": -1.2250968217849731, - "logps/rejected": -5.091402053833008, - "logps/rejected_friction": -4.398796081542969, - "loss": 0.0033, - "policy_friction_nll_loss": 1.210333228111267, - "policy_nll_loss": 3.0683255195617676, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25227051973342896, - "rewards/chosen_fricton": -0.07514631003141403, - "rewards/margins": 0.19438929855823517, - "rewards/margins_friction": 0.31305253505706787, - "rewards/rejected": -0.44665980339050293, - "rewards/rejected_friction": -0.3881988525390625, - "step": 855 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.46, - "learning_rate": 4.755662910220945e-06, - "logits/chosen": -0.38142889738082886, - "logits/chosen_friction": -0.3829438388347626, - "logits/rejected": -0.3769550919532776, - "logits/rejected_friction": -0.3569730222225189, - "logps/chosen": -3.130767583847046, - "logps/chosen_friction": -1.2956466674804688, - "logps/rejected": -4.77437162399292, - "logps/rejected_friction": -4.445256233215332, - "loss": 0.0029, - "policy_friction_nll_loss": 1.2803198099136353, - "policy_nll_loss": 3.100527286529541, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.251865416765213, - "rewards/chosen_fricton": -0.08078990876674652, - "rewards/margins": 0.16131436824798584, - "rewards/margins_friction": 0.31116002798080444, - "rewards/rejected": -0.41317978501319885, - "rewards/rejected_friction": -0.3919498920440674, + "epoch": 0.26, + "learning_rate": 4.19948623628877e-06, + "logits/chosen": -0.3127744495868683, + "logits/rejected": -0.31366902589797974, + "logps/chosen": -451.15966796875, + "logps/rejected": -458.08154296875, + "loss": 0.5186, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.2668843269348145, + "rewards/margins": 0.8956031799316406, + "rewards/rejected": -3.162487268447876, "step": 860 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.46, - "learning_rate": 4.752828369301069e-06, - "logits/chosen": -0.37123069167137146, - "logits/chosen_friction": -0.37709981203079224, - "logits/rejected": -0.36522412300109863, - "logits/rejected_friction": -0.35086554288864136, - "logps/chosen": -3.2150135040283203, - "logps/chosen_friction": -1.3081588745117188, - "logps/rejected": -5.090475559234619, - "logps/rejected_friction": -4.558204174041748, - "loss": 0.0034, - "policy_friction_nll_loss": 1.2816603183746338, - "policy_nll_loss": 3.1760566234588623, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26027074456214905, - "rewards/chosen_fricton": -0.08184390515089035, - "rewards/margins": 0.18389835953712463, - "rewards/margins_friction": 0.32009556889533997, - "rewards/rejected": -0.4441691040992737, - "rewards/rejected_friction": -0.4019394814968109, - "step": 865 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.46, - "learning_rate": 4.74997833615009e-06, - "logits/chosen": -0.38410595059394836, - "logits/chosen_friction": -0.3908304274082184, - "logits/rejected": -0.3698596954345703, - "logits/rejected_friction": -0.3648473620414734, - "logps/chosen": -3.1172146797180176, - "logps/chosen_friction": -1.242893934249878, - "logps/rejected": -5.0778303146362305, - "logps/rejected_friction": -4.290308952331543, - "loss": 0.0025, - "policy_friction_nll_loss": 1.2302000522613525, - "policy_nll_loss": 3.0928916931152344, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2528523802757263, - "rewards/chosen_fricton": -0.07767560333013535, - "rewards/margins": 0.19359707832336426, - "rewards/margins_friction": 0.3013288974761963, - "rewards/rejected": -0.4464494287967682, - "rewards/rejected_friction": -0.37900450825691223, + "epoch": 0.26, + "learning_rate": 4.179524383199016e-06, + "logits/chosen": -0.30885085463523865, + "logits/rejected": -0.3100178837776184, + "logps/chosen": -445.05670166015625, + "logps/rejected": -453.55328369140625, + "loss": 0.4533, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1777684688568115, + "rewards/margins": 1.1419053077697754, + "rewards/rejected": -3.319674253463745, "step": 870 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.47, - "learning_rate": 4.747112830367093e-06, - "logits/chosen": -0.37253063917160034, - "logits/chosen_friction": -0.37703707814216614, - "logits/rejected": -0.36333316564559937, - "logits/rejected_friction": -0.3513241708278656, - "logps/chosen": -3.2721476554870605, - "logps/chosen_friction": -1.284793496131897, - "logps/rejected": -5.15109395980835, - "logps/rejected_friction": -4.5078277587890625, - "loss": 0.0031, - "policy_friction_nll_loss": 1.272761583328247, - "policy_nll_loss": 3.240360975265503, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26576346158981323, - "rewards/chosen_fricton": -0.07949931919574738, - "rewards/margins": 0.184799924492836, - "rewards/margins_friction": 0.31834104657173157, - "rewards/rejected": -0.45056337118148804, - "rewards/rejected_friction": -0.39784038066864014, - "step": 875 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.47, - "learning_rate": 4.744231871657563e-06, - "logits/chosen": -0.3745035231113434, - "logits/chosen_friction": -0.3787258267402649, - "logits/rejected": -0.36864176392555237, - "logits/rejected_friction": -0.3532297611236572, - "logps/chosen": -3.2208569049835205, - "logps/chosen_friction": -1.291297197341919, - "logps/rejected": -4.995157718658447, - "logps/rejected_friction": -4.528962135314941, - "loss": 0.0029, - "policy_friction_nll_loss": 1.2729023694992065, - "policy_nll_loss": 3.1954948902130127, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25991129875183105, - "rewards/chosen_fricton": -0.0788605660200119, - "rewards/margins": 0.17442360520362854, - "rewards/margins_friction": 0.31998690962791443, - "rewards/rejected": -0.434334933757782, - "rewards/rejected_friction": -0.39884746074676514, + "epoch": 0.27, + "learning_rate": 4.159365430476262e-06, + "logits/chosen": -0.30774661898612976, + "logits/rejected": -0.3091534674167633, + "logps/chosen": -445.9901428222656, + "logps/rejected": -453.9535217285156, + "loss": 0.4711, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2285873889923096, + "rewards/margins": 1.0858628749847412, + "rewards/rejected": -3.31445050239563, "step": 880 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.47, - "learning_rate": 4.741335479833254e-06, - "logits/chosen": -0.3850293755531311, - "logits/chosen_friction": -0.388772189617157, - "logits/rejected": -0.3782188296318054, - "logits/rejected_friction": -0.36333826184272766, - "logps/chosen": -3.1232738494873047, - "logps/chosen_friction": -1.2634518146514893, - "logps/rejected": -4.8632659912109375, - "logps/rejected_friction": -4.413657188415527, - "loss": 0.0028, - "policy_friction_nll_loss": 1.2482755184173584, - "policy_nll_loss": 3.0993857383728027, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2517077922821045, - "rewards/chosen_fricton": -0.07810337841510773, - "rewards/margins": 0.17090348899364471, - "rewards/margins_friction": 0.3112645149230957, - "rewards/rejected": -0.4226113259792328, - "rewards/rejected_friction": -0.38936784863471985, - "step": 885 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.47, - "learning_rate": 4.738423674812048e-06, - "logits/chosen": -0.37982380390167236, - "logits/chosen_friction": -0.3837817311286926, - "logits/rejected": -0.3695020079612732, - "logits/rejected_friction": -0.3589261472225189, - "logps/chosen": -2.9654364585876465, - "logps/chosen_friction": -1.2220933437347412, - "logps/rejected": -4.891234397888184, - "logps/rejected_friction": -4.309747695922852, - "loss": 0.0053, - "policy_friction_nll_loss": 1.2017436027526855, - "policy_nll_loss": 2.9296631813049316, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23820164799690247, - "rewards/chosen_fricton": -0.07540035992860794, - "rewards/margins": 0.1896674782037735, - "rewards/margins_friction": 0.30519628524780273, - "rewards/rejected": -0.42786914110183716, - "rewards/rejected_friction": -0.38059666752815247, + "epoch": 0.27, + "learning_rate": 4.139011743862991e-06, + "logits/chosen": -0.31220975518226624, + "logits/rejected": -0.31295710802078247, + "logps/chosen": -437.8184509277344, + "logps/rejected": -450.45611572265625, + "loss": 0.4411, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.122331142425537, + "rewards/margins": 1.2842220067977905, + "rewards/rejected": -3.406553268432617, "step": 890 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.48, - "learning_rate": 4.735496476617821e-06, - "logits/chosen": -0.37700599431991577, - "logits/chosen_friction": -0.38310706615448, - "logits/rejected": -0.3635532259941101, - "logits/rejected_friction": -0.35641413927078247, - "logps/chosen": -3.0571391582489014, - "logps/chosen_friction": -1.203147530555725, - "logps/rejected": -5.030953884124756, - "logps/rejected_friction": -4.370713233947754, - "loss": 0.0021, - "policy_friction_nll_loss": 1.1810581684112549, - "policy_nll_loss": 3.018392562866211, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2465175837278366, - "rewards/chosen_fricton": -0.07265119254589081, - "rewards/margins": 0.19503185153007507, - "rewards/margins_friction": 0.3138144612312317, - "rewards/rejected": -0.4415494501590729, - "rewards/rejected_friction": -0.3864656388759613, - "step": 895 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.48, - "learning_rate": 4.732553905380305e-06, - "logits/chosen": -0.37222373485565186, - "logits/chosen_friction": -0.37809017300605774, - "logits/rejected": -0.36407554149627686, - "logits/rejected_friction": -0.3499669134616852, - "logps/chosen": -3.2026124000549316, - "logps/chosen_friction": -1.2577171325683594, - "logps/rejected": -5.0561065673828125, - "logps/rejected_friction": -4.48679256439209, - "loss": 0.0027, - "policy_friction_nll_loss": 1.2409099340438843, - "policy_nll_loss": 3.1699025630950928, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25716686248779297, - "rewards/chosen_fricton": -0.07539502531290054, - "rewards/margins": 0.18177616596221924, - "rewards/margins_friction": 0.31847840547561646, - "rewards/rejected": -0.4389430582523346, - "rewards/rejected_friction": -0.393873393535614, + "epoch": 0.27, + "learning_rate": 4.11846571195457e-06, + "logits/chosen": -0.30749282240867615, + "logits/rejected": -0.3092586398124695, + "logps/chosen": -445.489013671875, + "logps/rejected": -456.45361328125, + "loss": 0.4331, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.122631788253784, + "rewards/margins": 1.2757575511932373, + "rewards/rejected": -3.3983893394470215, "step": 900 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.48, - "learning_rate": 4.72959598133495e-06, - "logits/chosen": -0.36829525232315063, - "logits/chosen_friction": -0.3736148774623871, - "logits/rejected": -0.3586757481098175, - "logits/rejected_friction": -0.348945677280426, - "logps/chosen": -2.928995370864868, - "logps/chosen_friction": -1.1963233947753906, - "logps/rejected": -4.704957962036133, - "logps/rejected_friction": -4.249531269073486, - "loss": 0.0033, - "policy_friction_nll_loss": 1.1705007553100586, - "policy_nll_loss": 2.871469259262085, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23445963859558105, - "rewards/chosen_fricton": -0.0733371451497078, - "rewards/margins": 0.1747424155473709, - "rewards/margins_friction": 0.3013378977775574, - "rewards/rejected": -0.40920203924179077, - "rewards/rejected_friction": -0.37467503547668457, - "step": 905 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.49, - "learning_rate": 4.726622724822781e-06, - "logits/chosen": -0.37969183921813965, - "logits/chosen_friction": -0.3850950300693512, - "logits/rejected": -0.3660884499549866, - "logits/rejected_friction": -0.3581054210662842, - "logps/chosen": -3.1021275520324707, - "logps/chosen_friction": -1.1778042316436768, - "logps/rejected": -4.997735023498535, - "logps/rejected_friction": -4.318004608154297, - "loss": 0.0032, - "policy_friction_nll_loss": 1.1682085990905762, - "policy_nll_loss": 3.081631898880005, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2515637278556824, - "rewards/chosen_fricton": -0.07080941647291183, - "rewards/margins": 0.18619711697101593, - "rewards/margins_friction": 0.3099055588245392, - "rewards/rejected": -0.4377608299255371, - "rewards/rejected_friction": -0.38071495294570923, + "epoch": 0.28, + "learning_rate": 4.0977297459189405e-06, + "logits/chosen": -0.31161195039749146, + "logits/rejected": -0.3124944865703583, + "logps/chosen": -448.9032287597656, + "logps/rejected": -456.729248046875, + "loss": 0.4549, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3067939281463623, + "rewards/margins": 1.165810227394104, + "rewards/rejected": -3.472604274749756, "step": 910 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.49, - "learning_rate": 4.723634156290265e-06, - "logits/chosen": -0.37474170327186584, - "logits/chosen_friction": -0.3808595538139343, - "logits/rejected": -0.35845497250556946, - "logits/rejected_friction": -0.35341668128967285, - "logps/chosen": -3.0669007301330566, - "logps/chosen_friction": -1.2114055156707764, - "logps/rejected": -5.010428428649902, - "logps/rejected_friction": -4.353379726409912, - "loss": 0.0044, - "policy_friction_nll_loss": 1.187885046005249, - "policy_nll_loss": 3.0254178047180176, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24718399345874786, - "rewards/chosen_fricton": -0.07341562211513519, - "rewards/margins": 0.1906377375125885, - "rewards/margins_friction": 0.3096485733985901, - "rewards/rejected": -0.43782171607017517, - "rewards/rejected_friction": -0.3830642104148865, - "step": 915 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.49, - "learning_rate": 4.720630296289165e-06, - "logits/chosen": -0.3719572126865387, - "logits/chosen_friction": -0.3800339698791504, - "logits/rejected": -0.35617193579673767, - "logits/rejected_friction": -0.35364288091659546, - "logps/chosen": -3.0318493843078613, - "logps/chosen_friction": -1.1815329790115356, - "logps/rejected": -4.978213310241699, - "logps/rejected_friction": -4.236151218414307, - "loss": 0.0038, - "policy_friction_nll_loss": 1.1633524894714355, - "policy_nll_loss": 2.9806289672851562, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2458246648311615, - "rewards/chosen_fricton": -0.0723811537027359, - "rewards/margins": 0.19131243228912354, - "rewards/margins_friction": 0.3013635277748108, - "rewards/rejected": -0.43713706731796265, - "rewards/rejected_friction": -0.3737446665763855, + "epoch": 0.28, + "learning_rate": 4.076806279213656e-06, + "logits/chosen": -0.311604380607605, + "logits/rejected": -0.312518447637558, + "logps/chosen": -438.07916259765625, + "logps/rejected": -450.47381591796875, + "loss": 0.4232, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2351415157318115, + "rewards/margins": 1.1904911994934082, + "rewards/rejected": -3.425632953643799, "step": 920 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.49, - "learning_rate": 4.717611165476399e-06, - "logits/chosen": -0.36159271001815796, - "logits/chosen_friction": -0.36791855096817017, - "logits/rejected": -0.35091111063957214, - "logits/rejected_friction": -0.33992961049079895, - "logps/chosen": -3.1395275592803955, - "logps/chosen_friction": -1.2759071588516235, - "logps/rejected": -4.867236137390137, - "logps/rejected_friction": -4.411207675933838, - "loss": 0.0035, - "policy_friction_nll_loss": 1.2580757141113281, - "policy_nll_loss": 3.0958292484283447, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25319287180900574, - "rewards/chosen_fricton": -0.07833616435527802, - "rewards/margins": 0.1686895787715912, - "rewards/margins_friction": 0.3087218701839447, - "rewards/rejected": -0.4218824505805969, - "rewards/rejected_friction": -0.3870580494403839, - "step": 925 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.5, - "learning_rate": 4.7145767846139e-06, - "logits/chosen": -0.35186392068862915, - "logits/chosen_friction": -0.3564409613609314, - "logits/rejected": -0.3418453633785248, - "logits/rejected_friction": -0.33104053139686584, - "logps/chosen": -3.093787670135498, - "logps/chosen_friction": -1.2066270112991333, - "logps/rejected": -4.813014984130859, - "logps/rejected_friction": -4.330706596374512, - "loss": 0.0039, - "policy_friction_nll_loss": 1.1704752445220947, - "policy_nll_loss": 3.0231239795684814, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25042179226875305, - "rewards/chosen_fricton": -0.07274355739355087, - "rewards/margins": 0.16883182525634766, - "rewards/margins_friction": 0.3088993728160858, - "rewards/rejected": -0.4192535877227783, - "rewards/rejected_friction": -0.3816429078578949, + "epoch": 0.28, + "learning_rate": 4.055697767300302e-06, + "logits/chosen": -0.3170091211795807, + "logits/rejected": -0.31755563616752625, + "logps/chosen": -442.83758544921875, + "logps/rejected": -450.9444885253906, + "loss": 0.5088, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.335662841796875, + "rewards/margins": 1.0687111616134644, + "rewards/rejected": -3.40437388420105, "step": 930 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.5, - "learning_rate": 4.7115271745684745e-06, - "logits/chosen": -0.3680647909641266, - "logits/chosen_friction": -0.37428194284439087, - "logits/rejected": -0.3591741621494293, - "logits/rejected_friction": -0.3462345600128174, - "logps/chosen": -3.0251574516296387, - "logps/chosen_friction": -1.2108650207519531, - "logps/rejected": -4.855663299560547, - "logps/rejected_friction": -4.3857903480529785, - "loss": 0.004, - "policy_friction_nll_loss": 1.1915853023529053, - "policy_nll_loss": 2.9857101440429688, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24400432407855988, - "rewards/chosen_fricton": -0.07381459325551987, - "rewards/margins": 0.179687961935997, - "rewards/margins_friction": 0.31364721059799194, - "rewards/rejected": -0.4236923158168793, - "rewards/rejected_friction": -0.3874618411064148, - "step": 935 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.5, - "learning_rate": 4.708462356311652e-06, - "logits/chosen": -0.37373116612434387, - "logits/chosen_friction": -0.3798629641532898, - "logits/rejected": -0.36289387941360474, - "logits/rejected_friction": -0.3515613079071045, - "logps/chosen": -3.158050298690796, - "logps/chosen_friction": -1.2454431056976318, - "logps/rejected": -5.052344799041748, - "logps/rejected_friction": -4.528155326843262, - "loss": 0.0029, - "policy_friction_nll_loss": 1.2328623533248901, - "policy_nll_loss": 3.1268675327301025, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2561054229736328, - "rewards/chosen_fricton": -0.07649224996566772, - "rewards/margins": 0.18684165179729462, - "rewards/margins_friction": 0.32505181431770325, - "rewards/rejected": -0.44294705986976624, - "rewards/rejected_friction": -0.4015440344810486, + "epoch": 0.29, + "learning_rate": 4.034406687356344e-06, + "logits/chosen": -0.3176030218601227, + "logits/rejected": -0.31867748498916626, + "logps/chosen": -438.16229248046875, + "logps/rejected": -446.01806640625, + "loss": 0.5146, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.501446008682251, + "rewards/margins": 0.903441309928894, + "rewards/rejected": -3.4048874378204346, "step": 940 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.5, - "learning_rate": 4.705382350919553e-06, - "logits/chosen": -0.3695526421070099, - "logits/chosen_friction": -0.3734152019023895, - "logits/rejected": -0.36031022667884827, - "logits/rejected_friction": -0.3465050458908081, - "logps/chosen": -2.9277586936950684, - "logps/chosen_friction": -1.1957721710205078, - "logps/rejected": -4.7566375732421875, - "logps/rejected_friction": -4.4269256591796875, - "loss": 0.003, - "policy_friction_nll_loss": 1.1668603420257568, - "policy_nll_loss": 2.8835577964782715, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23465684056282043, - "rewards/chosen_fricton": -0.07236681133508682, - "rewards/margins": 0.18011906743049622, - "rewards/margins_friction": 0.3193545341491699, - "rewards/rejected": -0.4147758483886719, - "rewards/rejected_friction": -0.39172133803367615, - "step": 945 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.51, - "learning_rate": 4.70228717957273e-06, - "logits/chosen": -0.3685542047023773, - "logits/chosen_friction": -0.3719711899757385, - "logits/rejected": -0.36283940076828003, - "logits/rejected_friction": -0.3433905243873596, - "logps/chosen": -3.091089963912964, - "logps/chosen_friction": -1.253240942955017, - "logps/rejected": -4.688680171966553, - "logps/rejected_friction": -4.492508888244629, - "loss": 0.0034, - "policy_friction_nll_loss": 1.2310667037963867, - "policy_nll_loss": 3.0489625930786133, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24868431687355042, - "rewards/chosen_fricton": -0.07597470283508301, - "rewards/margins": 0.1559428572654724, - "rewards/margins_friction": 0.31957897543907166, - "rewards/rejected": -0.40462714433670044, - "rewards/rejected_friction": -0.39555373787879944, + "epoch": 0.29, + "learning_rate": 4.012935537984414e-06, + "logits/chosen": -0.31417202949523926, + "logits/rejected": -0.3148192763328552, + "logps/chosen": -435.503173828125, + "logps/rejected": -444.60400390625, + "loss": 0.5049, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.4111835956573486, + "rewards/margins": 0.8773403167724609, + "rewards/rejected": -3.2885234355926514, "step": 950 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.51, - "learning_rate": 4.699176863556031e-06, - "logits/chosen": -0.37517625093460083, - "logits/chosen_friction": -0.38226670026779175, - "logits/rejected": -0.36184418201446533, - "logits/rejected_friction": -0.35245680809020996, - "logps/chosen": -3.1055166721343994, - "logps/chosen_friction": -1.2122324705123901, - "logps/rejected": -4.989890098571777, - "logps/rejected_friction": -4.490630149841309, - "loss": 0.0018, - "policy_friction_nll_loss": 1.1980751752853394, - "policy_nll_loss": 3.076606035232544, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25098809599876404, - "rewards/chosen_fricton": -0.07364396750926971, - "rewards/margins": 0.1852022111415863, - "rewards/margins_friction": 0.32400473952293396, - "rewards/rejected": -0.4361903667449951, - "rewards/rejected_friction": -0.3976486921310425, - "step": 955 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.51, - "learning_rate": 4.696051424258451e-06, - "logits/chosen": -0.3645961582660675, - "logits/chosen_friction": -0.3680470585823059, - "logits/rejected": -0.3529554605484009, - "logits/rejected_friction": -0.33757421374320984, - "logps/chosen": -3.3926219940185547, - "logps/chosen_friction": -1.3659181594848633, - "logps/rejected": -5.141293525695801, - "logps/rejected_friction": -4.717015266418457, - "loss": 0.0023, - "policy_friction_nll_loss": 1.352970838546753, - "policy_nll_loss": 3.3713417053222656, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2730044722557068, - "rewards/chosen_fricton": -0.08194348961114883, - "rewards/margins": 0.17166946828365326, - "rewards/margins_friction": 0.3315955102443695, - "rewards/rejected": -0.44467395544052124, - "rewards/rejected_friction": -0.41353899240493774, + "epoch": 0.29, + "learning_rate": 3.991286838919086e-06, + "logits/chosen": -0.30995315313339233, + "logits/rejected": -0.31148335337638855, + "logps/chosen": -440.8172912597656, + "logps/rejected": -452.94268798828125, + "loss": 0.4584, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.2748212814331055, + "rewards/margins": 1.0933376550674438, + "rewards/rejected": -3.3681588172912598, "step": 960 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.51, - "learning_rate": 4.6929108831729855e-06, - "logits/chosen": -0.39045339822769165, - "logits/chosen_friction": -0.3964537978172302, - "logits/rejected": -0.3760530352592468, - "logits/rejected_friction": -0.3692043423652649, - "logps/chosen": -3.0272793769836426, - "logps/chosen_friction": -1.2239001989364624, - "logps/rejected": -4.885192394256592, - "logps/rejected_friction": -4.3622002601623535, - "loss": 0.0026, - "policy_friction_nll_loss": 1.2097351551055908, - "policy_nll_loss": 2.9998161792755127, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24493379890918732, - "rewards/chosen_fricton": -0.07599469274282455, - "rewards/margins": 0.18230721354484558, - "rewards/margins_friction": 0.30957695841789246, - "rewards/rejected": -0.4272409975528717, - "rewards/rejected_friction": -0.3855716586112976, - "step": 965 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.52, - "learning_rate": 4.68975526189648e-06, - "logits/chosen": -0.3691980242729187, - "logits/chosen_friction": -0.3765380382537842, - "logits/rejected": -0.3550460636615753, - "logits/rejected_friction": -0.3497599959373474, - "logps/chosen": -3.0006136894226074, - "logps/chosen_friction": -1.1877535581588745, - "logps/rejected": -4.825511932373047, - "logps/rejected_friction": -4.291228294372559, - "loss": 0.0029, - "policy_friction_nll_loss": 1.1641912460327148, - "policy_nll_loss": 2.946385622024536, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24273400008678436, - "rewards/chosen_fricton": -0.07166536152362823, - "rewards/margins": 0.17898327112197876, - "rewards/margins_friction": 0.30652326345443726, - "rewards/rejected": -0.42171725630760193, - "rewards/rejected_friction": -0.3781886398792267, + "epoch": 0.29, + "learning_rate": 3.969463130731183e-06, + "logits/chosen": -0.3108167052268982, + "logits/rejected": -0.31207841634750366, + "logps/chosen": -443.76409912109375, + "logps/rejected": -456.50286865234375, + "loss": 0.4063, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.237427234649658, + "rewards/margins": 1.2730433940887451, + "rewards/rejected": -3.5104706287384033, "step": 970 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.52, - "learning_rate": 4.6865845821294854e-06, - "logits/chosen": -0.3875858187675476, - "logits/chosen_friction": -0.39462655782699585, - "logits/rejected": -0.37028393149375916, - "logits/rejected_friction": -0.36725321412086487, - "logps/chosen": -3.0342907905578613, - "logps/chosen_friction": -1.1321302652359009, - "logps/rejected": -5.032505989074707, - "logps/rejected_friction": -4.314020156860352, - "loss": 0.0033, - "policy_friction_nll_loss": 1.1165480613708496, - "policy_nll_loss": 3.00178599357605, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24586519598960876, - "rewards/chosen_fricton": -0.06784720718860626, - "rewards/margins": 0.1969476044178009, - "rewards/margins_friction": 0.3144725263118744, - "rewards/rejected": -0.44281283020973206, - "rewards/rejected_friction": -0.38231974840164185, - "step": 975 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.52, - "learning_rate": 4.683398865676105e-06, - "logits/chosen": -0.37378427386283875, - "logits/chosen_friction": -0.37451300024986267, - "logits/rejected": -0.3693489134311676, - "logits/rejected_friction": -0.34673863649368286, - "logps/chosen": -3.0839810371398926, - "logps/chosen_friction": -1.2718274593353271, - "logps/rejected": -4.666837215423584, - "logps/rejected_friction": -4.5508222579956055, - "loss": 0.0037, - "policy_friction_nll_loss": 1.2586541175842285, - "policy_nll_loss": 3.058352470397949, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24626879394054413, - "rewards/chosen_fricton": -0.07684121280908585, - "rewards/margins": 0.1544613540172577, - "rewards/margins_friction": 0.32353124022483826, - "rewards/rejected": -0.4007301330566406, - "rewards/rejected_friction": -0.4003724157810211, + "epoch": 0.3, + "learning_rate": 3.947466974529622e-06, + "logits/chosen": -0.3074961304664612, + "logits/rejected": -0.30913347005844116, + "logps/chosen": -451.47320556640625, + "logps/rejected": -461.1958923339844, + "loss": 0.4688, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.525216817855835, + "rewards/margins": 1.2734287977218628, + "rewards/rejected": -3.798645496368408, "step": 980 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.53, - "learning_rate": 4.680198134443845e-06, - "logits/chosen": -0.3780021369457245, - "logits/chosen_friction": -0.3840213418006897, - "logits/rejected": -0.36253467202186584, - "logits/rejected_friction": -0.35874563455581665, - "logps/chosen": -2.90437912940979, - "logps/chosen_friction": -1.1432725191116333, - "logps/rejected": -4.819657802581787, - "logps/rejected_friction": -4.2755351066589355, - "loss": 0.0044, - "policy_friction_nll_loss": 1.1253334283828735, - "policy_nll_loss": 2.865833282470703, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23346486687660217, - "rewards/chosen_fricton": -0.06937471032142639, - "rewards/margins": 0.18823638558387756, - "rewards/margins_friction": 0.3090466260910034, - "rewards/rejected": -0.42170119285583496, - "rewards/rejected_friction": -0.3784213662147522, - "step": 985 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.53, - "learning_rate": 4.676982410443469e-06, - "logits/chosen": -0.3780950605869293, - "logits/chosen_friction": -0.383195698261261, - "logits/rejected": -0.3636656105518341, - "logits/rejected_friction": -0.35555005073547363, - "logps/chosen": -2.9947571754455566, - "logps/chosen_friction": -1.1640506982803345, - "logps/rejected": -4.844972133636475, - "logps/rejected_friction": -4.366303443908691, - "loss": 0.0052, - "policy_friction_nll_loss": 1.1442415714263916, - "policy_nll_loss": 2.952759027481079, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24033291637897491, - "rewards/chosen_fricton": -0.06870372593402863, - "rewards/margins": 0.18184246122837067, - "rewards/margins_friction": 0.3162197470664978, - "rewards/rejected": -0.4221753478050232, - "rewards/rejected_friction": -0.3849234879016876, + "epoch": 0.3, + "learning_rate": 3.925300951660859e-06, + "logits/chosen": -0.3098825216293335, + "logits/rejected": -0.3106127381324768, + "logps/chosen": -449.3988342285156, + "logps/rejected": -455.8897399902344, + "loss": 0.4974, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.6619412899017334, + "rewards/margins": 1.0037035942077637, + "rewards/rejected": -3.665644884109497, "step": 990 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.53, - "learning_rate": 4.673751715788841e-06, - "logits/chosen": -0.3741362392902374, - "logits/chosen_friction": -0.3777056336402893, - "logits/rejected": -0.3637619614601135, - "logits/rejected_friction": -0.3518379032611847, - "logps/chosen": -3.132763385772705, - "logps/chosen_friction": -1.2316515445709229, - "logps/rejected": -4.781726360321045, - "logps/rejected_friction": -4.359641075134277, - "loss": 0.0042, - "policy_friction_nll_loss": 1.2234172821044922, - "policy_nll_loss": 3.105370044708252, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2530985176563263, - "rewards/chosen_fricton": -0.07464580982923508, - "rewards/margins": 0.16216585040092468, - "rewards/margins_friction": 0.3089084029197693, - "rewards/rejected": -0.4152643084526062, - "rewards/rejected_friction": -0.38355422019958496, - "step": 995 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.53, - "learning_rate": 4.670506072696773e-06, - "logits/chosen": -0.371518075466156, - "logits/chosen_friction": -0.3795081377029419, - "logits/rejected": -0.35420235991477966, - "logits/rejected_friction": -0.3524012267589569, - "logps/chosen": -3.1237683296203613, - "logps/chosen_friction": -1.2014914751052856, - "logps/rejected": -5.166347503662109, - "logps/rejected_friction": -4.467172622680664, - "loss": 0.0029, - "policy_friction_nll_loss": 1.1922687292099, - "policy_nll_loss": 3.081488609313965, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25400134921073914, - "rewards/chosen_fricton": -0.07356332242488861, - "rewards/margins": 0.20087292790412903, - "rewards/margins_friction": 0.3222697377204895, - "rewards/rejected": -0.45487427711486816, - "rewards/rejected_friction": -0.3958330750465393, + "epoch": 0.3, + "learning_rate": 3.9029676634059565e-06, + "logits/chosen": -0.31196895241737366, + "logits/rejected": -0.3131485879421234, + "logps/chosen": -451.7205505371094, + "logps/rejected": -461.85467529296875, + "loss": 0.4296, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.469062328338623, + "rewards/margins": 1.1974780559539795, + "rewards/rejected": -3.6665406227111816, "step": 1000 }, { - "epoch": 0.53, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.4071754813194275, - "eval_logits/chosen_friction": -0.41341301798820496, - "eval_logits/rejected": -0.3904404938220978, - "eval_logits/rejected_friction": -0.38157445192337036, - "eval_logps/chosen": -3.1261370182037354, - "eval_logps/chosen_friction": -1.24674391746521, - "eval_logps/rejected": -4.986579895019531, - "eval_logps/rejected_friction": -4.3812665939331055, - "eval_loss": 0.0026232702657580376, - "eval_policy_friction_nll_loss": 1.24674391746521, - "eval_policy_nll_loss": 3.1261370182037354, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.2526715099811554, - "eval_rewards/chosen_fricton": -0.07640566676855087, - "eval_rewards/margins": 0.18256652355194092, - "eval_rewards/margins_friction": 0.3093050718307495, - "eval_rewards/rejected": -0.4352380335330963, - "eval_rewards/rejected_friction": -0.385710746049881, - "eval_runtime": 549.2162, - "eval_samples_per_second": 0.91, - "eval_steps_per_second": 0.455, + "epoch": 0.3, + "eval_logits/chosen": -0.3799600601196289, + "eval_logits/rejected": -0.3806193768978119, + "eval_logps/chosen": -436.8405456542969, + "eval_logps/rejected": -445.942626953125, + "eval_loss": 0.48261019587516785, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": -2.3949320316314697, + "eval_rewards/margins": 1.1014209985733032, + "eval_rewards/rejected": -3.4963533878326416, + "eval_runtime": 377.1489, + "eval_samples_per_second": 1.326, + "eval_steps_per_second": 1.326, "step": 1000 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.54, - "learning_rate": 4.6672455034868775e-06, - "logits/chosen": -0.374799907207489, - "logits/chosen_friction": -0.38051724433898926, - "logits/rejected": -0.35578733682632446, - "logits/rejected_friction": -0.3547082543373108, - "logps/chosen": -3.036132335662842, - "logps/chosen_friction": -1.2264569997787476, - "logps/rejected": -4.915272235870361, - "logps/rejected_friction": -4.231269359588623, - "loss": 0.0032, - "policy_friction_nll_loss": 1.2118504047393799, - "policy_nll_loss": 3.012233018875122, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24629385769367218, - "rewards/chosen_fricton": -0.07671640813350677, - "rewards/margins": 0.18465812504291534, - "rewards/margins_friction": 0.2968679964542389, - "rewards/rejected": -0.4309520125389099, - "rewards/rejected_friction": -0.37358441948890686, - "step": 1005 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.54, - "learning_rate": 4.663970030581408e-06, - "logits/chosen": -0.36469775438308716, - "logits/chosen_friction": -0.3682125210762024, - "logits/rejected": -0.3548581302165985, - "logits/rejected_friction": -0.34119123220443726, - "logps/chosen": -3.072834014892578, - "logps/chosen_friction": -1.2291991710662842, - "logps/rejected": -4.770666599273682, - "logps/rejected_friction": -4.381205081939697, - "loss": 0.0026, - "policy_friction_nll_loss": 1.2165277004241943, - "policy_nll_loss": 3.042050838470459, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.245834082365036, - "rewards/chosen_fricton": -0.07342320680618286, - "rewards/margins": 0.16598615050315857, - "rewards/margins_friction": 0.3107037842273712, - "rewards/rejected": -0.4118202328681946, - "rewards/rejected_friction": -0.3841269612312317, + "epoch": 0.31, + "learning_rate": 3.880469730675311e-06, + "logits/chosen": -0.31937772035598755, + "logits/rejected": -0.3201027512550354, + "logps/chosen": -444.93267822265625, + "logps/rejected": -454.3338317871094, + "loss": 0.4744, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.4238359928131104, + "rewards/margins": 1.1197197437286377, + "rewards/rejected": -3.543555736541748, "step": 1010 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.54, - "learning_rate": 4.660679676505111e-06, - "logits/chosen": -0.3714122474193573, - "logits/chosen_friction": -0.37456613779067993, - "logits/rejected": -0.3601820766925812, - "logits/rejected_friction": -0.3472662568092346, - "logps/chosen": -3.1400444507598877, - "logps/chosen_friction": -1.2788856029510498, - "logps/rejected": -4.897824287414551, - "logps/rejected_friction": -4.492516994476318, - "loss": 0.0021, - "policy_friction_nll_loss": 1.259484052658081, - "policy_nll_loss": 3.1049931049346924, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25229310989379883, - "rewards/chosen_fricton": -0.07804089039564133, - "rewards/margins": 0.17292067408561707, - "rewards/margins_friction": 0.31765347719192505, - "rewards/rejected": -0.4252137541770935, - "rewards/rejected_friction": -0.3956943452358246, - "step": 1015 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.54, - "learning_rate": 4.657374463885063e-06, - "logits/chosen": -0.36567530035972595, - "logits/chosen_friction": -0.37142518162727356, - "logits/rejected": -0.350445032119751, - "logits/rejected_friction": -0.3458746373653412, - "logps/chosen": -2.9477200508117676, - "logps/chosen_friction": -1.217098593711853, - "logps/rejected": -4.846413612365723, - "logps/rejected_friction": -4.183579444885254, - "loss": 0.0023, - "policy_friction_nll_loss": 1.1917505264282227, - "policy_nll_loss": 2.9068028926849365, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23734703660011292, - "rewards/chosen_fricton": -0.07518737018108368, - "rewards/margins": 0.18735137581825256, - "rewards/margins_friction": 0.29354315996170044, - "rewards/rejected": -0.4246984124183655, - "rewards/rejected_friction": -0.36873048543930054, + "epoch": 0.31, + "learning_rate": 3.857809793701082e-06, + "logits/chosen": -0.3155730664730072, + "logits/rejected": -0.31668931245803833, + "logps/chosen": -447.9942932128906, + "logps/rejected": -458.11199951171875, + "loss": 0.4398, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2663698196411133, + "rewards/margins": 1.3081058263778687, + "rewards/rejected": -3.5744757652282715, "step": 1020 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.55, - "learning_rate": 4.654054415450523e-06, - "logits/chosen": -0.3616432249546051, - "logits/chosen_friction": -0.36732372641563416, - "logits/rejected": -0.35136255621910095, - "logits/rejected_friction": -0.3418830335140228, - "logps/chosen": -3.1253502368927, - "logps/chosen_friction": -1.1920180320739746, - "logps/rejected": -4.850667476654053, - "logps/rejected_friction": -4.240795135498047, - "loss": 0.0043, - "policy_friction_nll_loss": 1.1673812866210938, - "policy_nll_loss": 3.055177688598633, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2524583339691162, - "rewards/chosen_fricton": -0.07119692862033844, - "rewards/margins": 0.1701110452413559, - "rewards/margins_friction": 0.3018214702606201, - "rewards/rejected": -0.4225694239139557, - "rewards/rejected_friction": -0.37301844358444214, - "step": 1025 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.55, - "learning_rate": 4.650719554032773e-06, - "logits/chosen": -0.36856725811958313, - "logits/chosen_friction": -0.37209707498550415, - "logits/rejected": -0.35832419991493225, - "logits/rejected_friction": -0.3437844216823578, - "logps/chosen": -3.1988401412963867, - "logps/chosen_friction": -1.2443304061889648, - "logps/rejected": -4.925429821014404, - "logps/rejected_friction": -4.472928047180176, - "loss": 0.0026, - "policy_friction_nll_loss": 1.2292373180389404, - "policy_nll_loss": 3.1713833808898926, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25695037841796875, - "rewards/chosen_fricton": -0.07355404645204544, - "rewards/margins": 0.16997134685516357, - "rewards/margins_friction": 0.3197566866874695, - "rewards/rejected": -0.42692169547080994, - "rewards/rejected_friction": -0.3933107256889343, + "epoch": 0.31, + "learning_rate": 3.834990511727341e-06, + "logits/chosen": -0.3186780512332916, + "logits/rejected": -0.32040825486183167, + "logps/chosen": -445.6949768066406, + "logps/rejected": -458.57244873046875, + "loss": 0.4537, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.343160629272461, + "rewards/margins": 1.2446506023406982, + "rewards/rejected": -3.587811231613159, "step": 1030 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.55, - "learning_rate": 4.647369902564959e-06, - "logits/chosen": -0.38325849175453186, - "logits/chosen_friction": -0.3889881670475006, - "logits/rejected": -0.3742901682853699, - "logits/rejected_friction": -0.3605496287345886, - "logps/chosen": -3.0758674144744873, - "logps/chosen_friction": -1.2071397304534912, - "logps/rejected": -4.788763523101807, - "logps/rejected_friction": -4.406740665435791, - "loss": 0.0026, - "policy_friction_nll_loss": 1.197741985321045, - "policy_nll_loss": 3.0572962760925293, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24724180996418, - "rewards/chosen_fricton": -0.07261103391647339, - "rewards/margins": 0.16850602626800537, - "rewards/margins_friction": 0.3164690136909485, - "rewards/rejected": -0.4157477915287018, - "rewards/rejected_friction": -0.3890800476074219, - "step": 1035 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.55, - "learning_rate": 4.644005484081937e-06, - "logits/chosen": -0.3637835383415222, - "logits/chosen_friction": -0.36921465396881104, - "logits/rejected": -0.3535746932029724, - "logits/rejected_friction": -0.3425424098968506, - "logps/chosen": -3.0873525142669678, - "logps/chosen_friction": -1.1701781749725342, - "logps/rejected": -4.873215675354004, - "logps/rejected_friction": -4.415847301483154, - "loss": 0.0022, - "policy_friction_nll_loss": 1.152662992477417, - "policy_nll_loss": 3.043126344680786, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2472967803478241, - "rewards/chosen_fricton": -0.06806384027004242, - "rewards/margins": 0.17485937476158142, - "rewards/margins_friction": 0.3199538588523865, - "rewards/rejected": -0.4221561551094055, - "rewards/rejected_friction": -0.3880176842212677, + "epoch": 0.32, + "learning_rate": 3.812014562698002e-06, + "logits/chosen": -0.320089191198349, + "logits/rejected": -0.3210357427597046, + "logps/chosen": -441.82354736328125, + "logps/rejected": -449.4639587402344, + "loss": 0.5402, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.470724582672119, + "rewards/margins": 0.9423478841781616, + "rewards/rejected": -3.4130725860595703, "step": 1040 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.56, - "learning_rate": 4.6406263217201095e-06, - "logits/chosen": -0.35965457558631897, - "logits/chosen_friction": -0.3660734295845032, - "logits/rejected": -0.3487134575843811, - "logits/rejected_friction": -0.3402903079986572, - "logps/chosen": -3.0207598209381104, - "logps/chosen_friction": -1.2652195692062378, - "logps/rejected": -4.816288948059082, - "logps/rejected_friction": -4.356043815612793, - "loss": 0.0056, - "policy_friction_nll_loss": 1.220547080039978, - "policy_nll_loss": 2.9585022926330566, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24168309569358826, - "rewards/chosen_fricton": -0.07784043997526169, - "rewards/margins": 0.17721618711948395, - "rewards/margins_friction": 0.3063153922557831, - "rewards/rejected": -0.4188992977142334, - "rewards/rejected_friction": -0.384155809879303, - "step": 1045 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.56, - "learning_rate": 4.637232438717274e-06, - "logits/chosen": -0.38082361221313477, - "logits/chosen_friction": -0.38749244809150696, - "logits/rejected": -0.3639315962791443, - "logits/rejected_friction": -0.3602389693260193, - "logps/chosen": -2.981699228286743, - "logps/chosen_friction": -1.1486411094665527, - "logps/rejected": -4.899061679840088, - "logps/rejected_friction": -4.200716495513916, - "loss": 0.0025, - "policy_friction_nll_loss": 1.1381720304489136, - "policy_nll_loss": 2.9542527198791504, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24101948738098145, - "rewards/chosen_fricton": -0.06897245347499847, - "rewards/margins": 0.18897728621959686, - "rewards/margins_friction": 0.3017999529838562, - "rewards/rejected": -0.4299967885017395, - "rewards/rejected_friction": -0.37077245116233826, + "epoch": 0.32, + "learning_rate": 3.788884642942555e-06, + "logits/chosen": -0.32223668694496155, + "logits/rejected": -0.32441529631614685, + "logps/chosen": -444.36328125, + "logps/rejected": -457.4508361816406, + "loss": 0.4432, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.3148508071899414, + "rewards/margins": 1.2026770114898682, + "rewards/rejected": -3.5175278186798096, "step": 1050 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.56, - "learning_rate": 4.6338238584124554e-06, - "logits/chosen": -0.3762897849082947, - "logits/chosen_friction": -0.38511165976524353, - "logits/rejected": -0.35907474160194397, - "logits/rejected_friction": -0.3553256392478943, - "logps/chosen": -2.98772931098938, - "logps/chosen_friction": -1.1636947393417358, - "logps/rejected": -4.8927998542785645, - "logps/rejected_friction": -4.241664886474609, - "loss": 0.0029, - "policy_friction_nll_loss": 1.1514594554901123, - "policy_nll_loss": 2.959547758102417, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2409096658229828, - "rewards/chosen_fricton": -0.07027852535247803, - "rewards/margins": 0.1873967945575714, - "rewards/margins_friction": 0.30391743779182434, - "rewards/rejected": -0.4283064305782318, - "rewards/rejected_friction": -0.3741959035396576, - "step": 1055 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.57, - "learning_rate": 4.6304006042457495e-06, - "logits/chosen": -0.36704739928245544, - "logits/chosen_friction": -0.37259453535079956, - "logits/rejected": -0.3580113351345062, - "logits/rejected_friction": -0.3443640470504761, - "logps/chosen": -3.0953316688537598, - "logps/chosen_friction": -1.2058355808258057, - "logps/rejected": -4.841550350189209, - "logps/rejected_friction": -4.441107273101807, - "loss": 0.0019, - "policy_friction_nll_loss": 1.1881023645401, - "policy_nll_loss": 3.064936876296997, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24828505516052246, - "rewards/chosen_fricton": -0.07128670066595078, - "rewards/margins": 0.17153121531009674, - "rewards/margins_friction": 0.31964820623397827, - "rewards/rejected": -0.419816255569458, - "rewards/rejected_friction": -0.39093491435050964, + "epoch": 0.32, + "learning_rate": 3.765603466859635e-06, + "logits/chosen": -0.31094425916671753, + "logits/rejected": -0.3124980330467224, + "logps/chosen": -439.57025146484375, + "logps/rejected": -453.1656799316406, + "loss": 0.4585, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.423621654510498, + "rewards/margins": 1.1653788089752197, + "rewards/rejected": -3.589000701904297, "step": 1060 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.57, - "learning_rate": 4.626962699758163e-06, - "logits/chosen": -0.3705756664276123, - "logits/chosen_friction": -0.3754631280899048, - "logits/rejected": -0.3591746985912323, - "logits/rejected_friction": -0.3502013087272644, - "logps/chosen": -3.0799148082733154, - "logps/chosen_friction": -1.1587469577789307, - "logps/rejected": -4.864030838012695, - "logps/rejected_friction": -4.3332414627075195, - "loss": 0.0026, - "policy_friction_nll_loss": 1.1382522583007812, - "policy_nll_loss": 3.0472424030303955, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24795517325401306, - "rewards/chosen_fricton": -0.06787263602018356, - "rewards/margins": 0.17523761093616486, - "rewards/margins_friction": 0.31322431564331055, - "rewards/rejected": -0.4231927990913391, - "rewards/rejected_friction": -0.3810969293117523, - "step": 1065 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.57, - "learning_rate": 4.623510168591446e-06, - "logits/chosen": -0.3664952218532562, - "logits/chosen_friction": -0.37238770723342896, - "logits/rejected": -0.35473695397377014, - "logits/rejected_friction": -0.3425300717353821, - "logps/chosen": -3.2536263465881348, - "logps/chosen_friction": -1.2192201614379883, - "logps/rejected": -5.11106538772583, - "logps/rejected_friction": -4.563320159912109, - "loss": 0.0024, - "policy_friction_nll_loss": 1.2047474384307861, - "policy_nll_loss": 3.2262866497039795, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26275795698165894, - "rewards/chosen_fricton": -0.07137826830148697, - "rewards/margins": 0.1822093427181244, - "rewards/margins_friction": 0.33018380403518677, - "rewards/rejected": -0.44496726989746094, - "rewards/rejected_friction": -0.40156206488609314, + "epoch": 0.32, + "learning_rate": 3.7421737665984807e-06, + "logits/chosen": -0.32444941997528076, + "logits/rejected": -0.3258149325847626, + "logps/chosen": -444.17742919921875, + "logps/rejected": -454.7518005371094, + "loss": 0.485, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.465573310852051, + "rewards/margins": 1.1133558750152588, + "rewards/rejected": -3.5789291858673096, "step": 1070 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.57, - "learning_rate": 4.620043034487939e-06, - "logits/chosen": -0.37423640489578247, - "logits/chosen_friction": -0.3794357180595398, - "logits/rejected": -0.36634719371795654, - "logits/rejected_friction": -0.35180288553237915, - "logps/chosen": -3.0850884914398193, - "logps/chosen_friction": -1.1695349216461182, - "logps/rejected": -4.808932304382324, - "logps/rejected_friction": -4.456028938293457, - "loss": 0.0021, - "policy_friction_nll_loss": 1.1554557085037231, - "policy_nll_loss": 3.0548644065856934, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2485073059797287, - "rewards/chosen_fricton": -0.06870485842227936, - "rewards/margins": 0.168967142701149, - "rewards/margins_friction": 0.3243796229362488, - "rewards/rejected": -0.4174744188785553, - "rewards/rejected_friction": -0.39308446645736694, - "step": 1075 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.58, - "learning_rate": 4.616561321290398e-06, - "logits/chosen": -0.36667269468307495, - "logits/chosen_friction": -0.37086063623428345, - "logits/rejected": -0.35781770944595337, - "logits/rejected_friction": -0.3426331877708435, - "logps/chosen": -3.044682264328003, - "logps/chosen_friction": -1.182642936706543, - "logps/rejected": -4.787392616271973, - "logps/rejected_friction": -4.543292999267578, - "loss": 0.002, - "policy_friction_nll_loss": 1.1603862047195435, - "policy_nll_loss": 2.996687173843384, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24478872120380402, - "rewards/chosen_fricton": -0.06914345175027847, - "rewards/margins": 0.16992580890655518, - "rewards/margins_friction": 0.33082959055900574, - "rewards/rejected": -0.4147145748138428, - "rewards/rejected_friction": -0.3999730050563812, + "epoch": 0.33, + "learning_rate": 3.7185982917382986e-06, + "logits/chosen": -0.32046034932136536, + "logits/rejected": -0.3209912180900574, + "logps/chosen": -449.0337829589844, + "logps/rejected": -456.1290588378906, + "loss": 0.5036, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.421260356903076, + "rewards/margins": 1.0578956604003906, + "rewards/rejected": -3.479156017303467, "step": 1080 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.58, - "learning_rate": 4.6130650529418394e-06, - "logits/chosen": -0.36826851963996887, - "logits/chosen_friction": -0.3717716932296753, - "logits/rejected": -0.3584999442100525, - "logits/rejected_friction": -0.34458640217781067, - "logps/chosen": -3.143925428390503, - "logps/chosen_friction": -1.169560194015503, - "logps/rejected": -4.975898742675781, - "logps/rejected_friction": -4.4094743728637695, - "loss": 0.0024, - "policy_friction_nll_loss": 1.1589118242263794, - "policy_nll_loss": 3.1059093475341797, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2533486783504486, - "rewards/chosen_fricton": -0.06803341209888458, - "rewards/margins": 0.1791292428970337, - "rewards/margins_friction": 0.3194468915462494, - "rewards/rejected": -0.4324779510498047, - "rewards/rejected_friction": -0.3874802589416504, - "step": 1085 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.58, - "learning_rate": 4.6095542534853734e-06, - "logits/chosen": -0.371587336063385, - "logits/chosen_friction": -0.37760108709335327, - "logits/rejected": -0.3555970788002014, - "logits/rejected_friction": -0.34689557552337646, - "logps/chosen": -3.0456509590148926, - "logps/chosen_friction": -1.2287527322769165, - "logps/rejected": -4.908329486846924, - "logps/rejected_friction": -4.340825080871582, - "loss": 0.002, - "policy_friction_nll_loss": 1.212834119796753, - "policy_nll_loss": 3.0085301399230957, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24436406791210175, - "rewards/chosen_fricton": -0.0743357390165329, - "rewards/margins": 0.18296189606189728, - "rewards/margins_friction": 0.3073204457759857, - "rewards/rejected": -0.4273260235786438, - "rewards/rejected_friction": -0.3816561698913574, + "epoch": 0.33, + "learning_rate": 3.6948798089655913e-06, + "logits/chosen": -0.3232346177101135, + "logits/rejected": -0.3241461217403412, + "logps/chosen": -448.1339416503906, + "logps/rejected": -455.69970703125, + "loss": 0.4664, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2551751136779785, + "rewards/margins": 1.1089966297149658, + "rewards/rejected": -3.3641715049743652, "step": 1090 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.58, - "learning_rate": 4.606028947064034e-06, - "logits/chosen": -0.36051782965660095, - "logits/chosen_friction": -0.3658129572868347, - "logits/rejected": -0.3456508219242096, - "logits/rejected_friction": -0.3367251753807068, - "logps/chosen": -3.183877944946289, - "logps/chosen_friction": -1.2536100149154663, - "logps/rejected": -5.028787136077881, - "logps/rejected_friction": -4.4506964683532715, - "loss": 0.0014, - "policy_friction_nll_loss": 1.2344648838043213, - "policy_nll_loss": 3.142618179321289, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2570490539073944, - "rewards/chosen_fricton": -0.07575374096632004, - "rewards/margins": 0.18177969753742218, - "rewards/margins_friction": 0.316478967666626, - "rewards/rejected": -0.4388287663459778, - "rewards/rejected_friction": -0.3922327160835266, - "step": 1095 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.59, - "learning_rate": 4.602489157920619e-06, - "logits/chosen": -0.35758960247039795, - "logits/chosen_friction": -0.362274706363678, - "logits/rejected": -0.3488675355911255, - "logits/rejected_friction": -0.3338541090488434, - "logps/chosen": -3.2293498516082764, - "logps/chosen_friction": -1.2823388576507568, - "logps/rejected": -4.9878034591674805, - "logps/rejected_friction": -4.581715106964111, - "loss": 0.0022, - "policy_friction_nll_loss": 1.2509936094284058, - "policy_nll_loss": 3.172642469406128, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25988712906837463, - "rewards/chosen_fricton": -0.07667706161737442, - "rewards/margins": 0.17233756184577942, - "rewards/margins_friction": 0.3255767226219177, - "rewards/rejected": -0.43222469091415405, - "rewards/rejected_friction": -0.40225377678871155, + "epoch": 0.33, + "learning_rate": 3.671021101749476e-06, + "logits/chosen": -0.3160512447357178, + "logits/rejected": -0.3167613744735718, + "logps/chosen": -434.97698974609375, + "logps/rejected": -441.48065185546875, + "loss": 0.4634, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.224177122116089, + "rewards/margins": 1.158850908279419, + "rewards/rejected": -3.383028507232666, "step": 1100 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.59, - "learning_rate": 4.598934910397521e-06, - "logits/chosen": -0.3768940567970276, - "logits/chosen_friction": -0.3836180865764618, - "logits/rejected": -0.36368197202682495, - "logits/rejected_friction": -0.35540276765823364, - "logps/chosen": -3.186720609664917, - "logps/chosen_friction": -1.2080342769622803, - "logps/rejected": -4.9918012619018555, - "logps/rejected_friction": -4.415738105773926, - "loss": 0.003, - "policy_friction_nll_loss": 1.197378158569336, - "policy_nll_loss": 3.1612396240234375, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2583980858325958, - "rewards/chosen_fricton": -0.07342822104692459, - "rewards/margins": 0.17734625935554504, - "rewards/margins_friction": 0.3169638514518738, - "rewards/rejected": -0.43574434518814087, - "rewards/rejected_friction": -0.39039212465286255, - "step": 1105 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.59, - "learning_rate": 4.59536622893656e-06, - "logits/chosen": -0.367135226726532, - "logits/chosen_friction": -0.3728356957435608, - "logits/rejected": -0.35370272397994995, - "logits/rejected_friction": -0.34470289945602417, - "logps/chosen": -3.223998546600342, - "logps/chosen_friction": -1.2438987493515015, - "logps/rejected": -4.97867488861084, - "logps/rejected_friction": -4.383957862854004, - "loss": 0.0022, - "policy_friction_nll_loss": 1.231858491897583, - "policy_nll_loss": 3.196272850036621, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26295581459999084, - "rewards/chosen_fricton": -0.0760059729218483, - "rewards/margins": 0.17185159027576447, - "rewards/margins_friction": 0.3093425929546356, - "rewards/rejected": -0.4348073899745941, - "rewards/rejected_friction": -0.3853485882282257, + "epoch": 0.34, + "learning_rate": 3.6470249700150273e-06, + "logits/chosen": -0.31829750537872314, + "logits/rejected": -0.3188309669494629, + "logps/chosen": -440.1014099121094, + "logps/rejected": -448.751220703125, + "loss": 0.4287, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.001333713531494, + "rewards/margins": 1.3394745588302612, + "rewards/rejected": -3.340808153152466, "step": 1110 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.59, - "learning_rate": 4.5917831380788124e-06, - "logits/chosen": -0.37418264150619507, - "logits/chosen_friction": -0.38167405128479004, - "logits/rejected": -0.35861241817474365, - "logits/rejected_friction": -0.35379812121391296, - "logps/chosen": -3.1970608234405518, - "logps/chosen_friction": -1.1855926513671875, - "logps/rejected": -5.1176371574401855, - "logps/rejected_friction": -4.329601764678955, - "loss": 0.0027, - "policy_friction_nll_loss": 1.1714450120925903, - "policy_nll_loss": 3.16166353225708, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2617768347263336, - "rewards/chosen_fricton": -0.07203789055347443, - "rewards/margins": 0.1892777979373932, - "rewards/margins_friction": 0.310890257358551, - "rewards/rejected": -0.4510546326637268, - "rewards/rejected_friction": -0.38292819261550903, - "step": 1115 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.6, - "learning_rate": 4.58818566246445e-06, - "logits/chosen": -0.36457520723342896, - "logits/chosen_friction": -0.3709366023540497, - "logits/rejected": -0.3468713164329529, - "logits/rejected_friction": -0.3424617350101471, - "logps/chosen": -3.153019428253174, - "logps/chosen_friction": -1.2188167572021484, - "logps/rejected": -5.056092739105225, - "logps/rejected_friction": -4.354006767272949, - "loss": 0.0021, - "policy_friction_nll_loss": 1.2032506465911865, - "policy_nll_loss": 3.1168761253356934, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2573917806148529, - "rewards/chosen_fricton": -0.0754016786813736, - "rewards/margins": 0.186543270945549, - "rewards/margins_friction": 0.3093253970146179, - "rewards/rejected": -0.4439350664615631, - "rewards/rejected_friction": -0.3847270607948303, + "epoch": 0.34, + "learning_rate": 3.6228942298146985e-06, + "logits/chosen": -0.31696969270706177, + "logits/rejected": -0.3185669183731079, + "logps/chosen": -436.61090087890625, + "logps/rejected": -446.8427734375, + "loss": 0.4086, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.968629240989685, + "rewards/margins": 1.365235686302185, + "rewards/rejected": -3.33386492729187, "step": 1120 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.6, - "learning_rate": 4.58457382683256e-06, - "logits/chosen": -0.365249365568161, - "logits/chosen_friction": -0.36914950609207153, - "logits/rejected": -0.3551255166530609, - "logits/rejected_friction": -0.34122490882873535, - "logps/chosen": -3.14412522315979, - "logps/chosen_friction": -1.1951981782913208, - "logps/rejected": -4.8656325340271, - "logps/rejected_friction": -4.406551361083984, - "loss": 0.0026, - "policy_friction_nll_loss": 1.1847360134124756, - "policy_nll_loss": 3.1161856651306152, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25356119871139526, - "rewards/chosen_fricton": -0.06966500729322433, - "rewards/margins": 0.16922540962696075, - "rewards/margins_friction": 0.317890465259552, - "rewards/rejected": -0.42278656363487244, - "rewards/rejected_friction": -0.38755548000335693, - "step": 1125 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.6, - "learning_rate": 4.580947656020985e-06, - "logits/chosen": -0.36573195457458496, - "logits/chosen_friction": -0.3712257444858551, - "logits/rejected": -0.35178881883621216, - "logits/rejected_friction": -0.34195175766944885, - "logps/chosen": -3.0683538913726807, - "logps/chosen_friction": -1.1953330039978027, - "logps/rejected": -4.911564826965332, - "logps/rejected_friction": -4.422548294067383, - "loss": 0.0019, - "policy_friction_nll_loss": 1.176607370376587, - "policy_nll_loss": 3.0246059894561768, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24695761501789093, - "rewards/chosen_fricton": -0.07111116498708725, - "rewards/margins": 0.1808309108018875, - "rewards/margins_friction": 0.318707138299942, - "rewards/rejected": -0.42778855562210083, - "rewards/rejected_friction": -0.38981834053993225, + "epoch": 0.34, + "learning_rate": 3.598631712997841e-06, + "logits/chosen": -0.3232669234275818, + "logits/rejected": -0.32362625002861023, + "logps/chosen": -445.9930114746094, + "logps/rejected": -456.0194396972656, + "loss": 0.4797, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1778838634490967, + "rewards/margins": 1.1961848735809326, + "rewards/rejected": -3.3740687370300293, "step": 1130 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.61, - "learning_rate": 4.5773071749661454e-06, - "logits/chosen": -0.37605005502700806, - "logits/chosen_friction": -0.38618260622024536, - "logits/rejected": -0.35825014114379883, - "logits/rejected_friction": -0.3574860692024231, - "logps/chosen": -3.0847771167755127, - "logps/chosen_friction": -1.1797045469284058, - "logps/rejected": -5.002890586853027, - "logps/rejected_friction": -4.363857746124268, - "loss": 0.0025, - "policy_friction_nll_loss": 1.1664119958877563, - "policy_nll_loss": 3.063457489013672, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25125783681869507, - "rewards/chosen_fricton": -0.07192815840244293, - "rewards/margins": 0.18921436369419098, - "rewards/margins_friction": 0.3149677813053131, - "rewards/rejected": -0.44047221541404724, - "rewards/rejected_friction": -0.38689595460891724, - "step": 1135 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.61, - "learning_rate": 4.5736524087028706e-06, - "logits/chosen": -0.3707112669944763, - "logits/chosen_friction": -0.37499764561653137, - "logits/rejected": -0.3612481951713562, - "logits/rejected_friction": -0.347322940826416, - "logps/chosen": -3.175727367401123, - "logps/chosen_friction": -1.2297520637512207, - "logps/rejected": -4.873656272888184, - "logps/rejected_friction": -4.4646172523498535, - "loss": 0.0018, - "policy_friction_nll_loss": 1.2143731117248535, - "policy_nll_loss": 3.150766134262085, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2565153241157532, - "rewards/chosen_fricton": -0.07294534146785736, - "rewards/margins": 0.16643621027469635, - "rewards/margins_friction": 0.31979691982269287, - "rewards/rejected": -0.42295151948928833, - "rewards/rejected_friction": -0.39274224638938904, + "epoch": 0.35, + "learning_rate": 3.5742402668783797e-06, + "logits/chosen": -0.31457391381263733, + "logits/rejected": -0.31524404883384705, + "logps/chosen": -434.63885498046875, + "logps/rejected": -445.6629943847656, + "loss": 0.4942, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.235532283782959, + "rewards/margins": 1.1538090705871582, + "rewards/rejected": -3.389340877532959, "step": 1140 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.61, - "learning_rate": 4.569983382364226e-06, - "logits/chosen": -0.368129700422287, - "logits/chosen_friction": -0.37041789293289185, - "logits/rejected": -0.35975462198257446, - "logits/rejected_friction": -0.3399975895881653, - "logps/chosen": -3.1987805366516113, - "logps/chosen_friction": -1.2010023593902588, - "logps/rejected": -4.826416969299316, - "logps/rejected_friction": -4.477974891662598, - "loss": 0.0025, - "policy_friction_nll_loss": 1.1823413372039795, - "policy_nll_loss": 3.1626248359680176, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2581733465194702, - "rewards/chosen_fricton": -0.07070403546094894, - "rewards/margins": 0.15903925895690918, - "rewards/margins_friction": 0.3233758807182312, - "rewards/rejected": -0.41721266508102417, - "rewards/rejected_friction": -0.39407995343208313, - "step": 1145 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.61, - "learning_rate": 4.566300121181341e-06, - "logits/chosen": -0.3739219307899475, - "logits/chosen_friction": -0.3790202736854553, - "logits/rejected": -0.36136394739151, - "logits/rejected_friction": -0.34864217042922974, - "logps/chosen": -3.156022787094116, - "logps/chosen_friction": -1.1779096126556396, - "logps/rejected": -4.940117835998535, - "logps/rejected_friction": -4.49023962020874, - "loss": 0.0029, - "policy_friction_nll_loss": 1.1632509231567383, - "policy_nll_loss": 3.125868320465088, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25518786907196045, - "rewards/chosen_fricton": -0.0693497285246849, - "rewards/margins": 0.17523162066936493, - "rewards/margins_friction": 0.32729968428611755, - "rewards/rejected": -0.4304194450378418, - "rewards/rejected_friction": -0.39664942026138306, + "epoch": 0.35, + "learning_rate": 3.549722753900662e-06, + "logits/chosen": -0.3312085270881653, + "logits/rejected": -0.33145731687545776, + "logps/chosen": -451.101806640625, + "logps/rejected": -457.88909912109375, + "loss": 0.5859, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.525597095489502, + "rewards/margins": 0.7067753672599792, + "rewards/rejected": -3.232372283935547, "step": 1150 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.62, - "learning_rate": 4.562602650483234e-06, - "logits/chosen": -0.3770318329334259, - "logits/chosen_friction": -0.3859002888202667, - "logits/rejected": -0.36063095927238464, - "logits/rejected_friction": -0.35425513982772827, - "logps/chosen": -3.1780169010162354, - "logps/chosen_friction": -1.1685822010040283, - "logps/rejected": -5.07248592376709, - "logps/rejected_friction": -4.407637596130371, - "loss": 0.0015, - "policy_friction_nll_loss": 1.1635154485702515, - "policy_nll_loss": 3.1701114177703857, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2580605149269104, - "rewards/chosen_fricton": -0.06927026808261871, - "rewards/margins": 0.18622423708438873, - "rewards/margins_friction": 0.3199447691440582, - "rewards/rejected": -0.44428473711013794, - "rewards/rejected_friction": -0.38921505212783813, - "step": 1155 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.62, - "learning_rate": 4.558890995696642e-06, - "logits/chosen": -0.3810650706291199, - "logits/chosen_friction": -0.38652223348617554, - "logits/rejected": -0.3697448968887329, - "logits/rejected_friction": -0.35558274388313293, - "logps/chosen": -3.0984365940093994, - "logps/chosen_friction": -1.1787409782409668, - "logps/rejected": -4.8024492263793945, - "logps/rejected_friction": -4.377355098724365, - "loss": 0.003, - "policy_friction_nll_loss": 1.1714345216751099, - "policy_nll_loss": 3.0823209285736084, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.251006543636322, - "rewards/chosen_fricton": -0.07023533433675766, - "rewards/margins": 0.16731995344161987, - "rewards/margins_friction": 0.3161315321922302, - "rewards/rejected": -0.4183264672756195, - "rewards/rejected_friction": -0.38636690378189087, + "epoch": 0.35, + "learning_rate": 3.5250820513035403e-06, + "logits/chosen": -0.3225269615650177, + "logits/rejected": -0.3232432007789612, + "logps/chosen": -438.2666931152344, + "logps/rejected": -450.084716796875, + "loss": 0.4502, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.3596291542053223, + "rewards/margins": 1.153564691543579, + "rewards/rejected": -3.5131936073303223, "step": 1160 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.62, - "learning_rate": 4.555165182345841e-06, - "logits/chosen": -0.37553825974464417, - "logits/chosen_friction": -0.3812829554080963, - "logits/rejected": -0.3602459728717804, - "logits/rejected_friction": -0.35146430134773254, - "logps/chosen": -3.0681076049804688, - "logps/chosen_friction": -1.1933012008666992, - "logps/rejected": -4.91904354095459, - "logps/rejected_friction": -4.4234299659729, - "loss": 0.0026, - "policy_friction_nll_loss": 1.1621935367584229, - "policy_nll_loss": 3.0175366401672363, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2482847273349762, - "rewards/chosen_fricton": -0.07229878008365631, - "rewards/margins": 0.18123985826969147, - "rewards/margins_friction": 0.31857961416244507, - "rewards/rejected": -0.42952457070350647, - "rewards/rejected_friction": -0.3908784091472626, - "step": 1165 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.62, - "learning_rate": 4.551425236052473e-06, - "logits/chosen": -0.38269466161727905, - "logits/chosen_friction": -0.3925485610961914, - "logits/rejected": -0.36680012941360474, - "logits/rejected_friction": -0.3630061745643616, - "logps/chosen": -2.9643197059631348, - "logps/chosen_friction": -1.1431258916854858, - "logps/rejected": -4.87282657623291, - "logps/rejected_friction": -4.332942485809326, - "loss": 0.0019, - "policy_friction_nll_loss": 1.1292153596878052, - "policy_nll_loss": 2.937045097351074, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23803088068962097, - "rewards/chosen_fricton": -0.06823913007974625, - "rewards/margins": 0.18779222667217255, - "rewards/margins_friction": 0.3153873085975647, - "rewards/rejected": -0.4258231222629547, - "rewards/rejected_friction": -0.38362643122673035, + "epoch": 0.36, + "learning_rate": 3.500321050782717e-06, + "logits/chosen": -0.3299608826637268, + "logits/rejected": -0.33111685514450073, + "logps/chosen": -435.506103515625, + "logps/rejected": -449.41644287109375, + "loss": 0.4587, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.255194664001465, + "rewards/margins": 1.2196067571640015, + "rewards/rejected": -3.474801540374756, "step": 1170 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.63, - "learning_rate": 4.5476711825353676e-06, - "logits/chosen": -0.3621614873409271, - "logits/chosen_friction": -0.36884158849716187, - "logits/rejected": -0.34815844893455505, - "logits/rejected_friction": -0.34159064292907715, - "logps/chosen": -3.075427532196045, - "logps/chosen_friction": -1.1750457286834717, - "logps/rejected": -4.93013858795166, - "logps/rejected_friction": -4.337807655334473, - "loss": 0.0024, - "policy_friction_nll_loss": 1.156554937362671, - "policy_nll_loss": 3.022251844406128, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24866601824760437, - "rewards/chosen_fricton": -0.06998519599437714, - "rewards/margins": 0.18286970257759094, - "rewards/margins_friction": 0.31303393840789795, - "rewards/rejected": -0.4315357208251953, - "rewards/rejected_friction": -0.3830190896987915, - "step": 1175 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.63, - "learning_rate": 4.54390304761037e-06, - "logits/chosen": -0.3771347105503082, - "logits/chosen_friction": -0.38475775718688965, - "logits/rejected": -0.361184298992157, - "logits/rejected_friction": -0.35538774728775024, - "logps/chosen": -3.2437667846679688, - "logps/chosen_friction": -1.2449829578399658, - "logps/rejected": -5.096177577972412, - "logps/rejected_friction": -4.404180526733398, - "loss": 0.0055, - "policy_friction_nll_loss": 1.2291433811187744, - "policy_nll_loss": 3.217484712600708, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26375848054885864, - "rewards/chosen_fricton": -0.07596387714147568, - "rewards/margins": 0.18242189288139343, - "rewards/margins_friction": 0.31256377696990967, - "rewards/rejected": -0.44618040323257446, - "rewards/rejected_friction": -0.38852769136428833, + "epoch": 0.36, + "learning_rate": 3.4754426581513866e-06, + "logits/chosen": -0.3299122750759125, + "logits/rejected": -0.33067744970321655, + "logps/chosen": -450.20074462890625, + "logps/rejected": -456.9153747558594, + "loss": 0.4929, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.382310390472412, + "rewards/margins": 1.0834969282150269, + "rewards/rejected": -3.4658074378967285, "step": 1180 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.63, - "learning_rate": 4.540120857190157e-06, - "logits/chosen": -0.37518441677093506, - "logits/chosen_friction": -0.3844323456287384, - "logits/rejected": -0.3600298762321472, - "logits/rejected_friction": -0.3549543023109436, - "logps/chosen": -3.190631866455078, - "logps/chosen_friction": -1.1982018947601318, - "logps/rejected": -5.104979515075684, - "logps/rejected_friction": -4.441027641296387, - "loss": 0.0028, - "policy_friction_nll_loss": 1.185732126235962, - "policy_nll_loss": 3.161162853240967, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25944775342941284, - "rewards/chosen_fricton": -0.07176627963781357, - "rewards/margins": 0.18772082030773163, - "rewards/margins_friction": 0.3201674818992615, - "rewards/rejected": -0.44716858863830566, - "rewards/rejected_friction": -0.39193373918533325, - "step": 1185 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.63, - "learning_rate": 4.536324637284065e-06, - "logits/chosen": -0.3757483959197998, - "logits/chosen_friction": -0.3844298720359802, - "logits/rejected": -0.3575108051300049, - "logits/rejected_friction": -0.3543790280818939, - "logps/chosen": -3.088925838470459, - "logps/chosen_friction": -1.184146761894226, - "logps/rejected": -4.880747318267822, - "logps/rejected_friction": -4.203349590301514, - "loss": 0.003, - "policy_friction_nll_loss": 1.1742477416992188, - "policy_nll_loss": 3.0699517726898193, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25059330463409424, - "rewards/chosen_fricton": -0.07194038480520248, - "rewards/margins": 0.17505332827568054, - "rewards/margins_friction": 0.29712802171707153, - "rewards/rejected": -0.4256466329097748, - "rewards/rejected_friction": -0.3690684139728546, + "epoch": 0.36, + "learning_rate": 3.45044979299923e-06, + "logits/chosen": -0.3264179527759552, + "logits/rejected": -0.32756897807121277, + "logps/chosen": -442.2974548339844, + "logps/rejected": -449.1595764160156, + "loss": 0.4977, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.263164520263672, + "rewards/margins": 1.0983796119689941, + "rewards/rejected": -3.361544370651245, "step": 1190 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.64, - "learning_rate": 4.532514413997906e-06, - "logits/chosen": -0.36112529039382935, - "logits/chosen_friction": -0.36735114455223083, - "logits/rejected": -0.3434043824672699, - "logits/rejected_friction": -0.3358974754810333, - "logps/chosen": -3.461662769317627, - "logps/chosen_friction": -1.390560269355774, - "logps/rejected": -5.201237678527832, - "logps/rejected_friction": -4.486556053161621, - "loss": 0.0044, - "policy_friction_nll_loss": 1.3848074674606323, - "policy_nll_loss": 3.4450249671936035, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.28125208616256714, - "rewards/chosen_fricton": -0.08654996007680893, - "rewards/margins": 0.17058750987052917, - "rewards/margins_friction": 0.3054940104484558, - "rewards/rejected": -0.4518396258354187, - "rewards/rejected_friction": -0.39204397797584534, - "step": 1195 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.64, - "learning_rate": 4.528690213533792e-06, - "logits/chosen": -0.36647117137908936, - "logits/chosen_friction": -0.37390512228012085, - "logits/rejected": -0.34774285554885864, - "logits/rejected_friction": -0.3458898067474365, - "logps/chosen": -3.219317674636841, - "logps/chosen_friction": -1.240813136100769, - "logps/rejected": -5.0573272705078125, - "logps/rejected_friction": -4.277568817138672, - "loss": 0.0019, - "policy_friction_nll_loss": 1.2211558818817139, - "policy_nll_loss": 3.183509111404419, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26435592770576477, - "rewards/chosen_fricton": -0.0780341625213623, - "rewards/margins": 0.17995008826255798, - "rewards/margins_friction": 0.2991468608379364, - "rewards/rejected": -0.44430598616600037, - "rewards/rejected_friction": -0.3771809935569763, + "epoch": 0.36, + "learning_rate": 3.425345388349787e-06, + "logits/chosen": -0.31463193893432617, + "logits/rejected": -0.31522423028945923, + "logps/chosen": -442.2705078125, + "logps/rejected": -452.19110107421875, + "loss": 0.501, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2270102500915527, + "rewards/margins": 1.1489759683609009, + "rewards/rejected": -3.3759865760803223, "step": 1200 }, { - "epoch": 0.64, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.4044034481048584, - "eval_logits/chosen_friction": -0.41206008195877075, - "eval_logits/rejected": -0.38831180334091187, - "eval_logits/rejected_friction": -0.3788710832595825, - "eval_logps/chosen": -3.174180746078491, - "eval_logps/chosen_friction": -1.226664423942566, - "eval_logps/rejected": -5.0227251052856445, - "eval_logps/rejected_friction": -4.387862682342529, - "eval_loss": 0.0020382937509566545, - "eval_policy_friction_nll_loss": 1.226664423942566, - "eval_policy_nll_loss": 3.174180746078491, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.25747591257095337, - "eval_rewards/chosen_fricton": -0.07439772039651871, - "eval_rewards/margins": 0.1813766062259674, - "eval_rewards/margins_friction": 0.31197261810302734, - "eval_rewards/rejected": -0.4388525187969208, - "eval_rewards/rejected_friction": -0.38637036085128784, - "eval_runtime": 580.0162, - "eval_samples_per_second": 0.862, - "eval_steps_per_second": 0.431, + "epoch": 0.36, + "eval_logits/chosen": -0.39124199748039246, + "eval_logits/rejected": -0.3919140696525574, + "eval_logps/chosen": -434.10162353515625, + "eval_logps/rejected": -443.24261474609375, + "eval_loss": 0.4862767159938812, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -2.1210429668426514, + "eval_rewards/margins": 1.1053153276443481, + "eval_rewards/rejected": -3.22635817527771, + "eval_runtime": 375.0192, + "eval_samples_per_second": 1.333, + "eval_steps_per_second": 1.333, "step": 1200 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.64, - "learning_rate": 4.524852062189953e-06, - "logits/chosen": -0.3649195432662964, - "logits/chosen_friction": -0.3690287172794342, - "logits/rejected": -0.35456690192222595, - "logits/rejected_friction": -0.3399697244167328, - "logps/chosen": -3.078075647354126, - "logps/chosen_friction": -1.2266274690628052, - "logps/rejected": -4.857232093811035, - "logps/rejected_friction": -4.547089099884033, - "loss": 0.0025, - "policy_friction_nll_loss": 1.2104920148849487, - "policy_nll_loss": 3.04972767829895, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24481284618377686, - "rewards/chosen_fricton": -0.07203997671604156, - "rewards/margins": 0.1742604523897171, - "rewards/margins_friction": 0.32766619324684143, - "rewards/rejected": -0.41907328367233276, - "rewards/rejected_friction": -0.3997061848640442, - "step": 1205 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.65, - "learning_rate": 4.520999986360555e-06, - "logits/chosen": -0.3629683554172516, - "logits/chosen_friction": -0.3663240373134613, - "logits/rejected": -0.3538133502006531, - "logits/rejected_friction": -0.33665627241134644, - "logps/chosen": -3.1790082454681396, - "logps/chosen_friction": -1.2576878070831299, - "logps/rejected": -4.738929748535156, - "logps/rejected_friction": -4.5163445472717285, - "loss": 0.0028, - "policy_friction_nll_loss": 1.2326719760894775, - "policy_nll_loss": 3.1463024616241455, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25628548860549927, - "rewards/chosen_fricton": -0.07482384145259857, - "rewards/margins": 0.15373192727565765, - "rewards/margins_friction": 0.32304319739341736, - "rewards/rejected": -0.4100174307823181, - "rewards/rejected_friction": -0.3978670537471771, + "epoch": 0.37, + "learning_rate": 3.4001323903162476e-06, + "logits/chosen": -0.32597848773002625, + "logits/rejected": -0.32685333490371704, + "logps/chosen": -435.82135009765625, + "logps/rejected": -446.47552490234375, + "loss": 0.4618, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.0256919860839844, + "rewards/margins": 1.2204935550689697, + "rewards/rejected": -3.246185302734375, "step": 1210 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.65, - "learning_rate": 4.51713401253552e-06, - "logits/chosen": -0.364607036113739, - "logits/chosen_friction": -0.3720659613609314, - "logits/rejected": -0.3535701632499695, - "logits/rejected_friction": -0.3430554270744324, - "logps/chosen": -3.1721272468566895, - "logps/chosen_friction": -1.1858043670654297, - "logps/rejected": -4.976214408874512, - "logps/rejected_friction": -4.501718997955322, - "loss": 0.0019, - "policy_friction_nll_loss": 1.1659623384475708, - "policy_nll_loss": 3.127885341644287, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2581065893173218, - "rewards/chosen_fricton": -0.07152735441923141, - "rewards/margins": 0.17610910534858704, - "rewards/margins_friction": 0.3266790807247162, - "rewards/rejected": -0.4342156946659088, - "rewards/rejected_friction": -0.3982064425945282, - "step": 1215 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.65, - "learning_rate": 4.513254167300345e-06, - "logits/chosen": -0.3754827082157135, - "logits/chosen_friction": -0.3829275071620941, - "logits/rejected": -0.3622964024543762, - "logits/rejected_friction": -0.35332420468330383, - "logps/chosen": -3.1071126461029053, - "logps/chosen_friction": -1.1822060346603394, - "logps/rejected": -4.955196380615234, - "logps/rejected_friction": -4.465958118438721, - "loss": 0.002, - "policy_friction_nll_loss": 1.170851707458496, - "policy_nll_loss": 3.084169864654541, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2515243589878082, - "rewards/chosen_fricton": -0.0704888254404068, - "rewards/margins": 0.18137913942337036, - "rewards/margins_friction": 0.3242649435997009, - "rewards/rejected": -0.4329034686088562, - "rewards/rejected_friction": -0.3947537839412689, + "epoch": 0.37, + "learning_rate": 3.3748137577557216e-06, + "logits/chosen": -0.3275033235549927, + "logits/rejected": -0.3280579149723053, + "logps/chosen": -438.50384521484375, + "logps/rejected": -447.7579040527344, + "loss": 0.4531, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.154592514038086, + "rewards/margins": 1.1412467956542969, + "rewards/rejected": -3.295839309692383, "step": 1220 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.65, - "learning_rate": 4.509360477335916e-06, - "logits/chosen": -0.3609849810600281, - "logits/chosen_friction": -0.3681395649909973, - "logits/rejected": -0.3494080603122711, - "logits/rejected_friction": -0.33840781450271606, - "logps/chosen": -3.060595989227295, - "logps/chosen_friction": -1.1644461154937744, - "logps/rejected": -4.793369293212891, - "logps/rejected_friction": -4.4019293785095215, - "loss": 0.0021, - "policy_friction_nll_loss": 1.1455439329147339, - "policy_nll_loss": 3.0190742015838623, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2460218369960785, - "rewards/chosen_fricton": -0.06791749596595764, - "rewards/margins": 0.17099788784980774, - "rewards/margins_friction": 0.3207145035266876, - "rewards/rejected": -0.4170197546482086, - "rewards/rejected_friction": -0.38863199949264526, - "step": 1225 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.66, - "learning_rate": 4.50545296941833e-06, - "logits/chosen": -0.36147403717041016, - "logits/chosen_friction": -0.36553168296813965, - "logits/rejected": -0.35422682762145996, - "logits/rejected_friction": -0.33761441707611084, - "logps/chosen": -3.128159999847412, - "logps/chosen_friction": -1.1692097187042236, - "logps/rejected": -4.735873222351074, - "logps/rejected_friction": -4.4193525314331055, - "loss": 0.0034, - "policy_friction_nll_loss": 1.1466668844223022, - "policy_nll_loss": 3.08390474319458, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25160372257232666, - "rewards/chosen_fricton": -0.06717989593744278, - "rewards/margins": 0.15684764087200165, - "rewards/margins_friction": 0.32006561756134033, - "rewards/rejected": -0.4084514081478119, - "rewards/rejected_friction": -0.3872455656528473, + "epoch": 0.37, + "learning_rate": 3.3493924619219964e-06, + "logits/chosen": -0.3302023112773895, + "logits/rejected": -0.33196666836738586, + "logps/chosen": -454.8751525878906, + "logps/rejected": -466.814453125, + "loss": 0.4865, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2140915393829346, + "rewards/margins": 1.0115987062454224, + "rewards/rejected": -3.2256903648376465, "step": 1230 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.66, - "learning_rate": 4.501531670418701e-06, - "logits/chosen": -0.37993377447128296, - "logits/chosen_friction": -0.38777273893356323, - "logits/rejected": -0.3632522225379944, - "logits/rejected_friction": -0.3578053414821625, - "logps/chosen": -3.0776679515838623, - "logps/chosen_friction": -1.1430613994598389, - "logps/rejected": -4.92777681350708, - "logps/rejected_friction": -4.314558982849121, - "loss": 0.0023, - "policy_friction_nll_loss": 1.1357834339141846, - "policy_nll_loss": 3.051799774169922, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24777288734912872, - "rewards/chosen_fricton": -0.0665973648428917, - "rewards/margins": 0.1827271282672882, - "rewards/margins_friction": 0.3140813112258911, - "rewards/rejected": -0.4305000305175781, - "rewards/rejected_friction": -0.3806787133216858, - "step": 1235 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.66, - "learning_rate": 4.497596607302986e-06, - "logits/chosen": -0.36736440658569336, - "logits/chosen_friction": -0.375444233417511, - "logits/rejected": -0.3504078984260559, - "logits/rejected_friction": -0.34538331627845764, - "logps/chosen": -3.219996929168701, - "logps/chosen_friction": -1.193960189819336, - "logps/rejected": -5.171389579772949, - "logps/rejected_friction": -4.514037132263184, - "loss": 0.0023, - "policy_friction_nll_loss": 1.1828157901763916, - "policy_nll_loss": 3.1724331378936768, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2612609565258026, - "rewards/chosen_fricton": -0.07063291221857071, - "rewards/margins": 0.19085738062858582, - "rewards/margins_friction": 0.3270162045955658, - "rewards/rejected": -0.4521183371543884, - "rewards/rejected_friction": -0.3976491391658783, + "epoch": 0.38, + "learning_rate": 3.3238714861168513e-06, + "logits/chosen": -0.3286048173904419, + "logits/rejected": -0.3293796181678772, + "logps/chosen": -436.85308837890625, + "logps/rejected": -445.0462951660156, + "loss": 0.4905, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0171353816986084, + "rewards/margins": 1.1139663457870483, + "rewards/rejected": -3.1311020851135254, "step": 1240 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.66, - "learning_rate": 4.493647807131795e-06, - "logits/chosen": -0.3581414818763733, - "logits/chosen_friction": -0.3654397428035736, - "logits/rejected": -0.34495753049850464, - "logits/rejected_friction": -0.33794736862182617, - "logps/chosen": -3.1738247871398926, - "logps/chosen_friction": -1.1964550018310547, - "logps/rejected": -4.958683967590332, - "logps/rejected_friction": -4.468103885650635, - "loss": 0.0019, - "policy_friction_nll_loss": 1.1693153381347656, - "policy_nll_loss": 3.123990774154663, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25601932406425476, - "rewards/chosen_fricton": -0.0704459547996521, - "rewards/margins": 0.17523500323295593, - "rewards/margins_friction": 0.3231378495693207, - "rewards/rejected": -0.4312543272972107, - "rewards/rejected_friction": -0.3935838043689728, - "step": 1245 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.67, - "learning_rate": 4.489685297060202e-06, - "logits/chosen": -0.37682753801345825, - "logits/chosen_friction": -0.3841511309146881, - "logits/rejected": -0.36401838064193726, - "logits/rejected_friction": -0.35473817586898804, - "logps/chosen": -3.020150661468506, - "logps/chosen_friction": -1.1446421146392822, - "logps/rejected": -4.8727312088012695, - "logps/rejected_friction": -4.293002128601074, - "loss": 0.0024, - "policy_friction_nll_loss": 1.1329748630523682, - "policy_nll_loss": 3.001068592071533, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24361255764961243, - "rewards/chosen_fricton": -0.06789299845695496, - "rewards/margins": 0.18097494542598724, - "rewards/margins_friction": 0.3097797632217407, - "rewards/rejected": -0.42458754777908325, - "rewards/rejected_friction": -0.3776727616786957, + "epoch": 0.38, + "learning_rate": 3.29825382533995e-06, + "logits/chosen": -0.3311420679092407, + "logits/rejected": -0.3327622711658478, + "logps/chosen": -444.5484924316406, + "logps/rejected": -455.69732666015625, + "loss": 0.5066, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.358177661895752, + "rewards/margins": 0.9705084562301636, + "rewards/rejected": -3.328686237335205, "step": 1250 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.67, - "learning_rate": 4.485709104337563e-06, - "logits/chosen": -0.36721596121788025, - "logits/chosen_friction": -0.37455061078071594, - "logits/rejected": -0.35329797863960266, - "logits/rejected_friction": -0.34628716111183167, - "logps/chosen": -3.025759220123291, - "logps/chosen_friction": -1.12850821018219, - "logps/rejected": -4.854708671569824, - "logps/rejected_friction": -4.26564359664917, - "loss": 0.0012, - "policy_friction_nll_loss": 1.1162093877792358, - "policy_nll_loss": 2.9862611293792725, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2441779375076294, - "rewards/chosen_fricton": -0.06643809378147125, - "rewards/margins": 0.1791677176952362, - "rewards/margins_friction": 0.30914533138275146, - "rewards/rejected": -0.4233456552028656, - "rewards/rejected_friction": -0.3755834102630615, - "step": 1255 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.67, - "learning_rate": 4.481719256307328e-06, - "logits/chosen": -0.3656681776046753, - "logits/chosen_friction": -0.3752906918525696, - "logits/rejected": -0.3486460745334625, - "logits/rejected_friction": -0.34543871879577637, - "logps/chosen": -3.0361125469207764, - "logps/chosen_friction": -1.142987608909607, - "logps/rejected": -4.93657922744751, - "logps/rejected_friction": -4.278153419494629, - "loss": 0.0017, - "policy_friction_nll_loss": 1.1288371086120605, - "policy_nll_loss": 3.005591869354248, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24483713507652283, - "rewards/chosen_fricton": -0.06624660640954971, - "rewards/margins": 0.18687407672405243, - "rewards/margins_friction": 0.30987483263015747, - "rewards/rejected": -0.43171125650405884, - "rewards/rejected_friction": -0.3761214315891266, + "epoch": 0.38, + "learning_rate": 3.272542485937369e-06, + "logits/chosen": -0.33226504921913147, + "logits/rejected": -0.3331693708896637, + "logps/chosen": -434.11248779296875, + "logps/rejected": -441.95428466796875, + "loss": 0.4827, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.988227128982544, + "rewards/margins": 1.1914219856262207, + "rewards/rejected": -3.1796488761901855, "step": 1260 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.67, - "learning_rate": 4.477715780406849e-06, - "logits/chosen": -0.36691197752952576, - "logits/chosen_friction": -0.37591394782066345, - "logits/rejected": -0.34994572401046753, - "logits/rejected_friction": -0.34793633222579956, - "logps/chosen": -3.1078648567199707, - "logps/chosen_friction": -1.1128114461898804, - "logps/rejected": -5.056698799133301, - "logps/rejected_friction": -4.287209510803223, - "loss": 0.002, - "policy_friction_nll_loss": 1.095513105392456, - "policy_nll_loss": 3.06257963180542, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25404173135757446, - "rewards/chosen_fricton": -0.06566942483186722, - "rewards/margins": 0.19070442020893097, - "rewards/margins_friction": 0.3125818371772766, - "rewards/rejected": -0.4447461664676666, - "rewards/rejected_friction": -0.37825125455856323, - "step": 1265 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.68, - "learning_rate": 4.473698704167195e-06, - "logits/chosen": -0.364570677280426, - "logits/chosen_friction": -0.3725453019142151, - "logits/rejected": -0.3512701094150543, - "logits/rejected_friction": -0.3428551256656647, - "logps/chosen": -3.1946780681610107, - "logps/chosen_friction": -1.1773955821990967, - "logps/rejected": -4.99545955657959, - "logps/rejected_friction": -4.355684757232666, - "loss": 0.0018, - "policy_friction_nll_loss": 1.1585566997528076, - "policy_nll_loss": 3.153529167175293, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2585587203502655, - "rewards/chosen_fricton": -0.06882261484861374, - "rewards/margins": 0.17680247128009796, - "rewards/margins_friction": 0.3141136169433594, - "rewards/rejected": -0.43536120653152466, - "rewards/rejected_friction": -0.3829362392425537, + "epoch": 0.39, + "learning_rate": 3.2467404852487846e-06, + "logits/chosen": -0.33789581060409546, + "logits/rejected": -0.33837661147117615, + "logps/chosen": -445.60009765625, + "logps/rejected": -453.21380615234375, + "loss": 0.4935, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8867080211639404, + "rewards/margins": 1.2431962490081787, + "rewards/rejected": -3.1299045085906982, "step": 1270 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.68, - "learning_rate": 4.469668055212963e-06, - "logits/chosen": -0.35098356008529663, - "logits/chosen_friction": -0.35578304529190063, - "logits/rejected": -0.3377101421356201, - "logits/rejected_friction": -0.32342419028282166, - "logps/chosen": -3.3315742015838623, - "logps/chosen_friction": -1.2204821109771729, - "logps/rejected": -5.043564796447754, - "logps/rejected_friction": -4.509286403656006, - "loss": 0.0015, - "policy_friction_nll_loss": 1.196682333946228, - "policy_nll_loss": 3.275146484375, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2681397795677185, - "rewards/chosen_fricton": -0.06945343315601349, - "rewards/margins": 0.16805806756019592, - "rewards/margins_friction": 0.3249497711658478, - "rewards/rejected": -0.4361979067325592, - "rewards/rejected_friction": -0.3944031894207001, - "step": 1275 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.68, - "learning_rate": 4.465623861262085e-06, - "logits/chosen": -0.3715916872024536, - "logits/chosen_friction": -0.38279253244400024, - "logits/rejected": -0.3552556335926056, - "logits/rejected_friction": -0.34970182180404663, - "logps/chosen": -3.035825729370117, - "logps/chosen_friction": -1.0943177938461304, - "logps/rejected": -4.879923343658447, - "logps/rejected_friction": -4.30183744430542, - "loss": 0.002, - "policy_friction_nll_loss": 1.0787479877471924, - "policy_nll_loss": 2.9979348182678223, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24561366438865662, - "rewards/chosen_fricton": -0.06303177028894424, - "rewards/margins": 0.18114998936653137, - "rewards/margins_friction": 0.31687888503074646, - "rewards/rejected": -0.4267636835575104, - "rewards/rejected_friction": -0.3799106478691101, + "epoch": 0.39, + "learning_rate": 3.2208508512533777e-06, + "logits/chosen": -0.3227623403072357, + "logits/rejected": -0.3246156573295593, + "logps/chosen": -447.2259826660156, + "logps/rejected": -456.6937561035156, + "loss": 0.4514, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.05145001411438, + "rewards/margins": 1.0999400615692139, + "rewards/rejected": -3.151390552520752, "step": 1280 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.69, - "learning_rate": 4.46156615012564e-06, - "logits/chosen": -0.3611815869808197, - "logits/chosen_friction": -0.3671398460865021, - "logits/rejected": -0.3531096875667572, - "logits/rejected_friction": -0.33540037274360657, - "logps/chosen": -3.085827589035034, - "logps/chosen_friction": -1.1310393810272217, - "logps/rejected": -4.7422590255737305, - "logps/rejected_friction": -4.45145845413208, - "loss": 0.002, - "policy_friction_nll_loss": 1.1064541339874268, - "policy_nll_loss": 3.037652015686035, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2468957155942917, - "rewards/chosen_fricton": -0.06265312433242798, - "rewards/margins": 0.16252033412456512, - "rewards/margins_friction": 0.3285321891307831, - "rewards/rejected": -0.4094161093235016, - "rewards/rejected_friction": -0.39118534326553345, - "step": 1285 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.69, - "learning_rate": 4.457494949707662e-06, - "logits/chosen": -0.3724982738494873, - "logits/chosen_friction": -0.3819736838340759, - "logits/rejected": -0.3592456877231598, - "logits/rejected_friction": -0.34899359941482544, - "logps/chosen": -3.072214126586914, - "logps/chosen_friction": -1.0886245965957642, - "logps/rejected": -4.898593902587891, - "logps/rejected_friction": -4.441581726074219, - "loss": 0.0032, - "policy_friction_nll_loss": 1.0784858465194702, - "policy_nll_loss": 3.0438311100006104, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24966582655906677, - "rewards/chosen_fricton": -0.06291397660970688, - "rewards/margins": 0.17838387191295624, - "rewards/margins_friction": 0.3304113745689392, - "rewards/rejected": -0.4280497133731842, - "rewards/rejected_friction": -0.3933253586292267, + "epoch": 0.39, + "learning_rate": 3.1948766222144863e-06, + "logits/chosen": -0.32600507140159607, + "logits/rejected": -0.3266277313232422, + "logps/chosen": -434.4227600097656, + "logps/rejected": -442.1160583496094, + "loss": 0.5228, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.233891010284424, + "rewards/margins": 0.8642382621765137, + "rewards/rejected": -3.0981292724609375, "step": 1290 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.69, - "learning_rate": 4.453410288004947e-06, - "logits/chosen": -0.36455944180488586, - "logits/chosen_friction": -0.3757091760635376, - "logits/rejected": -0.3503013849258423, - "logits/rejected_friction": -0.34639275074005127, - "logps/chosen": -3.166306495666504, - "logps/chosen_friction": -1.1332242488861084, - "logps/rejected": -5.025478363037109, - "logps/rejected_friction": -4.3935933113098145, - "loss": 0.0023, - "policy_friction_nll_loss": 1.1165050268173218, - "policy_nll_loss": 3.11250376701355, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2592574954032898, - "rewards/chosen_fricton": -0.0670386478304863, - "rewards/margins": 0.18320947885513306, - "rewards/margins_friction": 0.3225708603858948, - "rewards/rejected": -0.44246697425842285, - "rewards/rejected_friction": -0.38960951566696167, - "step": 1295 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.69, - "learning_rate": 4.449312193106862e-06, - "logits/chosen": -0.35814595222473145, - "logits/chosen_friction": -0.3663831353187561, - "logits/rejected": -0.34386223554611206, - "logits/rejected_friction": -0.3373435139656067, - "logps/chosen": -3.1169819831848145, - "logps/chosen_friction": -1.1216720342636108, - "logps/rejected": -4.903852462768555, - "logps/rejected_friction": -4.284663200378418, - "loss": 0.0021, - "policy_friction_nll_loss": 1.0993732213974, - "policy_nll_loss": 3.078651189804077, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2529505789279938, - "rewards/chosen_fricton": -0.06542818248271942, - "rewards/margins": 0.17571806907653809, - "rewards/margins_friction": 0.3127683699131012, - "rewards/rejected": -0.42866867780685425, - "rewards/rejected_friction": -0.3781965374946594, + "epoch": 0.39, + "learning_rate": 3.168820846323053e-06, + "logits/chosen": -0.3299737870693207, + "logits/rejected": -0.3313831090927124, + "logps/chosen": -434.67803955078125, + "logps/rejected": -446.71417236328125, + "loss": 0.4392, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.0030479431152344, + "rewards/margins": 1.1440895795822144, + "rewards/rejected": -3.147137403488159, "step": 1300 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.7, - "learning_rate": 4.445200693195152e-06, - "logits/chosen": -0.36816635727882385, - "logits/chosen_friction": -0.3791962265968323, - "logits/rejected": -0.3511037230491638, - "logits/rejected_friction": -0.3469633460044861, - "logps/chosen": -3.2224280834198, - "logps/chosen_friction": -1.157041311264038, - "logps/rejected": -4.972950458526611, - "logps/rejected_friction": -4.240658283233643, - "loss": 0.002, - "policy_friction_nll_loss": 1.1384501457214355, - "policy_nll_loss": 3.187920093536377, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26312753558158875, - "rewards/chosen_fricton": -0.06839694827795029, - "rewards/margins": 0.17121943831443787, - "rewards/margins_friction": 0.3038845956325531, - "rewards/rejected": -0.4343469738960266, - "rewards/rejected_friction": -0.372281551361084, - "step": 1305 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.7, - "learning_rate": 4.441075816543745e-06, - "logits/chosen": -0.36793622374534607, - "logits/chosen_friction": -0.3755772113800049, - "logits/rejected": -0.350874662399292, - "logits/rejected_friction": -0.3450879156589508, - "logps/chosen": -3.1841049194335938, - "logps/chosen_friction": -1.121605396270752, - "logps/rejected": -5.0496602058410645, - "logps/rejected_friction": -4.377647399902344, - "loss": 0.0017, - "policy_friction_nll_loss": 1.1101489067077637, - "policy_nll_loss": 3.1640350818634033, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25913718342781067, - "rewards/chosen_fricton": -0.06502178311347961, - "rewards/margins": 0.18352201581001282, - "rewards/margins_friction": 0.3217903971672058, - "rewards/rejected": -0.4426591992378235, - "rewards/rejected_friction": -0.3868121802806854, + "epoch": 0.4, + "learning_rate": 3.142686581339902e-06, + "logits/chosen": -0.32545098662376404, + "logits/rejected": -0.32752394676208496, + "logps/chosen": -435.9081115722656, + "logps/rejected": -445.0193786621094, + "loss": 0.5154, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0516555309295654, + "rewards/margins": 1.0273317098617554, + "rewards/rejected": -3.0789875984191895, "step": 1310 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.7, - "learning_rate": 4.436937591518557e-06, - "logits/chosen": -0.3632461130619049, - "logits/chosen_friction": -0.3712728023529053, - "logits/rejected": -0.34709596633911133, - "logits/rejected_friction": -0.3382473587989807, - "logps/chosen": -3.33642840385437, - "logps/chosen_friction": -1.1716411113739014, - "logps/rejected": -5.133236408233643, - "logps/rejected_friction": -4.453862190246582, - "loss": 0.0018, - "policy_friction_nll_loss": 1.1569833755493164, - "policy_nll_loss": 3.299872636795044, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.27196204662323, - "rewards/chosen_fricton": -0.06767892837524414, - "rewards/margins": 0.1766572892665863, - "rewards/margins_friction": 0.3246728479862213, - "rewards/rejected": -0.44861936569213867, - "rewards/rejected_friction": -0.39235180616378784, - "step": 1315 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.7, - "learning_rate": 4.4327860465773e-06, - "logits/chosen": -0.36476653814315796, - "logits/chosen_friction": -0.37147411704063416, - "logits/rejected": -0.3507649898529053, - "logits/rejected_friction": -0.34015071392059326, - "logps/chosen": -3.219933032989502, - "logps/chosen_friction": -1.2114542722702026, - "logps/rejected": -4.943177223205566, - "logps/rejected_friction": -4.33928918838501, - "loss": 0.0034, - "policy_friction_nll_loss": 1.1816682815551758, - "policy_nll_loss": 3.197089910507202, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2622333765029907, - "rewards/chosen_fricton": -0.07306136190891266, - "rewards/margins": 0.16905155777931213, - "rewards/margins_friction": 0.3086971640586853, - "rewards/rejected": -0.43128490447998047, - "rewards/rejected_friction": -0.38175854086875916, + "epoch": 0.4, + "learning_rate": 3.1164768942369058e-06, + "logits/chosen": -0.33717575669288635, + "logits/rejected": -0.33777323365211487, + "logps/chosen": -439.6886291503906, + "logps/rejected": -450.8135681152344, + "loss": 0.4056, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7847926616668701, + "rewards/margins": 1.3125979900360107, + "rewards/rejected": -3.097390651702881, "step": 1320 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.71, - "learning_rate": 4.428621210269282e-06, - "logits/chosen": -0.36643534898757935, - "logits/chosen_friction": -0.3767964839935303, - "logits/rejected": -0.3529232144355774, - "logits/rejected_friction": -0.34533199667930603, - "logps/chosen": -3.1094601154327393, - "logps/chosen_friction": -1.1142525672912598, - "logps/rejected": -4.955231666564941, - "logps/rejected_friction": -4.348025798797607, - "loss": 0.0015, - "policy_friction_nll_loss": 1.096393346786499, - "policy_nll_loss": 3.071516275405884, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25275373458862305, - "rewards/chosen_fricton": -0.06526979058980942, - "rewards/margins": 0.18120479583740234, - "rewards/margins_friction": 0.3190153241157532, - "rewards/rejected": -0.4339585304260254, - "rewards/rejected_friction": -0.3842851221561432, - "step": 1325 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.71, - "learning_rate": 4.424443111235215e-06, - "logits/chosen": -0.35435643792152405, - "logits/chosen_friction": -0.3631955683231354, - "logits/rejected": -0.34091195464134216, - "logits/rejected_friction": -0.3326430916786194, - "logps/chosen": -3.281325578689575, - "logps/chosen_friction": -1.1870144605636597, - "logps/rejected": -5.108790397644043, - "logps/rejected_friction": -4.449599266052246, - "loss": 0.0019, - "policy_friction_nll_loss": 1.1686726808547974, - "policy_nll_loss": 3.237253189086914, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2689092755317688, - "rewards/chosen_fricton": -0.07092095911502838, - "rewards/margins": 0.17806978523731232, - "rewards/margins_friction": 0.32076773047447205, - "rewards/rejected": -0.4469790458679199, - "rewards/rejected_friction": -0.39168867468833923, + "epoch": 0.4, + "learning_rate": 3.0901948608370503e-06, + "logits/chosen": -0.3371260166168213, + "logits/rejected": -0.33846548199653625, + "logps/chosen": -436.64190673828125, + "logps/rejected": -450.7660217285156, + "loss": 0.4474, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.9579681158065796, + "rewards/margins": 1.1995084285736084, + "rewards/rejected": -3.1574764251708984, "step": 1330 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.71, - "learning_rate": 4.420251778207016e-06, - "logits/chosen": -0.350531667470932, - "logits/chosen_friction": -0.35614532232284546, - "logits/rejected": -0.3405655026435852, - "logits/rejected_friction": -0.3237277865409851, - "logps/chosen": -3.2451603412628174, - "logps/chosen_friction": -1.169776201248169, - "logps/rejected": -4.883904933929443, - "logps/rejected_friction": -4.485592842102051, - "loss": 0.0017, - "policy_friction_nll_loss": 1.1465356349945068, - "policy_nll_loss": 3.1949665546417236, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2615228593349457, - "rewards/chosen_fricton": -0.06584356725215912, - "rewards/margins": 0.1601373702287674, - "rewards/margins_friction": 0.32715722918510437, - "rewards/rejected": -0.42166024446487427, - "rewards/rejected_friction": -0.3930008113384247, - "step": 1335 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.71, - "learning_rate": 4.4160472400076055e-06, - "logits/chosen": -0.3721831440925598, - "logits/chosen_friction": -0.38154035806655884, - "logits/rejected": -0.36115163564682007, - "logits/rejected_friction": -0.35127657651901245, - "logps/chosen": -3.0787341594696045, - "logps/chosen_friction": -1.0903127193450928, - "logps/rejected": -4.863831996917725, - "logps/rejected_friction": -4.343472480773926, - "loss": 0.0032, - "policy_friction_nll_loss": 1.0762752294540405, - "policy_nll_loss": 3.059924364089966, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25019222497940063, - "rewards/chosen_fricton": -0.06336705386638641, - "rewards/margins": 0.17557606101036072, - "rewards/margins_friction": 0.32184070348739624, - "rewards/rejected": -0.42576828598976135, - "rewards/rejected_friction": -0.3852077126502991, + "epoch": 0.41, + "learning_rate": 3.063843565453486e-06, + "logits/chosen": -0.3233332931995392, + "logits/rejected": -0.3235628008842468, + "logps/chosen": -441.6625061035156, + "logps/rejected": -450.6896057128906, + "loss": 0.4454, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0109899044036865, + "rewards/margins": 1.2036950588226318, + "rewards/rejected": -3.2146849632263184, "step": 1340 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.72, - "learning_rate": 4.411829525550719e-06, - "logits/chosen": -0.3744856119155884, - "logits/chosen_friction": -0.38401395082473755, - "logits/rejected": -0.36180272698402405, - "logits/rejected_friction": -0.35188260674476624, - "logps/chosen": -3.156811237335205, - "logps/chosen_friction": -1.1071674823760986, - "logps/rejected": -4.9591779708862305, - "logps/rejected_friction": -4.43972110748291, - "loss": 0.0015, - "policy_friction_nll_loss": 1.0907853841781616, - "policy_nll_loss": 3.1289618015289307, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25602245330810547, - "rewards/chosen_fricton": -0.0631440207362175, - "rewards/margins": 0.17740926146507263, - "rewards/margins_friction": 0.32932987809181213, - "rewards/rejected": -0.4334316849708557, - "rewards/rejected_friction": -0.39247390627861023, - "step": 1345 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.72, - "learning_rate": 4.4075986638406995e-06, - "logits/chosen": -0.36165380477905273, - "logits/chosen_friction": -0.3665259778499603, - "logits/rejected": -0.35453301668167114, - "logits/rejected_friction": -0.3337849974632263, - "logps/chosen": -3.2106292247772217, - "logps/chosen_friction": -1.1507233381271362, - "logps/rejected": -4.774526596069336, - "logps/rejected_friction": -4.509537220001221, - "loss": 0.0017, - "policy_friction_nll_loss": 1.1301480531692505, - "policy_nll_loss": 3.176205635070801, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25798115134239197, - "rewards/chosen_fricton": -0.06423866748809814, - "rewards/margins": 0.15362916886806488, - "rewards/margins_friction": 0.33243662118911743, - "rewards/rejected": -0.41161030530929565, - "rewards/rejected_friction": -0.3966752886772156, + "epoch": 0.41, + "learning_rate": 3.0374261005275606e-06, + "logits/chosen": -0.32744866609573364, + "logits/rejected": -0.32873040437698364, + "logps/chosen": -438.97955322265625, + "logps/rejected": -452.37255859375, + "loss": 0.4277, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.850262999534607, + "rewards/margins": 1.45210862159729, + "rewards/rejected": -3.3023715019226074, "step": 1350 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.72, - "learning_rate": 4.4033546839723e-06, - "logits/chosen": -0.3709540367126465, - "logits/chosen_friction": -0.3784843385219574, - "logits/rejected": -0.3599422574043274, - "logits/rejected_friction": -0.3471742272377014, - "logps/chosen": -3.142547130584717, - "logps/chosen_friction": -1.1001412868499756, - "logps/rejected": -4.915012359619141, - "logps/rejected_friction": -4.302671909332275, - "loss": 0.0037, - "policy_friction_nll_loss": 1.0885566473007202, - "policy_nll_loss": 3.1174423694610596, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2536913752555847, - "rewards/chosen_fricton": -0.06154347583651543, - "rewards/margins": 0.17332394421100616, - "rewards/margins_friction": 0.315773069858551, - "rewards/rejected": -0.4270153045654297, - "rewards/rejected_friction": -0.37731659412384033, - "step": 1355 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.73, - "learning_rate": 4.3990976151304866e-06, - "logits/chosen": -0.3608176112174988, - "logits/chosen_friction": -0.37256819009780884, - "logits/rejected": -0.340756893157959, - "logits/rejected_friction": -0.3432336151599884, - "logps/chosen": -3.014756679534912, - "logps/chosen_friction": -1.111917495727539, - "logps/rejected": -5.067824363708496, - "logps/rejected_friction": -4.221665382385254, - "loss": 0.0013, - "policy_friction_nll_loss": 1.084557056427002, - "policy_nll_loss": 2.9444808959960938, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24495497345924377, - "rewards/chosen_fricton": -0.06590845435857773, - "rewards/margins": 0.2021690309047699, - "rewards/margins_friction": 0.307096004486084, - "rewards/rejected": -0.4471239447593689, - "rewards/rejected_friction": -0.3730044960975647, + "epoch": 0.41, + "learning_rate": 3.0109455662659126e-06, + "logits/chosen": -0.33421364426612854, + "logits/rejected": -0.33508172631263733, + "logps/chosen": -438.8184509277344, + "logps/rejected": -447.74267578125, + "loss": 0.469, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3082900047302246, + "rewards/margins": 1.0502725839614868, + "rewards/rejected": -3.358562469482422, "step": 1360 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.73, - "learning_rate": 4.394827486590233e-06, - "logits/chosen": -0.36093562841415405, - "logits/chosen_friction": -0.3714118003845215, - "logits/rejected": -0.34139329195022583, - "logits/rejected_friction": -0.34035488963127136, - "logps/chosen": -3.1117405891418457, - "logps/chosen_friction": -1.14009690284729, - "logps/rejected": -5.068652153015137, - "logps/rejected_friction": -4.309114456176758, - "loss": 0.0019, - "policy_friction_nll_loss": 1.1225776672363281, - "policy_nll_loss": 3.0728278160095215, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2537683844566345, - "rewards/chosen_fricton": -0.06691966950893402, - "rewards/margins": 0.1917285919189453, - "rewards/margins_friction": 0.3122798502445221, - "rewards/rejected": -0.44549688696861267, - "rewards/rejected_friction": -0.3791995346546173, - "step": 1365 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.73, - "learning_rate": 4.390544327716324e-06, - "logits/chosen": -0.3549707233905792, - "logits/chosen_friction": -0.36412426829338074, - "logits/rejected": -0.33754962682724, - "logits/rejected_friction": -0.3344693183898926, - "logps/chosen": -3.081876277923584, - "logps/chosen_friction": -1.201738715171814, - "logps/rejected": -4.872628688812256, - "logps/rejected_friction": -4.183483123779297, - "loss": 0.0044, - "policy_friction_nll_loss": 1.1849806308746338, - "policy_nll_loss": 3.0372157096862793, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2491125762462616, - "rewards/chosen_fricton": -0.07279146462678909, - "rewards/margins": 0.17653200030326843, - "rewards/margins_friction": 0.29467543959617615, - "rewards/rejected": -0.42564457654953003, - "rewards/rejected_friction": -0.36746692657470703, + "epoch": 0.42, + "learning_rate": 2.984405070276646e-06, + "logits/chosen": -0.3377315402030945, + "logits/rejected": -0.3380245268344879, + "logps/chosen": -440.62689208984375, + "logps/rejected": -448.7757873535156, + "loss": 0.4497, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.239384174346924, + "rewards/margins": 1.1150033473968506, + "rewards/rejected": -3.3543879985809326, "step": 1370 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.73, - "learning_rate": 4.38624816796315e-06, - "logits/chosen": -0.37586167454719543, - "logits/chosen_friction": -0.38199615478515625, - "logits/rejected": -0.35865408182144165, - "logits/rejected_friction": -0.352683961391449, - "logps/chosen": -3.077873945236206, - "logps/chosen_friction": -1.1419681310653687, - "logps/rejected": -4.9083757400512695, - "logps/rejected_friction": -4.261175632476807, - "loss": 0.0019, - "policy_friction_nll_loss": 1.1302069425582886, - "policy_nll_loss": 3.04624605178833, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2495582401752472, - "rewards/chosen_fricton": -0.06776805967092514, - "rewards/margins": 0.17919427156448364, - "rewards/margins_friction": 0.307343989610672, - "rewards/rejected": -0.42875248193740845, - "rewards/rejected_friction": -0.37511202692985535, - "step": 1375 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.74, - "learning_rate": 4.3819390368745065e-06, - "logits/chosen": -0.35907039046287537, - "logits/chosen_friction": -0.3672904372215271, - "logits/rejected": -0.34476301074028015, - "logits/rejected_friction": -0.33797556161880493, - "logps/chosen": -3.220582962036133, - "logps/chosen_friction": -1.1647272109985352, - "logps/rejected": -5.072291374206543, - "logps/rejected_friction": -4.385309219360352, - "loss": 0.002, - "policy_friction_nll_loss": 1.1465812921524048, - "policy_nll_loss": 3.1846282482147217, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2628052234649658, - "rewards/chosen_fricton": -0.06897134333848953, - "rewards/margins": 0.1820625364780426, - "rewards/margins_friction": 0.31817469000816345, - "rewards/rejected": -0.4448677897453308, - "rewards/rejected_friction": -0.38714608550071716, + "epoch": 0.42, + "learning_rate": 2.9578077272046407e-06, + "logits/chosen": -0.3324066698551178, + "logits/rejected": -0.3327699303627014, + "logps/chosen": -445.11651611328125, + "logps/rejected": -452.57135009765625, + "loss": 0.4627, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.3683180809020996, + "rewards/margins": 1.2064650058746338, + "rewards/rejected": -3.5747828483581543, "step": 1380 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.74, - "learning_rate": 4.377616964083389e-06, - "logits/chosen": -0.35345807671546936, - "logits/chosen_friction": -0.3579837679862976, - "logits/rejected": -0.3455270826816559, - "logits/rejected_friction": -0.3284001648426056, - "logps/chosen": -3.143364429473877, - "logps/chosen_friction": -1.1553386449813843, - "logps/rejected": -4.908282279968262, - "logps/rejected_friction": -4.522717475891113, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1387498378753662, - "policy_nll_loss": 3.104769706726074, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25187811255455017, - "rewards/chosen_fricton": -0.06478022038936615, - "rewards/margins": 0.1725928783416748, - "rewards/margins_friction": 0.3320035934448242, - "rewards/rejected": -0.424470990896225, - "rewards/rejected_friction": -0.39678385853767395, - "step": 1385 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.74, - "learning_rate": 4.373281979311792e-06, - "logits/chosen": -0.38017526268959045, - "logits/chosen_friction": -0.3879132866859436, - "logits/rejected": -0.3657204508781433, - "logits/rejected_friction": -0.35879576206207275, - "logps/chosen": -2.9502274990081787, - "logps/chosen_friction": -1.0483644008636475, - "logps/rejected": -4.819767951965332, - "logps/rejected_friction": -4.193985462188721, - "loss": 0.0023, - "policy_friction_nll_loss": 1.0367364883422852, - "policy_nll_loss": 2.924254894256592, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24013550579547882, - "rewards/chosen_fricton": -0.06163973733782768, - "rewards/margins": 0.18348988890647888, - "rewards/margins_friction": 0.3102000057697296, - "rewards/rejected": -0.4236253798007965, - "rewards/rejected_friction": -0.3718397319316864, + "epoch": 0.42, + "learning_rate": 2.931156658366032e-06, + "logits/chosen": -0.33288371562957764, + "logits/rejected": -0.33407607674598694, + "logps/chosen": -438.28363037109375, + "logps/rejected": -449.0726623535156, + "loss": 0.4609, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4992074966430664, + "rewards/margins": 1.1167179346084595, + "rewards/rejected": -3.6159253120422363, "step": 1390 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.74, - "learning_rate": 4.368934112370501e-06, - "logits/chosen": -0.35775619745254517, - "logits/chosen_friction": -0.3638460040092468, - "logits/rejected": -0.34855058789253235, - "logits/rejected_friction": -0.3324471712112427, - "logps/chosen": -3.197788715362549, - "logps/chosen_friction": -1.1340601444244385, - "logps/rejected": -4.87926721572876, - "logps/rejected_friction": -4.3541717529296875, - "loss": 0.0032, - "policy_friction_nll_loss": 1.114818811416626, - "policy_nll_loss": 3.151995897293091, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2592063248157501, - "rewards/chosen_fricton": -0.06447899341583252, - "rewards/margins": 0.16547521948814392, - "rewards/margins_friction": 0.318691223859787, - "rewards/rejected": -0.42468157410621643, - "rewards/rejected_friction": -0.3831701874732971, - "step": 1395 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.75, - "learning_rate": 4.364573393158893e-06, - "logits/chosen": -0.3591650724411011, - "logits/chosen_friction": -0.3668918013572693, - "logits/rejected": -0.3487336039543152, - "logits/rejected_friction": -0.33742278814315796, - "logps/chosen": -3.195805072784424, - "logps/chosen_friction": -1.156799077987671, - "logps/rejected": -5.064579010009766, - "logps/rejected_friction": -4.445986270904541, - "loss": 0.0018, - "policy_friction_nll_loss": 1.1442265510559082, - "policy_nll_loss": 3.1637511253356934, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2594603896141052, - "rewards/chosen_fricton": -0.06745926290750504, - "rewards/margins": 0.1835389882326126, - "rewards/margins_friction": 0.32497352361679077, - "rewards/rejected": -0.44299936294555664, - "rewards/rejected_friction": -0.3924327790737152, + "epoch": 0.43, + "learning_rate": 2.9044549913819125e-06, + "logits/chosen": -0.33773019909858704, + "logits/rejected": -0.3393145203590393, + "logps/chosen": -441.80511474609375, + "logps/rejected": -450.90997314453125, + "loss": 0.421, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.251107692718506, + "rewards/margins": 1.231013536453247, + "rewards/rejected": -3.482121706008911, "step": 1400 }, { - "epoch": 0.75, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.39929088950157166, - "eval_logits/chosen_friction": -0.40743380784988403, - "eval_logits/rejected": -0.38426685333251953, - "eval_logits/rejected_friction": -0.37278762459754944, - "eval_logps/chosen": -3.187434673309326, - "eval_logps/chosen_friction": -1.173846960067749, - "eval_logps/rejected": -5.094576358795166, - "eval_logps/rejected_friction": -4.419308662414551, - "eval_loss": 0.0018378919921815395, - "eval_policy_friction_nll_loss": 1.1738468408584595, - "eval_policy_nll_loss": 3.187434673309326, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.25880128145217896, - "eval_rewards/chosen_fricton": -0.06911597400903702, - "eval_rewards/margins": 0.18723635375499725, - "eval_rewards/margins_friction": 0.3203990161418915, - "eval_rewards/rejected": -0.446037620306015, - "eval_rewards/rejected_friction": -0.3895149827003479, - "eval_runtime": 558.2303, - "eval_samples_per_second": 0.896, - "eval_steps_per_second": 0.448, + "epoch": 0.43, + "eval_logits/chosen": -0.401915043592453, + "eval_logits/rejected": -0.4025632441043854, + "eval_logps/chosen": -436.2534484863281, + "eval_logps/rejected": -445.6542053222656, + "eval_loss": 0.4834233820438385, + "eval_rewards/accuracies": 0.7580000162124634, + "eval_rewards/chosen": -2.336226463317871, + "eval_rewards/margins": 1.131289005279541, + "eval_rewards/rejected": -3.4675159454345703, + "eval_runtime": 373.3095, + "eval_samples_per_second": 1.339, + "eval_steps_per_second": 1.339, "step": 1400 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.75, - "learning_rate": 4.360199851664722e-06, - "logits/chosen": -0.3485613465309143, - "logits/chosen_friction": -0.35454320907592773, - "logits/rejected": -0.33464667201042175, - "logits/rejected_friction": -0.3243491053581238, - "logps/chosen": -3.1829800605773926, - "logps/chosen_friction": -1.1887924671173096, - "logps/rejected": -5.093494415283203, - "logps/rejected_friction": -4.540672779083252, - "loss": 0.0018, - "policy_friction_nll_loss": 1.1680580377578735, - "policy_nll_loss": 3.1419482231140137, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25713056325912476, - "rewards/chosen_fricton": -0.06897633522748947, - "rewards/margins": 0.1882425844669342, - "rewards/margins_friction": 0.3317468762397766, - "rewards/rejected": -0.44537314772605896, - "rewards/rejected_friction": -0.4007232189178467, - "step": 1405 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.75, - "learning_rate": 4.355813517963924e-06, - "logits/chosen": -0.3621693253517151, - "logits/chosen_friction": -0.3705170750617981, - "logits/rejected": -0.3453361392021179, - "logits/rejected_friction": -0.3412872850894928, - "logps/chosen": -3.254481554031372, - "logps/chosen_friction": -1.1542408466339111, - "logps/rejected": -5.058920860290527, - "logps/rejected_friction": -4.268472671508789, - "loss": 0.0015, - "policy_friction_nll_loss": 1.1319295167922974, - "policy_nll_loss": 3.2096056938171387, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2652929127216339, - "rewards/chosen_fricton": -0.06755994260311127, - "rewards/margins": 0.17670294642448425, - "rewards/margins_friction": 0.307078093290329, - "rewards/rejected": -0.44199585914611816, - "rewards/rejected_friction": -0.37463802099227905, + "epoch": 0.43, + "learning_rate": 2.877705859811292e-06, + "logits/chosen": -0.32963141798973083, + "logits/rejected": -0.32958561182022095, + "logps/chosen": -441.468017578125, + "logps/rejected": -452.41632080078125, + "loss": 0.4867, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.3174846172332764, + "rewards/margins": 1.2222645282745361, + "rewards/rejected": -3.5397496223449707, "step": 1410 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.75, - "learning_rate": 4.351414422220399e-06, - "logits/chosen": -0.3581032454967499, - "logits/chosen_friction": -0.36381450295448303, - "logits/rejected": -0.34490180015563965, - "logits/rejected_friction": -0.33551257848739624, - "logps/chosen": -3.1082212924957275, - "logps/chosen_friction": -1.1386653184890747, - "logps/rejected": -4.870804786682129, - "logps/rejected_friction": -4.311789512634277, - "loss": 0.0015, - "policy_friction_nll_loss": 1.1146214008331299, - "policy_nll_loss": 3.051532745361328, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2503339946269989, - "rewards/chosen_fricton": -0.06555898487567902, - "rewards/margins": 0.17303213477134705, - "rewards/margins_friction": 0.3131570816040039, - "rewards/rejected": -0.42336615920066833, - "rewards/rejected_friction": -0.3787160813808441, - "step": 1415 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.76, - "learning_rate": 4.347002594685814e-06, - "logits/chosen": -0.3593606948852539, - "logits/chosen_friction": -0.36729469895362854, - "logits/rejected": -0.343192994594574, - "logits/rejected_friction": -0.33569663763046265, - "logps/chosen": -3.2004287242889404, - "logps/chosen_friction": -1.2207019329071045, - "logps/rejected": -4.975351333618164, - "logps/rejected_friction": -4.343016147613525, - "loss": 0.0023, - "policy_friction_nll_loss": 1.2023441791534424, - "policy_nll_loss": 3.1545751094818115, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2607436180114746, - "rewards/chosen_fricton": -0.0737910270690918, - "rewards/margins": 0.17407076060771942, - "rewards/margins_friction": 0.30794575810432434, - "rewards/rejected": -0.43481436371803284, - "rewards/rejected_friction": -0.38173675537109375, + "epoch": 0.43, + "learning_rate": 2.850912402783361e-06, + "logits/chosen": -0.33581605553627014, + "logits/rejected": -0.3373740315437317, + "logps/chosen": -443.38507080078125, + "logps/rejected": -455.705810546875, + "loss": 0.4821, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.5340211391448975, + "rewards/margins": 1.059066653251648, + "rewards/rejected": -3.593087673187256, "step": 1420 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.76, - "learning_rate": 4.342578065699385e-06, - "logits/chosen": -0.3597262501716614, - "logits/chosen_friction": -0.3658071458339691, - "logits/rejected": -0.34268176555633545, - "logits/rejected_friction": -0.3378746211528778, - "logps/chosen": -3.2248623371124268, - "logps/chosen_friction": -1.182926058769226, - "logps/rejected": -5.081020355224609, - "logps/rejected_friction": -4.274964332580566, - "loss": 0.0031, - "policy_friction_nll_loss": 1.1636264324188232, - "policy_nll_loss": 3.1907451152801514, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26137781143188477, - "rewards/chosen_fricton": -0.06913228332996368, - "rewards/margins": 0.1831226497888565, - "rewards/margins_friction": 0.30636531114578247, - "rewards/rejected": -0.4445004463195801, - "rewards/rejected_friction": -0.3754975497722626, - "step": 1425 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.76, - "learning_rate": 4.338140865687678e-06, - "logits/chosen": -0.36186403036117554, - "logits/chosen_friction": -0.36944887042045593, - "logits/rejected": -0.345559686422348, - "logits/rejected_friction": -0.3382258713245392, - "logps/chosen": -3.19853138923645, - "logps/chosen_friction": -1.1588850021362305, - "logps/rejected": -5.034745216369629, - "logps/rejected_friction": -4.335484504699707, - "loss": 0.0015, - "policy_friction_nll_loss": 1.1404191255569458, - "policy_nll_loss": 3.153050422668457, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2599095404148102, - "rewards/chosen_fricton": -0.06769635528326035, - "rewards/margins": 0.1805179864168167, - "rewards/margins_friction": 0.31375187635421753, - "rewards/rejected": -0.4404275417327881, - "rewards/rejected_friction": -0.3814482092857361, + "epoch": 0.43, + "learning_rate": 2.8240777646290973e-06, + "logits/chosen": -0.3432762026786804, + "logits/rejected": -0.3442252576351166, + "logps/chosen": -455.3641662597656, + "logps/rejected": -465.02032470703125, + "loss": 0.4363, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3768112659454346, + "rewards/margins": 1.3034956455230713, + "rewards/rejected": -3.680307388305664, "step": 1430 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.77, - "learning_rate": 4.333691025164389e-06, - "logits/chosen": -0.35263463854789734, - "logits/chosen_friction": -0.3592016100883484, - "logits/rejected": -0.3380410075187683, - "logits/rejected_friction": -0.3278982937335968, - "logps/chosen": -3.18688702583313, - "logps/chosen_friction": -1.211496114730835, - "logps/rejected": -5.005252838134766, - "logps/rejected_friction": -4.442674160003662, - "loss": 0.0014, - "policy_friction_nll_loss": 1.1894422769546509, - "policy_nll_loss": 3.1504273414611816, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25749292969703674, - "rewards/chosen_fricton": -0.07138528674840927, - "rewards/margins": 0.1783134490251541, - "rewards/margins_friction": 0.31908130645751953, - "rewards/rejected": -0.43580636382102966, - "rewards/rejected_friction": -0.3904666006565094, - "step": 1435 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.77, - "learning_rate": 4.329228574730148e-06, - "logits/chosen": -0.3656327724456787, - "logits/chosen_friction": -0.3759131133556366, - "logits/rejected": -0.35112816095352173, - "logits/rejected_friction": -0.34378159046173096, - "logps/chosen": -3.044854164123535, - "logps/chosen_friction": -1.1196744441986084, - "logps/rejected": -4.901145935058594, - "logps/rejected_friction": -4.367335319519043, - "loss": 0.0014, - "policy_friction_nll_loss": 1.1074873208999634, - "policy_nll_loss": 3.0161983966827393, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.245590478181839, - "rewards/chosen_fricton": -0.0644986480474472, - "rewards/margins": 0.18211157619953156, - "rewards/margins_friction": 0.3208082914352417, - "rewards/rejected": -0.4277020990848541, - "rewards/rejected_friction": -0.3853068947792053, + "epoch": 0.44, + "learning_rate": 2.7972050945122666e-06, + "logits/chosen": -0.3318456709384918, + "logits/rejected": -0.33274808526039124, + "logps/chosen": -442.74029541015625, + "logps/rejected": -453.32745361328125, + "loss": 0.4564, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.556647300720215, + "rewards/margins": 1.2236577272415161, + "rewards/rejected": -3.7803051471710205, "step": 1440 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.77, - "learning_rate": 4.3247535450722934e-06, - "logits/chosen": -0.3674312233924866, - "logits/chosen_friction": -0.37573930621147156, - "logits/rejected": -0.3533594310283661, - "logits/rejected_friction": -0.3438417315483093, - "logps/chosen": -3.1099328994750977, - "logps/chosen_friction": -1.1033694744110107, - "logps/rejected": -4.915585517883301, - "logps/rejected_friction": -4.35053014755249, - "loss": 0.0014, - "policy_friction_nll_loss": 1.0872316360473633, - "policy_nll_loss": 3.072099447250366, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25227269530296326, - "rewards/chosen_fricton": -0.06282766163349152, - "rewards/margins": 0.1776151806116104, - "rewards/margins_friction": 0.3209766447544098, - "rewards/rejected": -0.4298878610134125, - "rewards/rejected_friction": -0.3838043212890625, - "step": 1445 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.77, - "learning_rate": 4.320265966964672e-06, - "logits/chosen": -0.36270594596862793, - "logits/chosen_friction": -0.3707398474216461, - "logits/rejected": -0.3497660756111145, - "logits/rejected_friction": -0.3411349654197693, - "logps/chosen": -3.1043341159820557, - "logps/chosen_friction": -1.1300346851348877, - "logps/rejected": -4.9076080322265625, - "logps/rejected_friction": -4.284505367279053, - "loss": 0.0022, - "policy_friction_nll_loss": 1.1096723079681396, - "policy_nll_loss": 3.066110849380493, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24985957145690918, - "rewards/chosen_fricton": -0.06468230485916138, - "rewards/margins": 0.17787837982177734, - "rewards/margins_friction": 0.3122323155403137, - "rewards/rejected": -0.4277379512786865, - "rewards/rejected_friction": -0.3769146502017975, + "epoch": 0.44, + "learning_rate": 2.7702975460598545e-06, + "logits/chosen": -0.33731141686439514, + "logits/rejected": -0.33812469244003296, + "logps/chosen": -445.42596435546875, + "logps/rejected": -457.1685485839844, + "loss": 0.4487, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.542117118835449, + "rewards/margins": 1.1616876125335693, + "rewards/rejected": -3.7038047313690186, "step": 1450 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.78, - "learning_rate": 4.315765871267424e-06, - "logits/chosen": -0.3628886342048645, - "logits/chosen_friction": -0.3719286024570465, - "logits/rejected": -0.3455713391304016, - "logits/rejected_friction": -0.3424150347709656, - "logps/chosen": -3.078223466873169, - "logps/chosen_friction": -1.1233017444610596, - "logps/rejected": -4.976130962371826, - "logps/rejected_friction": -4.281785011291504, - "loss": 0.0017, - "policy_friction_nll_loss": 1.1043702363967896, - "policy_nll_loss": 3.033684253692627, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24899116158485413, - "rewards/chosen_fricton": -0.06550715863704681, - "rewards/margins": 0.1864529401063919, - "rewards/margins_friction": 0.31167471408843994, - "rewards/rejected": -0.4354441165924072, - "rewards/rejected_friction": -0.37718185782432556, - "step": 1455 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.78, - "learning_rate": 4.311253288926769e-06, - "logits/chosen": -0.37915748357772827, - "logits/chosen_friction": -0.3878866732120514, - "logits/rejected": -0.3617352545261383, - "logits/rejected_friction": -0.3590402603149414, - "logps/chosen": -3.0469412803649902, - "logps/chosen_friction": -1.1166986227035522, - "logps/rejected": -4.995335102081299, - "logps/rejected_friction": -4.1651153564453125, - "loss": 0.0021, - "policy_friction_nll_loss": 1.1092225313186646, - "policy_nll_loss": 3.0287766456604004, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24670736491680145, - "rewards/chosen_fricton": -0.06576039642095566, - "rewards/margins": 0.19206808507442474, - "rewards/margins_friction": 0.30149951577186584, - "rewards/rejected": -0.4387754499912262, - "rewards/rejected_friction": -0.3672599196434021, + "epoch": 0.44, + "learning_rate": 2.7433582769919752e-06, + "logits/chosen": -0.3384588360786438, + "logits/rejected": -0.33992061018943787, + "logps/chosen": -448.994873046875, + "logps/rejected": -456.4767150878906, + "loss": 0.5548, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.7594006061553955, + "rewards/margins": 0.9322620630264282, + "rewards/rejected": -3.691662549972534, "step": 1460 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.78, - "learning_rate": 4.306728250974795e-06, - "logits/chosen": -0.36530381441116333, - "logits/chosen_friction": -0.3703305125236511, - "logits/rejected": -0.35450097918510437, - "logits/rejected_friction": -0.33709174394607544, - "logps/chosen": -3.2839035987854004, - "logps/chosen_friction": -1.1710522174835205, - "logps/rejected": -5.016531944274902, - "logps/rejected_friction": -4.478046894073486, - "loss": 0.001, - "policy_friction_nll_loss": 1.1572840213775635, - "policy_nll_loss": 3.252472400665283, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2625790536403656, - "rewards/chosen_fricton": -0.06455279886722565, - "rewards/margins": 0.1698436439037323, - "rewards/margins_friction": 0.32676413655281067, - "rewards/rejected": -0.4324227273464203, - "rewards/rejected_friction": -0.3913169503211975, - "step": 1465 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.78, - "learning_rate": 4.302190788529243e-06, - "logits/chosen": -0.364012211561203, - "logits/chosen_friction": -0.3725064694881439, - "logits/rejected": -0.34795695543289185, - "logits/rejected_friction": -0.3420436680316925, - "logps/chosen": -3.0897955894470215, - "logps/chosen_friction": -1.1286933422088623, - "logps/rejected": -5.055689334869385, - "logps/rejected_friction": -4.364092826843262, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1158615350723267, - "policy_nll_loss": 3.048125743865967, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24888984858989716, - "rewards/chosen_fricton": -0.06515058875083923, - "rewards/margins": 0.1931978017091751, - "rewards/margins_friction": 0.31934741139411926, - "rewards/rejected": -0.4420875906944275, - "rewards/rejected_friction": -0.3844979703426361, + "epoch": 0.45, + "learning_rate": 2.716390448751294e-06, + "logits/chosen": -0.34274882078170776, + "logits/rejected": -0.34329262375831604, + "logps/chosen": -450.77972412109375, + "logps/rejected": -461.76239013671875, + "loss": 0.4976, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5453426837921143, + "rewards/margins": 1.080673336982727, + "rewards/rejected": -3.6260154247283936, "step": 1470 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.79, - "learning_rate": 4.2976409327933e-06, - "logits/chosen": -0.36330580711364746, - "logits/chosen_friction": -0.3690491318702698, - "logits/rejected": -0.3514525890350342, - "logits/rejected_friction": -0.34167033433914185, - "logps/chosen": -3.080087184906006, - "logps/chosen_friction": -1.1186349391937256, - "logps/rejected": -4.916838645935059, - "logps/rejected_friction": -4.3207550048828125, - "loss": 0.0032, - "policy_friction_nll_loss": 1.104379653930664, - "policy_nll_loss": 3.0368075370788574, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24838760495185852, - "rewards/chosen_fricton": -0.06440293043851852, - "rewards/margins": 0.17958791553974152, - "rewards/margins_friction": 0.3153039813041687, - "rewards/rejected": -0.42797547578811646, - "rewards/rejected_friction": -0.3797069787979126, - "step": 1475 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.79, - "learning_rate": 4.2930787150553735e-06, - "logits/chosen": -0.35655075311660767, - "logits/chosen_friction": -0.3597807288169861, - "logits/rejected": -0.3455184996128082, - "logits/rejected_friction": -0.3334309458732605, - "logps/chosen": -3.0540237426757812, - "logps/chosen_friction": -1.100396752357483, - "logps/rejected": -4.808653831481934, - "logps/rejected_friction": -4.21375036239624, - "loss": 0.0018, - "policy_friction_nll_loss": 1.0790681838989258, - "policy_nll_loss": 3.0021445751190186, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24540984630584717, - "rewards/chosen_fricton": -0.06210817024111748, - "rewards/margins": 0.17292940616607666, - "rewards/margins_friction": 0.3078908324241638, - "rewards/rejected": -0.4183393120765686, - "rewards/rejected_friction": -0.369998961687088, + "epoch": 0.45, + "learning_rate": 2.6893972261320265e-06, + "logits/chosen": -0.3363896608352661, + "logits/rejected": -0.33778852224349976, + "logps/chosen": -442.7216796875, + "logps/rejected": -453.9684143066406, + "loss": 0.4628, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.584522008895874, + "rewards/margins": 1.2380025386810303, + "rewards/rejected": -3.8225245475769043, "step": 1480 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.79, - "learning_rate": 4.288504166688883e-06, - "logits/chosen": -0.36560946702957153, - "logits/chosen_friction": -0.37307119369506836, - "logits/rejected": -0.35527458786964417, - "logits/rejected_friction": -0.3443114161491394, - "logps/chosen": -3.105792999267578, - "logps/chosen_friction": -1.1035202741622925, - "logps/rejected": -4.912137985229492, - "logps/rejected_friction": -4.302995204925537, - "loss": 0.0013, - "policy_friction_nll_loss": 1.0902080535888672, - "policy_nll_loss": 3.078507900238037, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.251787006855011, - "rewards/chosen_fricton": -0.0632215142250061, - "rewards/margins": 0.17674249410629272, - "rewards/margins_friction": 0.31533318758010864, - "rewards/rejected": -0.4285295009613037, - "rewards/rejected_friction": -0.37855464220046997, - "step": 1485 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.79, - "learning_rate": 4.283917319152045e-06, - "logits/chosen": -0.37478020787239075, - "logits/chosen_friction": -0.3817475438117981, - "logits/rejected": -0.36203843355178833, - "logits/rejected_friction": -0.3542458117008209, - "logps/chosen": -2.875753879547119, - "logps/chosen_friction": -1.069907307624817, - "logps/rejected": -4.8290863037109375, - "logps/rejected_friction": -4.252469062805176, - "loss": 0.0014, - "policy_friction_nll_loss": 1.0586097240447998, - "policy_nll_loss": 2.843186616897583, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.22970156371593475, - "rewards/chosen_fricton": -0.06144840642809868, - "rewards/margins": 0.19221924245357513, - "rewards/margins_friction": 0.3146403729915619, - "rewards/rejected": -0.4219208359718323, - "rewards/rejected_friction": -0.3760887384414673, + "epoch": 0.45, + "learning_rate": 2.6623817769085268e-06, + "logits/chosen": -0.3299495577812195, + "logits/rejected": -0.3310778737068176, + "logps/chosen": -438.0104064941406, + "logps/rejected": -450.68572998046875, + "loss": 0.4308, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4091532230377197, + "rewards/margins": 1.258310079574585, + "rewards/rejected": -3.6674628257751465, "step": 1490 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.8, - "learning_rate": 4.2793182039876556e-06, - "logits/chosen": -0.3714742064476013, - "logits/chosen_friction": -0.37870514392852783, - "logits/rejected": -0.3606312870979309, - "logits/rejected_friction": -0.34830039739608765, - "logps/chosen": -3.0215187072753906, - "logps/chosen_friction": -1.106467366218567, - "logps/rejected": -4.832515716552734, - "logps/rejected_friction": -4.316788673400879, - "loss": 0.0015, - "policy_friction_nll_loss": 1.0939865112304688, - "policy_nll_loss": 2.9981091022491455, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24290159344673157, - "rewards/chosen_fricton": -0.06319155544042587, - "rewards/margins": 0.1773923933506012, - "rewards/margins_friction": 0.31638628244400024, - "rewards/rejected": -0.42029398679733276, - "rewards/rejected_friction": -0.3795778155326843, - "step": 1495 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.8, - "learning_rate": 4.2747068528228686e-06, - "logits/chosen": -0.3732590973377228, - "logits/chosen_friction": -0.38064199686050415, - "logits/rejected": -0.3628058433532715, - "logits/rejected_friction": -0.3486878275871277, - "logps/chosen": -3.1322808265686035, - "logps/chosen_friction": -1.1161530017852783, - "logps/rejected": -4.878325462341309, - "logps/rejected_friction": -4.372431755065918, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1084024906158447, - "policy_nll_loss": 3.1124346256256104, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25275275111198425, - "rewards/chosen_fricton": -0.06371154636144638, - "rewards/margins": 0.17040245234966278, - "rewards/margins_friction": 0.32037240266799927, - "rewards/rejected": -0.4231552183628082, - "rewards/rejected_friction": -0.38408392667770386, + "epoch": 0.46, + "learning_rate": 2.6353472714635443e-06, + "logits/chosen": -0.3383990526199341, + "logits/rejected": -0.33991554379463196, + "logps/chosen": -453.71038818359375, + "logps/rejected": -466.170166015625, + "loss": 0.4603, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7626547813415527, + "rewards/margins": 1.127687692642212, + "rewards/rejected": -3.8903422355651855, "step": 1500 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.8, - "learning_rate": 4.270083297368985e-06, - "logits/chosen": -0.3622027039527893, - "logits/chosen_friction": -0.3677745461463928, - "logits/rejected": -0.3549235463142395, - "logits/rejected_friction": -0.33912381529808044, - "logps/chosen": -3.115478515625, - "logps/chosen_friction": -1.1479277610778809, - "logps/rejected": -4.894097328186035, - "logps/rejected_friction": -4.4049177169799805, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1260219812393188, - "policy_nll_loss": 3.0718820095062256, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24923697113990784, - "rewards/chosen_fricton": -0.06425130367279053, - "rewards/margins": 0.17459264397621155, - "rewards/margins_friction": 0.32197681069374084, - "rewards/rejected": -0.4238296151161194, - "rewards/rejected_friction": -0.38622814416885376, - "step": 1505 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.81, - "learning_rate": 4.265447569421234e-06, - "logits/chosen": -0.3657798171043396, - "logits/chosen_friction": -0.37425127625465393, - "logits/rejected": -0.3530051112174988, - "logits/rejected_friction": -0.34389057755470276, - "logps/chosen": -3.0498392581939697, - "logps/chosen_friction": -1.0971344709396362, - "logps/rejected": -4.9249162673950195, - "logps/rejected_friction": -4.308377742767334, - "loss": 0.0019, - "policy_friction_nll_loss": 1.082276701927185, - "policy_nll_loss": 3.0132832527160645, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24485746026039124, - "rewards/chosen_fricton": -0.06175720691680908, - "rewards/margins": 0.1840486079454422, - "rewards/margins_friction": 0.31727927923202515, - "rewards/rejected": -0.42890605330467224, - "rewards/rejected_friction": -0.37903645634651184, + "epoch": 0.46, + "learning_rate": 2.6082968824161558e-06, + "logits/chosen": -0.3404627740383148, + "logits/rejected": -0.3412095606327057, + "logps/chosen": -446.44281005859375, + "logps/rejected": -454.9615783691406, + "loss": 0.4887, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.626864194869995, + "rewards/margins": 1.2109944820404053, + "rewards/rejected": -3.8378589153289795, "step": 1510 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.81, - "learning_rate": 4.260799700858548e-06, - "logits/chosen": -0.35508379340171814, - "logits/chosen_friction": -0.36031338572502136, - "logits/rejected": -0.34692201018333435, - "logits/rejected_friction": -0.331345796585083, - "logps/chosen": -3.210543155670166, - "logps/chosen_friction": -1.2061470746994019, - "logps/rejected": -4.950142860412598, - "logps/rejected_friction": -4.4566802978515625, - "loss": 0.004, - "policy_friction_nll_loss": 1.2016443014144897, - "policy_nll_loss": 3.1840009689331055, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2582273781299591, - "rewards/chosen_fricton": -0.06983978301286697, - "rewards/margins": 0.1702277958393097, - "rewards/margins_friction": 0.32048723101615906, - "rewards/rejected": -0.4284552037715912, - "rewards/rejected_friction": -0.3903270363807678, - "step": 1515 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.81, - "learning_rate": 4.256139723643352e-06, - "logits/chosen": -0.3754886984825134, - "logits/chosen_friction": -0.3829841911792755, - "logits/rejected": -0.36241987347602844, - "logits/rejected_friction": -0.35239046812057495, - "logps/chosen": -3.1258397102355957, - "logps/chosen_friction": -1.1472647190093994, - "logps/rejected": -4.886548042297363, - "logps/rejected_friction": -4.239466190338135, - "loss": 0.0024, - "policy_friction_nll_loss": 1.1320788860321045, - "policy_nll_loss": 3.100567102432251, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25496649742126465, - "rewards/chosen_fricton": -0.06831592321395874, - "rewards/margins": 0.17323559522628784, - "rewards/margins_friction": 0.305982768535614, - "rewards/rejected": -0.4282020926475525, - "rewards/rejected_friction": -0.37429869174957275, + "epoch": 0.46, + "learning_rate": 2.5812337842494517e-06, + "logits/chosen": -0.3334888815879822, + "logits/rejected": -0.334361732006073, + "logps/chosen": -437.97979736328125, + "logps/rejected": -449.66864013671875, + "loss": 0.4395, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6739344596862793, + "rewards/margins": 1.2408138513565063, + "rewards/rejected": -3.914747953414917, "step": 1520 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.81, - "learning_rate": 4.251467669821337e-06, - "logits/chosen": -0.3644895553588867, - "logits/chosen_friction": -0.370709091424942, - "logits/rejected": -0.3541489541530609, - "logits/rejected_friction": -0.3427184522151947, - "logps/chosen": -3.143709659576416, - "logps/chosen_friction": -1.114744782447815, - "logps/rejected": -4.975032806396484, - "logps/rejected_friction": -4.336799621582031, - "loss": 0.0016, - "policy_friction_nll_loss": 1.0984718799591064, - "policy_nll_loss": 3.102635622024536, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25508183240890503, - "rewards/chosen_fricton": -0.0635814443230629, - "rewards/margins": 0.17938530445098877, - "rewards/margins_friction": 0.31796425580978394, - "rewards/rejected": -0.4344671666622162, - "rewards/rejected_friction": -0.38154569268226624, - "step": 1525 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.82, - "learning_rate": 4.2467835715212424e-06, - "logits/chosen": -0.3667684495449066, - "logits/chosen_friction": -0.37213000655174255, - "logits/rejected": -0.3542974591255188, - "logits/rejected_friction": -0.34376630187034607, - "logps/chosen": -3.085909128189087, - "logps/chosen_friction": -1.1257262229919434, - "logps/rejected": -4.957578659057617, - "logps/rejected_friction": -4.284592628479004, - "loss": 0.0011, - "policy_friction_nll_loss": 1.112746000289917, - "policy_nll_loss": 3.060182571411133, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2484138309955597, - "rewards/chosen_fricton": -0.06475050747394562, - "rewards/margins": 0.18428125977516174, - "rewards/margins_friction": 0.31226006150245667, - "rewards/rejected": -0.43269509077072144, - "rewards/rejected_friction": -0.3770105838775635, + "epoch": 0.46, + "learning_rate": 2.554161152937994e-06, + "logits/chosen": -0.34664058685302734, + "logits/rejected": -0.34752577543258667, + "logps/chosen": -452.38983154296875, + "logps/rejected": -458.98046875, + "loss": 0.46, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.5840182304382324, + "rewards/margins": 1.3410053253173828, + "rewards/rejected": -3.9250235557556152, "step": 1530 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.82, - "learning_rate": 4.242087460954638e-06, - "logits/chosen": -0.3618529438972473, - "logits/chosen_friction": -0.3688064515590668, - "logits/rejected": -0.35105833411216736, - "logits/rejected_friction": -0.3395049571990967, - "logps/chosen": -3.056795597076416, - "logps/chosen_friction": -1.0990663766860962, - "logps/rejected": -4.900546550750732, - "logps/rejected_friction": -4.364954948425293, - "loss": 0.0014, - "policy_friction_nll_loss": 1.0819358825683594, - "policy_nll_loss": 3.023958206176758, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24597711861133575, - "rewards/chosen_fricton": -0.06260945647954941, - "rewards/margins": 0.18062427639961243, - "rewards/margins_friction": 0.3217465579509735, - "rewards/rejected": -0.4266014099121094, - "rewards/rejected_friction": -0.3843560218811035, - "step": 1535 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.82, - "learning_rate": 4.237379370415695e-06, - "logits/chosen": -0.3601837754249573, - "logits/chosen_friction": -0.3670174181461334, - "logits/rejected": -0.34589412808418274, - "logits/rejected_friction": -0.3361402750015259, - "logps/chosen": -3.080084800720215, - "logps/chosen_friction": -1.1380376815795898, - "logps/rejected": -4.943946838378906, - "logps/rejected_friction": -4.304479598999023, - "loss": 0.0017, - "policy_friction_nll_loss": 1.1107988357543945, - "policy_nll_loss": 3.021699905395508, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24843356013298035, - "rewards/chosen_fricton": -0.06527705490589142, - "rewards/margins": 0.18361878395080566, - "rewards/margins_friction": 0.31326860189437866, - "rewards/rejected": -0.4320523738861084, - "rewards/rejected_friction": -0.3785456418991089, + "epoch": 0.47, + "learning_rate": 2.5270821655750997e-06, + "logits/chosen": -0.3402210772037506, + "logits/rejected": -0.3408128619194031, + "logps/chosen": -452.06658935546875, + "logps/rejected": -465.1114807128906, + "loss": 0.383, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.483328342437744, + "rewards/margins": 1.4499397277832031, + "rewards/rejected": -3.9332680702209473, "step": 1540 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.82, - "learning_rate": 4.232659332280973e-06, - "logits/chosen": -0.36016029119491577, - "logits/chosen_friction": -0.36641138792037964, - "logits/rejected": -0.34756386280059814, - "logits/rejected_friction": -0.33727389574050903, - "logps/chosen": -3.135504961013794, - "logps/chosen_friction": -1.1671425104141235, - "logps/rejected": -4.987720012664795, - "logps/rejected_friction": -4.375451564788818, - "loss": 0.0014, - "policy_friction_nll_loss": 1.1482102870941162, - "policy_nll_loss": 3.09739351272583, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25158801674842834, - "rewards/chosen_fricton": -0.06639845669269562, - "rewards/margins": 0.18216446042060852, - "rewards/margins_friction": 0.31692737340927124, - "rewards/rejected": -0.43375247716903687, - "rewards/rejected_friction": -0.38332584500312805, - "step": 1545 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.83, - "learning_rate": 4.227927379009189e-06, - "logits/chosen": -0.3622887134552002, - "logits/chosen_friction": -0.36787208914756775, - "logits/rejected": -0.35246723890304565, - "logits/rejected_friction": -0.3384186625480652, - "logps/chosen": -3.09799861907959, - "logps/chosen_friction": -1.1518982648849487, - "logps/rejected": -4.822833061218262, - "logps/rejected_friction": -4.340365409851074, - "loss": 0.0014, - "policy_friction_nll_loss": 1.1370456218719482, - "policy_nll_loss": 3.071188449859619, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24909372627735138, - "rewards/chosen_fricton": -0.06595823913812637, - "rewards/margins": 0.16904596984386444, - "rewards/margins_friction": 0.3145836293697357, - "rewards/rejected": -0.4181397557258606, - "rewards/rejected_friction": -0.3805418312549591, + "epoch": 0.47, + "learning_rate": 2.5e-06, + "logits/chosen": -0.33848652243614197, + "logits/rejected": -0.3391149640083313, + "logps/chosen": -447.24407958984375, + "logps/rejected": -456.50933837890625, + "loss": 0.4384, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.932450294494629, + "rewards/margins": 1.1602147817611694, + "rewards/rejected": -4.09266471862793, "step": 1550 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.83, - "learning_rate": 4.2231835431409986e-06, - "logits/chosen": -0.3620893359184265, - "logits/chosen_friction": -0.3656770586967468, - "logits/rejected": -0.3513573110103607, - "logits/rejected_friction": -0.3372897505760193, - "logps/chosen": -3.2089295387268066, - "logps/chosen_friction": -1.1441643238067627, - "logps/rejected": -4.999948978424072, - "logps/rejected_friction": -4.330763339996338, - "loss": 0.0009, - "policy_friction_nll_loss": 1.128757357597351, - "policy_nll_loss": 3.178452253341675, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25950533151626587, - "rewards/chosen_fricton": -0.06534359604120255, - "rewards/margins": 0.175291046500206, - "rewards/margins_friction": 0.31388750672340393, - "rewards/rejected": -0.43479642271995544, - "rewards/rejected_friction": -0.3792310357093811, - "step": 1555 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.83, - "learning_rate": 4.218427857298774e-06, - "logits/chosen": -0.3638419508934021, - "logits/chosen_friction": -0.3706079125404358, - "logits/rejected": -0.3479618430137634, - "logits/rejected_friction": -0.3419577479362488, - "logps/chosen": -3.0068345069885254, - "logps/chosen_friction": -1.0947661399841309, - "logps/rejected": -4.945204734802246, - "logps/rejected_friction": -4.155577182769775, - "loss": 0.0016, - "policy_friction_nll_loss": 1.0811688899993896, - "policy_nll_loss": 2.976067543029785, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24393919110298157, - "rewards/chosen_fricton": -0.06351888179779053, - "rewards/margins": 0.19004960358142853, - "rewards/margins_friction": 0.3017675280570984, - "rewards/rejected": -0.4339888095855713, - "rewards/rejected_friction": -0.3652864098548889, + "epoch": 0.47, + "learning_rate": 2.4729178344249007e-06, + "logits/chosen": -0.34805721044540405, + "logits/rejected": -0.34990328550338745, + "logps/chosen": -457.77520751953125, + "logps/rejected": -467.7879943847656, + "loss": 0.4306, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.852219343185425, + "rewards/margins": 1.3265211582183838, + "rewards/rejected": -4.178740501403809, "step": 1560 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.83, - "learning_rate": 4.213660354186374e-06, - "logits/chosen": -0.36921435594558716, - "logits/chosen_friction": -0.3777810335159302, - "logits/rejected": -0.3556375205516815, - "logits/rejected_friction": -0.3468298316001892, - "logps/chosen": -3.1933162212371826, - "logps/chosen_friction": -1.1153842210769653, - "logps/rejected": -5.015137672424316, - "logps/rejected_friction": -4.325812339782715, - "loss": 0.0021, - "policy_friction_nll_loss": 1.1058704853057861, - "policy_nll_loss": 3.1674373149871826, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25987380743026733, - "rewards/chosen_fricton": -0.0632733404636383, - "rewards/margins": 0.17867568135261536, - "rewards/margins_friction": 0.3170929551124573, - "rewards/rejected": -0.4385494589805603, - "rewards/rejected_friction": -0.3803662657737732, - "step": 1565 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.84, - "learning_rate": 4.208881066588924e-06, - "logits/chosen": -0.3714197278022766, - "logits/chosen_friction": -0.37880635261535645, - "logits/rejected": -0.3652063012123108, - "logits/rejected_friction": -0.3498445153236389, - "logps/chosen": -3.055506944656372, - "logps/chosen_friction": -1.0721571445465088, - "logps/rejected": -4.791529178619385, - "logps/rejected_friction": -4.328304290771484, - "loss": 0.0014, - "policy_friction_nll_loss": 1.0626124143600464, - "policy_nll_loss": 3.0276618003845215, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24526405334472656, - "rewards/chosen_fricton": -0.05848166346549988, - "rewards/margins": 0.17071110010147095, - "rewards/margins_friction": 0.32187405228614807, - "rewards/rejected": -0.4159751534461975, - "rewards/rejected_friction": -0.38035568594932556, + "epoch": 0.48, + "learning_rate": 2.4458388470620066e-06, + "logits/chosen": -0.34960517287254333, + "logits/rejected": -0.35107699036598206, + "logps/chosen": -457.14569091796875, + "logps/rejected": -467.239990234375, + "loss": 0.4444, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.756500720977783, + "rewards/margins": 1.3199396133422852, + "rewards/rejected": -4.07643985748291, "step": 1570 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.84, - "learning_rate": 4.204090027372591e-06, - "logits/chosen": -0.3556351363658905, - "logits/chosen_friction": -0.3603302240371704, - "logits/rejected": -0.35190314054489136, - "logits/rejected_friction": -0.3329032063484192, - "logps/chosen": -3.120666265487671, - "logps/chosen_friction": -1.1043577194213867, - "logps/rejected": -4.815967082977295, - "logps/rejected_friction": -4.411304950714111, - "loss": 0.0014, - "policy_friction_nll_loss": 1.0875377655029297, - "policy_nll_loss": 3.086719036102295, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25134095549583435, - "rewards/chosen_fricton": -0.06116175651550293, - "rewards/margins": 0.16637368500232697, - "rewards/margins_friction": 0.32635220885276794, - "rewards/rejected": -0.41771459579467773, - "rewards/rejected_friction": -0.38751402497291565, - "step": 1575 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.84, - "learning_rate": 4.199287269484349e-06, - "logits/chosen": -0.3706800043582916, - "logits/chosen_friction": -0.3759602904319763, - "logits/rejected": -0.36248379945755005, - "logits/rejected_friction": -0.3429178297519684, - "logps/chosen": -3.252932071685791, - "logps/chosen_friction": -1.1465551853179932, - "logps/rejected": -5.010706901550293, - "logps/rejected_friction": -4.4736008644104, - "loss": 0.0014, - "policy_friction_nll_loss": 1.1399790048599243, - "policy_nll_loss": 3.2362704277038574, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2601303458213806, - "rewards/chosen_fricton": -0.06247730180621147, - "rewards/margins": 0.17272183299064636, - "rewards/margins_friction": 0.32879838347435, - "rewards/rejected": -0.432852178812027, - "rewards/rejected_friction": -0.39127570390701294, + "epoch": 0.48, + "learning_rate": 2.418766215750549e-06, + "logits/chosen": -0.3384454548358917, + "logits/rejected": -0.3394390642642975, + "logps/chosen": -455.9664001464844, + "logps/rejected": -467.4884338378906, + "loss": 0.4289, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.917130708694458, + "rewards/margins": 1.3165969848632812, + "rewards/rejected": -4.23372745513916, "step": 1580 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.85, - "learning_rate": 4.1944728259517646e-06, - "logits/chosen": -0.37153154611587524, - "logits/chosen_friction": -0.3785211443901062, - "logits/rejected": -0.3631858229637146, - "logits/rejected_friction": -0.35147184133529663, - "logps/chosen": -2.9833621978759766, - "logps/chosen_friction": -1.064235806465149, - "logps/rejected": -4.794966697692871, - "logps/rejected_friction": -4.222023963928223, - "loss": 0.0025, - "policy_friction_nll_loss": 1.0490074157714844, - "policy_nll_loss": 2.9544272422790527, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23924832046031952, - "rewards/chosen_fricton": -0.059706516563892365, - "rewards/margins": 0.1780560314655304, - "rewards/margins_friction": 0.3113541901111603, - "rewards/rejected": -0.4173043668270111, - "rewards/rejected_friction": -0.37106066942214966, - "step": 1585 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.85, - "learning_rate": 4.189646729882763e-06, - "logits/chosen": -0.35305237770080566, - "logits/chosen_friction": -0.3580729365348816, - "logits/rejected": -0.3427947163581848, - "logits/rejected_friction": -0.32901808619499207, - "logps/chosen": -3.128920555114746, - "logps/chosen_friction": -1.1719961166381836, - "logps/rejected": -4.956562042236328, - "logps/rejected_friction": -4.341875076293945, - "loss": 0.0014, - "policy_friction_nll_loss": 1.1452757120132446, - "policy_nll_loss": 3.0812466144561768, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2506740391254425, - "rewards/chosen_fricton": -0.06630445271730423, - "rewards/margins": 0.1810857653617859, - "rewards/margins_friction": 0.31484442949295044, - "rewards/rejected": -0.4317598342895508, - "rewards/rejected_friction": -0.3811488747596741, + "epoch": 0.48, + "learning_rate": 2.3917031175838447e-06, + "logits/chosen": -0.33930128812789917, + "logits/rejected": -0.33957165479660034, + "logps/chosen": -452.30548095703125, + "logps/rejected": -467.23614501953125, + "loss": 0.4339, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8669447898864746, + "rewards/margins": 1.3872116804122925, + "rewards/rejected": -4.254156589508057, "step": 1590 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.85, - "learning_rate": 4.184809014465401e-06, - "logits/chosen": -0.3614771068096161, - "logits/chosen_friction": -0.3695076107978821, - "logits/rejected": -0.34733736515045166, - "logits/rejected_friction": -0.33934903144836426, - "logps/chosen": -3.071833610534668, - "logps/chosen_friction": -1.079988718032837, - "logps/rejected": -4.98870849609375, - "logps/rejected_friction": -4.311559677124023, - "loss": 0.0015, - "policy_friction_nll_loss": 1.0651602745056152, - "policy_nll_loss": 3.031332492828369, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24791543185710907, - "rewards/chosen_fricton": -0.06037048250436783, - "rewards/margins": 0.18785831332206726, - "rewards/margins_friction": 0.3185027539730072, - "rewards/rejected": -0.4357737600803375, - "rewards/rejected_friction": -0.3788732886314392, - "step": 1595 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.85, - "learning_rate": 4.179959712967638e-06, - "logits/chosen": -0.36768609285354614, - "logits/chosen_friction": -0.37765973806381226, - "logits/rejected": -0.3499194383621216, - "logits/rejected_friction": -0.35039955377578735, - "logps/chosen": -2.8689045906066895, - "logps/chosen_friction": -1.06505286693573, - "logps/rejected": -4.8492937088012695, - "logps/rejected_friction": -4.072605609893799, - "loss": 0.0013, - "policy_friction_nll_loss": 1.0516551733016968, - "policy_nll_loss": 2.8428854942321777, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23137755692005157, - "rewards/chosen_fricton": -0.06236768513917923, - "rewards/margins": 0.19421732425689697, - "rewards/margins_friction": 0.29659199714660645, - "rewards/rejected": -0.42559486627578735, - "rewards/rejected_friction": -0.3589596450328827, + "epoch": 0.49, + "learning_rate": 2.3646527285364565e-06, + "logits/chosen": -0.33700358867645264, + "logits/rejected": -0.33825331926345825, + "logps/chosen": -451.98272705078125, + "logps/rejected": -461.351318359375, + "loss": 0.4821, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.0442519187927246, + "rewards/margins": 1.0982847213745117, + "rewards/rejected": -4.1425371170043945, "step": 1600 }, { - "epoch": 0.85, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.39927881956100464, - "eval_logits/chosen_friction": -0.4055052399635315, - "eval_logits/rejected": -0.38735198974609375, - "eval_logits/rejected_friction": -0.37313178181648254, - "eval_logps/chosen": -3.0933306217193604, - "eval_logps/chosen_friction": -1.132167935371399, - "eval_logps/rejected": -4.925455093383789, - "eval_logps/rejected_friction": -4.256489276885986, - "eval_loss": 0.0013795326231047511, - "eval_policy_friction_nll_loss": 1.1321678161621094, - "eval_policy_nll_loss": 3.0933306217193604, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.24939091503620148, - "eval_rewards/chosen_fricton": -0.06494806706905365, - "eval_rewards/margins": 0.17973467707633972, - "eval_rewards/margins_friction": 0.3082849681377411, - "eval_rewards/rejected": -0.42912557721138, - "eval_rewards/rejected_friction": -0.37323302030563354, - "eval_runtime": 348.996, - "eval_samples_per_second": 1.433, - "eval_steps_per_second": 0.716, + "epoch": 0.49, + "eval_logits/chosen": -0.41185611486434937, + "eval_logits/rejected": -0.41246527433395386, + "eval_logps/chosen": -441.00274658203125, + "eval_logps/rejected": -451.21136474609375, + "eval_loss": 0.48274433612823486, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -2.811156749725342, + "eval_rewards/margins": 1.2120723724365234, + "eval_rewards/rejected": -4.023228645324707, + "eval_runtime": 376.6555, + "eval_samples_per_second": 1.327, + "eval_steps_per_second": 1.327, "step": 1600 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.86, - "learning_rate": 4.17509885873711e-06, - "logits/chosen": -0.35218581557273865, - "logits/chosen_friction": -0.35873299837112427, - "logits/rejected": -0.34093424677848816, - "logits/rejected_friction": -0.33040502667427063, - "logps/chosen": -3.0674142837524414, - "logps/chosen_friction": -1.1169195175170898, - "logps/rejected": -4.907425403594971, - "logps/rejected_friction": -4.240057945251465, - "loss": 0.0013, - "policy_friction_nll_loss": 1.0966265201568604, - "policy_nll_loss": 3.024075984954834, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24692213535308838, - "rewards/chosen_fricton": -0.06314484775066376, - "rewards/margins": 0.1802913248538971, - "rewards/margins_friction": 0.3079231381416321, - "rewards/rejected": -0.4272134304046631, - "rewards/rejected_friction": -0.37106800079345703, - "step": 1605 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.86, - "learning_rate": 4.170226485200899e-06, - "logits/chosen": -0.35474973917007446, - "logits/chosen_friction": -0.36012881994247437, - "logits/rejected": -0.34271955490112305, - "logits/rejected_friction": -0.33110547065734863, - "logps/chosen": -3.1400327682495117, - "logps/chosen_friction": -1.0851269960403442, - "logps/rejected": -4.979005813598633, - "logps/rejected_friction": -4.186001777648926, - "loss": 0.0016, - "policy_friction_nll_loss": 1.0699245929718018, - "policy_nll_loss": 3.106609344482422, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25368160009384155, - "rewards/chosen_fricton": -0.06011684983968735, - "rewards/margins": 0.1808900535106659, - "rewards/margins_friction": 0.3063526749610901, - "rewards/rejected": -0.43457159399986267, - "rewards/rejected_friction": -0.36646953225135803, + "epoch": 0.49, + "learning_rate": 2.3376182230914728e-06, + "logits/chosen": -0.35231637954711914, + "logits/rejected": -0.3524485230445862, + "logps/chosen": -450.71600341796875, + "logps/rejected": -459.95623779296875, + "loss": 0.4562, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.791792869567871, + "rewards/margins": 1.3087048530578613, + "rewards/rejected": -4.100497245788574, "step": 1610 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.86, - "learning_rate": 4.1653426258653014e-06, - "logits/chosen": -0.36368945240974426, - "logits/chosen_friction": -0.3701314926147461, - "logits/rejected": -0.34655705094337463, - "logits/rejected_friction": -0.34324708580970764, - "logps/chosen": -2.96842622756958, - "logps/chosen_friction": -1.068320631980896, - "logps/rejected": -4.996771335601807, - "logps/rejected_friction": -4.151811122894287, - "loss": 0.001, - "policy_friction_nll_loss": 1.0563262701034546, - "policy_nll_loss": 2.929910659790039, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24052317440509796, - "rewards/chosen_fricton": -0.06197073310613632, - "rewards/margins": 0.19908098876476288, - "rewards/margins_friction": 0.30369648337364197, - "rewards/rejected": -0.4396041929721832, - "rewards/rejected_friction": -0.3656672239303589, - "step": 1615 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.86, - "learning_rate": 4.1604473143156e-06, - "logits/chosen": -0.37050729990005493, - "logits/chosen_friction": -0.3785044252872467, - "logits/rejected": -0.3558463156223297, - "logits/rejected_friction": -0.34971708059310913, - "logps/chosen": -3.0413622856140137, - "logps/chosen_friction": -1.084039568901062, - "logps/rejected": -4.908519268035889, - "logps/rejected_friction": -4.228528022766113, - "loss": 0.0021, - "policy_friction_nll_loss": 1.0759756565093994, - "policy_nll_loss": 3.022146224975586, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2455630749464035, - "rewards/chosen_fricton": -0.061205487698316574, - "rewards/margins": 0.18420274555683136, - "rewards/margins_friction": 0.31162744760513306, - "rewards/rejected": -0.4297657907009125, - "rewards/rejected_friction": -0.3728329539299011, + "epoch": 0.49, + "learning_rate": 2.3106027738679743e-06, + "logits/chosen": -0.3403882086277008, + "logits/rejected": -0.34152495861053467, + "logps/chosen": -453.09197998046875, + "logps/rejected": -461.7265625, + "loss": 0.5492, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.994368076324463, + "rewards/margins": 0.9577304124832153, + "rewards/rejected": -3.9520981311798096, "step": 1620 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.87, - "learning_rate": 4.155540584215833e-06, - "logits/chosen": -0.36326247453689575, - "logits/chosen_friction": -0.37090247869491577, - "logits/rejected": -0.34747058153152466, - "logits/rejected_friction": -0.34265196323394775, - "logps/chosen": -3.0395078659057617, - "logps/chosen_friction": -1.1244386434555054, - "logps/rejected": -4.962199687957764, - "logps/rejected_friction": -4.170924186706543, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1189305782318115, - "policy_nll_loss": 3.015183210372925, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24543419480323792, - "rewards/chosen_fricton": -0.06504359841346741, - "rewards/margins": 0.18913596868515015, - "rewards/margins_friction": 0.30066654086112976, - "rewards/rejected": -0.4345701336860657, - "rewards/rejected_friction": -0.36571013927459717, - "step": 1625 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.87, - "learning_rate": 4.150622469308559e-06, - "logits/chosen": -0.35777729749679565, - "logits/chosen_friction": -0.36590319871902466, - "logits/rejected": -0.34113043546676636, - "logits/rejected_friction": -0.3351803719997406, - "logps/chosen": -3.246208667755127, - "logps/chosen_friction": -1.1613587141036987, - "logps/rejected": -5.172602653503418, - "logps/rejected_friction": -4.3197221755981445, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1418975591659546, - "policy_nll_loss": 3.2052102088928223, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2629612684249878, - "rewards/chosen_fricton": -0.06634706258773804, - "rewards/margins": 0.18885968625545502, - "rewards/margins_friction": 0.3112875521183014, - "rewards/rejected": -0.4518209397792816, - "rewards/rejected_friction": -0.37763458490371704, + "epoch": 0.5, + "learning_rate": 2.2836095512487063e-06, + "logits/chosen": -0.34236225485801697, + "logits/rejected": -0.3437976539134979, + "logps/chosen": -448.03765869140625, + "logps/rejected": -458.0298767089844, + "loss": 0.4769, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8009488582611084, + "rewards/margins": 1.1950900554656982, + "rewards/rejected": -3.9960389137268066, "step": 1630 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.87, - "learning_rate": 4.145693003414628e-06, - "logits/chosen": -0.3604745864868164, - "logits/chosen_friction": -0.36820581555366516, - "logits/rejected": -0.343069463968277, - "logits/rejected_friction": -0.33844882249832153, - "logps/chosen": -3.1566314697265625, - "logps/chosen_friction": -1.1560431718826294, - "logps/rejected": -5.1170654296875, - "logps/rejected_friction": -4.169801235198975, - "loss": 0.0013, - "policy_friction_nll_loss": 1.1462976932525635, - "policy_nll_loss": 3.131516456604004, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2557668685913086, - "rewards/chosen_fricton": -0.06724128872156143, - "rewards/margins": 0.19344282150268555, - "rewards/margins_friction": 0.2979913055896759, - "rewards/rejected": -0.4492097496986389, - "rewards/rejected_friction": -0.3652326166629791, - "step": 1635 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.87, - "learning_rate": 4.1407522204329525e-06, - "logits/chosen": -0.3594406545162201, - "logits/chosen_friction": -0.3671351671218872, - "logits/rejected": -0.34001266956329346, - "logits/rejected_friction": -0.3392466902732849, - "logps/chosen": -3.0918307304382324, - "logps/chosen_friction": -1.1623871326446533, - "logps/rejected": -5.081048965454102, - "logps/rejected_friction": -4.201562404632568, - "loss": 0.0018, - "policy_friction_nll_loss": 1.1333856582641602, - "policy_nll_loss": 3.0439279079437256, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2509385943412781, - "rewards/chosen_fricton": -0.06887766718864441, - "rewards/margins": 0.1957138031721115, - "rewards/margins_friction": 0.2997637987136841, - "rewards/rejected": -0.44665247201919556, - "rewards/rejected_friction": -0.3686414361000061, + "epoch": 0.5, + "learning_rate": 2.256641723008026e-06, + "logits/chosen": -0.3453958332538605, + "logits/rejected": -0.34628570079803467, + "logps/chosen": -452.4602966308594, + "logps/rejected": -464.2635192871094, + "loss": 0.4904, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8692307472229004, + "rewards/margins": 1.1883941888809204, + "rewards/rejected": -4.057624816894531, "step": 1640 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.88, - "learning_rate": 4.135800154340266e-06, - "logits/chosen": -0.3733990788459778, - "logits/chosen_friction": -0.38302573561668396, - "logits/rejected": -0.35677218437194824, - "logits/rejected_friction": -0.35350674390792847, - "logps/chosen": -3.084031105041504, - "logps/chosen_friction": -1.0925654172897339, - "logps/rejected": -5.001953125, - "logps/rejected_friction": -4.102067470550537, - "loss": 0.0017, - "policy_friction_nll_loss": 1.0794670581817627, - "policy_nll_loss": 3.053007125854492, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25018566846847534, - "rewards/chosen_fricton": -0.06284443289041519, - "rewards/margins": 0.18882641196250916, - "rewards/margins_friction": 0.2972489893436432, - "rewards/rejected": -0.4390120506286621, - "rewards/rejected_friction": -0.3600934147834778, - "step": 1645 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.88, - "learning_rate": 4.130836839190893e-06, - "logits/chosen": -0.345020055770874, - "logits/chosen_friction": -0.3493015468120575, - "logits/rejected": -0.33096957206726074, - "logits/rejected_friction": -0.3214486241340637, - "logps/chosen": -3.178476095199585, - "logps/chosen_friction": -1.1738605499267578, - "logps/rejected": -4.998623847961426, - "logps/rejected_friction": -4.245115280151367, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1519339084625244, - "policy_nll_loss": 3.1279449462890625, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25562018156051636, - "rewards/chosen_fricton": -0.06707366555929184, - "rewards/margins": 0.17885056138038635, - "rewards/margins_friction": 0.3031977713108063, - "rewards/rejected": -0.4344707429409027, - "rewards/rejected_friction": -0.3702714443206787, + "epoch": 0.5, + "learning_rate": 2.2297024539401463e-06, + "logits/chosen": -0.3422110974788666, + "logits/rejected": -0.34265169501304626, + "logps/chosen": -459.0148010253906, + "logps/rejected": -469.46038818359375, + "loss": 0.4726, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0340380668640137, + "rewards/margins": 1.1149537563323975, + "rewards/rejected": -4.148991584777832, "step": 1650 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.88, - "learning_rate": 4.12586230911652e-06, - "logits/chosen": -0.3688984513282776, - "logits/chosen_friction": -0.3767995238304138, - "logits/rejected": -0.3628339469432831, - "logits/rejected_friction": -0.3471958041191101, - "logps/chosen": -3.1592495441436768, - "logps/chosen_friction": -1.1139028072357178, - "logps/rejected": -4.994107723236084, - "logps/rejected_friction": -4.390771389007568, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1045119762420654, - "policy_nll_loss": 3.1360135078430176, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2572817802429199, - "rewards/chosen_fricton": -0.06377388536930084, - "rewards/margins": 0.1793210655450821, - "rewards/margins_friction": 0.3230139911174774, - "rewards/rejected": -0.4366028904914856, - "rewards/rejected_friction": -0.38678786158561707, - "step": 1655 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.89, - "learning_rate": 4.120876598325952e-06, - "logits/chosen": -0.35460150241851807, - "logits/chosen_friction": -0.358125239610672, - "logits/rejected": -0.3498954176902771, - "logits/rejected_friction": -0.33014366030693054, - "logps/chosen": -3.144922971725464, - "logps/chosen_friction": -1.1553869247436523, - "logps/rejected": -4.888993740081787, - "logps/rejected_friction": -4.470956325531006, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1411765813827515, - "policy_nll_loss": 3.104349374771118, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25206655263900757, - "rewards/chosen_fricton": -0.06513798236846924, - "rewards/margins": 0.17170079052448273, - "rewards/margins_friction": 0.3284170627593994, - "rewards/rejected": -0.4237673878669739, - "rewards/rejected_friction": -0.39355507493019104, + "epoch": 0.5, + "learning_rate": 2.2027949054877342e-06, + "logits/chosen": -0.34315139055252075, + "logits/rejected": -0.3437284529209137, + "logps/chosen": -448.80657958984375, + "logps/rejected": -458.0669860839844, + "loss": 0.5145, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8165078163146973, + "rewards/margins": 1.120755672454834, + "rewards/rejected": -3.9372634887695312, "step": 1660 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.89, - "learning_rate": 4.115879741104882e-06, - "logits/chosen": -0.35338306427001953, - "logits/chosen_friction": -0.35966944694519043, - "logits/rejected": -0.34344282746315, - "logits/rejected_friction": -0.3328072130680084, - "logps/chosen": -3.0192344188690186, - "logps/chosen_friction": -1.092893362045288, - "logps/rejected": -4.8955888748168945, - "logps/rejected_friction": -4.282710075378418, - "loss": 0.0017, - "policy_friction_nll_loss": 1.075641393661499, - "policy_nll_loss": 2.9750466346740723, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24415996670722961, - "rewards/chosen_fricton": -0.06236233562231064, - "rewards/margins": 0.18522822856903076, - "rewards/margins_friction": 0.3157908022403717, - "rewards/rejected": -0.42938822507858276, - "rewards/rejected_friction": -0.37815314531326294, - "step": 1665 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.89, - "learning_rate": 4.110871771815657e-06, - "logits/chosen": -0.36553460359573364, - "logits/chosen_friction": -0.3726870119571686, - "logits/rejected": -0.35000884532928467, - "logits/rejected_friction": -0.3447745442390442, - "logps/chosen": -3.0467138290405273, - "logps/chosen_friction": -1.1013667583465576, - "logps/rejected": -5.00229549407959, - "logps/rejected_friction": -4.242344856262207, - "loss": 0.001, - "policy_friction_nll_loss": 1.0898946523666382, - "policy_nll_loss": 3.016042947769165, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24746005237102509, - "rewards/chosen_fricton": -0.06547097861766815, - "rewards/margins": 0.19292107224464417, - "rewards/margins_friction": 0.31062445044517517, - "rewards/rejected": -0.44038113951683044, - "rewards/rejected_friction": -0.3760954737663269, + "epoch": 0.51, + "learning_rate": 2.175922235370904e-06, + "logits/chosen": -0.34890785813331604, + "logits/rejected": -0.34955543279647827, + "logps/chosen": -448.3866271972656, + "logps/rejected": -457.5038146972656, + "loss": 0.4845, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5519251823425293, + "rewards/margins": 1.266904592514038, + "rewards/rejected": -3.8188300132751465, "step": 1670 }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.89, - "learning_rate": 4.105852724897037e-06, - "logits/chosen": -0.3643651008605957, - "logits/chosen_friction": -0.3719940781593323, - "logits/rejected": -0.3463948667049408, - "logits/rejected_friction": -0.34309399127960205, - "logps/chosen": -3.135896682739258, - "logps/chosen_friction": -1.107506513595581, - "logps/rejected": -5.079819679260254, - "logps/rejected_friction": -4.136595249176025, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1012170314788818, - "policy_nll_loss": 3.1074721813201904, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25627827644348145, - "rewards/chosen_fricton": -0.06542832404375076, - "rewards/margins": 0.19201159477233887, - "rewards/margins_friction": 0.2997449040412903, - "rewards/rejected": -0.4482899308204651, - "rewards/rejected_friction": -0.36517325043678284, - "step": 1675 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.9, - "learning_rate": 4.100822634863962e-06, - "logits/chosen": -0.35239920020103455, - "logits/chosen_friction": -0.3587798774242401, - "logits/rejected": -0.3403719365596771, - "logits/rejected_friction": -0.32944607734680176, - "logps/chosen": -3.302016496658325, - "logps/chosen_friction": -1.2367980480194092, - "logps/rejected": -5.138609886169434, - "logps/rejected_friction": -4.453995704650879, - "loss": 0.0009, - "policy_friction_nll_loss": 1.2239658832550049, - "policy_nll_loss": 3.2669334411621094, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26650673151016235, - "rewards/chosen_fricton": -0.07116206735372543, - "rewards/margins": 0.18079625070095062, - "rewards/margins_friction": 0.31838124990463257, - "rewards/rejected": -0.4473029673099518, - "rewards/rejected_friction": -0.389543354511261, + { + "epoch": 0.51, + "learning_rate": 2.1490875972166394e-06, + "logits/chosen": -0.3498338460922241, + "logits/rejected": -0.35048046708106995, + "logps/chosen": -449.01849365234375, + "logps/rejected": -459.8980407714844, + "loss": 0.3836, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.5593769550323486, + "rewards/margins": 1.4853286743164062, + "rewards/rejected": -4.044705390930176, "step": 1680 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.9, - "learning_rate": 4.095781536307313e-06, - "logits/chosen": -0.3498537540435791, - "logits/chosen_friction": -0.35620489716529846, - "logits/rejected": -0.3369419276714325, - "logits/rejected_friction": -0.32598617672920227, - "logps/chosen": -3.3273119926452637, - "logps/chosen_friction": -1.207050085067749, - "logps/rejected": -5.151278972625732, - "logps/rejected_friction": -4.432053565979004, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1937633752822876, - "policy_nll_loss": 3.295746326446533, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2675402760505676, - "rewards/chosen_fricton": -0.0680025964975357, - "rewards/margins": 0.17911432683467865, - "rewards/margins_friction": 0.3184574246406555, - "rewards/rejected": -0.44665461778640747, - "rewards/rejected_friction": -0.38646000623703003, - "step": 1685 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.9, - "learning_rate": 4.0907294638936734e-06, - "logits/chosen": -0.3491040766239166, - "logits/chosen_friction": -0.35755112767219543, - "logits/rejected": -0.3359959125518799, - "logits/rejected_friction": -0.3289892375469208, - "logps/chosen": -3.051631212234497, - "logps/chosen_friction": -1.1109354496002197, - "logps/rejected": -4.97200345993042, - "logps/rejected_friction": -4.207494735717773, - "loss": 0.0011, - "policy_friction_nll_loss": 1.08785080909729, - "policy_nll_loss": 3.0048680305480957, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24707968533039093, - "rewards/chosen_fricton": -0.0644863098859787, - "rewards/margins": 0.18889474868774414, - "rewards/margins_friction": 0.30596351623535156, - "rewards/rejected": -0.43597444891929626, - "rewards/rejected_friction": -0.37044981122016907, + "epoch": 0.51, + "learning_rate": 2.1222941401887087e-06, + "logits/chosen": -0.3391914367675781, + "logits/rejected": -0.3401142954826355, + "logps/chosen": -437.19488525390625, + "logps/rejected": -449.09820556640625, + "loss": 0.4638, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.642850875854492, + "rewards/margins": 1.1874374151229858, + "rewards/rejected": -3.8302879333496094, "step": 1690 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.9, - "learning_rate": 4.085666452365091e-06, - "logits/chosen": -0.3551929295063019, - "logits/chosen_friction": -0.36181288957595825, - "logits/rejected": -0.3431848883628845, - "logits/rejected_friction": -0.33434414863586426, - "logps/chosen": -3.2183163166046143, - "logps/chosen_friction": -1.1513160467147827, - "logps/rejected": -5.074718952178955, - "logps/rejected_friction": -4.300723075866699, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1326642036437988, - "policy_nll_loss": 3.1817774772644043, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2622267007827759, - "rewards/chosen_fricton": -0.06726808845996857, - "rewards/margins": 0.18260005116462708, - "rewards/margins_friction": 0.3111986517906189, - "rewards/rejected": -0.4448266923427582, - "rewards/rejected_friction": -0.37846675515174866, - "step": 1695 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.91, - "learning_rate": 4.080592536538843e-06, - "logits/chosen": -0.36041802167892456, - "logits/chosen_friction": -0.3655330240726471, - "logits/rejected": -0.34708017110824585, - "logits/rejected_friction": -0.33903205394744873, - "logps/chosen": -2.9799509048461914, - "logps/chosen_friction": -1.0786969661712646, - "logps/rejected": -4.939871311187744, - "logps/rejected_friction": -4.222973346710205, - "loss": 0.0013, - "policy_friction_nll_loss": 1.0612149238586426, - "policy_nll_loss": 2.9390413761138916, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24130085110664368, - "rewards/chosen_fricton": -0.06214481592178345, - "rewards/margins": 0.1928352266550064, - "rewards/margins_friction": 0.31070369482040405, - "rewards/rejected": -0.4341360628604889, - "rewards/rejected_friction": -0.3728485107421875, + "epoch": 0.52, + "learning_rate": 2.0955450086180883e-06, + "logits/chosen": -0.3401223123073578, + "logits/rejected": -0.3409723937511444, + "logps/chosen": -453.819580078125, + "logps/rejected": -463.77117919921875, + "loss": 0.4747, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.682774066925049, + "rewards/margins": 1.2849785089492798, + "rewards/rejected": -3.9677529335021973, "step": 1700 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.91, - "learning_rate": 4.075507751307189e-06, - "logits/chosen": -0.3664965033531189, - "logits/chosen_friction": -0.37352341413497925, - "logits/rejected": -0.3512231707572937, - "logits/rejected_friction": -0.34330737590789795, - "logps/chosen": -3.1196327209472656, - "logps/chosen_friction": -1.1361314058303833, - "logps/rejected": -4.947991847991943, - "logps/rejected_friction": -4.21637487411499, - "loss": 0.0012, - "policy_friction_nll_loss": 1.1237461566925049, - "policy_nll_loss": 3.09061861038208, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2524990439414978, - "rewards/chosen_fricton": -0.06656280159950256, - "rewards/margins": 0.17988426983356476, - "rewards/margins_friction": 0.30435508489608765, - "rewards/rejected": -0.43238335847854614, - "rewards/rejected_friction": -0.3709179162979126, - "step": 1705 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.91, - "learning_rate": 4.070412131637139e-06, - "logits/chosen": -0.36911624670028687, - "logits/chosen_friction": -0.3770904541015625, - "logits/rejected": -0.3546540141105652, - "logits/rejected_friction": -0.3493059277534485, - "logps/chosen": -3.090266704559326, - "logps/chosen_friction": -1.093062162399292, - "logps/rejected": -4.952712535858154, - "logps/rejected_friction": -4.121860027313232, - "loss": 0.0015, - "policy_friction_nll_loss": 1.077960729598999, - "policy_nll_loss": 3.057447671890259, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25309884548187256, - "rewards/chosen_fricton": -0.06468731164932251, - "rewards/margins": 0.1822192668914795, - "rewards/margins_friction": 0.29790833592414856, - "rewards/rejected": -0.43531814217567444, - "rewards/rejected_friction": -0.36259567737579346, + "epoch": 0.52, + "learning_rate": 2.0688433416339694e-06, + "logits/chosen": -0.3425321877002716, + "logits/rejected": -0.3435406982898712, + "logps/chosen": -441.6337890625, + "logps/rejected": -454.7735290527344, + "loss": 0.4359, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.900296688079834, + "rewards/margins": 1.1836225986480713, + "rewards/rejected": -4.083919525146484, "step": 1710 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.91, - "learning_rate": 4.065305712570206e-06, - "logits/chosen": -0.35777390003204346, - "logits/chosen_friction": -0.3643503785133362, - "logits/rejected": -0.34158462285995483, - "logits/rejected_friction": -0.3340870141983032, - "logps/chosen": -3.194955348968506, - "logps/chosen_friction": -1.15040922164917, - "logps/rejected": -5.186622142791748, - "logps/rejected_friction": -4.295763969421387, - "loss": 0.0015, - "policy_friction_nll_loss": 1.1349960565567017, - "policy_nll_loss": 3.1535983085632324, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2599099576473236, - "rewards/chosen_fricton": -0.06762473285198212, - "rewards/margins": 0.1957230269908905, - "rewards/margins_friction": 0.3099932074546814, - "rewards/rejected": -0.4556330144405365, - "rewards/rejected_friction": -0.3776179850101471, - "step": 1715 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.92, - "learning_rate": 4.060188529222168e-06, - "logits/chosen": -0.3621329963207245, - "logits/chosen_friction": -0.36992427706718445, - "logits/rejected": -0.3476121127605438, - "logits/rejected_friction": -0.33909106254577637, - "logps/chosen": -3.256326675415039, - "logps/chosen_friction": -1.160455584526062, - "logps/rejected": -5.15654182434082, - "logps/rejected_friction": -4.289867401123047, - "loss": 0.0013, - "policy_friction_nll_loss": 1.1490013599395752, - "policy_nll_loss": 3.235875368118286, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26458609104156494, - "rewards/chosen_fricton": -0.06655276566743851, - "rewards/margins": 0.18753860890865326, - "rewards/margins_friction": 0.30963587760925293, - "rewards/rejected": -0.4521247446537018, - "rewards/rejected_friction": -0.37618860602378845, + "epoch": 0.52, + "learning_rate": 2.0421922727953597e-06, + "logits/chosen": -0.3457149863243103, + "logits/rejected": -0.3468255400657654, + "logps/chosen": -449.11700439453125, + "logps/rejected": -461.40020751953125, + "loss": 0.4626, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.759221315383911, + "rewards/margins": 1.2033522129058838, + "rewards/rejected": -3.962573528289795, "step": 1720 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.92, - "learning_rate": 4.055060616782832e-06, - "logits/chosen": -0.35882601141929626, - "logits/chosen_friction": -0.36687523126602173, - "logits/rejected": -0.34246158599853516, - "logits/rejected_friction": -0.3386932909488678, - "logps/chosen": -3.094312906265259, - "logps/chosen_friction": -1.0640361309051514, - "logps/rejected": -5.017544746398926, - "logps/rejected_friction": -4.095543384552002, - "loss": 0.0012, - "policy_friction_nll_loss": 1.0566647052764893, - "policy_nll_loss": 3.0714783668518066, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25416168570518494, - "rewards/chosen_fricton": -0.06253460049629211, - "rewards/margins": 0.18845239281654358, - "rewards/margins_friction": 0.29852432012557983, - "rewards/rejected": -0.4426140785217285, - "rewards/rejected_friction": -0.36105892062187195, - "step": 1725 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.92, - "learning_rate": 4.0499220105157795e-06, - "logits/chosen": -0.34832122921943665, - "logits/chosen_friction": -0.3575947880744934, - "logits/rejected": -0.3339238464832306, - "logits/rejected_friction": -0.32763463258743286, - "logps/chosen": -3.1701245307922363, - "logps/chosen_friction": -1.1476490497589111, - "logps/rejected": -5.147014617919922, - "logps/rejected_friction": -4.310229301452637, - "loss": 0.0009, - "policy_friction_nll_loss": 1.131258249282837, - "policy_nll_loss": 3.1347999572753906, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2573288679122925, - "rewards/chosen_fricton": -0.06680917739868164, - "rewards/margins": 0.19421084225177765, - "rewards/margins_friction": 0.3119752109050751, - "rewards/rejected": -0.45153969526290894, - "rewards/rejected_friction": -0.3787844181060791, + "epoch": 0.53, + "learning_rate": 2.0155949297233542e-06, + "logits/chosen": -0.3487555980682373, + "logits/rejected": -0.34981435537338257, + "logps/chosen": -461.87481689453125, + "logps/rejected": -473.541015625, + "loss": 0.4555, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.762120008468628, + "rewards/margins": 1.2758208513259888, + "rewards/rejected": -4.037940979003906, "step": 1730 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.93, - "learning_rate": 4.044772745758136e-06, - "logits/chosen": -0.3439483940601349, - "logits/chosen_friction": -0.3503867983818054, - "logits/rejected": -0.32904624938964844, - "logits/rejected_friction": -0.3215058743953705, - "logps/chosen": -3.1261038780212402, - "logps/chosen_friction": -1.127010703086853, - "logps/rejected": -5.05502986907959, - "logps/rejected_friction": -4.282733917236328, - "loss": 0.0012, - "policy_friction_nll_loss": 1.1116702556610107, - "policy_nll_loss": 3.094399929046631, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25127848982810974, - "rewards/chosen_fricton": -0.0634174644947052, - "rewards/margins": 0.18914541602134705, - "rewards/margins_friction": 0.31104037165641785, - "rewards/rejected": -0.4404239058494568, - "rewards/rejected_friction": -0.37445783615112305, - "step": 1735 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.93, - "learning_rate": 4.039612857920323e-06, - "logits/chosen": -0.36433830857276917, - "logits/chosen_friction": -0.37157535552978516, - "logits/rejected": -0.35006242990493774, - "logits/rejected_friction": -0.3411122262477875, - "logps/chosen": -3.1705322265625, - "logps/chosen_friction": -1.1284708976745605, - "logps/rejected": -5.07279109954834, - "logps/rejected_friction": -4.212175369262695, - "loss": 0.0014, - "policy_friction_nll_loss": 1.1197515726089478, - "policy_nll_loss": 3.146000623703003, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2577904462814331, - "rewards/chosen_fricton": -0.06558551639318466, - "rewards/margins": 0.1875130832195282, - "rewards/margins_friction": 0.30523213744163513, - "rewards/rejected": -0.4453034996986389, - "rewards/rejected_friction": -0.3708176612854004, + "epoch": 0.53, + "learning_rate": 1.9890544337340882e-06, + "logits/chosen": -0.3474620282649994, + "logits/rejected": -0.34911760687828064, + "logps/chosen": -446.1351623535156, + "logps/rejected": -461.01678466796875, + "loss": 0.4426, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9118289947509766, + "rewards/margins": 1.271337866783142, + "rewards/rejected": -4.183166980743408, "step": 1740 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.93, - "learning_rate": 4.034442382485813e-06, - "logits/chosen": -0.36034706234931946, - "logits/chosen_friction": -0.36553090810775757, - "logits/rejected": -0.3487778604030609, - "logits/rejected_friction": -0.3336871564388275, - "logps/chosen": -3.2509524822235107, - "logps/chosen_friction": -1.1877728700637817, - "logps/rejected": -5.108328342437744, - "logps/rejected_friction": -4.36234712600708, - "loss": 0.0012, - "policy_friction_nll_loss": 1.1774475574493408, - "policy_nll_loss": 3.2297301292419434, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26231783628463745, - "rewards/chosen_fricton": -0.06797986477613449, - "rewards/margins": 0.1821613758802414, - "rewards/margins_friction": 0.3131219148635864, - "rewards/rejected": -0.44447922706604004, - "rewards/rejected_friction": -0.3811018168926239, - "step": 1745 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.93, - "learning_rate": 4.029261355010886e-06, - "logits/chosen": -0.3539872467517853, - "logits/chosen_friction": -0.361166775226593, - "logits/rejected": -0.34263312816619873, - "logits/rejected_friction": -0.33077797293663025, - "logps/chosen": -3.1213386058807373, - "logps/chosen_friction": -1.1505372524261475, - "logps/rejected": -4.997889518737793, - "logps/rejected_friction": -4.354981422424316, - "loss": 0.0016, - "policy_friction_nll_loss": 1.1412289142608643, - "policy_nll_loss": 3.0847485065460205, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2517564594745636, - "rewards/chosen_fricton": -0.06633012741804123, - "rewards/margins": 0.18394127488136292, - "rewards/margins_friction": 0.3157809376716614, - "rewards/rejected": -0.4356977045536041, - "rewards/rejected_friction": -0.3821110129356384, + "epoch": 0.53, + "learning_rate": 1.96257389947244e-06, + "logits/chosen": -0.34583669900894165, + "logits/rejected": -0.3470597565174103, + "logps/chosen": -445.00054931640625, + "logps/rejected": -457.888671875, + "loss": 0.4487, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.823984146118164, + "rewards/margins": 1.3737802505493164, + "rewards/rejected": -4.1977643966674805, "step": 1750 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.94, - "learning_rate": 4.02406981112439e-06, - "logits/chosen": -0.35194867849349976, - "logits/chosen_friction": -0.3572395145893097, - "logits/rejected": -0.3420664668083191, - "logits/rejected_friction": -0.3291206955909729, - "logps/chosen": -3.1660189628601074, - "logps/chosen_friction": -1.1187745332717896, - "logps/rejected": -4.98264217376709, - "logps/rejected_friction": -4.323138236999512, - "loss": 0.0014, - "policy_friction_nll_loss": 1.1022605895996094, - "policy_nll_loss": 3.127141237258911, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2550220191478729, - "rewards/chosen_fricton": -0.06260285526514053, - "rewards/margins": 0.1776399314403534, - "rewards/margins_friction": 0.3154630661010742, - "rewards/rejected": -0.4326620101928711, - "rewards/rejected_friction": -0.37806594371795654, - "step": 1755 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.94, - "learning_rate": 4.018867786527489e-06, - "logits/chosen": -0.3649967610836029, - "logits/chosen_friction": -0.37150049209594727, - "logits/rejected": -0.35444194078445435, - "logits/rejected_friction": -0.3397776782512665, - "logps/chosen": -3.127133369445801, - "logps/chosen_friction": -1.1320279836654663, - "logps/rejected": -4.9719414710998535, - "logps/rejected_friction": -4.341587543487549, - "loss": 0.0019, - "policy_friction_nll_loss": 1.1245417594909668, - "policy_nll_loss": 3.102334976196289, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25136247277259827, - "rewards/chosen_fricton": -0.06346379220485687, - "rewards/margins": 0.18165983259677887, - "rewards/margins_friction": 0.31730887293815613, - "rewards/rejected": -0.43302232027053833, - "rewards/rejected_friction": -0.3807726502418518, + "epoch": 0.53, + "learning_rate": 1.936156434546515e-06, + "logits/chosen": -0.3472025990486145, + "logits/rejected": -0.3478149473667145, + "logps/chosen": -450.0955505371094, + "logps/rejected": -459.27166748046875, + "loss": 0.5015, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8728580474853516, + "rewards/margins": 1.2571897506713867, + "rewards/rejected": -4.130047798156738, "step": 1760 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.94, - "learning_rate": 4.013655316993423e-06, - "logits/chosen": -0.35328465700149536, - "logits/chosen_friction": -0.36166784167289734, - "logits/rejected": -0.33828097581863403, - "logits/rejected_friction": -0.3327266275882721, - "logps/chosen": -3.044088125228882, - "logps/chosen_friction": -1.0734052658081055, - "logps/rejected": -4.994851112365723, - "logps/rejected_friction": -4.1886396408081055, - "loss": 0.0037, - "policy_friction_nll_loss": 1.0529062747955322, - "policy_nll_loss": 3.001021146774292, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24685624241828918, - "rewards/chosen_fricton": -0.0614028163254261, - "rewards/margins": 0.1914132535457611, - "rewards/margins_friction": 0.3070124685764313, - "rewards/rejected": -0.4382694661617279, - "rewards/rejected_friction": -0.36841529607772827, - "step": 1765 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.94, - "learning_rate": 4.008432438367257e-06, - "logits/chosen": -0.35214799642562866, - "logits/chosen_friction": -0.35920581221580505, - "logits/rejected": -0.33698955178260803, - "logits/rejected_friction": -0.3300001323223114, - "logps/chosen": -3.0287864208221436, - "logps/chosen_friction": -1.0614628791809082, - "logps/rejected": -4.970867156982422, - "logps/rejected_friction": -4.223477363586426, - "loss": 0.0015, - "policy_friction_nll_loss": 1.0441087484359741, - "policy_nll_loss": 2.994965076446533, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2440882921218872, - "rewards/chosen_fricton": -0.05890621989965439, - "rewards/margins": 0.19046741724014282, - "rewards/margins_friction": 0.3118889331817627, - "rewards/rejected": -0.43455570936203003, - "rewards/rejected_friction": -0.3707951307296753, + "epoch": 0.54, + "learning_rate": 1.90980513916295e-06, + "logits/chosen": -0.3443449139595032, + "logits/rejected": -0.3453408479690552, + "logps/chosen": -450.039306640625, + "logps/rejected": -456.46392822265625, + "loss": 0.4463, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8687210083007812, + "rewards/margins": 1.327781081199646, + "rewards/rejected": -4.196502208709717, "step": 1770 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.95, - "learning_rate": 4.00319918656564e-06, - "logits/chosen": -0.35183098912239075, - "logits/chosen_friction": -0.3575732111930847, - "logits/rejected": -0.3385879397392273, - "logits/rejected_friction": -0.32928982377052307, - "logps/chosen": -3.1481614112854004, - "logps/chosen_friction": -1.1284565925598145, - "logps/rejected": -5.053268909454346, - "logps/rejected_friction": -4.277283668518066, - "loss": 0.0009, - "policy_friction_nll_loss": 1.1160383224487305, - "policy_nll_loss": 3.1179938316345215, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2541174292564392, - "rewards/chosen_fricton": -0.06349185109138489, - "rewards/margins": 0.1873835325241089, - "rewards/margins_friction": 0.3112783432006836, - "rewards/rejected": -0.4415009617805481, - "rewards/rejected_friction": -0.3747701644897461, - "step": 1775 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.95, - "learning_rate": 3.997955597576553e-06, - "logits/chosen": -0.3522377610206604, - "logits/chosen_friction": -0.35862380266189575, - "logits/rejected": -0.3372401297092438, - "logits/rejected_friction": -0.329156756401062, - "logps/chosen": -3.113704204559326, - "logps/chosen_friction": -1.14545738697052, - "logps/rejected": -5.016848564147949, - "logps/rejected_friction": -4.217341423034668, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1358671188354492, - "policy_nll_loss": 3.0848934650421143, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.250765860080719, - "rewards/chosen_fricton": -0.0659661516547203, - "rewards/margins": 0.18665435910224915, - "rewards/margins_friction": 0.30278584361076355, - "rewards/rejected": -0.43742018938064575, - "rewards/rejected_friction": -0.36875200271606445, + "epoch": 0.54, + "learning_rate": 1.8835231057630955e-06, + "logits/chosen": -0.34365350008010864, + "logits/rejected": -0.34461337327957153, + "logps/chosen": -454.1045837402344, + "logps/rejected": -468.08251953125, + "loss": 0.3981, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.681962490081787, + "rewards/margins": 1.4462287425994873, + "rewards/rejected": -4.128190517425537, "step": 1780 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.95, - "learning_rate": 3.992701707459064e-06, - "logits/chosen": -0.3488229811191559, - "logits/chosen_friction": -0.35410910844802856, - "logits/rejected": -0.33439162373542786, - "logits/rejected_friction": -0.3266444802284241, - "logps/chosen": -3.1558008193969727, - "logps/chosen_friction": -1.1347217559814453, - "logps/rejected": -5.014817237854004, - "logps/rejected_friction": -4.21085786819458, - "loss": 0.0018, - "policy_friction_nll_loss": 1.1118122339248657, - "policy_nll_loss": 3.1136131286621094, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2562618851661682, - "rewards/chosen_fricton": -0.06593899428844452, - "rewards/margins": 0.1825026571750641, - "rewards/margins_friction": 0.30406731367111206, - "rewards/rejected": -0.4387645721435547, - "rewards/rejected_friction": -0.3700063228607178, - "step": 1785 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.95, - "learning_rate": 3.987437552343082e-06, - "logits/chosen": -0.3523460626602173, - "logits/chosen_friction": -0.36029526591300964, - "logits/rejected": -0.3387754261493683, - "logits/rejected_friction": -0.33040255308151245, - "logps/chosen": -3.1710689067840576, - "logps/chosen_friction": -1.0996931791305542, - "logps/rejected": -5.098153114318848, - "logps/rejected_friction": -4.283156394958496, - "loss": 0.0011, - "policy_friction_nll_loss": 1.081913709640503, - "policy_nll_loss": 3.12849497795105, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25723880529403687, - "rewards/chosen_fricton": -0.061529528349637985, - "rewards/margins": 0.18984147906303406, - "rewards/margins_friction": 0.3146226406097412, - "rewards/rejected": -0.44708022475242615, - "rewards/rejected_friction": -0.3761521577835083, + "epoch": 0.54, + "learning_rate": 1.8573134186600978e-06, + "logits/chosen": -0.3493928909301758, + "logits/rejected": -0.35027194023132324, + "logps/chosen": -447.32666015625, + "logps/rejected": -458.9419860839844, + "loss": 0.4397, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6235809326171875, + "rewards/margins": 1.4347044229507446, + "rewards/rejected": -4.058285236358643, "step": 1790 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.96, - "learning_rate": 3.982163168429103e-06, - "logits/chosen": -0.3578431010246277, - "logits/chosen_friction": -0.36468663811683655, - "logits/rejected": -0.34806957840919495, - "logits/rejected_friction": -0.33469006419181824, - "logps/chosen": -3.1707277297973633, - "logps/chosen_friction": -1.1105844974517822, - "logps/rejected": -4.972075939178467, - "logps/rejected_friction": -4.334001064300537, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1009668111801147, - "policy_nll_loss": 3.149998903274536, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2560630440711975, - "rewards/chosen_fricton": -0.060960639268159866, - "rewards/margins": 0.17756937444210052, - "rewards/margins_friction": 0.3193025588989258, - "rewards/rejected": -0.43363243341445923, - "rewards/rejected_friction": -0.38026320934295654, - "step": 1795 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.96, - "learning_rate": 3.976878591987965e-06, - "logits/chosen": -0.35907047986984253, - "logits/chosen_friction": -0.3665209114551544, - "logits/rejected": -0.34819597005844116, - "logits/rejected_friction": -0.335066020488739, - "logps/chosen": -3.1030702590942383, - "logps/chosen_friction": -1.0575300455093384, - "logps/rejected": -4.972933769226074, - "logps/rejected_friction": -4.237673759460449, - "loss": 0.0012, - "policy_friction_nll_loss": 1.0461570024490356, - "policy_nll_loss": 3.08594012260437, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2511691451072693, - "rewards/chosen_fricton": -0.058175861835479736, - "rewards/margins": 0.18381422758102417, - "rewards/margins_friction": 0.31410127878189087, - "rewards/rejected": -0.43498334288597107, - "rewards/rejected_friction": -0.3722771108150482, + "epoch": 0.55, + "learning_rate": 1.8311791536769485e-06, + "logits/chosen": -0.346055805683136, + "logits/rejected": -0.3475271463394165, + "logps/chosen": -442.3778381347656, + "logps/rejected": -458.1031188964844, + "loss": 0.3935, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.6405744552612305, + "rewards/margins": 1.582415223121643, + "rewards/rejected": -4.222989559173584, "step": 1800 }, { - "epoch": 0.96, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.3970787525177002, - "eval_logits/chosen_friction": -0.40310609340667725, - "eval_logits/rejected": -0.38581138849258423, - "eval_logits/rejected_friction": -0.36990272998809814, - "eval_logps/chosen": -3.0933332443237305, - "eval_logps/chosen_friction": -1.1042819023132324, - "eval_logps/rejected": -4.9899468421936035, - "eval_logps/rejected_friction": -4.273800849914551, - "eval_loss": 0.0011356892064213753, - "eval_policy_friction_nll_loss": 1.1042819023132324, - "eval_policy_nll_loss": 3.0933332443237305, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.24939115345478058, - "eval_rewards/chosen_fricton": -0.062159471213817596, - "eval_rewards/margins": 0.18618355691432953, - "eval_rewards/margins_friction": 0.3128046691417694, - "eval_rewards/rejected": -0.4355747103691101, - "eval_rewards/rejected_friction": -0.3749641478061676, - "eval_runtime": 355.4566, - "eval_samples_per_second": 1.407, - "eval_steps_per_second": 0.703, + "epoch": 0.55, + "eval_logits/chosen": -0.41737592220306396, + "eval_logits/rejected": -0.4179980754852295, + "eval_logps/chosen": -440.4134826660156, + "eval_logps/rejected": -450.7027893066406, + "eval_loss": 0.47837841510772705, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -2.7522289752960205, + "eval_rewards/margins": 1.220139503479004, + "eval_rewards/rejected": -3.9723684787750244, + "eval_runtime": 351.6535, + "eval_samples_per_second": 1.422, + "eval_steps_per_second": 1.422, "step": 1800 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.96, - "learning_rate": 3.971583859360601e-06, - "logits/chosen": -0.34931379556655884, - "logits/chosen_friction": -0.3530734181404114, - "logits/rejected": -0.3390863835811615, - "logits/rejected_friction": -0.32496732473373413, - "logps/chosen": -3.1084070205688477, - "logps/chosen_friction": -1.1094306707382202, - "logps/rejected": -4.981678485870361, - "logps/rejected_friction": -4.35053825378418, - "loss": 0.0013, - "policy_friction_nll_loss": 1.088637113571167, - "policy_nll_loss": 3.073550224304199, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24905022978782654, - "rewards/chosen_fricton": -0.06177163124084473, - "rewards/margins": 0.1836744248867035, - "rewards/margins_friction": 0.3191898465156555, - "rewards/rejected": -0.43272462487220764, - "rewards/rejected_friction": -0.380961537361145, - "step": 1805 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.97, - "learning_rate": 3.966279006957781e-06, - "logits/chosen": -0.3668665885925293, - "logits/chosen_friction": -0.37235939502716064, - "logits/rejected": -0.3567797541618347, - "logits/rejected_friction": -0.34238535165786743, - "logps/chosen": -3.0334203243255615, - "logps/chosen_friction": -1.0865634679794312, - "logps/rejected": -4.9152750968933105, - "logps/rejected_friction": -4.287070274353027, - "loss": 0.0011, - "policy_friction_nll_loss": 1.074439287185669, - "policy_nll_loss": 3.005591630935669, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24453827738761902, - "rewards/chosen_fricton": -0.06173349544405937, - "rewards/margins": 0.18464751541614532, - "rewards/margins_friction": 0.3157528042793274, - "rewards/rejected": -0.42918580770492554, - "rewards/rejected_friction": -0.3774862587451935, + "epoch": 0.55, + "learning_rate": 1.805123377785515e-06, + "logits/chosen": -0.3527616858482361, + "logits/rejected": -0.3528694212436676, + "logps/chosen": -444.4476623535156, + "logps/rejected": -453.0213317871094, + "loss": 0.4432, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.709862232208252, + "rewards/margins": 1.3173949718475342, + "rewards/rejected": -4.027257442474365, "step": 1810 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.97, - "learning_rate": 3.960964071259871e-06, - "logits/chosen": -0.35704487562179565, - "logits/chosen_friction": -0.3652270436286926, - "logits/rejected": -0.34390121698379517, - "logits/rejected_friction": -0.3350996673107147, - "logps/chosen": -3.045515537261963, - "logps/chosen_friction": -1.0532338619232178, - "logps/rejected": -4.934488773345947, - "logps/rejected_friction": -4.220075607299805, - "loss": 0.0012, - "policy_friction_nll_loss": 1.0332329273223877, - "policy_nll_loss": 2.9975013732910156, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24595074355602264, - "rewards/chosen_fricton": -0.05867267772555351, - "rewards/margins": 0.18591561913490295, - "rewards/margins_friction": 0.3129701614379883, - "rewards/rejected": -0.4318663477897644, - "rewards/rejected_friction": -0.3716428875923157, - "step": 1815 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.97, - "learning_rate": 3.9556390888165755e-06, - "logits/chosen": -0.36197811365127563, - "logits/chosen_friction": -0.36754634976387024, - "logits/rejected": -0.3498838245868683, - "logits/rejected_friction": -0.3399241864681244, - "logps/chosen": -3.1317238807678223, - "logps/chosen_friction": -1.088293194770813, - "logps/rejected": -5.0190749168396, - "logps/rejected_friction": -4.216869354248047, - "loss": 0.0009, - "policy_friction_nll_loss": 1.080302119255066, - "policy_nll_loss": 3.1128878593444824, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2529526948928833, - "rewards/chosen_fricton": -0.060774464160203934, - "rewards/margins": 0.1859932690858841, - "rewards/margins_friction": 0.3095507323741913, - "rewards/rejected": -0.4389459490776062, - "rewards/rejected_friction": -0.3703251779079437, + "epoch": 0.55, + "learning_rate": 1.7791491487466234e-06, + "logits/chosen": -0.3477206528186798, + "logits/rejected": -0.34793621301651, + "logps/chosen": -444.4949645996094, + "logps/rejected": -456.147705078125, + "loss": 0.4933, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9446983337402344, + "rewards/margins": 1.1086232662200928, + "rewards/rejected": -4.053321361541748, "step": 1820 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.97, - "learning_rate": 3.950304096246689e-06, - "logits/chosen": -0.35111576318740845, - "logits/chosen_friction": -0.3570344150066376, - "logits/rejected": -0.33757835626602173, - "logits/rejected_friction": -0.32977980375289917, - "logps/chosen": -3.066901922225952, - "logps/chosen_friction": -1.0507056713104248, - "logps/rejected": -5.001862049102783, - "logps/rejected_friction": -4.201172828674316, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0298802852630615, - "policy_nll_loss": 3.0241260528564453, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2503334879875183, - "rewards/chosen_fricton": -0.06009981781244278, - "rewards/margins": 0.1898377239704132, - "rewards/margins_friction": 0.31044650077819824, - "rewards/rejected": -0.44017118215560913, - "rewards/rejected_friction": -0.3705463111400604, - "step": 1825 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.98, - "learning_rate": 3.944959130237843e-06, - "logits/chosen": -0.3527962267398834, - "logits/chosen_friction": -0.35674813389778137, - "logits/rejected": -0.34487560391426086, - "logits/rejected_friction": -0.32931962609291077, - "logps/chosen": -3.0936684608459473, - "logps/chosen_friction": -1.1386569738388062, - "logps/rejected": -4.893092155456543, - "logps/rejected_friction": -4.333435535430908, - "loss": 0.0013, - "policy_friction_nll_loss": 1.1153252124786377, - "policy_nll_loss": 3.046417236328125, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24709415435791016, - "rewards/chosen_fricton": -0.06333817541599274, - "rewards/margins": 0.1775868684053421, - "rewards/margins_friction": 0.3161957263946533, - "rewards/rejected": -0.42468100786209106, - "rewards/rejected_friction": -0.37953391671180725, + "epoch": 0.56, + "learning_rate": 1.7532595147512167e-06, + "logits/chosen": -0.34836429357528687, + "logits/rejected": -0.34931057691574097, + "logps/chosen": -448.5811462402344, + "logps/rejected": -460.9894104003906, + "loss": 0.4243, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.7561140060424805, + "rewards/margins": 1.3816736936569214, + "rewards/rejected": -4.137787818908691, "step": 1830 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.98, - "learning_rate": 3.939604227546255e-06, - "logits/chosen": -0.3585546314716339, - "logits/chosen_friction": -0.3637189269065857, - "logits/rejected": -0.3452376127243042, - "logits/rejected_friction": -0.3372344970703125, - "logps/chosen": -3.0523228645324707, - "logps/chosen_friction": -1.124021291732788, - "logps/rejected": -4.945216178894043, - "logps/rejected_friction": -4.201236248016357, - "loss": 0.0029, - "policy_friction_nll_loss": 1.1047600507736206, - "policy_nll_loss": 3.001453399658203, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2463025599718094, - "rewards/chosen_fricton": -0.06556074321269989, - "rewards/margins": 0.1851997673511505, - "rewards/margins_friction": 0.3026280403137207, - "rewards/rejected": -0.4315022826194763, - "rewards/rejected_friction": -0.368188738822937, - "step": 1835 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.98, - "learning_rate": 3.934239424996472e-06, - "logits/chosen": -0.3570273518562317, - "logits/chosen_friction": -0.3641056418418884, - "logits/rejected": -0.3419456481933594, - "logits/rejected_friction": -0.33656996488571167, - "logps/chosen": -2.98364520072937, - "logps/chosen_friction": -1.085025668144226, - "logps/rejected": -4.972365379333496, - "logps/rejected_friction": -4.134318828582764, - "loss": 0.0013, - "policy_friction_nll_loss": 1.063749074935913, - "policy_nll_loss": 2.940697431564331, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23912513256072998, - "rewards/chosen_fricton": -0.061613548547029495, - "rewards/margins": 0.19683949649333954, - "rewards/margins_friction": 0.30199772119522095, - "rewards/rejected": -0.4359646439552307, - "rewards/rejected_friction": -0.36361128091812134, + "epoch": 0.56, + "learning_rate": 1.7274575140626318e-06, + "logits/chosen": -0.359462171792984, + "logits/rejected": -0.36063042283058167, + "logps/chosen": -448.9486389160156, + "logps/rejected": -458.26776123046875, + "loss": 0.4759, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6954994201660156, + "rewards/margins": 1.2426683902740479, + "rewards/rejected": -3.9381680488586426, "step": 1840 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.98, - "learning_rate": 3.928864759481127e-06, - "logits/chosen": -0.36898112297058105, - "logits/chosen_friction": -0.3768022954463959, - "logits/rejected": -0.3543735444545746, - "logits/rejected_friction": -0.3465885519981384, - "logps/chosen": -3.1139652729034424, - "logps/chosen_friction": -1.078665852546692, - "logps/rejected": -5.020030975341797, - "logps/rejected_friction": -4.140861511230469, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0628330707550049, - "policy_nll_loss": 3.0770466327667236, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2517441213130951, - "rewards/chosen_fricton": -0.060237735509872437, - "rewards/margins": 0.18770165741443634, - "rewards/margins_friction": 0.30282288789749146, - "rewards/rejected": -0.43944573402404785, - "rewards/rejected_friction": -0.3630605638027191, - "step": 1845 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.99, - "learning_rate": 3.923480267960672e-06, - "logits/chosen": -0.35600632429122925, - "logits/chosen_friction": -0.3635779023170471, - "logits/rejected": -0.3432525098323822, - "logits/rejected_friction": -0.3329480290412903, - "logps/chosen": -3.067863941192627, - "logps/chosen_friction": -1.091168999671936, - "logps/rejected": -4.9362287521362305, - "logps/rejected_friction": -4.254705429077148, - "loss": 0.0015, - "policy_friction_nll_loss": 1.0789653062820435, - "policy_nll_loss": 3.0434622764587402, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24481718242168427, - "rewards/chosen_fricton": -0.059603333473205566, - "rewards/margins": 0.18327976763248444, - "rewards/margins_friction": 0.3119720220565796, - "rewards/rejected": -0.4280969500541687, - "rewards/rejected_friction": -0.37157538533210754, + "epoch": 0.56, + "learning_rate": 1.7017461746600506e-06, + "logits/chosen": -0.3540958762168884, + "logits/rejected": -0.3554149866104126, + "logps/chosen": -442.2723083496094, + "logps/rejected": -452.81951904296875, + "loss": 0.479, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.743239402770996, + "rewards/margins": 1.2077919244766235, + "rewards/rejected": -3.95103120803833, "step": 1850 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.99, - "learning_rate": 3.918085987463134e-06, - "logits/chosen": -0.3618175983428955, - "logits/chosen_friction": -0.3687087893486023, - "logits/rejected": -0.34623628854751587, - "logits/rejected_friction": -0.33983245491981506, - "logps/chosen": -3.0419774055480957, - "logps/chosen_friction": -1.1023472547531128, - "logps/rejected": -4.988726615905762, - "logps/rejected_friction": -4.1270318031311035, - "loss": 0.0017, - "policy_friction_nll_loss": 1.0895864963531494, - "policy_nll_loss": 3.014646053314209, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24551057815551758, - "rewards/chosen_fricton": -0.06325967609882355, - "rewards/margins": 0.19110645353794098, - "rewards/margins_friction": 0.29821258783340454, - "rewards/rejected": -0.436616986989975, - "rewards/rejected_friction": -0.3614722788333893, - "step": 1855 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 0.99, - "learning_rate": 3.912681955083854e-06, - "logits/chosen": -0.35252124071121216, - "logits/chosen_friction": -0.3597642481327057, - "logits/rejected": -0.33827677369117737, - "logits/rejected_friction": -0.3310569226741791, - "logps/chosen": -3.144594192504883, - "logps/chosen_friction": -1.1031588315963745, - "logps/rejected": -5.122199058532715, - "logps/rejected_friction": -4.205641269683838, - "loss": 0.001, - "policy_friction_nll_loss": 1.0898582935333252, - "policy_nll_loss": 3.1115314960479736, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25329238176345825, - "rewards/chosen_fricton": -0.06049058958888054, - "rewards/margins": 0.19434617459774017, - "rewards/margins_friction": 0.30606281757354736, - "rewards/rejected": -0.447638601064682, - "rewards/rejected_friction": -0.3665534257888794, + "epoch": 0.56, + "learning_rate": 1.6761285138831493e-06, + "logits/chosen": -0.3558579981327057, + "logits/rejected": -0.35607069730758667, + "logps/chosen": -448.01458740234375, + "logps/rejected": -458.3499450683594, + "loss": 0.4367, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.706444025039673, + "rewards/margins": 1.3273353576660156, + "rewards/rejected": -4.033779144287109, "step": 1860 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 0.99, - "learning_rate": 3.907268207985238e-06, - "logits/chosen": -0.35693347454071045, - "logits/chosen_friction": -0.36483436822891235, - "logits/rejected": -0.3422977328300476, - "logits/rejected_friction": -0.33601388335227966, - "logps/chosen": -3.0374183654785156, - "logps/chosen_friction": -1.1044843196868896, - "logps/rejected": -5.029804229736328, - "logps/rejected_friction": -4.206171035766602, - "loss": 0.0014, - "policy_friction_nll_loss": 1.0938798189163208, - "policy_nll_loss": 3.0070478916168213, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24305081367492676, - "rewards/chosen_fricton": -0.06194297596812248, - "rewards/margins": 0.1956903636455536, - "rewards/margins_friction": 0.30551812052726746, - "rewards/rejected": -0.43874114751815796, - "rewards/rejected_friction": -0.36746111512184143, - "step": 1865 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.0, - "learning_rate": 3.901844783396494e-06, - "logits/chosen": -0.34600409865379333, - "logits/chosen_friction": -0.35116124153137207, - "logits/rejected": -0.3330352008342743, - "logits/rejected_friction": -0.32177144289016724, - "logps/chosen": -3.1858441829681396, - "logps/chosen_friction": -1.1492975950241089, - "logps/rejected": -5.071053504943848, - "logps/rejected_friction": -4.291268825531006, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1348308324813843, - "policy_nll_loss": 3.1500236988067627, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25510361790657043, - "rewards/chosen_fricton": -0.06348259001970291, - "rewards/margins": 0.18582530319690704, - "rewards/margins_friction": 0.3104936182498932, - "rewards/rejected": -0.44092893600463867, - "rewards/rejected_friction": -0.3739762008190155, + "epoch": 0.57, + "learning_rate": 1.6506075380780043e-06, + "logits/chosen": -0.343932569026947, + "logits/rejected": -0.3449569046497345, + "logps/chosen": -449.41534423828125, + "logps/rejected": -461.0784606933594, + "loss": 0.4612, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.849799633026123, + "rewards/margins": 1.247933030128479, + "rewards/rejected": -4.0977325439453125, "step": 1870 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.0, - "learning_rate": 3.896411718613385e-06, - "logits/chosen": -0.35913437604904175, - "logits/chosen_friction": -0.36687082052230835, - "logits/rejected": -0.3442925810813904, - "logits/rejected_friction": -0.3377821445465088, - "logps/chosen": -3.0683014392852783, - "logps/chosen_friction": -1.0749223232269287, - "logps/rejected": -5.0500688552856445, - "logps/rejected_friction": -4.205936431884766, - "loss": 0.001, - "policy_friction_nll_loss": 1.058093547821045, - "policy_nll_loss": 3.0298428535461426, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24694669246673584, - "rewards/chosen_fricton": -0.05976169556379318, - "rewards/margins": 0.19504967331886292, - "rewards/margins_friction": 0.30916714668273926, - "rewards/rejected": -0.44199639558792114, - "rewards/rejected_friction": -0.36892879009246826, - "step": 1875 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.0, - "learning_rate": 3.890969050997964e-06, - "logits/chosen": -0.35821643471717834, - "logits/chosen_friction": -0.36517956852912903, - "logits/rejected": -0.3448939025402069, - "logits/rejected_friction": -0.33601540327072144, - "logps/chosen": -3.1295270919799805, - "logps/chosen_friction": -1.0987575054168701, - "logps/rejected": -5.077920436859131, - "logps/rejected_friction": -4.199408531188965, - "loss": 0.0013, - "policy_friction_nll_loss": 1.0794763565063477, - "policy_nll_loss": 3.0873780250549316, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2522042989730835, - "rewards/chosen_fricton": -0.061542022973299026, - "rewards/margins": 0.19122788310050964, - "rewards/margins_friction": 0.30571505427360535, - "rewards/rejected": -0.4434322416782379, - "rewards/rejected_friction": -0.36725708842277527, + "epoch": 0.57, + "learning_rate": 1.625186242244279e-06, + "logits/chosen": -0.351362407207489, + "logits/rejected": -0.35285985469818115, + "logps/chosen": -442.25335693359375, + "logps/rejected": -452.58526611328125, + "loss": 0.4487, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7264816761016846, + "rewards/margins": 1.3034883737564087, + "rewards/rejected": -4.029970169067383, "step": 1880 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.01, - "learning_rate": 3.8855168179783195e-06, - "logits/chosen": -0.35617685317993164, - "logits/chosen_friction": -0.3627377450466156, - "logits/rejected": -0.3425740897655487, - "logits/rejected_friction": -0.33128148317337036, - "logps/chosen": -3.185546875, - "logps/chosen_friction": -1.1407101154327393, - "logps/rejected": -5.055400848388672, - "logps/rejected_friction": -4.241162300109863, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1159226894378662, - "policy_nll_loss": 3.140681505203247, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2565447688102722, - "rewards/chosen_fricton": -0.06346140801906586, - "rewards/margins": 0.18396729230880737, - "rewards/margins_friction": 0.30657580494880676, - "rewards/rejected": -0.440512090921402, - "rewards/rejected_friction": -0.3700372278690338, - "step": 1885 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.01, - "learning_rate": 3.880055057048325e-06, - "logits/chosen": -0.36063772439956665, - "logits/chosen_friction": -0.3702596127986908, - "logits/rejected": -0.3437923789024353, - "logits/rejected_friction": -0.34165769815444946, - "logps/chosen": -3.010429859161377, - "logps/chosen_friction": -1.027812123298645, - "logps/rejected": -5.052890300750732, - "logps/rejected_friction": -4.088536739349365, - "loss": 0.0012, - "policy_friction_nll_loss": 1.0186048746109009, - "policy_nll_loss": 2.9885082244873047, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24578723311424255, - "rewards/chosen_fricton": -0.05922720953822136, - "rewards/margins": 0.2001916617155075, - "rewards/margins_friction": 0.3011823296546936, - "rewards/rejected": -0.44597887992858887, - "rewards/rejected_friction": -0.36040952801704407, + "epoch": 0.57, + "learning_rate": 1.5998676096837534e-06, + "logits/chosen": -0.35466188192367554, + "logits/rejected": -0.35623863339424133, + "logps/chosen": -455.30859375, + "logps/rejected": -466.81817626953125, + "loss": 0.4525, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.76641845703125, + "rewards/margins": 1.3434927463531494, + "rewards/rejected": -4.1099114418029785, "step": 1890 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.01, - "learning_rate": 3.874583805767371e-06, - "logits/chosen": -0.36031031608581543, - "logits/chosen_friction": -0.3680192530155182, - "logits/rejected": -0.3478662073612213, - "logits/rejected_friction": -0.33873817324638367, - "logps/chosen": -3.0355000495910645, - "logps/chosen_friction": -1.0969223976135254, - "logps/rejected": -4.966764450073242, - "logps/rejected_friction": -4.168595314025879, - "loss": 0.001, - "policy_friction_nll_loss": 1.0814182758331299, - "policy_nll_loss": 3.0080251693725586, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2439323365688324, - "rewards/chosen_fricton": -0.061770401895046234, - "rewards/margins": 0.19038985669612885, - "rewards/margins_friction": 0.30369219183921814, - "rewards/rejected": -0.43432220816612244, - "rewards/rejected_friction": -0.36546260118484497, - "step": 1895 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.01, - "learning_rate": 3.869103101760111e-06, - "logits/chosen": -0.35354480147361755, - "logits/chosen_friction": -0.361233651638031, - "logits/rejected": -0.3416203260421753, - "logits/rejected_friction": -0.3334607779979706, - "logps/chosen": -3.069324016571045, - "logps/chosen_friction": -1.0847967863082886, - "logps/rejected": -4.9889326095581055, - "logps/rejected_friction": -4.252261161804199, - "loss": 0.0009, - "policy_friction_nll_loss": 1.061941146850586, - "policy_nll_loss": 3.0152881145477295, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24628940224647522, - "rewards/chosen_fricton": -0.059280991554260254, - "rewards/margins": 0.18861433863639832, - "rewards/margins_friction": 0.3126095235347748, - "rewards/rejected": -0.43490371108055115, - "rewards/rejected_friction": -0.37189051508903503, + "epoch": 0.58, + "learning_rate": 1.574654611650214e-06, + "logits/chosen": -0.353823721408844, + "logits/rejected": -0.3546674847602844, + "logps/chosen": -448.30615234375, + "logps/rejected": -462.4393005371094, + "loss": 0.4049, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.676255226135254, + "rewards/margins": 1.421419382095337, + "rewards/rejected": -4.097674369812012, "step": 1900 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.02, - "learning_rate": 3.863612982716204e-06, - "logits/chosen": -0.3526819348335266, - "logits/chosen_friction": -0.36056962609291077, - "logits/rejected": -0.33878377079963684, - "logits/rejected_friction": -0.3341030478477478, - "logps/chosen": -3.0546886920928955, - "logps/chosen_friction": -1.0813477039337158, - "logps/rejected": -4.987541675567627, - "logps/rejected_friction": -4.106870174407959, - "loss": 0.0012, - "policy_friction_nll_loss": 1.0691879987716675, - "policy_nll_loss": 3.0204641819000244, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24600744247436523, - "rewards/chosen_fricton": -0.061386026442050934, - "rewards/margins": 0.1911228448152542, - "rewards/margins_friction": 0.2995704710483551, - "rewards/rejected": -0.43713027238845825, - "rewards/rejected_friction": -0.36095649003982544, - "step": 1905 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.02, - "learning_rate": 3.858113486390056e-06, - "logits/chosen": -0.34193119406700134, - "logits/chosen_friction": -0.3477042317390442, - "logits/rejected": -0.33076217770576477, - "logits/rejected_friction": -0.31833887100219727, - "logps/chosen": -3.264582395553589, - "logps/chosen_friction": -1.1312448978424072, - "logps/rejected": -5.1487321853637695, - "logps/rejected_friction": -4.343472480773926, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1105397939682007, - "policy_nll_loss": 3.221205234527588, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2631241977214813, - "rewards/chosen_fricton": -0.06157250329852104, - "rewards/margins": 0.18461330235004425, - "rewards/margins_friction": 0.3167691230773926, - "rewards/rejected": -0.44773751497268677, - "rewards/rejected_friction": -0.3783416152000427, + "epoch": 0.58, + "learning_rate": 1.54955020700077e-06, + "logits/chosen": -0.35255804657936096, + "logits/rejected": -0.35378915071487427, + "logps/chosen": -442.2880859375, + "logps/rejected": -454.832763671875, + "loss": 0.4771, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8402438163757324, + "rewards/margins": 1.2248531579971313, + "rewards/rejected": -4.065096855163574, "step": 1910 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.02, - "learning_rate": 3.852604650600555e-06, - "logits/chosen": -0.3636505603790283, - "logits/chosen_friction": -0.37208038568496704, - "logits/rejected": -0.34797361493110657, - "logits/rejected_friction": -0.3438517451286316, - "logps/chosen": -2.991744041442871, - "logps/chosen_friction": -1.0994846820831299, - "logps/rejected": -5.073159217834473, - "logps/rejected_friction": -4.181719779968262, - "loss": 0.001, - "policy_friction_nll_loss": 1.0833966732025146, - "policy_nll_loss": 2.967758893966675, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24210719764232635, - "rewards/chosen_fricton": -0.06407150626182556, - "rewards/margins": 0.20478174090385437, - "rewards/margins_friction": 0.3041752874851227, - "rewards/rejected": -0.44688892364501953, - "rewards/rejected_friction": -0.36824679374694824, - "step": 1915 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.02, - "learning_rate": 3.847086513230817e-06, - "logits/chosen": -0.3502192497253418, - "logits/chosen_friction": -0.3568991720676422, - "logits/rejected": -0.3394063115119934, - "logits/rejected_friction": -0.32922276854515076, - "logps/chosen": -3.056478500366211, - "logps/chosen_friction": -1.1112574338912964, - "logps/rejected": -4.966493129730225, - "logps/rejected_friction": -4.212573051452637, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0905689001083374, - "policy_nll_loss": 3.0213851928710938, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24488604068756104, - "rewards/chosen_fricton": -0.06214331462979317, - "rewards/margins": 0.18801221251487732, - "rewards/margins_friction": 0.3063402473926544, - "rewards/rejected": -0.43289828300476074, - "rewards/rejected_friction": -0.3684835731983185, + "epoch": 0.58, + "learning_rate": 1.5245573418486136e-06, + "logits/chosen": -0.35058295726776123, + "logits/rejected": -0.3520324230194092, + "logps/chosen": -451.47265625, + "logps/rejected": -462.79791259765625, + "loss": 0.4615, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.648658037185669, + "rewards/margins": 1.3944532871246338, + "rewards/rejected": -4.043111324310303, "step": 1920 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.03, - "learning_rate": 3.841559112227923e-06, - "logits/chosen": -0.3618120551109314, - "logits/chosen_friction": -0.3688984513282776, - "logits/rejected": -0.35107535123825073, - "logits/rejected_friction": -0.3398933410644531, - "logps/chosen": -3.1745638847351074, - "logps/chosen_friction": -1.1108537912368774, - "logps/rejected": -5.065258502960205, - "logps/rejected_friction": -4.243707656860352, - "loss": 0.001, - "policy_friction_nll_loss": 1.0989737510681152, - "policy_nll_loss": 3.145543098449707, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25744470953941345, - "rewards/chosen_fricton": -0.06276442110538483, - "rewards/margins": 0.18509450554847717, - "rewards/margins_friction": 0.30845993757247925, - "rewards/rejected": -0.4425392150878906, - "rewards/rejected_friction": -0.37122437357902527, - "step": 1925 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.03, - "learning_rate": 3.8360224856026575e-06, - "logits/chosen": -0.3533279299736023, - "logits/chosen_friction": -0.3591856062412262, - "logits/rejected": -0.34188002347946167, - "logits/rejected_friction": -0.3313242197036743, - "logps/chosen": -3.084228515625, - "logps/chosen_friction": -1.098388433456421, - "logps/rejected": -5.020941734313965, - "logps/rejected_friction": -4.2193379402160645, - "loss": 0.001, - "policy_friction_nll_loss": 1.0915908813476562, - "policy_nll_loss": 3.0497076511383057, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24808450043201447, - "rewards/chosen_fricton": -0.06079822778701782, - "rewards/margins": 0.1905084103345871, - "rewards/margins_friction": 0.30845361948013306, - "rewards/rejected": -0.43859297037124634, - "rewards/rejected_friction": -0.3692518472671509, + "epoch": 0.59, + "learning_rate": 1.4996789492172836e-06, + "logits/chosen": -0.35444819927215576, + "logits/rejected": -0.35484084486961365, + "logps/chosen": -447.3772888183594, + "logps/rejected": -457.873291015625, + "loss": 0.4392, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.671096086502075, + "rewards/margins": 1.3712053298950195, + "rewards/rejected": -4.042301654815674, "step": 1930 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.03, - "learning_rate": 3.830476671429246e-06, - "logits/chosen": -0.3503647744655609, - "logits/chosen_friction": -0.35658812522888184, - "logits/rejected": -0.3403721749782562, - "logits/rejected_friction": -0.3279593288898468, - "logps/chosen": -3.1019911766052246, - "logps/chosen_friction": -1.0979423522949219, - "logps/rejected": -5.020318031311035, - "logps/rejected_friction": -4.290463447570801, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0798554420471191, - "policy_nll_loss": 3.0568740367889404, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24902191758155823, - "rewards/chosen_fricton": -0.06052514910697937, - "rewards/margins": 0.1883070170879364, - "rewards/margins_friction": 0.31520336866378784, - "rewards/rejected": -0.43732890486717224, - "rewards/rejected_friction": -0.3757285177707672, - "step": 1935 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.03, - "learning_rate": 3.824921707845099e-06, - "logits/chosen": -0.34272536635398865, - "logits/chosen_friction": -0.34799444675445557, - "logits/rejected": -0.3353269696235657, - "logits/rejected_friction": -0.32015442848205566, - "logps/chosen": -3.089827060699463, - "logps/chosen_friction": -1.0671859979629517, - "logps/rejected": -4.962313652038574, - "logps/rejected_friction": -4.274179935455322, - "loss": 0.0012, - "policy_friction_nll_loss": 1.0485620498657227, - "policy_nll_loss": 3.0378451347351074, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24884042143821716, - "rewards/chosen_fricton": -0.05763911083340645, - "rewards/margins": 0.18320603668689728, - "rewards/margins_friction": 0.3157082200050354, - "rewards/rejected": -0.43204647302627563, - "rewards/rejected_friction": -0.37334734201431274, + "epoch": 0.59, + "learning_rate": 1.4749179486964599e-06, + "logits/chosen": -0.3643060028553009, + "logits/rejected": -0.3653911054134369, + "logps/chosen": -452.032470703125, + "logps/rejected": -464.42193603515625, + "loss": 0.4286, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.5244593620300293, + "rewards/margins": 1.475843071937561, + "rewards/rejected": -4.000302314758301, "step": 1940 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.04, - "learning_rate": 3.819357633050542e-06, - "logits/chosen": -0.35569438338279724, - "logits/chosen_friction": -0.3595896065235138, - "logits/rejected": -0.3433033525943756, - "logits/rejected_friction": -0.3335304856300354, - "logps/chosen": -3.024003028869629, - "logps/chosen_friction": -1.0826663970947266, - "logps/rejected": -4.937881946563721, - "logps/rejected_friction": -4.1858110427856445, - "loss": 0.0025, - "policy_friction_nll_loss": 1.0604946613311768, - "policy_nll_loss": 2.9768924713134766, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2426944226026535, - "rewards/chosen_fricton": -0.06014398857951164, - "rewards/margins": 0.1887381672859192, - "rewards/margins_friction": 0.3067602217197418, - "rewards/rejected": -0.4314325749874115, - "rewards/rejected_friction": -0.36690422892570496, - "step": 1945 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.04, - "learning_rate": 3.813784485308557e-06, - "logits/chosen": -0.36211052536964417, - "logits/chosen_friction": -0.3685288727283478, - "logits/rejected": -0.35185134410858154, - "logits/rejected_friction": -0.33740484714508057, - "logps/chosen": -3.2314181327819824, - "logps/chosen_friction": -1.1138066053390503, - "logps/rejected": -5.094208717346191, - "logps/rejected_friction": -4.3044939041137695, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1012948751449585, - "policy_nll_loss": 3.2081425189971924, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26086220145225525, - "rewards/chosen_fricton": -0.06091637536883354, - "rewards/margins": 0.18321624398231506, - "rewards/margins_friction": 0.31504520773887634, - "rewards/rejected": -0.4440785050392151, - "rewards/rejected_friction": -0.37596163153648376, + "epoch": 0.59, + "learning_rate": 1.4502772460993387e-06, + "logits/chosen": -0.35049787163734436, + "logits/rejected": -0.3510446846485138, + "logps/chosen": -448.87518310546875, + "logps/rejected": -457.3994140625, + "loss": 0.491, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.827812910079956, + "rewards/margins": 1.2428455352783203, + "rewards/rejected": -4.0706586837768555, "step": 1950 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.04, - "learning_rate": 3.8082023029445204e-06, - "logits/chosen": -0.36342957615852356, - "logits/chosen_friction": -0.3703571557998657, - "logits/rejected": -0.35192936658859253, - "logits/rejected_friction": -0.3406491279602051, - "logps/chosen": -3.0994086265563965, - "logps/chosen_friction": -1.0978949069976807, - "logps/rejected": -5.003418445587158, - "logps/rejected_friction": -4.206614017486572, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0915390253067017, - "policy_nll_loss": 3.0819616317749023, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24874010682106018, - "rewards/chosen_fricton": -0.0601508729159832, - "rewards/margins": 0.188583105802536, - "rewards/margins_friction": 0.30810707807540894, - "rewards/rejected": -0.4373231828212738, - "rewards/rejected_friction": -0.36825793981552124, - "step": 1955 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.05, - "learning_rate": 3.8026111243459362e-06, - "logits/chosen": -0.35238713026046753, - "logits/chosen_friction": -0.3602953851222992, - "logits/rejected": -0.33609071373939514, - "logits/rejected_friction": -0.33243221044540405, - "logps/chosen": -3.1369547843933105, - "logps/chosen_friction": -1.1064385175704956, - "logps/rejected": -5.157766819000244, - "logps/rejected_friction": -4.210993766784668, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0916677713394165, - "policy_nll_loss": 3.101828098297119, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2549934983253479, - "rewards/chosen_fricton": -0.06374526768922806, - "rewards/margins": 0.19793197512626648, - "rewards/margins_friction": 0.3050730228424072, - "rewards/rejected": -0.452925443649292, - "rewards/rejected_friction": -0.3688182830810547, + "epoch": 0.6, + "learning_rate": 1.4257597331216211e-06, + "logits/chosen": -0.3531518578529358, + "logits/rejected": -0.3538290858268738, + "logps/chosen": -456.27691650390625, + "logps/rejected": -466.98687744140625, + "loss": 0.4657, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0542407035827637, + "rewards/margins": 1.164147138595581, + "rewards/rejected": -4.218388080596924, "step": 1960 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.05, - "learning_rate": 3.7970109879621748e-06, - "logits/chosen": -0.35237017273902893, - "logits/chosen_friction": -0.36098381876945496, - "logits/rejected": -0.3344704806804657, - "logits/rejected_friction": -0.3339795172214508, - "logps/chosen": -2.951873302459717, - "logps/chosen_friction": -1.0662591457366943, - "logps/rejected": -4.980239391326904, - "logps/rejected_friction": -4.066709041595459, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0485963821411133, - "policy_nll_loss": 2.904907464981079, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.23766891658306122, - "rewards/chosen_fricton": -0.060905821621418, - "rewards/margins": 0.19875681400299072, - "rewards/margins_friction": 0.2954152226448059, - "rewards/rejected": -0.43642574548721313, - "rewards/rejected_friction": -0.3563210368156433, - "step": 1965 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.05, - "learning_rate": 3.7914019323042056e-06, - "logits/chosen": -0.34765851497650146, - "logits/chosen_friction": -0.3547523617744446, - "logits/rejected": -0.334299236536026, - "logits/rejected_friction": -0.3264591693878174, - "logps/chosen": -3.174102783203125, - "logps/chosen_friction": -1.11860990524292, - "logps/rejected": -5.098637104034424, - "logps/rejected_friction": -4.225217342376709, - "loss": 0.0011, - "policy_friction_nll_loss": 1.1059350967407227, - "policy_nll_loss": 3.1393253803253174, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25610995292663574, - "rewards/chosen_fricton": -0.0625610500574112, - "rewards/margins": 0.18902145326137543, - "rewards/margins_friction": 0.3062191605567932, - "rewards/rejected": -0.44513139128685, - "rewards/rejected_friction": -0.3687801957130432, + "epoch": 0.6, + "learning_rate": 1.4013682870021594e-06, + "logits/chosen": -0.35849729180336, + "logits/rejected": -0.3595832884311676, + "logps/chosen": -447.4246520996094, + "logps/rejected": -460.0209045410156, + "loss": 0.3725, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.7747585773468018, + "rewards/margins": 1.452606439590454, + "rewards/rejected": -4.227365016937256, "step": 1970 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.05, - "learning_rate": 3.7857839959443353e-06, - "logits/chosen": -0.36117497086524963, - "logits/chosen_friction": -0.370960533618927, - "logits/rejected": -0.3469727635383606, - "logits/rejected_friction": -0.3430294096469879, - "logps/chosen": -3.05452036857605, - "logps/chosen_friction": -1.0633822679519653, - "logps/rejected": -5.044494152069092, - "logps/rejected_friction": -4.148152828216553, - "loss": 0.0008, - "policy_friction_nll_loss": 1.051626205444336, - "policy_nll_loss": 3.0228562355041504, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2479037046432495, - "rewards/chosen_fricton": -0.06042136624455452, - "rewards/margins": 0.19626213610172272, - "rewards/margins_friction": 0.3049968183040619, - "rewards/rejected": -0.4441658854484558, - "rewards/rejected_friction": -0.3654181659221649, - "step": 1975 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.06, - "learning_rate": 3.780157217515943e-06, - "logits/chosen": -0.3425987958908081, - "logits/chosen_friction": -0.34803327918052673, - "logits/rejected": -0.33437004685401917, - "logits/rejected_friction": -0.3227625787258148, - "logps/chosen": -3.1756911277770996, - "logps/chosen_friction": -1.1276358366012573, - "logps/rejected": -5.006478309631348, - "logps/rejected_friction": -4.198144435882568, - "loss": 0.0015, - "policy_friction_nll_loss": 1.1080461740493774, - "policy_nll_loss": 3.1369402408599854, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2572401463985443, - "rewards/chosen_fricton": -0.06458591669797897, - "rewards/margins": 0.1788850724697113, - "rewards/margins_friction": 0.30211132764816284, - "rewards/rejected": -0.4361252188682556, - "rewards/rejected_friction": -0.3666972219944, + "epoch": 0.6, + "learning_rate": 1.3771057701853034e-06, + "logits/chosen": -0.35135719180107117, + "logits/rejected": -0.3521498739719391, + "logps/chosen": -455.69549560546875, + "logps/rejected": -467.37591552734375, + "loss": 0.4899, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.884308338165283, + "rewards/margins": 1.336925983428955, + "rewards/rejected": -4.221234321594238, "step": 1980 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.06, - "learning_rate": 3.7745216357132088e-06, - "logits/chosen": -0.351408451795578, - "logits/chosen_friction": -0.359707236289978, - "logits/rejected": -0.33474668860435486, - "logits/rejected_friction": -0.3304193913936615, - "logps/chosen": -3.0707180500030518, - "logps/chosen_friction": -1.0779842138290405, - "logps/rejected": -5.054358005523682, - "logps/rejected_friction": -4.239457607269287, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0573190450668335, - "policy_nll_loss": 3.0215630531311035, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24781827628612518, - "rewards/chosen_fricton": -0.060057543218135834, - "rewards/margins": 0.19510555267333984, - "rewards/margins_friction": 0.3121110796928406, - "rewards/rejected": -0.44292378425598145, - "rewards/rejected_friction": -0.37216857075691223, - "step": 1985 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.06, - "learning_rate": 3.7688772892908556e-06, - "logits/chosen": -0.3609718382358551, - "logits/chosen_friction": -0.3700377941131592, - "logits/rejected": -0.34504419565200806, - "logits/rejected_friction": -0.3412388861179352, - "logps/chosen": -3.0781304836273193, - "logps/chosen_friction": -1.0660462379455566, - "logps/rejected": -5.050809383392334, - "logps/rejected_friction": -4.147953987121582, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0555188655853271, - "policy_nll_loss": 3.0512535572052, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25034356117248535, - "rewards/chosen_fricton": -0.06099918484687805, - "rewards/margins": 0.1939208209514618, - "rewards/margins_friction": 0.30405592918395996, - "rewards/rejected": -0.44426441192626953, - "rewards/rejected_friction": -0.3650550842285156, + "epoch": 0.6, + "learning_rate": 1.352975029984974e-06, + "logits/chosen": -0.3514239192008972, + "logits/rejected": -0.35260799527168274, + "logps/chosen": -441.4317932128906, + "logps/rejected": -454.58251953125, + "loss": 0.4829, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8820691108703613, + "rewards/margins": 1.2067675590515137, + "rewards/rejected": -4.088836669921875, "step": 1990 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.06, - "learning_rate": 3.763224217063876e-06, - "logits/chosen": -0.3423737585544586, - "logits/chosen_friction": -0.34922799468040466, - "logits/rejected": -0.3286157548427582, - "logits/rejected_friction": -0.3198082745075226, - "logps/chosen": -3.086115837097168, - "logps/chosen_friction": -1.0802738666534424, - "logps/rejected": -5.065817356109619, - "logps/rejected_friction": -4.223278999328613, - "loss": 0.0021, - "policy_friction_nll_loss": 1.0617704391479492, - "policy_nll_loss": 3.0338072776794434, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24898485839366913, - "rewards/chosen_fricton": -0.05941743403673172, - "rewards/margins": 0.19473044574260712, - "rewards/margins_friction": 0.31060609221458435, - "rewards/rejected": -0.44371533393859863, - "rewards/rejected_friction": -0.3700234889984131, - "step": 1995 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.07, - "learning_rate": 3.75756245790727e-06, - "logits/chosen": -0.35395509004592896, - "logits/chosen_friction": -0.3622872233390808, - "logits/rejected": -0.33816754817962646, - "logits/rejected_friction": -0.33257240056991577, - "logps/chosen": -3.0820677280426025, - "logps/chosen_friction": -1.0715110301971436, - "logps/rejected": -5.011096000671387, - "logps/rejected_friction": -4.130929470062256, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0582078695297241, - "policy_nll_loss": 3.0403385162353516, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24951860308647156, - "rewards/chosen_fricton": -0.06007007881999016, - "rewards/margins": 0.18945685029029846, - "rewards/margins_friction": 0.30175715684890747, - "rewards/rejected": -0.4389754831790924, - "rewards/rejected_friction": -0.36182719469070435, + "epoch": 0.61, + "learning_rate": 1.328978898250525e-06, + "logits/chosen": -0.3527238070964813, + "logits/rejected": -0.3534066379070282, + "logps/chosen": -452.95989990234375, + "logps/rejected": -464.698486328125, + "loss": 0.4476, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.7672972679138184, + "rewards/margins": 1.3264329433441162, + "rewards/rejected": -4.093730926513672, "step": 2000 }, { - "epoch": 1.07, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.391663521528244, - "eval_logits/chosen_friction": -0.3990480899810791, - "eval_logits/rejected": -0.3762083649635315, - "eval_logits/rejected_friction": -0.366459459066391, - "eval_logps/chosen": -3.1534159183502197, - "eval_logps/chosen_friction": -1.1094056367874146, - "eval_logps/rejected": -5.095674991607666, - "eval_logps/rejected_friction": -4.205045700073242, - "eval_loss": 0.0009662566590122879, - "eval_policy_friction_nll_loss": 1.1094056367874146, - "eval_policy_nll_loss": 3.1534159183502197, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.2553994357585907, - "eval_rewards/chosen_fricton": -0.06267184019088745, - "eval_rewards/margins": 0.19074808061122894, - "eval_rewards/margins_friction": 0.30541688203811646, - "eval_rewards/rejected": -0.44614750146865845, - "eval_rewards/rejected_friction": -0.3680887222290039, - "eval_runtime": 354.4445, - "eval_samples_per_second": 1.411, - "eval_steps_per_second": 0.705, + "epoch": 0.61, + "eval_logits/chosen": -0.42317765951156616, + "eval_logits/rejected": -0.42379918694496155, + "eval_logps/chosen": -441.13116455078125, + "eval_logps/rejected": -451.5594177246094, + "eval_loss": 0.4796808958053589, + "eval_rewards/accuracies": 0.7559999823570251, + "eval_rewards/chosen": -2.823995590209961, + "eval_rewards/margins": 1.234041452407837, + "eval_rewards/rejected": -4.058037281036377, + "eval_runtime": 351.7707, + "eval_samples_per_second": 1.421, + "eval_steps_per_second": 1.421, "step": 2000 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.07, - "learning_rate": 3.7518920507557755e-06, - "logits/chosen": -0.3577654957771301, - "logits/chosen_friction": -0.36554139852523804, - "logits/rejected": -0.34247857332229614, - "logits/rejected_friction": -0.3365681767463684, - "logps/chosen": -3.166908025741577, - "logps/chosen_friction": -1.0889966487884521, - "logps/rejected": -5.156120300292969, - "logps/rejected_friction": -4.237732410430908, - "loss": 0.001, - "policy_friction_nll_loss": 1.0765361785888672, - "policy_nll_loss": 3.138902187347412, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2563958764076233, - "rewards/chosen_fricton": -0.06094469502568245, - "rewards/margins": 0.19476313889026642, - "rewards/margins_friction": 0.3098185956478119, - "rewards/rejected": -0.4511590003967285, - "rewards/rejected_friction": -0.37076330184936523, - "step": 2005 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.07, - "learning_rate": 3.7462130346036e-06, - "logits/chosen": -0.3497234284877777, - "logits/chosen_friction": -0.35699304938316345, - "logits/rejected": -0.3366973400115967, - "logits/rejected_friction": -0.3277578353881836, - "logps/chosen": -3.2123100757598877, - "logps/chosen_friction": -1.0950678586959839, - "logps/rejected": -5.156116962432861, - "logps/rejected_friction": -4.33667516708374, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0807693004608154, - "policy_nll_loss": 3.1781222820281982, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2590380311012268, - "rewards/chosen_fricton": -0.05972331017255783, - "rewards/margins": 0.19054469466209412, - "rewards/margins_friction": 0.3197977840900421, - "rewards/rejected": -0.4495827555656433, - "rewards/rejected_friction": -0.37952107191085815, + "epoch": 0.61, + "learning_rate": 1.305120191034409e-06, + "logits/chosen": -0.34321507811546326, + "logits/rejected": -0.343815416097641, + "logps/chosen": -443.4376525878906, + "logps/rejected": -452.2301330566406, + "loss": 0.4223, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.8989710807800293, + "rewards/margins": 1.322284460067749, + "rewards/rejected": -4.221255302429199, "step": 2010 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.07, - "learning_rate": 3.740525448504154e-06, - "logits/chosen": -0.3587339520454407, - "logits/chosen_friction": -0.36404794454574585, - "logits/rejected": -0.34877970814704895, - "logits/rejected_friction": -0.3341774046421051, - "logps/chosen": -3.2771987915039062, - "logps/chosen_friction": -1.0921905040740967, - "logps/rejected": -5.105399131774902, - "logps/rejected_friction": -4.333439350128174, - "loss": 0.0017, - "policy_friction_nll_loss": 1.0842286348342896, - "policy_nll_loss": 3.2527318000793457, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.263894647359848, - "rewards/chosen_fricton": -0.057329945266246796, - "rewards/margins": 0.17927180230617523, - "rewards/margins_friction": 0.31959354877471924, - "rewards/rejected": -0.44316643476486206, - "rewards/rejected_friction": -0.37692350149154663, - "step": 2015 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.08, - "learning_rate": 3.7348293315697813e-06, - "logits/chosen": -0.3552873432636261, - "logits/chosen_friction": -0.3629855513572693, - "logits/rejected": -0.34304919838905334, - "logits/rejected_friction": -0.3345158100128174, - "logps/chosen": -3.1632046699523926, - "logps/chosen_friction": -1.098980188369751, - "logps/rejected": -5.074326038360596, - "logps/rejected_friction": -4.2218098640441895, - "loss": 0.001, - "policy_friction_nll_loss": 1.091110348701477, - "policy_nll_loss": 3.138714551925659, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2549850344657898, - "rewards/chosen_fricton": -0.060802023857831955, - "rewards/margins": 0.18746179342269897, - "rewards/margins_friction": 0.30781155824661255, - "rewards/rejected": -0.4424467980861664, - "rewards/rejected_friction": -0.368613600730896, + "epoch": 0.61, + "learning_rate": 1.2814017082617025e-06, + "logits/chosen": -0.3508697748184204, + "logits/rejected": -0.35247209668159485, + "logps/chosen": -444.38641357421875, + "logps/rejected": -456.51153564453125, + "loss": 0.4284, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6340365409851074, + "rewards/margins": 1.4381511211395264, + "rewards/rejected": -4.072187900543213, "step": 2020 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.08, - "learning_rate": 3.729124722971491e-06, - "logits/chosen": -0.35744890570640564, - "logits/chosen_friction": -0.36485859751701355, - "logits/rejected": -0.3384266197681427, - "logits/rejected_friction": -0.33663254976272583, - "logps/chosen": -3.0332818031311035, - "logps/chosen_friction": -1.0540258884429932, - "logps/rejected": -5.05654764175415, - "logps/rejected_friction": -4.031783103942871, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0325716733932495, - "policy_nll_loss": 2.985109806060791, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24676987528800964, - "rewards/chosen_fricton": -0.05962783098220825, - "rewards/margins": 0.1997208297252655, - "rewards/margins_friction": 0.2946571707725525, - "rewards/rejected": -0.44649070501327515, - "rewards/rejected_friction": -0.35428500175476074, - "step": 2025 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.08, - "learning_rate": 3.7234116619386875e-06, - "logits/chosen": -0.35892122983932495, - "logits/chosen_friction": -0.3670521676540375, - "logits/rejected": -0.3393634259700775, - "logits/rejected_friction": -0.33667317032814026, - "logps/chosen": -3.1944611072540283, - "logps/chosen_friction": -1.0599511861801147, - "logps/rejected": -5.20877742767334, - "logps/rejected_friction": -4.144543647766113, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0471951961517334, - "policy_nll_loss": 3.1637825965881348, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2606923282146454, - "rewards/chosen_fricton": -0.06017500162124634, - "rewards/margins": 0.19785478711128235, - "rewards/margins_friction": 0.3039926290512085, - "rewards/rejected": -0.45854711532592773, - "rewards/rejected_friction": -0.36416763067245483, + "epoch": 0.62, + "learning_rate": 1.2578262334015201e-06, + "logits/chosen": -0.34914684295654297, + "logits/rejected": -0.35076671838760376, + "logps/chosen": -441.771728515625, + "logps/rejected": -457.03057861328125, + "loss": 0.4234, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.6355042457580566, + "rewards/margins": 1.4987059831619263, + "rewards/rejected": -4.134210109710693, "step": 2030 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.09, - "learning_rate": 3.7176901877589e-06, - "logits/chosen": -0.3614543080329895, - "logits/chosen_friction": -0.36854279041290283, - "logits/rejected": -0.34584903717041016, - "logits/rejected_friction": -0.3392640948295593, - "logps/chosen": -3.2224297523498535, - "logps/chosen_friction": -1.0981355905532837, - "logps/rejected": -5.15081262588501, - "logps/rejected_friction": -4.242194652557373, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0904111862182617, - "policy_nll_loss": 3.2089405059814453, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26224514842033386, - "rewards/chosen_fricton": -0.061390649527311325, - "rewards/margins": 0.1889020949602127, - "rewards/margins_friction": 0.30973026156425476, - "rewards/rejected": -0.4511471688747406, - "rewards/rejected_friction": -0.3711208999156952, - "step": 2035 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.09, - "learning_rate": 3.7119603397775144e-06, - "logits/chosen": -0.3532547056674957, - "logits/chosen_friction": -0.3607575297355652, - "logits/rejected": -0.3357606530189514, - "logits/rejected_friction": -0.3322365880012512, - "logps/chosen": -3.0995893478393555, - "logps/chosen_friction": -1.080482006072998, - "logps/rejected": -5.075872898101807, - "logps/rejected_friction": -4.149783134460449, - "loss": 0.0033, - "policy_friction_nll_loss": 1.062032699584961, - "policy_nll_loss": 3.057141065597534, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25025442242622375, - "rewards/chosen_fricton": -0.060344986617565155, - "rewards/margins": 0.19471962749958038, - "rewards/margins_friction": 0.303230881690979, - "rewards/rejected": -0.44497400522232056, - "rewards/rejected_friction": -0.36357584595680237, + "epoch": 0.62, + "learning_rate": 1.234396533140365e-06, + "logits/chosen": -0.3611491024494171, + "logits/rejected": -0.3617832660675049, + "logps/chosen": -454.31951904296875, + "logps/rejected": -467.4290466308594, + "loss": 0.435, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7407212257385254, + "rewards/margins": 1.4744737148284912, + "rewards/rejected": -4.215195178985596, "step": 2040 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.09, - "learning_rate": 3.7062221573975003e-06, - "logits/chosen": -0.3670308291912079, - "logits/chosen_friction": -0.37422674894332886, - "logits/rejected": -0.34953388571739197, - "logits/rejected_friction": -0.3457208573818207, - "logps/chosen": -3.1311986446380615, - "logps/chosen_friction": -1.0851972103118896, - "logps/rejected": -5.078215599060059, - "logps/rejected_friction": -4.159598350524902, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0773115158081055, - "policy_nll_loss": 3.1148085594177246, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25475379824638367, - "rewards/chosen_fricton": -0.06171612814068794, - "rewards/margins": 0.19117248058319092, - "rewards/margins_friction": 0.3031601011753082, - "rewards/rejected": -0.4459262788295746, - "rewards/rejected_friction": -0.3648762106895447, - "step": 2045 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.09, - "learning_rate": 3.7004756800791424e-06, - "logits/chosen": -0.3629576563835144, - "logits/chosen_friction": -0.36979353427886963, - "logits/rejected": -0.34663599729537964, - "logits/rejected_friction": -0.3422054052352905, - "logps/chosen": -3.2022037506103516, - "logps/chosen_friction": -1.1486176252365112, - "logps/rejected": -5.218056678771973, - "logps/rejected_friction": -4.222769737243652, - "loss": 0.0012, - "policy_friction_nll_loss": 1.1321063041687012, - "policy_nll_loss": 3.174116849899292, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2601475119590759, - "rewards/chosen_fricton": -0.06674204766750336, - "rewards/margins": 0.19881713390350342, - "rewards/margins_friction": 0.30410146713256836, - "rewards/rejected": -0.45896467566490173, - "rewards/rejected_friction": -0.37084347009658813, + "epoch": 0.62, + "learning_rate": 1.2111153570574454e-06, + "logits/chosen": -0.35015982389450073, + "logits/rejected": -0.35119912028312683, + "logps/chosen": -446.706787109375, + "logps/rejected": -461.330810546875, + "loss": 0.4095, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.59765625, + "rewards/margins": 1.5183773040771484, + "rewards/rejected": -4.116034030914307, "step": 2050 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.1, - "learning_rate": 3.694720947339767e-06, - "logits/chosen": -0.35634782910346985, - "logits/chosen_friction": -0.3622613549232483, - "logits/rejected": -0.3436570465564728, - "logits/rejected_friction": -0.33313465118408203, - "logps/chosen": -3.1957404613494873, - "logps/chosen_friction": -1.0874526500701904, - "logps/rejected": -5.121479511260986, - "logps/rejected_friction": -4.244449138641357, - "loss": 0.001, - "policy_friction_nll_loss": 1.077108383178711, - "policy_nll_loss": 3.1650118827819824, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25869446992874146, - "rewards/chosen_fricton": -0.05890227481722832, - "rewards/margins": 0.18887045979499817, - "rewards/margins_friction": 0.311598002910614, - "rewards/rejected": -0.44756489992141724, - "rewards/rejected_friction": -0.3705002963542938, - "step": 2055 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.1, - "learning_rate": 3.6889579987534723e-06, - "logits/chosen": -0.35917600989341736, - "logits/chosen_friction": -0.3621653914451599, - "logits/rejected": -0.3495846390724182, - "logits/rejected_friction": -0.33377566933631897, - "logps/chosen": -3.2238364219665527, - "logps/chosen_friction": -1.137844443321228, - "logps/rejected": -5.014621734619141, - "logps/rejected_friction": -4.273080348968506, - "loss": 0.0007, - "policy_friction_nll_loss": 1.1277639865875244, - "policy_nll_loss": 3.201425552368164, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2592436671257019, - "rewards/chosen_fricton": -0.061976708471775055, - "rewards/margins": 0.1759762018918991, - "rewards/margins_friction": 0.30981454253196716, - "rewards/rejected": -0.4352199137210846, - "rewards/rejected_friction": -0.3717912435531616, + "epoch": 0.63, + "learning_rate": 1.187985437301999e-06, + "logits/chosen": -0.35530123114585876, + "logits/rejected": -0.35578909516334534, + "logps/chosen": -438.29974365234375, + "logps/rejected": -452.89483642578125, + "loss": 0.4416, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.729788303375244, + "rewards/margins": 1.507673978805542, + "rewards/rejected": -4.237462043762207, "step": 2060 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.1, - "learning_rate": 3.683186873950854e-06, - "logits/chosen": -0.3548794686794281, - "logits/chosen_friction": -0.3605293333530426, - "logits/rejected": -0.34377920627593994, - "logits/rejected_friction": -0.33337217569351196, - "logps/chosen": -3.152297019958496, - "logps/chosen_friction": -1.0938844680786133, - "logps/rejected": -5.112492561340332, - "logps/rejected_friction": -4.229168891906738, - "loss": 0.0012, - "policy_friction_nll_loss": 1.0780763626098633, - "policy_nll_loss": 3.122596025466919, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2535160183906555, - "rewards/chosen_fricton": -0.05940442159771919, - "rewards/margins": 0.19352290034294128, - "rewards/margins_friction": 0.3103941082954407, - "rewards/rejected": -0.4470389485359192, - "rewards/rejected_friction": -0.3697985112667084, - "step": 2065 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.1, - "learning_rate": 3.677407612618734e-06, - "logits/chosen": -0.353793203830719, - "logits/chosen_friction": -0.3597516417503357, - "logits/rejected": -0.34103184938430786, - "logits/rejected_friction": -0.3321821093559265, - "logps/chosen": -3.1352479457855225, - "logps/chosen_friction": -1.085289716720581, - "logps/rejected": -5.091660499572754, - "logps/rejected_friction": -4.179171562194824, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0607621669769287, - "policy_nll_loss": 3.0729820728302, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25395509600639343, - "rewards/chosen_fricton": -0.06013410538434982, - "rewards/margins": 0.19234856963157654, - "rewards/margins_friction": 0.30566245317459106, - "rewards/rejected": -0.44630369544029236, - "rewards/rejected_friction": -0.36579659581184387, + "epoch": 0.63, + "learning_rate": 1.1650094882726599e-06, + "logits/chosen": -0.36762434244155884, + "logits/rejected": -0.36925989389419556, + "logps/chosen": -455.1209411621094, + "logps/rejected": -469.49920654296875, + "loss": 0.4061, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.913037061691284, + "rewards/margins": 1.4348491430282593, + "rewards/rejected": -4.347886562347412, "step": 2070 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.11, - "learning_rate": 3.6716202544998885e-06, - "logits/chosen": -0.3699076771736145, - "logits/chosen_friction": -0.3777405619621277, - "logits/rejected": -0.35556119680404663, - "logits/rejected_friction": -0.3493996262550354, - "logps/chosen": -3.0565621852874756, - "logps/chosen_friction": -1.055999994277954, - "logps/rejected": -5.051729679107666, - "logps/rejected_friction": -4.135456085205078, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0448200702667236, - "policy_nll_loss": 3.0257816314697266, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24664607644081116, - "rewards/chosen_fricton": -0.05877421423792839, - "rewards/margins": 0.196357861161232, - "rewards/margins_friction": 0.3040851056575775, - "rewards/rejected": -0.44300398230552673, - "rewards/rejected_friction": -0.3628593385219574, - "step": 2075 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.11, - "learning_rate": 3.6658248393927727e-06, - "logits/chosen": -0.3577764630317688, - "logits/chosen_friction": -0.36322981119155884, - "logits/rejected": -0.34462738037109375, - "logits/rejected_friction": -0.3343071937561035, - "logps/chosen": -3.134507894515991, - "logps/chosen_friction": -1.064667820930481, - "logps/rejected": -5.049853324890137, - "logps/rejected_friction": -4.13385009765625, - "loss": 0.001, - "policy_friction_nll_loss": 1.057470679283142, - "policy_nll_loss": 3.106142044067383, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2555690109729767, - "rewards/chosen_fricton": -0.0596783384680748, - "rewards/margins": 0.18849417567253113, - "rewards/margins_friction": 0.3032028079032898, - "rewards/rejected": -0.4440631866455078, - "rewards/rejected_friction": -0.3628811836242676, + "epoch": 0.63, + "learning_rate": 1.1421902062989178e-06, + "logits/chosen": -0.3690846264362335, + "logits/rejected": -0.3703765869140625, + "logps/chosen": -451.37860107421875, + "logps/rejected": -462.94293212890625, + "loss": 0.4399, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7418205738067627, + "rewards/margins": 1.415470838546753, + "rewards/rejected": -4.157290935516357, "step": 2080 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.11, - "learning_rate": 3.660021407151248e-06, - "logits/chosen": -0.34077152609825134, - "logits/chosen_friction": -0.34363603591918945, - "logits/rejected": -0.32893332839012146, - "logits/rejected_friction": -0.3170956075191498, - "logps/chosen": -3.259356737136841, - "logps/chosen_friction": -1.1324530839920044, - "logps/rejected": -5.202783107757568, - "logps/rejected_friction": -4.3680949211120605, - "loss": 0.0012, - "policy_friction_nll_loss": 1.111027717590332, - "policy_nll_loss": 3.2206242084503174, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26276591420173645, - "rewards/chosen_fricton": -0.06201593205332756, - "rewards/margins": 0.19044385850429535, - "rewards/margins_friction": 0.3187122941017151, - "rewards/rejected": -0.4532097280025482, - "rewards/rejected_friction": -0.38072818517684937, - "step": 2085 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.11, - "learning_rate": 3.654209997684308e-06, - "logits/chosen": -0.35799577832221985, - "logits/chosen_friction": -0.36486488580703735, - "logits/rejected": -0.3426213264465332, - "logits/rejected_friction": -0.3364567756652832, - "logps/chosen": -3.145435333251953, - "logps/chosen_friction": -1.1049630641937256, - "logps/rejected": -5.103873252868652, - "logps/rejected_friction": -4.166511058807373, - "loss": 0.0011, - "policy_friction_nll_loss": 1.089216947555542, - "policy_nll_loss": 3.1139161586761475, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25605902075767517, - "rewards/chosen_fricton": -0.06318672001361847, - "rewards/margins": 0.1925140917301178, - "rewards/margins_friction": 0.30240362882614136, - "rewards/rejected": -0.44857311248779297, - "rewards/rejected_friction": -0.3655903935432434, + "epoch": 0.63, + "learning_rate": 1.1195302693246879e-06, + "logits/chosen": -0.34830474853515625, + "logits/rejected": -0.34976112842559814, + "logps/chosen": -447.49261474609375, + "logps/rejected": -460.62994384765625, + "loss": 0.4744, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.9276537895202637, + "rewards/margins": 1.2483450174331665, + "rewards/rejected": -4.175999164581299, "step": 2090 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.12, - "learning_rate": 3.6483906509558054e-06, - "logits/chosen": -0.3472575545310974, - "logits/chosen_friction": -0.3525816798210144, - "logits/rejected": -0.3322084844112396, - "logits/rejected_friction": -0.3238278031349182, - "logps/chosen": -3.20402455329895, - "logps/chosen_friction": -1.087805151939392, - "logps/rejected": -5.126701354980469, - "logps/rejected_friction": -4.19939661026001, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0770318508148193, - "policy_nll_loss": 3.1765899658203125, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25943297147750854, - "rewards/chosen_fricton": -0.05867725610733032, - "rewards/margins": 0.18941886723041534, - "rewards/margins_friction": 0.30777865648269653, - "rewards/rejected": -0.4488518238067627, - "rewards/rejected_friction": -0.36645588278770447, - "step": 2095 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.12, - "learning_rate": 3.642563406984173e-06, - "logits/chosen": -0.35509827733039856, - "logits/chosen_friction": -0.3613711893558502, - "logits/rejected": -0.3419992923736572, - "logits/rejected_friction": -0.33395954966545105, - "logps/chosen": -3.17252779006958, - "logps/chosen_friction": -1.0940622091293335, - "logps/rejected": -5.09922981262207, - "logps/rejected_friction": -4.1111040115356445, - "loss": 0.0012, - "policy_friction_nll_loss": 1.0785280466079712, - "policy_nll_loss": 3.1412365436553955, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25739479064941406, - "rewards/chosen_fricton": -0.061729639768600464, - "rewards/margins": 0.1895264834165573, - "rewards/margins_friction": 0.2978770136833191, - "rewards/rejected": -0.44692128896713257, - "rewards/rejected_friction": -0.35960668325424194, + "epoch": 0.64, + "learning_rate": 1.0970323365940443e-06, + "logits/chosen": -0.358784556388855, + "logits/rejected": -0.35958269238471985, + "logps/chosen": -449.94482421875, + "logps/rejected": -461.4793395996094, + "loss": 0.456, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9514570236206055, + "rewards/margins": 1.3440725803375244, + "rewards/rejected": -4.295529365539551, "step": 2100 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.12, - "learning_rate": 3.6367283058421544e-06, - "logits/chosen": -0.35402265191078186, - "logits/chosen_friction": -0.361591637134552, - "logits/rejected": -0.34036898612976074, - "logits/rejected_friction": -0.3340233564376831, - "logps/chosen": -3.0984854698181152, - "logps/chosen_friction": -1.0931636095046997, - "logps/rejected": -5.089953422546387, - "logps/rejected_friction": -4.253872871398926, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0714659690856934, - "policy_nll_loss": 3.0504164695739746, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24938654899597168, - "rewards/chosen_fricton": -0.06023000553250313, - "rewards/margins": 0.19663909077644348, - "rewards/margins_friction": 0.3126344084739685, - "rewards/rejected": -0.44602566957473755, - "rewards/rejected_friction": -0.37286442518234253, - "step": 2105 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.13, - "learning_rate": 3.6308853876565232e-06, - "logits/chosen": -0.36828410625457764, - "logits/chosen_friction": -0.3746894896030426, - "logits/rejected": -0.3530122637748718, - "logits/rejected_friction": -0.34568271040916443, - "logps/chosen": -3.10139799118042, - "logps/chosen_friction": -1.0840427875518799, - "logps/rejected": -5.103480339050293, - "logps/rejected_friction": -4.104491233825684, - "loss": 0.0008, - "policy_friction_nll_loss": 1.075092077255249, - "policy_nll_loss": 3.0795340538024902, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2520536184310913, - "rewards/chosen_fricton": -0.06170913577079773, - "rewards/margins": 0.19629046320915222, - "rewards/margins_friction": 0.2974157929420471, - "rewards/rejected": -0.4483441412448883, - "rewards/rejected_friction": -0.35912495851516724, + "epoch": 0.64, + "learning_rate": 1.0746990483391414e-06, + "logits/chosen": -0.3496165871620178, + "logits/rejected": -0.3507440388202667, + "logps/chosen": -453.05755615234375, + "logps/rejected": -464.02978515625, + "loss": 0.429, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.848665237426758, + "rewards/margins": 1.313674807548523, + "rewards/rejected": -4.162339687347412, "step": 2110 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.13, - "learning_rate": 3.6250346926078093e-06, - "logits/chosen": -0.36817753314971924, - "logits/chosen_friction": -0.37549930810928345, - "logits/rejected": -0.34815555810928345, - "logits/rejected_friction": -0.3460944592952728, - "logps/chosen": -2.9412474632263184, - "logps/chosen_friction": -1.024954915046692, - "logps/rejected": -4.943512916564941, - "logps/rejected_friction": -3.9437553882598877, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0196268558502197, - "policy_nll_loss": 2.922393560409546, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2382185161113739, - "rewards/chosen_fricton": -0.05767266824841499, - "rewards/margins": 0.1977483183145523, - "rewards/margins_friction": 0.2889152467250824, - "rewards/rejected": -0.435966819524765, - "rewards/rejected_friction": -0.3465878963470459, - "step": 2115 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.13, - "learning_rate": 3.6191762609300247e-06, - "logits/chosen": -0.35318490862846375, - "logits/chosen_friction": -0.3605826199054718, - "logits/rejected": -0.3405068516731262, - "logits/rejected_friction": -0.332461953163147, - "logps/chosen": -3.169260263442993, - "logps/chosen_friction": -1.0960675477981567, - "logps/rejected": -5.115991115570068, - "logps/rejected_friction": -4.166688442230225, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0855090618133545, - "policy_nll_loss": 3.136274814605713, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2561270594596863, - "rewards/chosen_fricton": -0.06104712933301926, - "rewards/margins": 0.19096747040748596, - "rewards/margins_friction": 0.302639365196228, - "rewards/rejected": -0.4470944404602051, - "rewards/rejected_friction": -0.3636864721775055, + "epoch": 0.64, + "learning_rate": 1.052533025470379e-06, + "logits/chosen": -0.3463028073310852, + "logits/rejected": -0.34690287709236145, + "logps/chosen": -443.31134033203125, + "logps/rejected": -455.8688049316406, + "loss": 0.4229, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8174338340759277, + "rewards/margins": 1.4426196813583374, + "rewards/rejected": -4.260054111480713, "step": 2120 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.13, - "learning_rate": 3.613310132910381e-06, - "logits/chosen": -0.34383344650268555, - "logits/chosen_friction": -0.3515166640281677, - "logits/rejected": -0.3306904733181, - "logits/rejected_friction": -0.32238301634788513, - "logps/chosen": -3.175543785095215, - "logps/chosen_friction": -1.0961602926254272, - "logps/rejected": -5.14786434173584, - "logps/rejected_friction": -4.219443321228027, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0765284299850464, - "policy_nll_loss": 3.1216843128204346, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2567747235298157, - "rewards/chosen_fricton": -0.05991360545158386, - "rewards/margins": 0.19433745741844177, - "rewards/margins_friction": 0.30897465348243713, - "rewards/rejected": -0.45111218094825745, - "rewards/rejected_friction": -0.368888258934021, - "step": 2125 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.14, - "learning_rate": 3.6074363488890184e-06, - "logits/chosen": -0.3658985495567322, - "logits/chosen_friction": -0.36903446912765503, - "logits/rejected": -0.3504960238933563, - "logits/rejected_friction": -0.3394075632095337, - "logps/chosen": -3.2586617469787598, - "logps/chosen_friction": -1.0906455516815186, - "logps/rejected": -5.156571388244629, - "logps/rejected_friction": -4.198966026306152, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0805646181106567, - "policy_nll_loss": 3.232179641723633, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26350224018096924, - "rewards/chosen_fricton": -0.05884348601102829, - "rewards/margins": 0.18711228668689728, - "rewards/margins_friction": 0.3076232969760895, - "rewards/rejected": -0.4506145119667053, - "rewards/rejected_friction": -0.36646682024002075, + "epoch": 0.65, + "learning_rate": 1.0305368692688175e-06, + "logits/chosen": -0.3607487082481384, + "logits/rejected": -0.36121565103530884, + "logps/chosen": -459.01824951171875, + "logps/rejected": -472.31884765625, + "loss": 0.4488, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.872823715209961, + "rewards/margins": 1.4165146350860596, + "rewards/rejected": -4.2893385887146, "step": 2130 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.14, - "learning_rate": 3.6015549492587253e-06, - "logits/chosen": -0.35221344232559204, - "logits/chosen_friction": -0.3584277033805847, - "logits/rejected": -0.34183835983276367, - "logits/rejected_friction": -0.3308924436569214, - "logps/chosen": -3.149615526199341, - "logps/chosen_friction": -1.0882593393325806, - "logps/rejected": -5.130424499511719, - "logps/rejected_friction": -4.257571220397949, - "loss": 0.001, - "policy_friction_nll_loss": 1.0688114166259766, - "policy_nll_loss": 3.1019599437713623, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2538009285926819, - "rewards/chosen_fricton": -0.05936942249536514, - "rewards/margins": 0.1946263462305069, - "rewards/margins_friction": 0.3127365708351135, - "rewards/rejected": -0.4484272599220276, - "rewards/rejected_friction": -0.3721059560775757, - "step": 2135 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.14, - "learning_rate": 3.5956659744646617e-06, - "logits/chosen": -0.35169538855552673, - "logits/chosen_friction": -0.35865676403045654, - "logits/rejected": -0.33602604269981384, - "logits/rejected_friction": -0.3317912518978119, - "logps/chosen": -3.0904736518859863, - "logps/chosen_friction": -1.0337055921554565, - "logps/rejected": -5.099551677703857, - "logps/rejected_friction": -4.102230548858643, - "loss": 0.0008, - "policy_friction_nll_loss": 1.021358609199524, - "policy_nll_loss": 3.057163715362549, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2509838342666626, - "rewards/chosen_fricton": -0.057298243045806885, - "rewards/margins": 0.1971920132637024, - "rewards/margins_friction": 0.302435964345932, - "rewards/rejected": -0.448175847530365, - "rewards/rejected_friction": -0.3597341775894165, + "epoch": 0.65, + "learning_rate": 1.0087131610809153e-06, + "logits/chosen": -0.34994029998779297, + "logits/rejected": -0.35072094202041626, + "logps/chosen": -442.97589111328125, + "logps/rejected": -453.756591796875, + "loss": 0.555, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.1791205406188965, + "rewards/margins": 1.0200657844543457, + "rewards/rejected": -4.1991868019104, "step": 2140 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.14, - "learning_rate": 3.589769465004079e-06, - "logits/chosen": -0.3550805449485779, - "logits/chosen_friction": -0.35786861181259155, - "logits/rejected": -0.34607571363449097, - "logits/rejected_friction": -0.3299782872200012, - "logps/chosen": -3.2711620330810547, - "logps/chosen_friction": -1.1650042533874512, - "logps/rejected": -5.144331455230713, - "logps/rejected_friction": -4.279609680175781, - "loss": 0.0025, - "policy_friction_nll_loss": 1.1490894556045532, - "policy_nll_loss": 3.2339141368865967, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26199787855148315, - "rewards/chosen_fricton": -0.06414501368999481, - "rewards/margins": 0.1839958280324936, - "rewards/margins_friction": 0.3071526885032654, - "rewards/rejected": -0.4459937512874603, - "rewards/rejected_friction": -0.371297687292099, - "step": 2145 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.15, - "learning_rate": 3.5838654614260432e-06, - "logits/chosen": -0.35517385601997375, - "logits/chosen_friction": -0.36168041825294495, - "logits/rejected": -0.3407610356807709, - "logits/rejected_friction": -0.33400100469589233, - "logps/chosen": -3.0537753105163574, - "logps/chosen_friction": -1.0695993900299072, - "logps/rejected": -5.010882377624512, - "logps/rejected_friction": -4.091104507446289, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0503180027008057, - "policy_nll_loss": 3.0108718872070312, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2475840151309967, - "rewards/chosen_fricton": -0.06095578521490097, - "rewards/margins": 0.19193199276924133, - "rewards/margins_friction": 0.297689288854599, - "rewards/rejected": -0.43951597809791565, - "rewards/rejected_friction": -0.35864511132240295, + "epoch": 0.65, + "learning_rate": 9.870644620155878e-07, + "logits/chosen": -0.35871225595474243, + "logits/rejected": -0.35941624641418457, + "logps/chosen": -454.1318359375, + "logps/rejected": -464.7295837402344, + "loss": 0.4462, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9613711833953857, + "rewards/margins": 1.3269731998443604, + "rewards/rejected": -4.288344383239746, "step": 2150 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.15, - "learning_rate": 3.577954004331158e-06, - "logits/chosen": -0.3538157045841217, - "logits/chosen_friction": -0.35914522409439087, - "logits/rejected": -0.33978086709976196, - "logits/rejected_friction": -0.33178848028182983, - "logps/chosen": -3.1229140758514404, - "logps/chosen_friction": -1.0189430713653564, - "logps/rejected": -5.093026161193848, - "logps/rejected_friction": -4.224425315856934, - "loss": 0.001, - "policy_friction_nll_loss": 1.004822850227356, - "policy_nll_loss": 3.0853395462036133, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2530035674571991, - "rewards/chosen_fricton": -0.05504867434501648, - "rewards/margins": 0.19310741126537323, - "rewards/margins_friction": 0.31540173292160034, - "rewards/rejected": -0.4461110234260559, - "rewards/rejected_friction": -0.3704504072666168, - "step": 2155 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.15, - "learning_rate": 3.57203513437128e-06, - "logits/chosen": -0.3378422260284424, - "logits/chosen_friction": -0.3437091112136841, - "logits/rejected": -0.3239489197731018, - "logits/rejected_friction": -0.31565192341804504, - "logps/chosen": -3.1698741912841797, - "logps/chosen_friction": -1.0920830965042114, - "logps/rejected": -5.072753429412842, - "logps/rejected_friction": -4.2317304611206055, - "loss": 0.001, - "policy_friction_nll_loss": 1.0726616382598877, - "policy_nll_loss": 3.115246057510376, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25602614879608154, - "rewards/chosen_fricton": -0.05980774015188217, - "rewards/margins": 0.18633928894996643, - "rewards/margins_friction": 0.30919796228408813, - "rewards/rejected": -0.44236546754837036, - "rewards/rejected_friction": -0.3690056800842285, + "epoch": 0.66, + "learning_rate": 9.655933126436565e-07, + "logits/chosen": -0.3492319583892822, + "logits/rejected": -0.3505721092224121, + "logps/chosen": -444.28515625, + "logps/rejected": -456.6302185058594, + "loss": 0.4471, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.847562313079834, + "rewards/margins": 1.3418700695037842, + "rewards/rejected": -4.1894330978393555, "step": 2160 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.15, - "learning_rate": 3.5661088922492442e-06, - "logits/chosen": -0.35506629943847656, - "logits/chosen_friction": -0.36114588379859924, - "logits/rejected": -0.3398541808128357, - "logits/rejected_friction": -0.3337363004684448, - "logps/chosen": -3.1278343200683594, - "logps/chosen_friction": -1.0740940570831299, - "logps/rejected": -5.101932525634766, - "logps/rejected_friction": -4.174454689025879, - "loss": 0.001, - "policy_friction_nll_loss": 1.0710369348526, - "policy_nll_loss": 3.1093909740448, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.253683477640152, - "rewards/chosen_fricton": -0.05981310456991196, - "rewards/margins": 0.19411996006965637, - "rewards/margins_friction": 0.3058796525001526, - "rewards/rejected": -0.44780340790748596, - "rewards/rejected_friction": -0.36569273471832275, - "step": 2165 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.16, - "learning_rate": 3.560175318718583e-06, - "logits/chosen": -0.3577633500099182, - "logits/chosen_friction": -0.36628860235214233, - "logits/rejected": -0.3395858108997345, - "logits/rejected_friction": -0.33906227350234985, - "logps/chosen": -3.018399715423584, - "logps/chosen_friction": -1.014209508895874, - "logps/rejected": -5.057510852813721, - "logps/rejected_friction": -4.062264442443848, - "loss": 0.0007, - "policy_friction_nll_loss": 0.9961079359054565, - "policy_nll_loss": 2.962085247039795, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24679729342460632, - "rewards/chosen_fricton": -0.05770678073167801, - "rewards/margins": 0.20101284980773926, - "rewards/margins_friction": 0.3010788857936859, - "rewards/rejected": -0.44781017303466797, - "rewards/rejected_friction": -0.3587856590747833, + "epoch": 0.66, + "learning_rate": 9.443022326996984e-07, + "logits/chosen": -0.354257732629776, + "logits/rejected": -0.35464176535606384, + "logps/chosen": -444.35089111328125, + "logps/rejected": -453.5, + "loss": 0.4514, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9952127933502197, + "rewards/margins": 1.3154346942901611, + "rewards/rejected": -4.310647487640381, "step": 2170 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.16, - "learning_rate": 3.554234454583244e-06, - "logits/chosen": -0.3600156605243683, - "logits/chosen_friction": -0.36591318249702454, - "logits/rejected": -0.34536975622177124, - "logits/rejected_friction": -0.33691638708114624, - "logps/chosen": -3.182370662689209, - "logps/chosen_friction": -1.069347620010376, - "logps/rejected": -5.092129230499268, - "logps/rejected_friction": -4.139839172363281, - "loss": 0.001, - "policy_friction_nll_loss": 1.0605661869049072, - "policy_nll_loss": 3.1521949768066406, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2590058445930481, - "rewards/chosen_fricton": -0.05849630758166313, - "rewards/margins": 0.18774035573005676, - "rewards/margins_friction": 0.3034437298774719, - "rewards/rejected": -0.44674617052078247, - "rewards/rejected_friction": -0.3619399964809418, - "step": 2175 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.16, - "learning_rate": 3.5482863406973127e-06, - "logits/chosen": -0.35983237624168396, - "logits/chosen_friction": -0.3666148781776428, - "logits/rejected": -0.34351590275764465, - "logits/rejected_friction": -0.3390960693359375, - "logps/chosen": -3.1406524181365967, - "logps/chosen_friction": -1.0624639987945557, - "logps/rejected": -5.0975542068481445, - "logps/rejected_friction": -4.103381156921387, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0514816045761108, - "policy_nll_loss": 3.1125502586364746, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2550669312477112, - "rewards/chosen_fricton": -0.05939073488116264, - "rewards/margins": 0.1931925266981125, - "rewards/margins_friction": 0.3010750710964203, - "rewards/rejected": -0.4482594430446625, - "rewards/rejected_friction": -0.3604658246040344, + "epoch": 0.66, + "learning_rate": 9.231937207863459e-07, + "logits/chosen": -0.35797202587127686, + "logits/rejected": -0.3591151833534241, + "logps/chosen": -446.80487060546875, + "logps/rejected": -460.2659606933594, + "loss": 0.4346, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.878164768218994, + "rewards/margins": 1.2440316677093506, + "rewards/rejected": -4.122197151184082, "step": 2180 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.17, - "learning_rate": 3.5423310179647275e-06, - "logits/chosen": -0.3586091101169586, - "logits/chosen_friction": -0.36536678671836853, - "logits/rejected": -0.34194129705429077, - "logits/rejected_friction": -0.33718109130859375, - "logps/chosen": -3.130409002304077, - "logps/chosen_friction": -1.0626049041748047, - "logps/rejected": -5.080742835998535, - "logps/rejected_friction": -4.157718181610107, - "loss": 0.0008, - "policy_friction_nll_loss": 1.051827311515808, - "policy_nll_loss": 3.1057701110839844, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25406062602996826, - "rewards/chosen_fricton": -0.05894993990659714, - "rewards/margins": 0.19144734740257263, - "rewards/margins_friction": 0.30519938468933105, - "rewards/rejected": -0.4455080032348633, - "rewards/rejected_friction": -0.364149272441864, - "step": 2185 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.17, - "learning_rate": 3.536368527339002e-06, - "logits/chosen": -0.3594784140586853, - "logits/chosen_friction": -0.36441779136657715, - "logits/rejected": -0.3428528904914856, - "logits/rejected_friction": -0.3344501554965973, - "logps/chosen": -3.1455156803131104, - "logps/chosen_friction": -1.0993027687072754, - "logps/rejected": -5.13982629776001, - "logps/rejected_friction": -4.229061603546143, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0870749950408936, - "policy_nll_loss": 3.1108789443969727, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2551157772541046, - "rewards/chosen_fricton": -0.06251100450754166, - "rewards/margins": 0.19602791965007782, - "rewards/margins_friction": 0.3083586096763611, - "rewards/rejected": -0.45114365220069885, - "rewards/rejected_friction": -0.37086957693099976, + "epoch": 0.67, + "learning_rate": 9.022702540810607e-07, + "logits/chosen": -0.3597440719604492, + "logits/rejected": -0.3606324791908264, + "logps/chosen": -450.4046325683594, + "logps/rejected": -460.8863830566406, + "loss": 0.4151, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.835644483566284, + "rewards/margins": 1.3725159168243408, + "rewards/rejected": -4.208160400390625, "step": 2190 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.17, - "learning_rate": 3.5303989098229407e-06, - "logits/chosen": -0.36524102091789246, - "logits/chosen_friction": -0.3704375624656677, - "logits/rejected": -0.34746286273002625, - "logits/rejected_friction": -0.34217768907546997, - "logps/chosen": -3.052935838699341, - "logps/chosen_friction": -1.0293465852737427, - "logps/rejected": -5.052632808685303, - "logps/rejected_friction": -4.08125638961792, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0221877098083496, - "policy_nll_loss": 3.022488832473755, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2488909214735031, - "rewards/chosen_fricton": -0.057765714824199677, - "rewards/margins": 0.19722364842891693, - "rewards/margins_friction": 0.30172133445739746, - "rewards/rejected": -0.44611454010009766, - "rewards/rejected_friction": -0.35948699712753296, - "step": 2195 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.17, - "learning_rate": 3.524422206468359e-06, - "logits/chosen": -0.3541533350944519, - "logits/chosen_friction": -0.3602713644504547, - "logits/rejected": -0.3391427993774414, - "logits/rejected_friction": -0.3328729271888733, - "logps/chosen": -3.1116788387298584, - "logps/chosen_friction": -1.0715510845184326, - "logps/rejected": -5.048295974731445, - "logps/rejected_friction": -4.121532440185547, - "loss": 0.001, - "policy_friction_nll_loss": 1.0621309280395508, - "policy_nll_loss": 3.076078414916992, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25306785106658936, - "rewards/chosen_fricton": -0.060296762734651566, - "rewards/margins": 0.1899951845407486, - "rewards/margins_friction": 0.30072706937789917, - "rewards/rejected": -0.44306302070617676, - "rewards/rejected_friction": -0.36102384328842163, + "epoch": 0.67, + "learning_rate": 8.815342880454312e-07, + "logits/chosen": -0.3541966378688812, + "logits/rejected": -0.35494524240493774, + "logps/chosen": -455.19146728515625, + "logps/rejected": -470.131103515625, + "loss": 0.4702, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.059727430343628, + "rewards/margins": 1.261040449142456, + "rewards/rejected": -4.320767879486084, "step": 2200 }, { - "epoch": 1.17, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.3910662531852722, - "eval_logits/chosen_friction": -0.39667990803718567, - "eval_logits/rejected": -0.3756091892719269, - "eval_logits/rejected_friction": -0.3646467626094818, - "eval_logps/chosen": -3.1567203998565674, - "eval_logps/chosen_friction": -1.0925096273422241, - "eval_logps/rejected": -5.12302827835083, - "eval_logps/rejected_friction": -4.20783805847168, - "eval_loss": 0.0008260383037850261, - "eval_policy_friction_nll_loss": 1.0925096273422241, - "eval_policy_nll_loss": 3.1567206382751465, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.25572991371154785, - "eval_rewards/chosen_fricton": -0.060982245951890945, - "eval_rewards/margins": 0.19315297901630402, - "eval_rewards/margins_friction": 0.3073856830596924, - "eval_rewards/rejected": -0.4488828778266907, - "eval_rewards/rejected_friction": -0.36836791038513184, - "eval_runtime": 359.0213, - "eval_samples_per_second": 1.393, - "eval_steps_per_second": 0.696, + "epoch": 0.67, + "eval_logits/chosen": -0.4240322411060333, + "eval_logits/rejected": -0.4246600270271301, + "eval_logps/chosen": -441.68072509765625, + "eval_logps/rejected": -452.262451171875, + "eval_loss": 0.4791676104068756, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -2.878952980041504, + "eval_rewards/margins": 1.2493829727172852, + "eval_rewards/rejected": -4.128335475921631, + "eval_runtime": 351.6609, + "eval_samples_per_second": 1.422, + "eval_steps_per_second": 1.422, "step": 2200 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.18, - "learning_rate": 3.5184384583758014e-06, - "logits/chosen": -0.34230536222457886, - "logits/chosen_friction": -0.3468330204486847, - "logits/rejected": -0.3311873972415924, - "logits/rejected_friction": -0.3192118704319, - "logps/chosen": -3.1093497276306152, - "logps/chosen_friction": -1.088476300239563, - "logps/rejected": -5.034243583679199, - "logps/rejected_friction": -4.248931884765625, - "loss": 0.0006, - "policy_friction_nll_loss": 1.0674636363983154, - "policy_nll_loss": 3.055842876434326, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2501407265663147, - "rewards/chosen_fricton": -0.05997311323881149, - "rewards/margins": 0.18893814086914062, - "rewards/margins_friction": 0.3114513158798218, - "rewards/rejected": -0.4390788972377777, - "rewards/rejected_friction": -0.37142443656921387, - "step": 2205 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.18, - "learning_rate": 3.512447706694254e-06, - "logits/chosen": -0.3463706374168396, - "logits/chosen_friction": -0.35187381505966187, - "logits/rejected": -0.32909518480300903, - "logits/rejected_friction": -0.3222137987613678, - "logps/chosen": -3.2239813804626465, - "logps/chosen_friction": -1.09125554561615, - "logps/rejected": -5.184380054473877, - "logps/rejected_friction": -4.2401227951049805, - "loss": 0.0006, - "policy_friction_nll_loss": 1.0708640813827515, - "policy_nll_loss": 3.1714611053466797, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2625286877155304, - "rewards/chosen_fricton": -0.06067076325416565, - "rewards/margins": 0.19325773417949677, - "rewards/margins_friction": 0.3112563490867615, - "rewards/rejected": -0.4557863771915436, - "rewards/rejected_friction": -0.37192708253860474, + "epoch": 0.67, + "learning_rate": 8.609882561370101e-07, + "logits/chosen": -0.3556322455406189, + "logits/rejected": -0.35619792342185974, + "logps/chosen": -446.03204345703125, + "logps/rejected": -455.453125, + "loss": 0.4476, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.709972381591797, + "rewards/margins": 1.3839571475982666, + "rewards/rejected": -4.093929290771484, "step": 2210 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.18, - "learning_rate": 3.506449992620869e-06, - "logits/chosen": -0.3511253893375397, - "logits/chosen_friction": -0.357921838760376, - "logits/rejected": -0.33393850922584534, - "logits/rejected_friction": -0.3287371098995209, - "logps/chosen": -3.237710952758789, - "logps/chosen_friction": -1.1132657527923584, - "logps/rejected": -5.203339576721191, - "logps/rejected_friction": -4.194175720214844, - "loss": 0.0007, - "policy_friction_nll_loss": 1.098232626914978, - "policy_nll_loss": 3.2025370597839355, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26103144884109497, - "rewards/chosen_fricton": -0.06070087477564812, - "rewards/margins": 0.19324883818626404, - "rewards/margins_friction": 0.30388468503952026, - "rewards/rejected": -0.4542803168296814, - "rewards/rejected_friction": -0.3645855784416199, - "step": 2215 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.18, - "learning_rate": 3.500445357400676e-06, - "logits/chosen": -0.33885979652404785, - "logits/chosen_friction": -0.3427988290786743, - "logits/rejected": -0.32523271441459656, - "logits/rejected_friction": -0.3159716725349426, - "logps/chosen": -3.162764549255371, - "logps/chosen_friction": -1.1082240343093872, - "logps/rejected": -5.056168556213379, - "logps/rejected_friction": -4.159665584564209, - "loss": 0.0033, - "policy_friction_nll_loss": 1.0861868858337402, - "policy_nll_loss": 3.113542318344116, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25506865978240967, - "rewards/chosen_fricton": -0.060645557940006256, - "rewards/margins": 0.1854279786348343, - "rewards/margins_friction": 0.3006284236907959, - "rewards/rejected": -0.44049662351608276, - "rewards/rejected_friction": -0.36127394437789917, + "epoch": 0.67, + "learning_rate": 8.406345695237394e-07, + "logits/chosen": -0.3541732430458069, + "logits/rejected": -0.35552269220352173, + "logps/chosen": -444.166015625, + "logps/rejected": -460.39617919921875, + "loss": 0.3845, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.691920757293701, + "rewards/margins": 1.684851884841919, + "rewards/rejected": -4.376772880554199, "step": 2220 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.19, - "learning_rate": 3.494433842326298e-06, - "logits/chosen": -0.3612160384654999, - "logits/chosen_friction": -0.36812537908554077, - "logits/rejected": -0.3425236642360687, - "logits/rejected_friction": -0.3412221670150757, - "logps/chosen": -3.0250792503356934, - "logps/chosen_friction": -1.0270823240280151, - "logps/rejected": -5.03117561340332, - "logps/rejected_friction": -4.010631084442139, - "loss": 0.003, - "policy_friction_nll_loss": 1.0150108337402344, - "policy_nll_loss": 2.9974217414855957, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24621930718421936, - "rewards/chosen_fricton": -0.05783824995160103, - "rewards/margins": 0.19661954045295715, - "rewards/margins_friction": 0.29365167021751404, - "rewards/rejected": -0.4428388476371765, - "rewards/rejected_friction": -0.35148993134498596, - "step": 2225 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.19, - "learning_rate": 3.488415488737673e-06, - "logits/chosen": -0.34973910450935364, - "logits/chosen_friction": -0.35402315855026245, - "logits/rejected": -0.3358108699321747, - "logits/rejected_friction": -0.32677367329597473, - "logps/chosen": -3.198850154876709, - "logps/chosen_friction": -1.1540958881378174, - "logps/rejected": -5.140152931213379, - "logps/rejected_friction": -4.251591682434082, - "loss": 0.0008, - "policy_friction_nll_loss": 1.1433441638946533, - "policy_nll_loss": 3.1654410362243652, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2582940459251404, - "rewards/chosen_fricton": -0.06578586995601654, - "rewards/margins": 0.19003592431545258, - "rewards/margins_friction": 0.30476322770118713, - "rewards/rejected": -0.44832998514175415, - "rewards/rejected_friction": -0.37054911255836487, + "epoch": 0.68, + "learning_rate": 8.20475616800985e-07, + "logits/chosen": -0.35582807660102844, + "logits/rejected": -0.35650044679641724, + "logps/chosen": -449.7290954589844, + "logps/rejected": -458.966064453125, + "loss": 0.498, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0267415046691895, + "rewards/margins": 1.181206464767456, + "rewards/rejected": -4.207947254180908, "step": 2230 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.19, - "learning_rate": 3.482390338021764e-06, - "logits/chosen": -0.35535570979118347, - "logits/chosen_friction": -0.36097273230552673, - "logits/rejected": -0.339291512966156, - "logits/rejected_friction": -0.3315024971961975, - "logps/chosen": -3.215299129486084, - "logps/chosen_friction": -1.1157004833221436, - "logps/rejected": -5.189469337463379, - "logps/rejected_friction": -4.204017162322998, - "loss": 0.0008, - "policy_friction_nll_loss": 1.099465012550354, - "policy_nll_loss": 3.1796882152557373, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2600664794445038, - "rewards/chosen_fricton": -0.06197778508067131, - "rewards/margins": 0.19467571377754211, - "rewards/margins_friction": 0.305331289768219, - "rewards/rejected": -0.4547421336174011, - "rewards/rejected_friction": -0.3673090636730194, - "step": 2235 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.19, - "learning_rate": 3.4763584316122756e-06, - "logits/chosen": -0.3598187267780304, - "logits/chosen_friction": -0.3674456477165222, - "logits/rejected": -0.344076007604599, - "logits/rejected_friction": -0.3376028835773468, - "logps/chosen": -3.0594565868377686, - "logps/chosen_friction": -1.0804388523101807, - "logps/rejected": -5.017966270446777, - "logps/rejected_friction": -4.146507263183594, - "loss": 0.001, - "policy_friction_nll_loss": 1.070693850517273, - "policy_nll_loss": 3.037463426589966, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24610669910907745, - "rewards/chosen_fricton": -0.0603896789252758, - "rewards/margins": 0.1928800642490387, - "rewards/margins_friction": 0.3032401204109192, - "rewards/rejected": -0.43898677825927734, - "rewards/rejected_friction": -0.3636297583580017, + "epoch": 0.68, + "learning_rate": 8.005137637112303e-07, + "logits/chosen": -0.35746604204177856, + "logits/rejected": -0.35817286372184753, + "logps/chosen": -450.47308349609375, + "logps/rejected": -463.4517517089844, + "loss": 0.4951, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.940412759780884, + "rewards/margins": 1.3010506629943848, + "rewards/rejected": -4.2414631843566895, "step": 2240 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.2, - "learning_rate": 3.470319810989371e-06, - "logits/chosen": -0.35532885789871216, - "logits/chosen_friction": -0.3590385317802429, - "logits/rejected": -0.3396589457988739, - "logits/rejected_friction": -0.32997217774391174, - "logps/chosen": -3.233919143676758, - "logps/chosen_friction": -1.0919967889785767, - "logps/rejected": -5.181818008422852, - "logps/rejected_friction": -4.2259521484375, - "loss": 0.0005, - "policy_friction_nll_loss": 1.0796048641204834, - "policy_nll_loss": 3.2033934593200684, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2619197964668274, - "rewards/chosen_fricton": -0.05929756164550781, - "rewards/margins": 0.19104209542274475, - "rewards/margins_friction": 0.3089081048965454, - "rewards/rejected": -0.45296183228492737, - "rewards/rejected_friction": -0.3682056665420532, - "step": 2245 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.2, - "learning_rate": 3.464274517679386e-06, - "logits/chosen": -0.35243669152259827, - "logits/chosen_friction": -0.35875365138053894, - "logits/rejected": -0.3385311961174011, - "logits/rejected_friction": -0.33107990026474, - "logps/chosen": -3.139094829559326, - "logps/chosen_friction": -1.091201901435852, - "logps/rejected": -5.077291011810303, - "logps/rejected_friction": -4.172491550445557, - "loss": 0.0023, - "policy_friction_nll_loss": 1.0754035711288452, - "policy_nll_loss": 3.102560043334961, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25305312871932983, - "rewards/chosen_fricton": -0.05990481376647949, - "rewards/margins": 0.19086401164531708, - "rewards/margins_friction": 0.30493003129959106, - "rewards/rejected": -0.4439171850681305, - "rewards/rejected_friction": -0.36483481526374817, + "epoch": 0.68, + "learning_rate": 7.807513528664415e-07, + "logits/chosen": -0.3562454581260681, + "logits/rejected": -0.3569663166999817, + "logps/chosen": -449.81768798828125, + "logps/rejected": -462.03125, + "loss": 0.4975, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.0241804122924805, + "rewards/margins": 1.1527836322784424, + "rewards/rejected": -4.176963806152344, "step": 2250 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.2, - "learning_rate": 3.4582225932545418e-06, - "logits/chosen": -0.34207597374916077, - "logits/chosen_friction": -0.3470006585121155, - "logits/rejected": -0.3293619155883789, - "logits/rejected_friction": -0.319511353969574, - "logps/chosen": -3.2089571952819824, - "logps/chosen_friction": -1.097170114517212, - "logps/rejected": -5.096442222595215, - "logps/rejected_friction": -4.220518112182617, - "loss": 0.0006, - "policy_friction_nll_loss": 1.0797182321548462, - "policy_nll_loss": 3.164191722869873, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26015228033065796, - "rewards/chosen_fricton": -0.06044761463999748, - "rewards/margins": 0.18553802371025085, - "rewards/margins_friction": 0.3084840178489685, - "rewards/rejected": -0.4456903338432312, - "rewards/rejected_friction": -0.36893168091773987, - "step": 2255 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.21, - "learning_rate": 3.4521640793326616e-06, - "logits/chosen": -0.3539137840270996, - "logits/chosen_friction": -0.36051109433174133, - "logits/rejected": -0.3412061333656311, - "logits/rejected_friction": -0.3328555226325989, - "logps/chosen": -3.0362656116485596, - "logps/chosen_friction": -1.0597670078277588, - "logps/rejected": -4.969259262084961, - "logps/rejected_friction": -4.115499019622803, - "loss": 0.0022, - "policy_friction_nll_loss": 1.0498418807983398, - "policy_nll_loss": 3.006465435028076, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24420738220214844, - "rewards/chosen_fricton": -0.058289967477321625, - "rewards/margins": 0.19035129249095917, - "rewards/margins_friction": 0.3020039498806, - "rewards/rejected": -0.434558629989624, - "rewards/rejected_friction": -0.3602939248085022, + "epoch": 0.69, + "learning_rate": 7.611907034731538e-07, + "logits/chosen": -0.35374173521995544, + "logits/rejected": -0.3544319272041321, + "logps/chosen": -452.8089904785156, + "logps/rejected": -466.8789978027344, + "loss": 0.4872, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0933475494384766, + "rewards/margins": 1.278378963470459, + "rewards/rejected": -4.371726989746094, "step": 2260 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.21, - "learning_rate": 3.4460990175768837e-06, - "logits/chosen": -0.36058151721954346, - "logits/chosen_friction": -0.3653266429901123, - "logits/rejected": -0.3479273319244385, - "logits/rejected_friction": -0.33517366647720337, - "logps/chosen": -3.2692089080810547, - "logps/chosen_friction": -1.1383073329925537, - "logps/rejected": -5.155524253845215, - "logps/rejected_friction": -4.311903953552246, - "loss": 0.0007, - "policy_friction_nll_loss": 1.1317980289459229, - "policy_nll_loss": 3.2488913536071777, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26301494240760803, - "rewards/chosen_fricton": -0.06159050017595291, - "rewards/margins": 0.18545755743980408, - "rewards/margins_friction": 0.31334713101387024, - "rewards/rejected": -0.4484724998474121, - "rewards/rejected_friction": -0.37493765354156494, - "step": 2265 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.21, - "learning_rate": 3.440027449695372e-06, - "logits/chosen": -0.3574160635471344, - "logits/chosen_friction": -0.36479881405830383, - "logits/rejected": -0.3405574560165405, - "logits/rejected_friction": -0.3389177918434143, - "logps/chosen": -2.9903573989868164, - "logps/chosen_friction": -1.0123355388641357, - "logps/rejected": -5.054336071014404, - "logps/rejected_friction": -4.073055267333984, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0064222812652588, - "policy_nll_loss": 2.96327543258667, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24328270554542542, - "rewards/chosen_fricton": -0.056142665445804596, - "rewards/margins": 0.20327933132648468, - "rewards/margins_friction": 0.3023919463157654, - "rewards/rejected": -0.4465620517730713, - "rewards/rejected_friction": -0.3585346043109894, + "epoch": 0.69, + "learning_rate": 7.418341110603e-07, + "logits/chosen": -0.3625703454017639, + "logits/rejected": -0.363391637802124, + "logps/chosen": -461.72442626953125, + "logps/rejected": -472.3929138183594, + "loss": 0.4361, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.796329975128174, + "rewards/margins": 1.4400124549865723, + "rewards/rejected": -4.236342430114746, "step": 2270 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.21, - "learning_rate": 3.4339494174410342e-06, - "logits/chosen": -0.34802576899528503, - "logits/chosen_friction": -0.35314133763313293, - "logits/rejected": -0.3336936831474304, - "logits/rejected_friction": -0.3256796896457672, - "logps/chosen": -3.099971294403076, - "logps/chosen_friction": -1.121181607246399, - "logps/rejected": -5.031020641326904, - "logps/rejected_friction": -4.112654685974121, - "loss": 0.0017, - "policy_friction_nll_loss": 1.0898048877716064, - "policy_nll_loss": 3.0627825260162354, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2505204677581787, - "rewards/chosen_fricton": -0.06427779793739319, - "rewards/margins": 0.189347505569458, - "rewards/margins_friction": 0.2947852909564972, - "rewards/rejected": -0.4398679733276367, - "rewards/rejected_friction": -0.3590630888938904, - "step": 2275 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.22, - "learning_rate": 3.427864962611231e-06, - "logits/chosen": -0.35273489356040955, - "logits/chosen_friction": -0.3575338125228882, - "logits/rejected": -0.33806973695755005, - "logits/rejected_friction": -0.3293951749801636, - "logps/chosen": -3.221149444580078, - "logps/chosen_friction": -1.0983026027679443, - "logps/rejected": -5.136956214904785, - "logps/rejected_friction": -4.183486461639404, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0908024311065674, - "policy_nll_loss": 3.1958365440368652, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2600838541984558, - "rewards/chosen_fricton": -0.06069673225283623, - "rewards/margins": 0.18889100849628448, - "rewards/margins_friction": 0.3049202561378479, - "rewards/rejected": -0.4489748477935791, - "rewards/rejected_friction": -0.365617036819458, + "epoch": 0.69, + "learning_rate": 7.226838472098239e-07, + "logits/chosen": -0.35118603706359863, + "logits/rejected": -0.35229939222335815, + "logps/chosen": -450.23895263671875, + "logps/rejected": -462.094482421875, + "loss": 0.4608, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8915536403656006, + "rewards/margins": 1.3143730163574219, + "rewards/rejected": -4.205926418304443, "step": 2280 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.22, - "learning_rate": 3.421774127047489e-06, - "logits/chosen": -0.3589330017566681, - "logits/chosen_friction": -0.3653487265110016, - "logits/rejected": -0.3401429057121277, - "logits/rejected_friction": -0.33684802055358887, - "logps/chosen": -3.120551824569702, - "logps/chosen_friction": -1.0640617609024048, - "logps/rejected": -5.1608099937438965, - "logps/rejected_friction": -4.0876994132995605, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0576415061950684, - "policy_nll_loss": 3.1010329723358154, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25454890727996826, - "rewards/chosen_fricton": -0.060493551194667816, - "rewards/margins": 0.20165736973285675, - "rewards/margins_friction": 0.2993614971637726, - "rewards/rejected": -0.4562062621116638, - "rewards/rejected_friction": -0.359855055809021, - "step": 2285 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.22, - "learning_rate": 3.4156769526352167e-06, - "logits/chosen": -0.34498485922813416, - "logits/chosen_friction": -0.3505428433418274, - "logits/rejected": -0.332183301448822, - "logits/rejected_friction": -0.32424330711364746, - "logps/chosen": -3.130784511566162, - "logps/chosen_friction": -1.07246994972229, - "logps/rejected": -5.107053756713867, - "logps/rejected_friction": -4.211551189422607, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0612552165985107, - "policy_nll_loss": 3.0924487113952637, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25215646624565125, - "rewards/chosen_fricton": -0.058184050023555756, - "rewards/margins": 0.1942053586244583, - "rewards/margins_friction": 0.30969902873039246, - "rewards/rejected": -0.44636183977127075, - "rewards/rejected_friction": -0.3678830862045288, + "epoch": 0.7, + "learning_rate": 7.037421592900942e-07, + "logits/chosen": -0.3532702326774597, + "logits/rejected": -0.3544442057609558, + "logps/chosen": -444.9677734375, + "logps/rejected": -458.2537536621094, + "loss": 0.4259, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8905608654022217, + "rewards/margins": 1.462416410446167, + "rewards/rejected": -4.352977275848389, "step": 2290 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.22, - "learning_rate": 3.40957348130341e-06, - "logits/chosen": -0.35508328676223755, - "logits/chosen_friction": -0.3629041910171509, - "logits/rejected": -0.3384551703929901, - "logits/rejected_friction": -0.3358921408653259, - "logps/chosen": -3.0569512844085693, - "logps/chosen_friction": -1.0746781826019287, - "logps/rejected": -4.9907732009887695, - "logps/rejected_friction": -4.068641662597656, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0647449493408203, - "policy_nll_loss": 3.0241150856018066, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24923689663410187, - "rewards/chosen_fricton": -0.062082450836896896, - "rewards/margins": 0.19057326018810272, - "rewards/margins_friction": 0.2955321669578552, - "rewards/rejected": -0.439810186624527, - "rewards/rejected_friction": -0.3576146364212036, - "step": 2295 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.23, - "learning_rate": 3.403463755024369e-06, - "logits/chosen": -0.358178049325943, - "logits/chosen_friction": -0.3663245141506195, - "logits/rejected": -0.34326353669166565, - "logits/rejected_friction": -0.3370365500450134, - "logps/chosen": -3.242417573928833, - "logps/chosen_friction": -1.092888593673706, - "logps/rejected": -5.160965919494629, - "logps/rejected_friction": -4.214120864868164, - "loss": 0.0013, - "policy_friction_nll_loss": 1.089258074760437, - "policy_nll_loss": 3.225801467895508, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26120442152023315, - "rewards/chosen_fricton": -0.05857201665639877, - "rewards/margins": 0.18915900588035583, - "rewards/margins_friction": 0.30894726514816284, - "rewards/rejected": -0.4503633975982666, - "rewards/rejected_friction": -0.3675192892551422, + "epoch": 0.7, + "learning_rate": 6.850112701921735e-07, + "logits/chosen": -0.35222965478897095, + "logits/rejected": -0.3528757095336914, + "logps/chosen": -441.19976806640625, + "logps/rejected": -455.4878845214844, + "loss": 0.4063, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7645294666290283, + "rewards/margins": 1.4231407642364502, + "rewards/rejected": -4.1876702308654785, "step": 2300 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.23, - "learning_rate": 3.3973478158134095e-06, - "logits/chosen": -0.3432939648628235, - "logits/chosen_friction": -0.34858161211013794, - "logits/rejected": -0.32972854375839233, - "logits/rejected_friction": -0.32151082158088684, - "logps/chosen": -3.2280211448669434, - "logps/chosen_friction": -1.094962239265442, - "logps/rejected": -5.20511531829834, - "logps/rejected_friction": -4.180837154388428, - "loss": 0.0006, - "policy_friction_nll_loss": 1.0786978006362915, - "policy_nll_loss": 3.186988353729248, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25994640588760376, - "rewards/chosen_fricton": -0.05903080850839615, - "rewards/margins": 0.1938796043395996, - "rewards/margins_friction": 0.3040875196456909, - "rewards/rejected": -0.45382601022720337, - "rewards/rejected_friction": -0.36311835050582886, - "step": 2305 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.23, - "learning_rate": 3.3912257057285684e-06, - "logits/chosen": -0.3545707166194916, - "logits/chosen_friction": -0.359928697347641, - "logits/rejected": -0.33798113465309143, - "logits/rejected_friction": -0.33219337463378906, - "logps/chosen": -3.0842843055725098, - "logps/chosen_friction": -1.0557359457015991, - "logps/rejected": -5.08116340637207, - "logps/rejected_friction": -4.149897575378418, - "loss": 0.0006, - "policy_friction_nll_loss": 1.0412309169769287, - "policy_nll_loss": 3.0485472679138184, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2502031624317169, - "rewards/chosen_fricton": -0.05795867368578911, - "rewards/margins": 0.19599005579948425, - "rewards/margins_friction": 0.3051678240299225, - "rewards/rejected": -0.4461931586265564, - "rewards/rejected_friction": -0.3631264865398407, + "epoch": 0.7, + "learning_rate": 6.664933780689445e-07, + "logits/chosen": -0.3582982122898102, + "logits/rejected": -0.3593185842037201, + "logps/chosen": -450.44549560546875, + "logps/rejected": -464.33905029296875, + "loss": 0.4102, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.714564800262451, + "rewards/margins": 1.4921871423721313, + "rewards/rejected": -4.206751823425293, "step": 2310 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.23, - "learning_rate": 3.3850974668703225e-06, - "logits/chosen": -0.3572007715702057, - "logits/chosen_friction": -0.36401695013046265, - "logits/rejected": -0.344165563583374, - "logits/rejected_friction": -0.3361770510673523, - "logps/chosen": -3.1757349967956543, - "logps/chosen_friction": -1.0874249935150146, - "logps/rejected": -5.137618064880371, - "logps/rejected_friction": -4.169728755950928, - "loss": 0.001, - "policy_friction_nll_loss": 1.0771458148956299, - "policy_nll_loss": 3.1470818519592285, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25570598244667053, - "rewards/chosen_fricton": -0.059264302253723145, - "rewards/margins": 0.1931440830230713, - "rewards/margins_friction": 0.30439141392707825, - "rewards/rejected": -0.4488500654697418, - "rewards/rejected_friction": -0.3636557161808014, - "step": 2315 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.24, - "learning_rate": 3.3789631413812917e-06, - "logits/chosen": -0.3642643392086029, - "logits/chosen_friction": -0.3698851466178894, - "logits/rejected": -0.3494442403316498, - "logits/rejected_friction": -0.3426448702812195, - "logps/chosen": -3.1930689811706543, - "logps/chosen_friction": -1.058661699295044, - "logps/rejected": -5.155884742736816, - "logps/rejected_friction": -4.153945446014404, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0483663082122803, - "policy_nll_loss": 3.166134834289551, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2593778371810913, - "rewards/chosen_fricton": -0.05747928470373154, - "rewards/margins": 0.19341450929641724, - "rewards/margins_friction": 0.3060278296470642, - "rewards/rejected": -0.45279231667518616, - "rewards/rejected_friction": -0.36350712180137634, + "epoch": 0.7, + "learning_rate": 6.481906560771525e-07, + "logits/chosen": -0.357990562915802, + "logits/rejected": -0.3587570786476135, + "logps/chosen": -441.71746826171875, + "logps/rejected": -452.60870361328125, + "loss": 0.4988, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9209952354431152, + "rewards/margins": 1.2236100435256958, + "rewards/rejected": -4.1446051597595215, "step": 2320 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.24, - "learning_rate": 3.3728227714459538e-06, - "logits/chosen": -0.3502333164215088, - "logits/chosen_friction": -0.35713544487953186, - "logits/rejected": -0.33544236421585083, - "logits/rejected_friction": -0.33252158761024475, - "logps/chosen": -3.010103225708008, - "logps/chosen_friction": -1.0383903980255127, - "logps/rejected": -5.021442413330078, - "logps/rejected_friction": -4.111576080322266, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0242499113082886, - "policy_nll_loss": 2.964048385620117, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24379467964172363, - "rewards/chosen_fricton": -0.05795621871948242, - "rewards/margins": 0.1983480006456375, - "rewards/margins_friction": 0.30371302366256714, - "rewards/rejected": -0.44214263558387756, - "rewards/rejected_friction": -0.36166924238204956, - "step": 2325 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.24, - "learning_rate": 3.366676399290354e-06, - "logits/chosen": -0.3627789616584778, - "logits/chosen_friction": -0.37088167667388916, - "logits/rejected": -0.3458250164985657, - "logits/rejected_friction": -0.3426409363746643, - "logps/chosen": -3.0522148609161377, - "logps/chosen_friction": -1.0154746770858765, - "logps/rejected": -5.031703472137451, - "logps/rejected_friction": -4.015703201293945, - "loss": 0.0016, - "policy_friction_nll_loss": 1.0070377588272095, - "policy_nll_loss": 3.0234789848327637, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24737365543842316, - "rewards/chosen_fricton": -0.05585230141878128, - "rewards/margins": 0.19481578469276428, - "rewards/margins_friction": 0.2965497076511383, - "rewards/rejected": -0.44218936562538147, - "rewards/rejected_friction": -0.352402001619339, + "epoch": 0.71, + "learning_rate": 6.301052521223736e-07, + "logits/chosen": -0.3549385070800781, + "logits/rejected": -0.3562348484992981, + "logps/chosen": -450.3941955566406, + "logps/rejected": -462.034912109375, + "loss": 0.4629, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.1530508995056152, + "rewards/margins": 1.2050797939300537, + "rewards/rejected": -4.358130931854248, "step": 2330 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.25, - "learning_rate": 3.3605240671818114e-06, - "logits/chosen": -0.35152044892311096, - "logits/chosen_friction": -0.3584595322608948, - "logits/rejected": -0.33679428696632385, - "logits/rejected_friction": -0.328668475151062, - "logps/chosen": -3.2974140644073486, - "logps/chosen_friction": -1.1307320594787598, - "logps/rejected": -5.2480244636535645, - "logps/rejected_friction": -4.224532604217529, - "loss": 0.0012, - "policy_friction_nll_loss": 1.120583415031433, - "policy_nll_loss": 3.273407459259033, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2677726447582245, - "rewards/chosen_fricton": -0.062442611902952194, - "rewards/margins": 0.19108878076076508, - "rewards/margins_friction": 0.30501121282577515, - "rewards/rejected": -0.45886144042015076, - "rewards/rejected_friction": -0.36745381355285645, - "step": 2335 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.25, - "learning_rate": 3.3543658174286313e-06, - "logits/chosen": -0.3515077233314514, - "logits/chosen_friction": -0.3580477833747864, - "logits/rejected": -0.3336850106716156, - "logits/rejected_friction": -0.3298516869544983, - "logps/chosen": -3.178184986114502, - "logps/chosen_friction": -1.0682238340377808, - "logps/rejected": -5.142664909362793, - "logps/rejected_friction": -4.136876106262207, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0566350221633911, - "policy_nll_loss": 3.136018753051758, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2589319348335266, - "rewards/chosen_fricton": -0.059227168560028076, - "rewards/margins": 0.19301888346672058, - "rewards/margins_friction": 0.30260181427001953, - "rewards/rejected": -0.45195087790489197, - "rewards/rejected_friction": -0.3618289828300476, + "epoch": 0.71, + "learning_rate": 6.122392886069486e-07, + "logits/chosen": -0.3575456738471985, + "logits/rejected": -0.3584723174571991, + "logps/chosen": -456.48797607421875, + "logps/rejected": -469.8323669433594, + "loss": 0.403, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.066274881362915, + "rewards/margins": 1.3775126934051514, + "rewards/rejected": -4.443788051605225, "step": 2340 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.25, - "learning_rate": 3.348201692379815e-06, - "logits/chosen": -0.3444919288158417, - "logits/chosen_friction": -0.35153818130493164, - "logits/rejected": -0.33309563994407654, - "logits/rejected_friction": -0.3255065083503723, - "logps/chosen": -3.077083110809326, - "logps/chosen_friction": -1.0489773750305176, - "logps/rejected": -5.003064155578613, - "logps/rejected_friction": -4.164944648742676, - "loss": 0.0007, - "policy_friction_nll_loss": 1.039749026298523, - "policy_nll_loss": 3.0541975498199463, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24838510155677795, - "rewards/chosen_fricton": -0.05708765238523483, - "rewards/margins": 0.18960563838481903, - "rewards/margins_friction": 0.3077656924724579, - "rewards/rejected": -0.4379907250404358, - "rewards/rejected_friction": -0.3648533523082733, - "step": 2345 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.25, - "learning_rate": 3.342031734424764e-06, - "logits/chosen": -0.34468215703964233, - "logits/chosen_friction": -0.3492412269115448, - "logits/rejected": -0.331560879945755, - "logits/rejected_friction": -0.3231896758079529, - "logps/chosen": -3.1337625980377197, - "logps/chosen_friction": -1.0582391023635864, - "logps/rejected": -5.08650541305542, - "logps/rejected_friction": -4.137609481811523, - "loss": 0.0035, - "policy_friction_nll_loss": 1.044088363647461, - "policy_nll_loss": 3.1079697608947754, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2532617449760437, - "rewards/chosen_fricton": -0.05792003124952316, - "rewards/margins": 0.19140507280826569, - "rewards/margins_friction": 0.3031957745552063, - "rewards/rejected": -0.4446667730808258, - "rewards/rejected_friction": -0.36111578345298767, + "epoch": 0.71, + "learning_rate": 5.945948621809092e-07, + "logits/chosen": -0.34499675035476685, + "logits/rejected": -0.34601226449012756, + "logps/chosen": -444.13818359375, + "logps/rejected": -458.27978515625, + "loss": 0.3784, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.885612726211548, + "rewards/margins": 1.6283729076385498, + "rewards/rejected": -4.513985633850098, "step": 2350 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.26, - "learning_rate": 3.335855985992994e-06, - "logits/chosen": -0.356143057346344, - "logits/chosen_friction": -0.3629095256328583, - "logits/rejected": -0.33759090304374695, - "logits/rejected_friction": -0.3338179588317871, - "logps/chosen": -3.120975971221924, - "logps/chosen_friction": -1.0463148355484009, - "logps/rejected": -5.096127510070801, - "logps/rejected_friction": -4.161252021789551, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0305795669555664, - "policy_nll_loss": 3.0783448219299316, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2529585361480713, - "rewards/chosen_fricton": -0.05723228305578232, - "rewards/margins": 0.1946333348751068, - "rewards/margins_friction": 0.30779388546943665, - "rewards/rejected": -0.44759178161621094, - "rewards/rejected_friction": -0.3650261163711548, - "step": 2355 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.26, - "learning_rate": 3.3296744895538394e-06, - "logits/chosen": -0.35479432344436646, - "logits/chosen_friction": -0.3626072406768799, - "logits/rejected": -0.33539000153541565, - "logits/rejected_friction": -0.3315817415714264, - "logps/chosen": -3.287107467651367, - "logps/chosen_friction": -1.1320899724960327, - "logps/rejected": -5.2761077880859375, - "logps/rejected_friction": -4.251160621643066, - "loss": 0.0007, - "policy_friction_nll_loss": 1.1205588579177856, - "policy_nll_loss": 3.2585537433624268, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2671833634376526, - "rewards/chosen_fricton": -0.06372446566820145, - "rewards/margins": 0.19541135430335999, - "rewards/margins_friction": 0.3076588213443756, - "rewards/rejected": -0.46259474754333496, - "rewards/rejected_friction": -0.3713832497596741, + "epoch": 0.72, + "learning_rate": 5.771740434959278e-07, + "logits/chosen": -0.36106568574905396, + "logits/rejected": -0.3616113066673279, + "logps/chosen": -451.7525329589844, + "logps/rejected": -462.39361572265625, + "loss": 0.4455, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.004162549972534, + "rewards/margins": 1.2321292161941528, + "rewards/rejected": -4.236291408538818, "step": 2360 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.26, - "learning_rate": 3.323487287616163e-06, - "logits/chosen": -0.35550999641418457, - "logits/chosen_friction": -0.35896459221839905, - "logits/rejected": -0.3391522169113159, - "logits/rejected_friction": -0.3325487971305847, - "logps/chosen": -3.2227535247802734, - "logps/chosen_friction": -1.0714749097824097, - "logps/rejected": -5.1736860275268555, - "logps/rejected_friction": -4.153679370880127, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0642403364181519, - "policy_nll_loss": 3.201542377471924, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26379722356796265, - "rewards/chosen_fricton": -0.06086168438196182, - "rewards/margins": 0.1911802440881729, - "rewards/margins_friction": 0.3036103844642639, - "rewards/rejected": -0.45497745275497437, - "rewards/rejected_friction": -0.36447206139564514, - "step": 2365 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.26, - "learning_rate": 3.317294422728061e-06, - "logits/chosen": -0.3502041697502136, - "logits/chosen_friction": -0.35460954904556274, - "logits/rejected": -0.3380792438983917, - "logits/rejected_friction": -0.32503077387809753, - "logps/chosen": -3.3127524852752686, - "logps/chosen_friction": -1.12654709815979, - "logps/rejected": -5.170108795166016, - "logps/rejected_friction": -4.327702522277832, - "loss": 0.0009, - "policy_friction_nll_loss": 1.1182284355163574, - "policy_nll_loss": 3.2952141761779785, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2664968967437744, - "rewards/chosen_fricton": -0.060410238802433014, - "rewards/margins": 0.1821223795413971, - "rewards/margins_friction": 0.31595689058303833, - "rewards/rejected": -0.4486192762851715, - "rewards/rejected_friction": -0.37636709213256836, + "epoch": 0.72, + "learning_rate": 5.599788769623174e-07, + "logits/chosen": -0.3459396958351135, + "logits/rejected": -0.3463771939277649, + "logps/chosen": -451.74462890625, + "logps/rejected": -460.52532958984375, + "loss": 0.442, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0312728881835938, + "rewards/margins": 1.2485148906707764, + "rewards/rejected": -4.279788017272949, "step": 2370 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.27, - "learning_rate": 3.311095937476577e-06, - "logits/chosen": -0.3476383090019226, - "logits/chosen_friction": -0.35449424386024475, - "logits/rejected": -0.32951244711875916, - "logits/rejected_friction": -0.3259989619255066, - "logps/chosen": -3.09321665763855, - "logps/chosen_friction": -1.066865086555481, - "logps/rejected": -5.0329179763793945, - "logps/rejected_friction": -4.143814563751221, - "loss": 0.0011, - "policy_friction_nll_loss": 1.048092007637024, - "policy_nll_loss": 3.045152187347412, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25055575370788574, - "rewards/chosen_fricton": -0.059408821165561676, - "rewards/margins": 0.19061583280563354, - "rewards/margins_friction": 0.3038719892501831, - "rewards/rejected": -0.4411715567111969, - "rewards/rejected_friction": -0.3632808029651642, - "step": 2375 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.27, - "learning_rate": 3.304891874487401e-06, - "logits/chosen": -0.3597900867462158, - "logits/chosen_friction": -0.36850738525390625, - "logits/rejected": -0.34595566987991333, - "logits/rejected_friction": -0.3392060101032257, - "logps/chosen": -3.116063117980957, - "logps/chosen_friction": -1.0625778436660767, - "logps/rejected": -5.052853584289551, - "logps/rejected_friction": -4.145756721496582, - "loss": 0.0005, - "policy_friction_nll_loss": 1.0554077625274658, - "policy_nll_loss": 3.0913543701171875, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25297343730926514, - "rewards/chosen_fricton": -0.05945741385221481, - "rewards/margins": 0.19062592089176178, - "rewards/margins_friction": 0.3044060468673706, - "rewards/rejected": -0.4435993731021881, - "rewards/rejected_friction": -0.3638634979724884, + "epoch": 0.72, + "learning_rate": 5.430113805091111e-07, + "logits/chosen": -0.34979885816574097, + "logits/rejected": -0.3506646156311035, + "logps/chosen": -452.90667724609375, + "logps/rejected": -459.8733825683594, + "loss": 0.4529, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2027690410614014, + "rewards/margins": 1.2168538570404053, + "rewards/rejected": -4.419622898101807, "step": 2380 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.27, - "learning_rate": 3.2986822764245794e-06, - "logits/chosen": -0.36002737283706665, - "logits/chosen_friction": -0.36764344573020935, - "logits/rejected": -0.34288543462753296, - "logits/rejected_friction": -0.33753886818885803, - "logps/chosen": -3.2168726921081543, - "logps/chosen_friction": -1.0995310544967651, - "logps/rejected": -5.184739112854004, - "logps/rejected_friction": -4.23123836517334, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0970799922943115, - "policy_nll_loss": 3.20210599899292, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26390376687049866, - "rewards/chosen_fricton": -0.06389003992080688, - "rewards/margins": 0.19396907091140747, - "rewards/margins_friction": 0.30945560336112976, - "rewards/rejected": -0.45787280797958374, - "rewards/rejected_friction": -0.37334561347961426, - "step": 2385 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.27, - "learning_rate": 3.292467185990226e-06, - "logits/chosen": -0.3564448952674866, - "logits/chosen_friction": -0.3611581325531006, - "logits/rejected": -0.3425319790840149, - "logits/rejected_friction": -0.33081740140914917, - "logps/chosen": -3.1119513511657715, - "logps/chosen_friction": -1.0640556812286377, - "logps/rejected": -5.027177333831787, - "logps/rejected_friction": -4.22582483291626, - "loss": 0.0005, - "policy_friction_nll_loss": 1.0562493801116943, - "policy_nll_loss": 3.0856335163116455, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25206100940704346, - "rewards/chosen_fricton": -0.05906452611088753, - "rewards/margins": 0.18729014694690704, - "rewards/margins_friction": 0.3113119304180145, - "rewards/rejected": -0.4393511414527893, - "rewards/rejected_friction": -0.37037649750709534, + "epoch": 0.73, + "learning_rate": 5.262735453472459e-07, + "logits/chosen": -0.3504520058631897, + "logits/rejected": -0.3512795567512512, + "logps/chosen": -448.52685546875, + "logps/rejected": -459.87359619140625, + "loss": 0.3957, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8900694847106934, + "rewards/margins": 1.5135910511016846, + "rewards/rejected": -4.403660774230957, "step": 2390 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.28, - "learning_rate": 3.286246645924222e-06, - "logits/chosen": -0.3473230302333832, - "logits/chosen_friction": -0.35366004705429077, - "logits/rejected": -0.3338302969932556, - "logits/rejected_friction": -0.3264562487602234, - "logps/chosen": -3.1613810062408447, - "logps/chosen_friction": -1.065808653831482, - "logps/rejected": -5.02337121963501, - "logps/rejected_friction": -4.1820783615112305, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0457963943481445, - "policy_nll_loss": 3.111154317855835, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25603777170181274, - "rewards/chosen_fricton": -0.058437369763851166, - "rewards/margins": 0.18376357853412628, - "rewards/margins_friction": 0.3085145950317383, - "rewards/rejected": -0.4398013651371002, - "rewards/rejected_friction": -0.36695194244384766, - "step": 2395 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.28, - "learning_rate": 3.2800206990039236e-06, - "logits/chosen": -0.35404306650161743, - "logits/chosen_friction": -0.3589603304862976, - "logits/rejected": -0.33946624398231506, - "logits/rejected_friction": -0.3321515619754791, - "logps/chosen": -3.1438379287719727, - "logps/chosen_friction": -1.0510872602462769, - "logps/rejected": -5.116480827331543, - "logps/rejected_friction": -4.144482612609863, - "loss": 0.0005, - "policy_friction_nll_loss": 1.0410945415496826, - "policy_nll_loss": 3.122011661529541, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25451043248176575, - "rewards/chosen_fricton": -0.05699266120791435, - "rewards/margins": 0.19413819909095764, - "rewards/margins_friction": 0.3054508566856384, - "rewards/rejected": -0.448648601770401, - "rewards/rejected_friction": -0.3624435067176819, + "epoch": 0.73, + "learning_rate": 5.097673357358906e-07, + "logits/chosen": -0.36047258973121643, + "logits/rejected": -0.36156997084617615, + "logps/chosen": -451.36767578125, + "logps/rejected": -462.75225830078125, + "loss": 0.4152, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.962690830230713, + "rewards/margins": 1.4245904684066772, + "rewards/rejected": -4.3872809410095215, "step": 2400 }, { - "epoch": 1.28, - "eval_directrewards_student/accuracies": 1.0, - "eval_logits/chosen": -0.3883186876773834, - "eval_logits/chosen_friction": -0.3930244445800781, - "eval_logits/rejected": -0.37296971678733826, - "eval_logits/rejected_friction": -0.36149144172668457, - "eval_logps/chosen": -3.1946604251861572, - "eval_logps/chosen_friction": -1.0977063179016113, - "eval_logps/rejected": -5.155197620391846, - "eval_logps/rejected_friction": -4.233913898468018, - "eval_loss": 0.0007374906563200057, - "eval_policy_friction_nll_loss": 1.0977063179016113, - "eval_policy_nll_loss": 3.1946604251861572, - "eval_rewards/accuracies": 1.0, - "eval_rewards/accuracies_friction": 1.0, - "eval_rewards/chosen": -0.259523868560791, - "eval_rewards/chosen_fricton": -0.06150191277265549, - "eval_rewards/margins": 0.19257593154907227, - "eval_rewards/margins_friction": 0.30947357416152954, - "eval_rewards/rejected": -0.4520997703075409, - "eval_rewards/rejected_friction": -0.3709754943847656, - "eval_runtime": 363.6352, - "eval_samples_per_second": 1.375, - "eval_steps_per_second": 0.688, + "epoch": 0.73, + "eval_logits/chosen": -0.4258207082748413, + "eval_logits/rejected": -0.4264317452907562, + "eval_logps/chosen": -443.40802001953125, + "eval_logps/rejected": -454.0955810546875, + "eval_loss": 0.4785875976085663, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -3.0516843795776367, + "eval_rewards/margins": 1.2599674463272095, + "eval_rewards/rejected": -4.311651706695557, + "eval_runtime": 351.6671, + "eval_samples_per_second": 1.422, + "eval_steps_per_second": 1.422, "step": 2400 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.28, - "learning_rate": 3.2737893880438703e-06, - "logits/chosen": -0.3433993458747864, - "logits/chosen_friction": -0.34591928124427795, - "logits/rejected": -0.3320866525173187, - "logits/rejected_friction": -0.3184870183467865, - "logps/chosen": -3.331597089767456, - "logps/chosen_friction": -1.1559491157531738, - "logps/rejected": -5.226788520812988, - "logps/rejected_friction": -4.376966953277588, - "loss": 0.0008, - "policy_friction_nll_loss": 1.135048508644104, - "policy_nll_loss": 3.2871406078338623, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.26917389035224915, - "rewards/chosen_fricton": -0.06335552036762238, - "rewards/margins": 0.18629401922225952, - "rewards/margins_friction": 0.3182716965675354, - "rewards/rejected": -0.45546793937683105, - "rewards/rejected_friction": -0.38162723183631897, - "step": 2405 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.29, - "learning_rate": 3.2675527558954897e-06, - "logits/chosen": -0.3563470244407654, - "logits/chosen_friction": -0.3590940535068512, - "logits/rejected": -0.34838229417800903, - "logits/rejected_friction": -0.3296787440776825, - "logps/chosen": -3.3307394981384277, - "logps/chosen_friction": -1.1604408025741577, - "logps/rejected": -5.163435935974121, - "logps/rejected_friction": -4.425493240356445, - "loss": 0.0006, - "policy_friction_nll_loss": 1.1536277532577515, - "policy_nll_loss": 3.3062214851379395, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2690548300743103, - "rewards/chosen_fricton": -0.06407757848501205, - "rewards/margins": 0.1794404238462448, - "rewards/margins_friction": 0.3217184841632843, - "rewards/rejected": -0.4484952390193939, - "rewards/rejected_friction": -0.38579607009887695, + "epoch": 0.73, + "learning_rate": 4.934946887519279e-07, + "logits/chosen": -0.36616581678390503, + "logits/rejected": -0.36695989966392517, + "logps/chosen": -457.62567138671875, + "logps/rejected": -470.66485595703125, + "loss": 0.4125, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.906996011734009, + "rewards/margins": 1.469089150428772, + "rewards/rejected": -4.37608528137207, "step": 2410 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.29, - "learning_rate": 3.261310845446799e-06, - "logits/chosen": -0.3402024805545807, - "logits/chosen_friction": -0.34477201104164124, - "logits/rejected": -0.3267061114311218, - "logits/rejected_friction": -0.31752631068229675, - "logps/chosen": -3.105581045150757, - "logps/chosen_friction": -1.0650908946990967, - "logps/rejected": -5.083567142486572, - "logps/rejected_friction": -4.167944431304932, - "loss": 0.0005, - "policy_friction_nll_loss": 1.0410373210906982, - "policy_nll_loss": 3.0525963306427, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24998927116394043, - "rewards/chosen_fricton": -0.057560622692108154, - "rewards/margins": 0.19462057948112488, - "rewards/margins_friction": 0.30626314878463745, - "rewards/rejected": -0.4446098804473877, - "rewards/rejected_friction": -0.3638237714767456, - "step": 2415 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.29, - "learning_rate": 3.2550636996221174e-06, - "logits/chosen": -0.35860586166381836, - "logits/chosen_friction": -0.3650890290737152, - "logits/rejected": -0.34294334053993225, - "logits/rejected_friction": -0.3361509442329407, - "logps/chosen": -3.110239028930664, - "logps/chosen_friction": -1.0718337297439575, - "logps/rejected": -5.092462062835693, - "logps/rejected_friction": -4.155723571777344, - "loss": 0.0008, - "policy_friction_nll_loss": 1.0609614849090576, - "policy_nll_loss": 3.0985584259033203, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25191909074783325, - "rewards/chosen_fricton": -0.05983327701687813, - "rewards/margins": 0.19500632584095, - "rewards/margins_friction": 0.3043217360973358, - "rewards/rejected": -0.44692540168762207, - "rewards/rejected_friction": -0.36415499448776245, + "epoch": 0.74, + "learning_rate": 4.774575140626317e-07, + "logits/chosen": -0.35779887437820435, + "logits/rejected": -0.3586946129798889, + "logps/chosen": -451.4176330566406, + "logps/rejected": -464.3094787597656, + "loss": 0.4281, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.945284605026245, + "rewards/margins": 1.3972845077514648, + "rewards/rejected": -4.342568874359131, "step": 2420 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.29, - "learning_rate": 3.2488113613817634e-06, - "logits/chosen": -0.35975438356399536, - "logits/chosen_friction": -0.36715441942214966, - "logits/rejected": -0.3409446179866791, - "logits/rejected_friction": -0.3392896354198456, - "logps/chosen": -3.1004478931427, - "logps/chosen_friction": -1.018484354019165, - "logps/rejected": -5.13165807723999, - "logps/rejected_friction": -4.021988391876221, - "loss": 0.0006, - "policy_friction_nll_loss": 1.0124728679656982, - "policy_nll_loss": 3.0754849910736084, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25436776876449585, - "rewards/chosen_fricton": -0.057475101202726364, - "rewards/margins": 0.2002871036529541, - "rewards/margins_friction": 0.29693466424942017, - "rewards/rejected": -0.45465484261512756, - "rewards/rejected_friction": -0.35440975427627563, - "step": 2425 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.3, - "learning_rate": 3.2425538737217643e-06, - "logits/chosen": -0.339498370885849, - "logits/chosen_friction": -0.3454785943031311, - "logits/rejected": -0.3242797255516052, - "logits/rejected_friction": -0.3171548843383789, - "logps/chosen": -3.1756014823913574, - "logps/chosen_friction": -1.0694844722747803, - "logps/rejected": -5.0912675857543945, - "logps/rejected_friction": -4.176323413848877, - "loss": 0.0008, - "policy_friction_nll_loss": 1.057558298110962, - "policy_nll_loss": 3.137226104736328, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2576334476470947, - "rewards/chosen_fricton": -0.05868919938802719, - "rewards/margins": 0.1878761649131775, - "rewards/margins_friction": 0.3061172068119049, - "rewards/rejected": -0.4455096125602722, - "rewards/rejected_friction": -0.3648063838481903, + "epoch": 0.74, + "learning_rate": 4.6165769370155516e-07, + "logits/chosen": -0.36210596561431885, + "logits/rejected": -0.36279112100601196, + "logps/chosen": -451.7628479003906, + "logps/rejected": -464.893798828125, + "loss": 0.4782, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2397968769073486, + "rewards/margins": 1.2520904541015625, + "rewards/rejected": -4.49188756942749, "step": 2430 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.3, - "learning_rate": 3.236291279673558e-06, - "logits/chosen": -0.3389986455440521, - "logits/chosen_friction": -0.34640151262283325, - "logits/rejected": -0.3235977292060852, - "logits/rejected_friction": -0.31796130537986755, - "logps/chosen": -3.208653211593628, - "logps/chosen_friction": -1.1344648599624634, - "logps/rejected": -5.217514991760254, - "logps/rejected_friction": -4.271679401397705, - "loss": 0.0008, - "policy_friction_nll_loss": 1.1196436882019043, - "policy_nll_loss": 3.167191982269287, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2604638636112213, - "rewards/chosen_fricton": -0.0650453269481659, - "rewards/margins": 0.19728854298591614, - "rewards/margins_friction": 0.30933353304862976, - "rewards/rejected": -0.45775240659713745, - "rewards/rejected_friction": -0.37437885999679565, - "step": 2435 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.3, - "learning_rate": 3.2300236223036985e-06, - "logits/chosen": -0.35233962535858154, - "logits/chosen_friction": -0.3572812080383301, - "logits/rejected": -0.3377545177936554, - "logits/rejected_friction": -0.3303508162498474, - "logps/chosen": -3.1401000022888184, - "logps/chosen_friction": -1.0652434825897217, - "logps/rejected": -5.1242194175720215, - "logps/rejected_friction": -4.176536560058594, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0576781034469604, - "policy_nll_loss": 3.117417573928833, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25576186180114746, - "rewards/chosen_fricton": -0.0592605285346508, - "rewards/margins": 0.1948094666004181, - "rewards/margins_friction": 0.30673032999038696, - "rewards/rejected": -0.45057135820388794, - "rewards/rejected_friction": -0.36599084734916687, + "epoch": 0.74, + "learning_rate": 4.4609708184767177e-07, + "logits/chosen": -0.3466174006462097, + "logits/rejected": -0.3471986651420593, + "logps/chosen": -448.8627014160156, + "logps/rejected": -458.1527404785156, + "loss": 0.4647, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.91853404045105, + "rewards/margins": 1.3560994863510132, + "rewards/rejected": -4.274633884429932, "step": 2440 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.3, - "learning_rate": 3.2237509447135595e-06, - "logits/chosen": -0.3613296449184418, - "logits/chosen_friction": -0.36532729864120483, - "logits/rejected": -0.34984272718429565, - "logits/rejected_friction": -0.3362743556499481, - "logps/chosen": -3.2004599571228027, - "logps/chosen_friction": -1.0881118774414062, - "logps/rejected": -5.05680513381958, - "logps/rejected_friction": -4.196092128753662, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0811514854431152, - "policy_nll_loss": 3.1878561973571777, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25903284549713135, - "rewards/chosen_fricton": -0.059359002858400345, - "rewards/margins": 0.18131856620311737, - "rewards/margins_friction": 0.30580881237983704, - "rewards/rejected": -0.4403514266014099, - "rewards/rejected_friction": -0.36516788601875305, - "step": 2445 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.31, - "learning_rate": 3.217473290039036e-06, - "logits/chosen": -0.3305946886539459, - "logits/chosen_friction": -0.3359111547470093, - "logits/rejected": -0.32022371888160706, - "logits/rejected_friction": -0.30864906311035156, - "logps/chosen": -3.189223289489746, - "logps/chosen_friction": -1.0911750793457031, - "logps/rejected": -5.128474235534668, - "logps/rejected_friction": -4.240830421447754, - "loss": 0.0006, - "policy_friction_nll_loss": 1.0681804418563843, - "policy_nll_loss": 3.130453109741211, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2571971118450165, - "rewards/chosen_fricton": -0.05933646485209465, - "rewards/margins": 0.1906583607196808, - "rewards/margins_friction": 0.31102055311203003, - "rewards/rejected": -0.4478554129600525, - "rewards/rejected_friction": -0.3703569769859314, + "epoch": 0.74, + "learning_rate": 4.307775046077739e-07, + "logits/chosen": -0.3524012863636017, + "logits/rejected": -0.3537690043449402, + "logps/chosen": -445.4092712402344, + "logps/rejected": -460.19635009765625, + "loss": 0.4618, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2233848571777344, + "rewards/margins": 1.2467783689498901, + "rewards/rejected": -4.470162391662598, "step": 2450 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.31, - "learning_rate": 3.211190701450252e-06, - "logits/chosen": -0.3515673577785492, - "logits/chosen_friction": -0.3576563000679016, - "logits/rejected": -0.33575305342674255, - "logits/rejected_friction": -0.3288782835006714, - "logps/chosen": -3.106581449508667, - "logps/chosen_friction": -1.0756672620773315, - "logps/rejected": -5.159440040588379, - "logps/rejected_friction": -4.1113433837890625, - "loss": 0.0009, - "policy_friction_nll_loss": 1.062561273574829, - "policy_nll_loss": 3.0717337131500244, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25255125761032104, - "rewards/chosen_fricton": -0.060789406299591064, - "rewards/margins": 0.20263519883155823, - "rewards/margins_friction": 0.300479918718338, - "rewards/rejected": -0.4551864266395569, - "rewards/rejected_friction": -0.3612693250179291, - "step": 2455 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.31, - "learning_rate": 3.2049032221512594e-06, - "logits/chosen": -0.35883718729019165, - "logits/chosen_friction": -0.36347654461860657, - "logits/rejected": -0.3457663953304291, - "logits/rejected_friction": -0.3334144055843353, - "logps/chosen": -3.195237874984741, - "logps/chosen_friction": -1.0913505554199219, - "logps/rejected": -5.127433776855469, - "logps/rejected_friction": -4.190518856048584, - "loss": 0.0011, - "policy_friction_nll_loss": 1.0794315338134766, - "policy_nll_loss": 3.1628100872039795, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25930920243263245, - "rewards/chosen_fricton": -0.060879968106746674, - "rewards/margins": 0.18983998894691467, - "rewards/margins_friction": 0.305417001247406, - "rewards/rejected": -0.4491492211818695, - "rewards/rejected_friction": -0.3662969768047333, + "epoch": 0.75, + "learning_rate": 4.1570075980217503e-07, + "logits/chosen": -0.3559108376502991, + "logits/rejected": -0.35668981075286865, + "logps/chosen": -449.45330810546875, + "logps/rejected": -457.7796325683594, + "loss": 0.4718, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.394498348236084, + "rewards/margins": 1.175183892250061, + "rewards/rejected": -4.5696821212768555, "step": 2460 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.31, - "learning_rate": 3.198610895379742e-06, - "logits/chosen": -0.3593900799751282, - "logits/chosen_friction": -0.3645535409450531, - "logits/rejected": -0.3456033170223236, - "logits/rejected_friction": -0.3379074037075043, - "logps/chosen": -3.0600223541259766, - "logps/chosen_friction": -1.0371733903884888, - "logps/rejected": -5.049943447113037, - "logps/rejected_friction": -4.065425872802734, - "loss": 0.0005, - "policy_friction_nll_loss": 1.0205280780792236, - "policy_nll_loss": 3.0248420238494873, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24736526608467102, - "rewards/chosen_fricton": -0.05703300982713699, - "rewards/margins": 0.1959211230278015, - "rewards/margins_friction": 0.29903697967529297, - "rewards/rejected": -0.44328635931015015, - "rewards/rejected_friction": -0.35606998205184937, - "step": 2465 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.32, - "learning_rate": 3.1923137644067197e-06, - "logits/chosen": -0.3593180775642395, - "logits/chosen_friction": -0.36348339915275574, - "logits/rejected": -0.3490147292613983, - "logits/rejected_friction": -0.3372521996498108, - "logps/chosen": -3.106876850128174, - "logps/chosen_friction": -1.0843693017959595, - "logps/rejected": -5.065398693084717, - "logps/rejected_friction": -4.2639055252075195, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0739977359771729, - "policy_nll_loss": 3.075458288192749, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.24988214671611786, - "rewards/chosen_fricton": -0.05934521555900574, - "rewards/margins": 0.19247405230998993, - "rewards/margins_friction": 0.3135497570037842, - "rewards/rejected": -0.4423561692237854, - "rewards/rejected_friction": -0.3728949725627899, + "epoch": 0.75, + "learning_rate": 4.008686167537243e-07, + "logits/chosen": -0.36145132780075073, + "logits/rejected": -0.362403005361557, + "logps/chosen": -455.7185974121094, + "logps/rejected": -467.92828369140625, + "loss": 0.427, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0464015007019043, + "rewards/margins": 1.3699265718460083, + "rewards/rejected": -4.416327953338623, "step": 2470 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.32, - "learning_rate": 3.1860118725362497e-06, - "logits/chosen": -0.3635866045951843, - "logits/chosen_friction": -0.3711097836494446, - "logits/rejected": -0.3475392162799835, - "logits/rejected_friction": -0.3430318832397461, - "logps/chosen": -3.112227201461792, - "logps/chosen_friction": -1.0665977001190186, - "logps/rejected": -5.134170055389404, - "logps/rejected_friction": -4.083018779754639, - "loss": 0.0016, - "policy_friction_nll_loss": 1.0652869939804077, - "policy_nll_loss": 3.088334321975708, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2527063488960266, - "rewards/chosen_fricton": -0.05988958477973938, - "rewards/margins": 0.1989477276802063, - "rewards/margins_friction": 0.29784995317459106, - "rewards/rejected": -0.4516540467739105, - "rewards/rejected_friction": -0.35773950815200806, - "step": 2475 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.32, - "learning_rate": 3.1797052631051294e-06, - "logits/chosen": -0.34880760312080383, - "logits/chosen_friction": -0.353074848651886, - "logits/rejected": -0.3336310386657715, - "logits/rejected_friction": -0.3256738781929016, - "logps/chosen": -3.202608108520508, - "logps/chosen_friction": -1.1070842742919922, - "logps/rejected": -5.147311687469482, - "logps/rejected_friction": -4.160330772399902, - "loss": 0.0007, - "policy_friction_nll_loss": 1.0893038511276245, - "policy_nll_loss": 3.158147096633911, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25904223322868347, - "rewards/chosen_fricton": -0.06088440492749214, - "rewards/margins": 0.19146797060966492, - "rewards/margins_friction": 0.30147451162338257, - "rewards/rejected": -0.4505102038383484, - "rewards/rejected_friction": -0.3623588979244232, + "epoch": 0.75, + "learning_rate": 3.862828160801707e-07, + "logits/chosen": -0.3624842166900635, + "logits/rejected": -0.36328238248825073, + "logps/chosen": -455.2030334472656, + "logps/rejected": -468.92608642578125, + "loss": 0.468, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.084407329559326, + "rewards/margins": 1.2791146039962769, + "rewards/rejected": -4.363522529602051, "step": 2480 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.33, - "learning_rate": 3.173393979482597e-06, - "logits/chosen": -0.35376229882240295, - "logits/chosen_friction": -0.35962921380996704, - "logits/rejected": -0.334113746881485, - "logits/rejected_friction": -0.33165720105171204, - "logps/chosen": -3.167695999145508, - "logps/chosen_friction": -1.0890882015228271, - "logps/rejected": -5.225467205047607, - "logps/rejected_friction": -4.104846000671387, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0764219760894775, - "policy_nll_loss": 3.127072334289551, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.258111834526062, - "rewards/chosen_fricton": -0.061178356409072876, - "rewards/margins": 0.20235106348991394, - "rewards/margins_friction": 0.2976469099521637, - "rewards/rejected": -0.4604629576206207, - "rewards/rejected_friction": -0.3588252663612366, - "step": 2485 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.33, - "learning_rate": 3.167078065070034e-06, - "logits/chosen": -0.359779417514801, - "logits/chosen_friction": -0.368461936712265, - "logits/rejected": -0.34057852625846863, - "logits/rejected_friction": -0.3391065001487732, - "logps/chosen": -3.154426097869873, - "logps/chosen_friction": -1.063517451286316, - "logps/rejected": -5.241499423980713, - "logps/rejected_friction": -4.088709831237793, - "loss": 0.0007, - "policy_friction_nll_loss": 1.055070161819458, - "policy_nll_loss": 3.1295084953308105, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.25790834426879883, - "rewards/chosen_fricton": -0.060708481818437576, - "rewards/margins": 0.20491793751716614, - "rewards/margins_friction": 0.2977062165737152, - "rewards/rejected": -0.46282631158828735, - "rewards/rejected_friction": -0.3584147095680237, + "epoch": 0.76, + "learning_rate": 3.7194506948989405e-07, + "logits/chosen": -0.3563145697116852, + "logits/rejected": -0.3578342795372009, + "logps/chosen": -448.5079650878906, + "logps/rejected": -462.24542236328125, + "loss": 0.3916, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.866283655166626, + "rewards/margins": 1.522825002670288, + "rewards/rejected": -4.389109134674072, "step": 2490 }, { - "directrewards_student/accuracies": 1.0, - "epoch": 1.33, - "learning_rate": 3.1607575633006673e-06, - "logits/chosen": -0.35678109526634216, - "logits/chosen_friction": -0.3634141981601715, - "logits/rejected": -0.33565518260002136, - "logits/rejected_friction": -0.3350142538547516, - "logps/chosen": -3.2385106086730957, - "logps/chosen_friction": -1.087593913078308, - "logps/rejected": -5.303256988525391, - "logps/rejected_friction": -4.133756160736084, - "loss": 0.0006, - "policy_friction_nll_loss": 1.077362060546875, - "policy_nll_loss": 3.2089576721191406, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2647870182991028, - "rewards/chosen_fricton": -0.06174128130078316, - "rewards/margins": 0.2030152976512909, - "rewards/margins_friction": 0.3002726137638092, - "rewards/rejected": -0.46780234575271606, - "rewards/rejected_friction": -0.36201390624046326, - "step": 2495 - }, - { - "directrewards_student/accuracies": 1.0, - "epoch": 1.33, - "learning_rate": 3.1544325176392697e-06, - "logits/chosen": -0.3441302180290222, - "logits/chosen_friction": -0.35219284892082214, - "logits/rejected": -0.3248507082462311, - "logits/rejected_friction": -0.32254496216773987, - "logps/chosen": -3.2175769805908203, - "logps/chosen_friction": -1.077408790588379, - "logps/rejected": -5.201781272888184, - "logps/rejected_friction": -4.114433288574219, - "loss": 0.0009, - "policy_friction_nll_loss": 1.0585802793502808, - "policy_nll_loss": 3.165919542312622, - "rewards/accuracies": 1.0, - "rewards/accuracies_friction": 1.0, - "rewards/chosen": -0.2620188295841217, - "rewards/chosen_fricton": -0.05884331464767456, - "rewards/margins": 0.19553261995315552, - "rewards/margins_friction": 0.3003498911857605, - "rewards/rejected": -0.45755141973495483, - "rewards/rejected_friction": -0.35919323563575745, + "epoch": 0.76, + "learning_rate": 3.578570595810274e-07, + "logits/chosen": -0.35796427726745605, + "logits/rejected": -0.3588128089904785, + "logps/chosen": -454.2952575683594, + "logps/rejected": -462.9483337402344, + "loss": 0.4564, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.1886661052703857, + "rewards/margins": 1.3093502521514893, + "rewards/rejected": -4.498016357421875, "step": 2500 + }, + { + "epoch": 0.76, + "learning_rate": 3.4402043964399527e-07, + "logits/chosen": -0.35277941823005676, + "logits/rejected": -0.35380321741104126, + "logps/chosen": -441.46600341796875, + "logps/rejected": -452.00537109375, + "loss": 0.4007, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.036705493927002, + "rewards/margins": 1.4018588066101074, + "rewards/rejected": -4.438564777374268, + "step": 2510 + }, + { + "epoch": 0.77, + "learning_rate": 3.304368334674965e-07, + "logits/chosen": -0.3567604124546051, + "logits/rejected": -0.35805758833885193, + "logps/chosen": -449.0523376464844, + "logps/rejected": -461.69842529296875, + "loss": 0.4191, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.938197612762451, + "rewards/margins": 1.5301718711853027, + "rewards/rejected": -4.468369483947754, + "step": 2520 + }, + { + "epoch": 0.77, + "learning_rate": 3.1710783514794256e-07, + "logits/chosen": -0.35164931416511536, + "logits/rejected": -0.3529738187789917, + "logps/chosen": -449.84173583984375, + "logps/rejected": -464.1533203125, + "loss": 0.5458, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.2846596240997314, + "rewards/margins": 1.069946527481079, + "rewards/rejected": -4.354605674743652, + "step": 2530 + }, + { + "epoch": 0.77, + "learning_rate": 3.040350089023844e-07, + "logits/chosen": -0.3580131232738495, + "logits/rejected": -0.35896363854408264, + "logps/chosen": -460.78790283203125, + "logps/rejected": -474.05987548828125, + "loss": 0.4396, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1543052196502686, + "rewards/margins": 1.453919768333435, + "rewards/rejected": -4.6082258224487305, + "step": 2540 + }, + { + "epoch": 0.77, + "learning_rate": 2.9121988888494297e-07, + "logits/chosen": -0.35557836294174194, + "logits/rejected": -0.356197327375412, + "logps/chosen": -454.5060119628906, + "logps/rejected": -467.5367736816406, + "loss": 0.3968, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.987016201019287, + "rewards/margins": 1.4960193634033203, + "rewards/rejected": -4.483035564422607, + "step": 2550 + }, + { + "epoch": 0.78, + "learning_rate": 2.786639790067719e-07, + "logits/chosen": -0.35686007142066956, + "logits/rejected": -0.3575289249420166, + "logps/chosen": -457.2361755371094, + "logps/rejected": -470.49212646484375, + "loss": 0.4509, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.249783754348755, + "rewards/margins": 1.2371891736984253, + "rewards/rejected": -4.486972808837891, + "step": 2560 + }, + { + "epoch": 0.78, + "learning_rate": 2.6636875275956567e-07, + "logits/chosen": -0.3554794192314148, + "logits/rejected": -0.35618001222610474, + "logps/chosen": -455.373291015625, + "logps/rejected": -466.55389404296875, + "loss": 0.5174, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.4816794395446777, + "rewards/margins": 1.009413480758667, + "rewards/rejected": -4.491092681884766, + "step": 2570 + }, + { + "epoch": 0.78, + "learning_rate": 2.543356530426394e-07, + "logits/chosen": -0.34936192631721497, + "logits/rejected": -0.3497045040130615, + "logps/chosen": -451.0462951660156, + "logps/rejected": -464.62109375, + "loss": 0.4859, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.2196598052978516, + "rewards/margins": 1.3027180433273315, + "rewards/rejected": -4.522377967834473, + "step": 2580 + }, + { + "epoch": 0.79, + "learning_rate": 2.425660919935954e-07, + "logits/chosen": -0.35678738355636597, + "logits/rejected": -0.35775676369667053, + "logps/chosen": -452.04925537109375, + "logps/rejected": -463.97869873046875, + "loss": 0.4253, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8534128665924072, + "rewards/margins": 1.3880208730697632, + "rewards/rejected": -4.241434097290039, + "step": 2590 + }, + { + "epoch": 0.79, + "learning_rate": 2.3106145082260777e-07, + "logits/chosen": -0.35490182042121887, + "logits/rejected": -0.35594433546066284, + "logps/chosen": -456.057373046875, + "logps/rejected": -470.40350341796875, + "loss": 0.4502, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0541415214538574, + "rewards/margins": 1.4110925197601318, + "rewards/rejected": -4.46523380279541, + "step": 2600 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -0.4264547824859619, + "eval_logits/rejected": -0.4270709156990051, + "eval_logps/chosen": -443.8349914550781, + "eval_logps/rejected": -454.5430603027344, + "eval_loss": 0.48084381222724915, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -3.0943799018859863, + "eval_rewards/margins": 1.2620201110839844, + "eval_rewards/rejected": -4.356400489807129, + "eval_runtime": 351.5894, + "eval_samples_per_second": 1.422, + "eval_steps_per_second": 1.422, + "step": 2600 + }, + { + "epoch": 0.79, + "learning_rate": 2.1982307965032563e-07, + "logits/chosen": -0.3585938513278961, + "logits/rejected": -0.3597787618637085, + "logps/chosen": -453.99884033203125, + "logps/rejected": -462.98272705078125, + "loss": 0.5579, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.515160322189331, + "rewards/margins": 0.9192056655883789, + "rewards/rejected": -4.434365749359131, + "step": 2610 + }, + { + "epoch": 0.8, + "learning_rate": 2.0885229734943501e-07, + "logits/chosen": -0.35792115330696106, + "logits/rejected": -0.35949331521987915, + "logps/chosen": -441.6431579589844, + "logps/rejected": -454.21160888671875, + "loss": 0.4968, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2611217498779297, + "rewards/margins": 1.2790337800979614, + "rewards/rejected": -4.540155410766602, + "step": 2620 + }, + { + "epoch": 0.8, + "learning_rate": 1.9815039138988135e-07, + "logits/chosen": -0.3631977438926697, + "logits/rejected": -0.3638666272163391, + "logps/chosen": -448.5018005371094, + "logps/rejected": -460.1982421875, + "loss": 0.452, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.071455478668213, + "rewards/margins": 1.4410284757614136, + "rewards/rejected": -4.512484073638916, + "step": 2630 + }, + { + "epoch": 0.8, + "learning_rate": 1.8771861768777794e-07, + "logits/chosen": -0.3509594798088074, + "logits/rejected": -0.35208243131637573, + "logps/chosen": -450.60308837890625, + "logps/rejected": -464.2266540527344, + "loss": 0.4278, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.225553512573242, + "rewards/margins": 1.3294174671173096, + "rewards/rejected": -4.554970741271973, + "step": 2640 + }, + { + "epoch": 0.8, + "learning_rate": 1.7755820045802146e-07, + "logits/chosen": -0.35590630769729614, + "logits/rejected": -0.35736554861068726, + "logps/chosen": -455.400390625, + "logps/rejected": -465.2867126464844, + "loss": 0.4158, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.942516803741455, + "rewards/margins": 1.4671036005020142, + "rewards/rejected": -4.409620761871338, + "step": 2650 + }, + { + "epoch": 0.81, + "learning_rate": 1.67670332070623e-07, + "logits/chosen": -0.3521929383277893, + "logits/rejected": -0.3526236116886139, + "logps/chosen": -455.163330078125, + "logps/rejected": -469.2591857910156, + "loss": 0.4457, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.1400699615478516, + "rewards/margins": 1.289953589439392, + "rewards/rejected": -4.430023193359375, + "step": 2660 + }, + { + "epoch": 0.81, + "learning_rate": 1.580561729107777e-07, + "logits/chosen": -0.35622936487197876, + "logits/rejected": -0.356993168592453, + "logps/chosen": -455.1328125, + "logps/rejected": -465.6949157714844, + "loss": 0.4489, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.2420265674591064, + "rewards/margins": 1.3224232196807861, + "rewards/rejected": -4.564449310302734, + "step": 2670 + }, + { + "epoch": 0.81, + "learning_rate": 1.487168512426901e-07, + "logits/chosen": -0.36213189363479614, + "logits/rejected": -0.3628009557723999, + "logps/chosen": -453.6480407714844, + "logps/rejected": -465.2872619628906, + "loss": 0.4185, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.116283416748047, + "rewards/margins": 1.3114349842071533, + "rewards/rejected": -4.427718162536621, + "step": 2680 + }, + { + "epoch": 0.82, + "learning_rate": 1.3965346307716676e-07, + "logits/chosen": -0.3530941605567932, + "logits/rejected": -0.35421401262283325, + "logps/chosen": -451.10894775390625, + "logps/rejected": -465.1979064941406, + "loss": 0.376, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.9081177711486816, + "rewards/margins": 1.643689751625061, + "rewards/rejected": -4.551807403564453, + "step": 2690 + }, + { + "epoch": 0.82, + "learning_rate": 1.3086707204299415e-07, + "logits/chosen": -0.36071377992630005, + "logits/rejected": -0.3618861138820648, + "logps/chosen": -448.95355224609375, + "logps/rejected": -460.8838806152344, + "loss": 0.4524, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.284519672393799, + "rewards/margins": 1.33005690574646, + "rewards/rejected": -4.6145758628845215, + "step": 2700 + }, + { + "epoch": 0.82, + "learning_rate": 1.223587092621162e-07, + "logits/chosen": -0.3580467998981476, + "logits/rejected": -0.35923272371292114, + "logps/chosen": -451.82769775390625, + "logps/rejected": -463.8099670410156, + "loss": 0.4238, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.3439979553222656, + "rewards/margins": 1.2987263202667236, + "rewards/rejected": -4.64272403717041, + "step": 2710 + }, + { + "epoch": 0.83, + "learning_rate": 1.1412937322862971e-07, + "logits/chosen": -0.3629991412162781, + "logits/rejected": -0.3639989495277405, + "logps/chosen": -448.5044860839844, + "logps/rejected": -460.817138671875, + "loss": 0.4102, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.0257716178894043, + "rewards/margins": 1.448880910873413, + "rewards/rejected": -4.4746527671813965, + "step": 2720 + }, + { + "epoch": 0.83, + "learning_rate": 1.0618002969160546e-07, + "logits/chosen": -0.3608396053314209, + "logits/rejected": -0.3618479371070862, + "logps/chosen": -453.93499755859375, + "logps/rejected": -466.81640625, + "loss": 0.4187, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1413166522979736, + "rewards/margins": 1.3508026599884033, + "rewards/rejected": -4.492118835449219, + "step": 2730 + }, + { + "epoch": 0.83, + "learning_rate": 9.851161154175337e-08, + "logits/chosen": -0.3562917113304138, + "logits/rejected": -0.35710564255714417, + "logps/chosen": -451.28076171875, + "logps/rejected": -461.2808532714844, + "loss": 0.5024, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.143902540206909, + "rewards/margins": 1.1941773891448975, + "rewards/rejected": -4.338079929351807, + "step": 2740 + }, + { + "epoch": 0.84, + "learning_rate": 9.112501870194273e-08, + "logits/chosen": -0.3589875102043152, + "logits/rejected": -0.35990768671035767, + "logps/chosen": -452.32000732421875, + "logps/rejected": -461.66033935546875, + "loss": 0.5337, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.512810468673706, + "rewards/margins": 0.9673913717269897, + "rewards/rejected": -4.4802021980285645, + "step": 2750 + }, + { + "epoch": 0.84, + "learning_rate": 8.402111802159413e-08, + "logits/chosen": -0.3585359454154968, + "logits/rejected": -0.35975727438926697, + "logps/chosen": -454.8050842285156, + "logps/rejected": -465.12615966796875, + "loss": 0.4486, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2963638305664062, + "rewards/margins": 1.2402369976043701, + "rewards/rejected": -4.5366010665893555, + "step": 2760 + }, + { + "epoch": 0.84, + "learning_rate": 7.720074317494913e-08, + "logits/chosen": -0.36562293767929077, + "logits/rejected": -0.3664829134941101, + "logps/chosen": -457.26068115234375, + "logps/rejected": -470.1167907714844, + "loss": 0.4503, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0565972328186035, + "rewards/margins": 1.4593775272369385, + "rewards/rejected": -4.515974521636963, + "step": 2770 + }, + { + "epoch": 0.84, + "learning_rate": 7.06646945632361e-08, + "logits/chosen": -0.3597029447555542, + "logits/rejected": -0.3601227402687073, + "logps/chosen": -461.0421447753906, + "logps/rejected": -469.86175537109375, + "loss": 0.512, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.246166944503784, + "rewards/margins": 1.1497961282730103, + "rewards/rejected": -4.395963191986084, + "step": 2780 + }, + { + "epoch": 0.85, + "learning_rate": 6.441373922073946e-08, + "logits/chosen": -0.359005331993103, + "logits/rejected": -0.35974326729774475, + "logps/chosen": -455.99908447265625, + "logps/rejected": -466.95440673828125, + "loss": 0.4367, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.224595546722412, + "rewards/margins": 1.361697793006897, + "rewards/rejected": -4.5862932205200195, + "step": 2790 + }, + { + "epoch": 0.85, + "learning_rate": 5.844861072478336e-08, + "logits/chosen": -0.3530232608318329, + "logits/rejected": -0.3545222580432892, + "logps/chosen": -443.6240234375, + "logps/rejected": -458.322998046875, + "loss": 0.4834, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.3614017963409424, + "rewards/margins": 1.2071417570114136, + "rewards/rejected": -4.568543434143066, + "step": 2800 + }, + { + "epoch": 0.85, + "eval_logits/chosen": -0.4272295832633972, + "eval_logits/rejected": -0.42783358693122864, + "eval_logps/chosen": -444.2228088378906, + "eval_logps/rejected": -454.95098876953125, + "eval_loss": 0.48089736700057983, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -3.1331627368927, + "eval_rewards/margins": 1.2640310525894165, + "eval_rewards/rejected": -4.3971943855285645, + "eval_runtime": 351.6656, + "eval_samples_per_second": 1.422, + "eval_steps_per_second": 1.422, + "step": 2800 + }, + { + "epoch": 0.85, + "learning_rate": 5.2770009109645306e-08, + "logits/chosen": -0.36214134097099304, + "logits/rejected": -0.36288636922836304, + "logps/chosen": -454.91839599609375, + "logps/rejected": -466.05224609375, + "loss": 0.4296, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1463279724121094, + "rewards/margins": 1.363966703414917, + "rewards/rejected": -4.5102949142456055, + "step": 2810 + }, + { + "epoch": 0.86, + "learning_rate": 4.7378600784402095e-08, + "logits/chosen": -0.3552590310573578, + "logits/rejected": -0.35652121901512146, + "logps/chosen": -455.6435546875, + "logps/rejected": -465.54693603515625, + "loss": 0.4669, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2327640056610107, + "rewards/margins": 1.2310270071029663, + "rewards/rejected": -4.463791370391846, + "step": 2820 + }, + { + "epoch": 0.86, + "learning_rate": 4.22750184547252e-08, + "logits/chosen": -0.3599388301372528, + "logits/rejected": -0.3607821762561798, + "logps/chosen": -456.6576232910156, + "logps/rejected": -469.9142150878906, + "loss": 0.4199, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1602883338928223, + "rewards/margins": 1.4565389156341553, + "rewards/rejected": -4.616827487945557, + "step": 2830 + }, + { + "epoch": 0.86, + "learning_rate": 3.745986104862903e-08, + "logits/chosen": -0.35964518785476685, + "logits/rejected": -0.360365092754364, + "logps/chosen": -455.8336486816406, + "logps/rejected": -467.90948486328125, + "loss": 0.4152, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.874156951904297, + "rewards/margins": 1.5480505228042603, + "rewards/rejected": -4.422207832336426, + "step": 2840 + }, + { + "epoch": 0.87, + "learning_rate": 3.293369364618465e-08, + "logits/chosen": -0.3647812604904175, + "logits/rejected": -0.3658196032047272, + "logps/chosen": -449.73138427734375, + "logps/rejected": -462.46942138671875, + "loss": 0.4729, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.295436382293701, + "rewards/margins": 1.252638816833496, + "rewards/rejected": -4.5480756759643555, + "step": 2850 + }, + { + "epoch": 0.87, + "learning_rate": 2.869704741320478e-08, + "logits/chosen": -0.35672903060913086, + "logits/rejected": -0.3576185703277588, + "logps/chosen": -449.70294189453125, + "logps/rejected": -459.93096923828125, + "loss": 0.4951, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.523907423019409, + "rewards/margins": 1.1444907188415527, + "rewards/rejected": -4.668398380279541, + "step": 2860 + }, + { + "epoch": 0.87, + "learning_rate": 2.4750419538908667e-08, + "logits/chosen": -0.3534146547317505, + "logits/rejected": -0.35466113686561584, + "logps/chosen": -452.890869140625, + "logps/rejected": -464.16510009765625, + "loss": 0.4477, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2099146842956543, + "rewards/margins": 1.3813788890838623, + "rewards/rejected": -4.591293811798096, + "step": 2870 + }, + { + "epoch": 0.87, + "learning_rate": 2.1094273177576508e-08, + "logits/chosen": -0.36183369159698486, + "logits/rejected": -0.36180374026298523, + "logps/chosen": -455.7822265625, + "logps/rejected": -465.14801025390625, + "loss": 0.4747, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.090353488922119, + "rewards/margins": 1.3213683366775513, + "rewards/rejected": -4.411721706390381, + "step": 2880 + }, + { + "epoch": 0.88, + "learning_rate": 1.7729037394193792e-08, + "logits/chosen": -0.3579171299934387, + "logits/rejected": -0.35931870341300964, + "logps/chosen": -450.9007263183594, + "logps/rejected": -464.6581115722656, + "loss": 0.4626, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.0921690464019775, + "rewards/margins": 1.4891610145568848, + "rewards/rejected": -4.581330299377441, + "step": 2890 + }, + { + "epoch": 0.88, + "learning_rate": 1.4655107114101008e-08, + "logits/chosen": -0.36245545744895935, + "logits/rejected": -0.36358946561813354, + "logps/chosen": -452.5899353027344, + "logps/rejected": -467.2527770996094, + "loss": 0.464, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.9442224502563477, + "rewards/margins": 1.4334999322891235, + "rewards/rejected": -4.377722263336182, + "step": 2900 + }, + { + "epoch": 0.88, + "learning_rate": 1.1872843076645157e-08, + "logits/chosen": -0.35802754759788513, + "logits/rejected": -0.35865747928619385, + "logps/chosen": -454.47662353515625, + "logps/rejected": -465.0662536621094, + "loss": 0.3877, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.96606707572937, + "rewards/margins": 1.5276943445205688, + "rewards/rejected": -4.4937615394592285, + "step": 2910 + }, + { + "epoch": 0.89, + "learning_rate": 9.382571792846962e-09, + "logits/chosen": -0.3509235084056854, + "logits/rejected": -0.3516360819339752, + "logps/chosen": -443.75469970703125, + "logps/rejected": -453.1048278808594, + "loss": 0.4472, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1273512840270996, + "rewards/margins": 1.3490197658538818, + "rewards/rejected": -4.476370811462402, + "step": 2920 + }, + { + "epoch": 0.89, + "learning_rate": 7.1845855070828975e-09, + "logits/chosen": -0.3624979555606842, + "logits/rejected": -0.36296314001083374, + "logps/chosen": -450.0325622558594, + "logps/rejected": -459.88916015625, + "loss": 0.4578, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.340954303741455, + "rewards/margins": 1.343732237815857, + "rewards/rejected": -4.684686660766602, + "step": 2930 + }, + { + "epoch": 0.89, + "learning_rate": 5.279142162789019e-09, + "logits/chosen": -0.35505902767181396, + "logits/rejected": -0.35619235038757324, + "logps/chosen": -451.9505920410156, + "logps/rejected": -465.6192321777344, + "loss": 0.4539, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.4748854637145996, + "rewards/margins": 1.2840955257415771, + "rewards/rejected": -4.758981227874756, + "step": 2940 + }, + { + "epoch": 0.9, + "learning_rate": 3.666465372190453e-09, + "logits/chosen": -0.356467604637146, + "logits/rejected": -0.357626736164093, + "logps/chosen": -452.7481384277344, + "logps/rejected": -465.7762145996094, + "loss": 0.472, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.293835401535034, + "rewards/margins": 1.261348009109497, + "rewards/rejected": -4.555183410644531, + "step": 2950 + }, + { + "epoch": 0.9, + "learning_rate": 2.34674439005822e-09, + "logits/chosen": -0.3525004982948303, + "logits/rejected": -0.35348066687583923, + "logps/chosen": -450.6170959472656, + "logps/rejected": -462.77069091796875, + "loss": 0.3976, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.295048952102661, + "rewards/margins": 1.4584187269210815, + "rewards/rejected": -4.753467559814453, + "step": 2960 + }, + { + "epoch": 0.9, + "learning_rate": 1.3201340915011685e-09, + "logits/chosen": -0.35318654775619507, + "logits/rejected": -0.35393238067626953, + "logps/chosen": -453.31707763671875, + "logps/rejected": -463.7138671875, + "loss": 0.4291, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.284226655960083, + "rewards/margins": 1.3613402843475342, + "rewards/rejected": -4.645566463470459, + "step": 2970 + }, + { + "epoch": 0.91, + "learning_rate": 5.86754953789681e-10, + "logits/chosen": -0.35480597615242004, + "logits/rejected": -0.35523343086242676, + "logps/chosen": -449.10235595703125, + "logps/rejected": -461.8785095214844, + "loss": 0.4941, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.270163059234619, + "rewards/margins": 1.221695065498352, + "rewards/rejected": -4.491857528686523, + "step": 2980 + }, + { + "epoch": 0.91, + "learning_rate": 1.4669304221726077e-10, + "logits/chosen": -0.3551548421382904, + "logits/rejected": -0.3556649386882782, + "logps/chosen": -456.6444396972656, + "logps/rejected": -467.2582092285156, + "loss": 0.4289, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.225339412689209, + "rewards/margins": 1.3719263076782227, + "rewards/rejected": -4.597265243530273, + "step": 2990 + }, + { + "epoch": 0.91, + "learning_rate": 0.0, + "logits/chosen": -0.35613125562667847, + "logits/rejected": -0.3575323522090912, + "logps/chosen": -453.228759765625, + "logps/rejected": -467.6603088378906, + "loss": 0.416, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2920143604278564, + "rewards/margins": 1.416282296180725, + "rewards/rejected": -4.708296775817871, + "step": 3000 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -0.42728757858276367, + "eval_logits/rejected": -0.42789557576179504, + "eval_logps/chosen": -444.3138427734375, + "eval_logps/rejected": -455.0481262207031, + "eval_loss": 0.47960197925567627, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -3.1422641277313232, + "eval_rewards/margins": 1.264641523361206, + "eval_rewards/rejected": -4.406905174255371, + "eval_runtime": 351.5662, + "eval_samples_per_second": 1.422, + "eval_steps_per_second": 1.422, + "step": 3000 } ], - "logging_steps": 5, - "max_steps": 6000, + "logging_steps": 10, + "max_steps": 3000, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, - "train_batch_size": 2, + "train_batch_size": 4, "trial_name": null, "trial_params": null }