{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 200, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32454361054766734, "grad_norm": 193.0, "kl": 0.18435561656951904, "learning_rate": 6e-08, "logits/chosen": -60439219.2, "logits/rejected": -88406048.0, "logps/chosen": -197.2506591796875, "logps/rejected": -107.289501953125, "loss": 3.5312, "rewards/chosen": -0.0054339878261089325, "rewards/margins": -0.004661996196955442, "rewards/rejected": -0.00077199162915349, "step": 10 }, { "epoch": 0.6490872210953347, "grad_norm": 184.0, "kl": 0.2278171330690384, "learning_rate": 1.2666666666666666e-07, "logits/chosen": -60431888.30573248, "logits/rejected": -90372359.85276073, "logps/chosen": -227.72788117038218, "logps/rejected": -115.19334739263803, "loss": 3.529, "rewards/chosen": -0.005934715650643512, "rewards/margins": -0.0023042475658752288, "rewards/rejected": -0.0036304680847682835, "step": 20 }, { "epoch": 0.973630831643002, "grad_norm": 212.0, "kl": 0.25923511385917664, "learning_rate": 1.9333333333333332e-07, "logits/chosen": -64896310.01834863, "logits/rejected": -90925383.15654951, "logps/chosen": -202.24849483944953, "logps/rejected": -113.89132388178913, "loss": 3.493, "rewards/chosen": -0.006282405386641849, "rewards/margins": 0.008622396832151831, "rewards/rejected": -0.01490480221879368, "step": 30 }, { "epoch": 1.2920892494929006, "grad_norm": 176.0, "kl": 0.1959868222475052, "learning_rate": 2.6e-07, "logits/chosen": -59720155.54179566, "logits/rejected": -88351780.93114755, "logps/chosen": -197.90833010835914, "logps/rejected": -108.10444415983606, "loss": 3.5435, "rewards/chosen": 0.006978450544847424, "rewards/margins": 0.021960525532540336, "rewards/rejected": -0.014982074987692912, "step": 40 }, { "epoch": 1.616632860040568, "grad_norm": 183.0, "kl": 0.21559596061706543, "learning_rate": 3.2666666666666663e-07, "logits/chosen": -59467023.25827815, "logits/rejected": -90559948.49704142, "logps/chosen": -228.09385347682118, "logps/rejected": -112.74208348742603, "loss": 3.5032, "rewards/chosen": 0.0010771212593609135, "rewards/margins": 0.03369407513736861, "rewards/rejected": -0.0326169538780077, "step": 50 }, { "epoch": 1.9411764705882353, "grad_norm": 166.0, "kl": 0.15380892157554626, "learning_rate": 3.933333333333333e-07, "logits/chosen": -65427847.25970149, "logits/rejected": -90375137.78360656, "logps/chosen": -201.5051072761194, "logps/rejected": -115.42854764344263, "loss": 3.4839, "rewards/chosen": -0.005884787217894597, "rewards/margins": 0.059975266923290735, "rewards/rejected": -0.06586005414118533, "step": 60 }, { "epoch": 2.259634888438134, "grad_norm": 177.0, "kl": 0.14670009911060333, "learning_rate": 4.6e-07, "logits/chosen": -59004054.974358976, "logits/rejected": -88976837.67088607, "logps/chosen": -197.41224709535257, "logps/rejected": -108.29176967958861, "loss": 3.5035, "rewards/chosen": 0.0009808578552343906, "rewards/margins": 0.09745390586015737, "rewards/rejected": -0.09647304800492298, "step": 70 }, { "epoch": 2.584178498985801, "grad_norm": 147.0, "kl": 0.17133259773254395, "learning_rate": 5.266666666666666e-07, "logits/chosen": -60354307.32467532, "logits/rejected": -89784492.72289157, "logps/chosen": -219.9271002435065, "logps/rejected": -113.98176298945783, "loss": 3.4706, "rewards/chosen": -0.01354174180464311, "rewards/margins": 0.1299854844710652, "rewards/rejected": -0.1435272262757083, "step": 80 }, { "epoch": 2.9087221095334685, "grad_norm": 197.0, "kl": 0.18291868269443512, "learning_rate": 5.933333333333334e-07, "logits/chosen": -64280637.686746985, "logits/rejected": -91105393.03896104, "logps/chosen": -209.47286803463857, "logps/rejected": -117.5850497159091, "loss": 3.4451, "rewards/chosen": -0.034842827233923485, "rewards/margins": 0.20260719032359223, "rewards/rejected": -0.2374500175575157, "step": 90 }, { "epoch": 3.227180527383367, "grad_norm": 197.0, "kl": 0.15808846056461334, "learning_rate": 6.6e-07, "logits/chosen": -61499443.52201258, "logits/rejected": -88841982.34838709, "logps/chosen": -191.46870086477986, "logps/rejected": -110.64470766129033, "loss": 3.4412, "rewards/chosen": -0.02490143655980908, "rewards/margins": 0.2772539658390694, "rewards/rejected": -0.3021554023988785, "step": 100 }, { "epoch": 3.5517241379310347, "grad_norm": 187.0, "kl": 0.20476070046424866, "learning_rate": 7.266666666666667e-07, "logits/chosen": -59606039.973244146, "logits/rejected": -90108102.19354838, "logps/chosen": -218.58996132943145, "logps/rejected": -116.83943135997067, "loss": 3.4053, "rewards/chosen": -0.04491897889204249, "rewards/margins": 0.3912458546913978, "rewards/rejected": -0.43616483358344027, "step": 110 }, { "epoch": 3.8762677484787016, "grad_norm": 177.0, "kl": 0.2384704351425171, "learning_rate": 7.933333333333333e-07, "logits/chosen": -63627667.66376811, "logits/rejected": -90490970.25084746, "logps/chosen": -210.72527173913045, "logps/rejected": -119.54580243644068, "loss": 3.3335, "rewards/chosen": -0.08938233195871546, "rewards/margins": 0.5145636426175557, "rewards/rejected": -0.6039459745762712, "step": 120 }, { "epoch": 4.1947261663286, "grad_norm": 180.0, "kl": 0.1893850862979889, "learning_rate": 8.599999999999999e-07, "logits/chosen": -60919781.574193545, "logits/rejected": -90189328.10062893, "logps/chosen": -201.86630544354838, "logps/rejected": -114.74926297169812, "loss": 3.3418, "rewards/chosen": -0.05891021605460874, "rewards/margins": 0.6146469454804984, "rewards/rejected": -0.6735571615351071, "step": 130 }, { "epoch": 4.519269776876268, "grad_norm": 175.0, "kl": 0.2661321759223938, "learning_rate": 9.266666666666665e-07, "logits/chosen": -59107307.58803987, "logits/rejected": -89497455.00884956, "logps/chosen": -215.70617473006644, "logps/rejected": -120.84332365412979, "loss": 3.2899, "rewards/chosen": -0.11127648084266638, "rewards/margins": 0.7554082443082831, "rewards/rejected": -0.8666847251509495, "step": 140 }, { "epoch": 4.8438133874239355, "grad_norm": 177.0, "kl": 0.3608871102333069, "learning_rate": 9.933333333333333e-07, "logits/chosen": -64112037.64705882, "logits/rejected": -91098432.85333334, "logps/chosen": -204.3817325367647, "logps/rejected": -125.74838541666666, "loss": 3.23, "rewards/chosen": -0.1627967497881721, "rewards/margins": 0.8743928864422965, "rewards/rejected": -1.0371896362304687, "step": 150 }, { "epoch": 5.162271805273834, "grad_norm": 137.0, "kl": 0.2991156578063965, "learning_rate": 9.99095521855875e-07, "logits/chosen": -60793345.625396825, "logits/rejected": -90675919.74440895, "logps/chosen": -211.70634920634922, "logps/rejected": -120.8386456669329, "loss": 3.217, "rewards/chosen": -0.10563917614164807, "rewards/margins": 0.9914023347846198, "rewards/rejected": -1.097041510926268, "step": 160 }, { "epoch": 5.486815415821501, "grad_norm": 171.0, "kl": 0.35243138670921326, "learning_rate": 9.959731316773258e-07, "logits/chosen": -57961989.13712375, "logits/rejected": -89004902.85043988, "logps/chosen": -207.07096571906354, "logps/rejected": -122.59219208211144, "loss": 3.15, "rewards/chosen": -0.1434617823980325, "rewards/margins": 1.1325593303963681, "rewards/rejected": -1.2760211127944006, "step": 170 }, { "epoch": 5.811359026369169, "grad_norm": 185.0, "kl": 0.4362719655036926, "learning_rate": 9.906356050933962e-07, "logits/chosen": -64087541.48973607, "logits/rejected": -91216310.36789298, "logps/chosen": -203.97832661290323, "logps/rejected": -131.44156302257525, "loss": 3.0881, "rewards/chosen": -0.23974457234581195, "rewards/margins": 1.1809270756605634, "rewards/rejected": -1.4206716480063755, "step": 180 }, { "epoch": 6.129817444219067, "grad_norm": 209.0, "kl": 0.45517590641975403, "learning_rate": 9.831067807935138e-07, "logits/chosen": -60818541.48427673, "logits/rejected": -91439209.7032258, "logps/chosen": -216.79994595125785, "logps/rejected": -123.5953125, "loss": 3.1009, "rewards/chosen": -0.14091354945920548, "rewards/margins": 1.3181333373540505, "rewards/rejected": -1.459046886813256, "step": 190 }, { "epoch": 6.454361054766734, "grad_norm": 174.0, "kl": 0.3785388171672821, "learning_rate": 9.73420284334652e-07, "logits/chosen": -57674728.34323432, "logits/rejected": -88681900.43916914, "logps/chosen": -205.04843492161717, "logps/rejected": -123.72198395771514, "loss": 3.0149, "rewards/chosen": -0.1368737551245359, "rewards/margins": 1.4349443391007053, "rewards/rejected": -1.571818094225241, "step": 200 }, { "epoch": 6.454361054766734, "eval_kl": 0.035786211490631104, "eval_logits/chosen": -67682705.2972973, "eval_logits/rejected": -106589274.61946903, "eval_logps/chosen": -223.63279490427928, "eval_logps/rejected": -130.98892941095133, "eval_loss": 0.3622306287288666, "eval_rewards/chosen": -0.1184040877196166, "eval_rewards/margins": 1.4728804700617717, "eval_rewards/rejected": -1.5912845577813883, "eval_runtime": 14.432, "eval_samples_per_second": 15.175, "eval_steps_per_second": 0.97, "step": 200 }, { "epoch": 6.778904665314402, "grad_norm": 173.0, "kl": 0.486126571893692, "learning_rate": 9.616193779614293e-07, "logits/chosen": -62086706.13649852, "logits/rejected": -91359813.28052805, "logps/chosen": -207.32766135014836, "logps/rejected": -134.37520627062707, "loss": 2.9831, "rewards/chosen": -0.22745244510095974, "rewards/margins": 1.4696961454905213, "rewards/rejected": -1.697148590591481, "step": 210 }, { "epoch": 7.0973630831643, "grad_norm": 238.0, "kl": 0.4411180913448334, "learning_rate": 9.477567673864215e-07, "logits/chosen": -61770599.064935066, "logits/rejected": -91440755.2, "logps/chosen": -212.82518262987014, "logps/rejected": -127.96240234375, "loss": 2.9719, "rewards/chosen": -0.23503605731121904, "rewards/margins": 1.439167243164855, "rewards/rejected": -1.6742033004760741, "step": 220 }, { "epoch": 7.421906693711968, "grad_norm": 696.0, "kl": 0.43413224816322327, "learning_rate": 9.318943663936569e-07, "logits/chosen": -58800922.256410256, "logits/rejected": -88753464.19512194, "logps/chosen": -204.23775540865384, "logps/rejected": -127.21961937881098, "loss": 2.9524, "rewards/chosen": -0.15640850556202424, "rewards/margins": 1.593839914967225, "rewards/rejected": -1.7502484205292492, "step": 230 }, { "epoch": 7.746450304259635, "grad_norm": 155.0, "kl": 0.6114085912704468, "learning_rate": 9.141030203166256e-07, "logits/chosen": -60832057.65765766, "logits/rejected": -91749279.27035831, "logps/chosen": -210.65941722972974, "logps/rejected": -133.82223381514657, "loss": 2.9045, "rewards/chosen": -0.2403512674051004, "rewards/margins": 1.6371219486983197, "rewards/rejected": -1.8774732161034202, "step": 240 }, { "epoch": 8.064908722109534, "grad_norm": 177.0, "kl": 0.5247067213058472, "learning_rate": 8.944621896258224e-07, "logits/chosen": -61470391.79487179, "logits/rejected": -90531684.4556962, "logps/chosen": -212.98465044070514, "logps/rejected": -129.2169822982595, "loss": 2.9026, "rewards/chosen": -0.2432682330791767, "rewards/margins": 1.5595254387340156, "rewards/rejected": -1.8027936718131923, "step": 250 }, { "epoch": 8.3894523326572, "grad_norm": 144.0, "kl": 0.4949173033237457, "learning_rate": 8.730595950389967e-07, "logits/chosen": -58573346.13333333, "logits/rejected": -89080557.88307692, "logps/chosen": -201.41715029761906, "logps/rejected": -128.4549278846154, "loss": 2.8883, "rewards/chosen": -0.1385447789752294, "rewards/margins": 1.7706690882122706, "rewards/rejected": -1.9092138671875, "step": 260 }, { "epoch": 8.713995943204868, "grad_norm": 185.0, "kl": 0.6798511743545532, "learning_rate": 8.499908257391323e-07, "logits/chosen": -60232947.512195125, "logits/rejected": -91839363.28205128, "logps/chosen": -217.515625, "logps/rejected": -134.24834735576923, "loss": 2.8604, "rewards/chosen": -0.22204266524896388, "rewards/margins": 1.757865030814738, "rewards/rejected": -1.9799076960637019, "step": 270 }, { "epoch": 9.032454361054766, "grad_norm": 240.0, "kl": 0.4586775600910187, "learning_rate": 8.253589124499511e-07, "logits/chosen": -61861802.11612903, "logits/rejected": -91389275.7735849, "logps/chosen": -207.82133316532259, "logps/rejected": -133.32677378144655, "loss": 2.8302, "rewards/chosen": -0.2830538349766885, "rewards/margins": 1.7209184404753555, "rewards/rejected": -2.003972275452044, "step": 280 }, { "epoch": 9.356997971602434, "grad_norm": 164.0, "kl": 0.61592036485672, "learning_rate": 7.992738672756908e-07, "logits/chosen": -58843218.940809965, "logits/rejected": -88515118.54545455, "logps/chosen": -197.16750632788163, "logps/rejected": -129.25817985893417, "loss": 2.8523, "rewards/chosen": -0.1824735837562062, "rewards/margins": 1.795535009381529, "rewards/rejected": -1.9780085931377351, "step": 290 }, { "epoch": 9.681541582150102, "grad_norm": 137.0, "kl": 0.7404313087463379, "learning_rate": 7.718521923603404e-07, "logits/chosen": -59332853.50157729, "logits/rejected": -91459862.98452012, "logps/chosen": -231.09537657728706, "logps/rejected": -134.74608165634675, "loss": 2.8027, "rewards/chosen": -0.2009657730439484, "rewards/margins": 1.85811701405938, "rewards/rejected": -2.0590827871033284, "step": 300 }, { "epoch": 10.0, "grad_norm": 207.0, "kl": 0.44328153133392334, "learning_rate": 7.43216359560785e-07, "logits/chosen": -62495649.72698413, "logits/rejected": -91457778.09584664, "logps/chosen": -200.75829613095237, "logps/rejected": -134.88111521565494, "loss": 2.7823, "rewards/chosen": -0.29915979778955853, "rewards/margins": 1.8331558409384765, "rewards/rejected": -2.132315638728035, "step": 310 }, { "epoch": 10.324543610547668, "grad_norm": 182.0, "kl": 0.5860379338264465, "learning_rate": 7.134942634577615e-07, "logits/chosen": -58812569.6, "logits/rejected": -88819168.0, "logps/chosen": -199.1839599609375, "logps/rejected": -127.58912353515625, "loss": 2.8078, "rewards/chosen": -0.19876351356506347, "rewards/margins": 1.8319713115692138, "rewards/rejected": -2.030734825134277, "step": 320 }, { "epoch": 10.649087221095336, "grad_norm": 177.0, "kl": 0.6635628938674927, "learning_rate": 6.828186501476144e-07, "logits/chosen": -58466127.89808917, "logits/rejected": -90940309.20245399, "logps/chosen": -229.41757066082803, "logps/rejected": -136.31909029907976, "loss": 2.7883, "rewards/chosen": -0.17490250897255671, "rewards/margins": 1.9413016884488392, "rewards/rejected": -2.116204197421396, "step": 330 }, { "epoch": 10.973630831643002, "grad_norm": 176.0, "kl": 0.6288160681724548, "learning_rate": 6.513265243660057e-07, "logits/chosen": -62876155.30275229, "logits/rejected": -91420423.36102237, "logps/chosen": -204.8246129587156, "logps/rejected": -136.0083491413738, "loss": 2.7516, "rewards/chosen": -0.2638938507173404, "rewards/margins": 1.962711744392983, "rewards/rejected": -2.2266055951103234, "step": 340 }, { "epoch": 11.2920892494929, "grad_norm": 165.0, "kl": 0.6786984205245972, "learning_rate": 6.191585375915055e-07, "logits/chosen": -58107764.50773994, "logits/rejected": -88665648.68196721, "logps/chosen": -199.96833881578948, "logps/rejected": -128.3748463114754, "loss": 2.8082, "rewards/chosen": -0.19902199193050987, "rewards/margins": 1.8429992122831171, "rewards/rejected": -2.042021204213627, "step": 350 }, { "epoch": 11.616632860040568, "grad_norm": 167.0, "kl": 0.6609476208686829, "learning_rate": 5.864583598619467e-07, "logits/chosen": -57476970.80794702, "logits/rejected": -91102129.23076923, "logps/chosen": -229.5999586092715, "logps/rejected": -134.39487795857988, "loss": 2.7377, "rewards/chosen": -0.1495321286435159, "rewards/margins": 2.048365336627841, "rewards/rejected": -2.197897465271357, "step": 360 }, { "epoch": 11.941176470588236, "grad_norm": 141.0, "kl": 0.6255931854248047, "learning_rate": 5.533720381091582e-07, "logits/chosen": -63461816.16716418, "logits/rejected": -90867329.2590164, "logps/chosen": -204.81716417910448, "logps/rejected": -137.47254098360656, "loss": 2.7423, "rewards/chosen": -0.3370915427136777, "rewards/margins": 1.9331667260491296, "rewards/rejected": -2.2702582687628072, "step": 370 }, { "epoch": 12.259634888438134, "grad_norm": 160.0, "kl": 0.6641746163368225, "learning_rate": 5.200473438779146e-07, "logits/chosen": -57381835.48717949, "logits/rejected": -89336883.84810127, "logps/chosen": -199.01509915865384, "logps/rejected": -127.97672320015823, "loss": 2.7637, "rewards/chosen": -0.159302613674066, "rewards/margins": 1.9056654471694863, "rewards/rejected": -2.0649680608435523, "step": 380 }, { "epoch": 12.584178498985802, "grad_norm": 158.0, "kl": 0.605785071849823, "learning_rate": 4.866331133423456e-07, "logits/chosen": -58439706.597402595, "logits/rejected": -90199761.73493975, "logps/chosen": -222.0545606737013, "logps/rejected": -134.65968561746988, "loss": 2.7272, "rewards/chosen": -0.22628873354428775, "rewards/margins": 1.985029133137282, "rewards/rejected": -2.21131786668157, "step": 390 }, { "epoch": 12.908722109533468, "grad_norm": 192.0, "kl": 0.7094799280166626, "learning_rate": 4.5327858256745065e-07, "logits/chosen": -62306581.590361446, "logits/rejected": -91545985.66233766, "logps/chosen": -212.20241905120483, "logps/rejected": -138.62365564123377, "loss": 2.7339, "rewards/chosen": -0.30779806389866105, "rewards/margins": 2.033511124349614, "rewards/rejected": -2.341309188248275, "step": 400 }, { "epoch": 12.908722109533468, "eval_kl": 0.035703618079423904, "eval_logits/chosen": -67034790.05405405, "eval_logits/rejected": -106966931.25663717, "eval_logps/chosen": -224.06151463963963, "eval_logps/rejected": -137.08888447179203, "eval_loss": 0.3353511095046997, "eval_rewards/chosen": -0.16127505603137318, "eval_rewards/margins": 2.040005520961506, "eval_rewards/rejected": -2.201280576992879, "eval_runtime": 14.3688, "eval_samples_per_second": 15.241, "eval_steps_per_second": 0.974, "step": 400 }, { "epoch": 13.227180527383368, "grad_norm": 160.0, "kl": 0.6558622121810913, "learning_rate": 4.201327209846065e-07, "logits/chosen": -59866034.716981135, "logits/rejected": -89190664.25806452, "logps/chosen": -193.23755650550314, "logps/rejected": -128.39615675403226, "loss": 2.7624, "rewards/chosen": -0.20178618521060585, "rewards/margins": 1.875514249517217, "rewards/rejected": -2.0773004347278228, "step": 410 }, { "epoch": 13.551724137931034, "grad_norm": 156.0, "kl": 0.5715658664703369, "learning_rate": 3.873435660579217e-07, "logits/chosen": -57941598.18060201, "logits/rejected": -90468988.62170088, "logps/chosen": -220.37727320234114, "logps/rejected": -135.28165093475073, "loss": 2.7148, "rewards/chosen": -0.22364995950041805, "rewards/margins": 2.0567365988756814, "rewards/rejected": -2.2803865583760996, "step": 420 }, { "epoch": 13.876267748478702, "grad_norm": 147.0, "kl": 0.8168804049491882, "learning_rate": 3.5505756211298774e-07, "logits/chosen": -61979449.136231884, "logits/rejected": -90915301.96610169, "logps/chosen": -212.7193161231884, "logps/rejected": -136.42921080508475, "loss": 2.7174, "rewards/chosen": -0.2887859178626019, "rewards/margins": 2.0035011004769108, "rewards/rejected": -2.2922870183395125, "step": 430 }, { "epoch": 14.1947261663286, "grad_norm": 188.0, "kl": 0.6269903779029846, "learning_rate": 3.234189062809695e-07, "logits/chosen": -59424662.29677419, "logits/rejected": -90425550.08805032, "logps/chosen": -203.18240927419356, "logps/rejected": -129.64670548349056, "loss": 2.7431, "rewards/chosen": -0.19051946824596774, "rewards/margins": 1.9727823955143509, "rewards/rejected": -2.1633018637603185, "step": 440 }, { "epoch": 14.519269776876268, "grad_norm": 145.0, "kl": 0.5869894027709961, "learning_rate": 2.9256890447921315e-07, "logits/chosen": -57757369.408637874, "logits/rejected": -89745903.38643068, "logps/chosen": -217.05743874584718, "logps/rejected": -134.99330521755164, "loss": 2.7134, "rewards/chosen": -0.24640435000194663, "rewards/margins": 2.035277397209521, "rewards/rejected": -2.2816817472114677, "step": 450 }, { "epoch": 14.843813387423936, "grad_norm": 160.0, "kl": 0.7495726346969604, "learning_rate": 2.626453403047172e-07, "logits/chosen": -62740118.5882353, "logits/rejected": -91247097.17333333, "logps/chosen": -205.96771599264707, "logps/rejected": -138.3257421875, "loss": 2.7088, "rewards/chosen": -0.32139582914464615, "rewards/margins": 1.973530928667854, "rewards/rejected": -2.2949267578125, "step": 460 }, { "epoch": 15.162271805273834, "grad_norm": 138.0, "kl": 0.6461220979690552, "learning_rate": 2.3378185965914078e-07, "logits/chosen": -59542024.12698413, "logits/rejected": -90673073.48242812, "logps/chosen": -212.47857142857143, "logps/rejected": -131.6886232028754, "loss": 2.7498, "rewards/chosen": -0.1828639923580109, "rewards/margins": 1.9991756036633548, "rewards/rejected": -2.1820395960213657, "step": 470 }, { "epoch": 15.486815415821502, "grad_norm": 153.0, "kl": 0.6038868427276611, "learning_rate": 2.0610737385376348e-07, "logits/chosen": -56933078.04682274, "logits/rejected": -88981978.46334311, "logps/chosen": -207.75867474916387, "logps/rejected": -132.5094391495601, "loss": 2.7032, "rewards/chosen": -0.21223314470272, "rewards/margins": 2.0555130507128774, "rewards/rejected": -2.2677461954155973, "step": 480 }, { "epoch": 15.811359026369168, "grad_norm": 169.0, "kl": 0.7091981172561646, "learning_rate": 1.7974548386027584e-07, "logits/chosen": -62982918.75659824, "logits/rejected": -91177055.89297658, "logps/chosen": -205.08644153225808, "logps/rejected": -140.23990123327758, "loss": 2.6976, "rewards/chosen": -0.3505571203147911, "rewards/margins": 1.9499471239760744, "rewards/rejected": -2.3005042442908654, "step": 490 }, { "epoch": 16.129817444219068, "grad_norm": 170.0, "kl": 0.7773324251174927, "learning_rate": 1.5481392827883488e-07, "logits/chosen": -59927378.11320755, "logits/rejected": -91354719.79354839, "logps/chosen": -217.3079304245283, "logps/rejected": -131.10647681451613, "loss": 2.7533, "rewards/chosen": -0.19171231347809797, "rewards/margins": 2.018452205801136, "rewards/rejected": -2.210164519279234, "step": 500 }, { "epoch": 16.454361054766736, "grad_norm": 159.0, "kl": 0.592677116394043, "learning_rate": 1.3142405748889457e-07, "logits/chosen": -56867931.24752475, "logits/rejected": -88677786.20771514, "logps/chosen": -205.24737004950495, "logps/rejected": -130.59708827893175, "loss": 2.7022, "rewards/chosen": -0.15676989413724088, "rewards/margins": 2.102558342428505, "rewards/rejected": -2.2593282365657457, "step": 510 }, { "epoch": 16.7789046653144, "grad_norm": 176.0, "kl": 0.7139925956726074, "learning_rate": 1.096803363313803e-07, "logits/chosen": -61236336.4272997, "logits/rejected": -91304395.61716172, "logps/chosen": -207.9202290430267, "logps/rejected": -140.73229940181517, "loss": 2.7113, "rewards/chosen": -0.2867103825690838, "rewards/margins": 2.0461471585561224, "rewards/rejected": -2.332857541125206, "step": 520 }, { "epoch": 17.0973630831643, "grad_norm": 197.0, "kl": 0.7058033347129822, "learning_rate": 8.967987754335022e-08, "logits/chosen": -61097139.53246753, "logits/rejected": -91390720.0, "logps/chosen": -213.40300324675326, "logps/rejected": -133.4157958984375, "loss": 2.7195, "rewards/chosen": -0.29281839147790684, "rewards/margins": 1.9267250655533432, "rewards/rejected": -2.21954345703125, "step": 530 }, { "epoch": 17.421906693711968, "grad_norm": 264.0, "kl": 0.5918253064155579, "learning_rate": 7.15120080289368e-08, "logits/chosen": -58138827.48717949, "logits/rejected": -88649209.75609756, "logps/chosen": -204.44078024839743, "logps/rejected": -132.16799256859755, "loss": 2.73, "rewards/chosen": -0.17671105800530848, "rewards/margins": 2.0683754258337728, "rewards/rejected": -2.2450864838390814, "step": 540 }, { "epoch": 17.746450304259636, "grad_norm": 157.0, "kl": 0.7473562359809875, "learning_rate": 5.5257869903709006e-08, "logits/chosen": -60289140.85285285, "logits/rejected": -91669847.55700326, "logps/chosen": -210.96473817567568, "logps/rejected": -138.43526058631923, "loss": 2.7077, "rewards/chosen": -0.27088488329638233, "rewards/margins": 2.0678908508750347, "rewards/rejected": -2.338775734171417, "step": 550 }, { "epoch": 18.064908722109532, "grad_norm": 163.0, "kl": 0.6325186491012573, "learning_rate": 4.099005809428596e-08, "logits/chosen": -60974821.743589744, "logits/rejected": -90441339.1392405, "logps/chosen": -213.37244591346155, "logps/rejected": -133.14664507515823, "loss": 2.7211, "rewards/chosen": -0.2820472228221404, "rewards/margins": 1.9137137696867599, "rewards/rejected": -2.1957609925089003, "step": 560 }, { "epoch": 18.3894523326572, "grad_norm": 138.0, "kl": 0.6061395406723022, "learning_rate": 2.8772296111772677e-08, "logits/chosen": -58149936.76190476, "logits/rejected": -89025810.11692308, "logps/chosen": -201.52972470238095, "logps/rejected": -132.06661057692307, "loss": 2.7258, "rewards/chosen": -0.14980345226469494, "rewards/margins": 2.120578909514151, "rewards/rejected": -2.270382361778846, "step": 570 }, { "epoch": 18.713995943204868, "grad_norm": 191.0, "kl": 0.7928330898284912, "learning_rate": 1.865915144708985e-08, "logits/chosen": -59847180.487804875, "logits/rejected": -91838811.8974359, "logps/chosen": -217.81192835365854, "logps/rejected": -137.56884765625, "loss": 2.7109, "rewards/chosen": -0.25167286105272246, "rewards/margins": 2.060285256310058, "rewards/rejected": -2.3119581173627806, "step": 580 }, { "epoch": 19.032454361054768, "grad_norm": 228.0, "kl": 0.5313221216201782, "learning_rate": 1.0695791859313297e-08, "logits/chosen": -61405038.658064514, "logits/rejected": -91388741.2327044, "logps/chosen": -208.06592741935484, "logps/rejected": -136.2116868121069, "loss": 2.7008, "rewards/chosen": -0.3075132308467742, "rewards/margins": 1.9849507315402146, "rewards/rejected": -2.292463962386989, "step": 590 }, { "epoch": 19.356997971602436, "grad_norm": 159.0, "kl": 0.7078633308410645, "learning_rate": 4.917783645496887e-09, "logits/chosen": -58602562.99065421, "logits/rejected": -88561907.96238245, "logps/chosen": -197.22442075545172, "logps/rejected": -131.6894592476489, "loss": 2.7409, "rewards/chosen": -0.1881643990489924, "rewards/margins": 2.0329730513575512, "rewards/rejected": -2.2211374504065438, "step": 600 }, { "epoch": 19.356997971602436, "eval_kl": 0.060564398765563965, "eval_logits/chosen": -67062447.27927928, "eval_logits/rejected": -106975032.63716814, "eval_logps/chosen": -223.90932925112614, "eval_logps/rejected": -137.44655696902655, "eval_loss": 0.33327072858810425, "eval_rewards/chosen": -0.14605889878831468, "eval_rewards/margins": 2.090988279767064, "eval_rewards/rejected": -2.237047178555379, "eval_runtime": 14.3431, "eval_samples_per_second": 15.269, "eval_steps_per_second": 0.976, "step": 600 }, { "epoch": 19.6815415821501, "grad_norm": 133.0, "kl": 0.8249608278274536, "learning_rate": 1.350932792956394e-09, "logits/chosen": -59016821.90536278, "logits/rejected": -91488921.75851393, "logps/chosen": -231.22170741324922, "logps/rejected": -136.9874467879257, "loss": 2.7014, "rewards/chosen": -0.21359941259919657, "rewards/margins": 2.069619963247359, "rewards/rejected": -2.2832193758465555, "step": 610 }, { "epoch": 20.0, "grad_norm": 176.0, "kl": 0.49784502387046814, "learning_rate": 1.1169723465487279e-11, "logits/chosen": -62227758.32380953, "logits/rejected": -91514088.28115016, "logps/chosen": -201.05374503968255, "logps/rejected": -136.85892571884983, "loss": 2.6991, "rewards/chosen": -0.32870684426928326, "rewards/margins": 2.0013896107776414, "rewards/rejected": -2.3300964550469248, "step": 620 } ], "logging_steps": 10, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }