Qwen-ckpt-100 / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
d5318e0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 50,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"advantage/absmean": 0.0,
"entropy": 0.49213120341300964,
"epoch": 0.002,
"grad_norm": 0.0,
"importance_ratio": 0.9995924234390259,
"learning_rate": 0.0,
"loss": 0.0,
"mismatch_kl": 0.0013128521386533976,
"reward": 0.009999999776482582,
"reward/refusal_reward_func": 0.009999999776482582,
"reward/std": 0.0,
"step": 1,
"timing/generation_ms": 3254.1263923048973,
"timing/scoring_ms": 25275.689974427223,
"timing/total_ms": 28529.81636673212,
"tokens/completion": 196.125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 37.95693778991699
},
{
"advantage/absmean": 0.17499999701976776,
"entropy": 0.6319499611854553,
"epoch": 0.004,
"grad_norm": 0.21545597111492612,
"importance_ratio": 0.9992015957832336,
"learning_rate": 1e-05,
"loss": -0.0015,
"mismatch_kl": 0.0010142761748284101,
"reward": 0.7100000381469727,
"reward/refusal_reward_func": 0.7100000381469727,
"reward/std": 0.26457512378692627,
"step": 2,
"timing/generation_ms": 5644.344195723534,
"timing/scoring_ms": 31337.847255170345,
"timing/total_ms": 36982.19145089388,
"tokens/completion": 642.9375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 92.69584393501282
},
{
"advantage/absmean": 0.08203125,
"entropy": 0.4765586853027344,
"epoch": 0.006,
"grad_norm": 0.2892202194064109,
"importance_ratio": 1.000748872756958,
"learning_rate": 2e-05,
"loss": 0.0023,
"mismatch_kl": 0.001345986733213067,
"reward": 0.06593750417232513,
"reward/refusal_reward_func": 0.06593750417232513,
"reward/std": 0.15905697643756866,
"step": 3,
"timing/generation_ms": 2269.3659961223602,
"timing/scoring_ms": 26262.165516614914,
"timing/total_ms": 28531.531512737274,
"tokens/completion": 240.15625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 143.94128561019897
},
{
"advantage/absmean": 0.0237890612334013,
"entropy": 0.5391862988471985,
"epoch": 0.008,
"grad_norm": 0.03657768868171977,
"importance_ratio": 1.0003156661987305,
"learning_rate": 3e-05,
"loss": 0.0003,
"mismatch_kl": 0.0014281703624874353,
"reward": 0.02812499925494194,
"reward/refusal_reward_func": 0.02812499925494194,
"reward/std": 0.028986798599362373,
"step": 4,
"timing/generation_ms": 3297.7539896965027,
"timing/scoring_ms": 18404.439702630043,
"timing/total_ms": 21702.193692326546,
"tokens/completion": 328.96875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 30.997375011444092
},
{
"advantage/absmean": 0.37681642174720764,
"entropy": 0.6411616206169128,
"epoch": 0.01,
"grad_norm": 0.3743397229309058,
"importance_ratio": 1.0011004209518433,
"learning_rate": 4e-05,
"loss": -0.0173,
"mismatch_kl": 0.00182775326538831,
"reward": 0.37406250834465027,
"reward/refusal_reward_func": 0.37406250834465027,
"reward/std": 0.3808777630329132,
"step": 5,
"timing/generation_ms": 4316.511310636997,
"timing/scoring_ms": 31669.483192265034,
"timing/total_ms": 35985.99450290203,
"tokens/completion": 485.3125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 72.83462309837341
},
{
"advantage/absmean": 0.007929688319563866,
"entropy": 0.5038114786148071,
"epoch": 0.012,
"grad_norm": 0.02765669861934211,
"importance_ratio": 0.999994158744812,
"learning_rate": 5e-05,
"loss": -0.0013,
"mismatch_kl": 0.0035878715571016073,
"reward": 0.014374999329447746,
"reward/refusal_reward_func": 0.014374999329447746,
"reward/std": 0.015398356132209301,
"step": 6,
"timing/generation_ms": 3028.248645365238,
"timing/scoring_ms": 25548.00620675087,
"timing/total_ms": 28576.254852116108,
"tokens/completion": 334.125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 36.018136739730835
},
{
"advantage/absmean": 0.0,
"entropy": 0.29107579588890076,
"epoch": 0.014,
"grad_norm": 0.0,
"importance_ratio": 1.0008047819137573,
"learning_rate": 6e-05,
"loss": 0.0,
"mismatch_kl": 0.003742748638615012,
"reward": 0.009999999776482582,
"reward/refusal_reward_func": 0.009999999776482582,
"reward/std": 0.0,
"step": 7,
"timing/generation_ms": 1139.9633809924126,
"timing/scoring_ms": 19651.204399764538,
"timing/total_ms": 20791.16778075695,
"tokens/completion": 100.375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 23.77466917037964
},
{
"advantage/absmean": 0.006562499795109034,
"entropy": 0.3522435426712036,
"epoch": 0.016,
"grad_norm": 0.013866793274081895,
"importance_ratio": 1.0039043426513672,
"learning_rate": 7e-05,
"loss": 0.0029,
"mismatch_kl": 0.022596202790737152,
"reward": 0.013749999925494194,
"reward/refusal_reward_func": 0.013749999925494194,
"reward/std": 0.009921567514538765,
"step": 8,
"timing/generation_ms": 784.0555533766747,
"timing/scoring_ms": 19302.494660019875,
"timing/total_ms": 20086.55021339655,
"tokens/completion": 56.875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 24.927189350128174
},
{
"advantage/absmean": 0.17390625178813934,
"entropy": 0.8274978995323181,
"epoch": 0.018,
"grad_norm": 0.11705006346082461,
"importance_ratio": 1.0001081228256226,
"learning_rate": 8e-05,
"loss": 0.0021,
"mismatch_kl": 0.0024926774203777313,
"reward": 0.6775000095367432,
"reward/refusal_reward_func": 0.6775000095367432,
"reward/std": 0.24893523752689362,
"step": 9,
"timing/generation_ms": 9839.449286460876,
"timing/scoring_ms": 40201.816976070404,
"timing/total_ms": 50041.26626253128,
"tokens/completion": 1103.59375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 94.76600408554077
},
{
"advantage/absmean": 0.0035156249068677425,
"entropy": 0.24158422648906708,
"epoch": 0.02,
"grad_norm": 0.005773329550661251,
"importance_ratio": 0.9983059763908386,
"learning_rate": 9e-05,
"loss": 0.0009,
"mismatch_kl": 0.05829961970448494,
"reward": 0.011874999850988388,
"reward/refusal_reward_func": 0.011874999850988388,
"reward/std": 0.007261843420565128,
"step": 10,
"timing/generation_ms": 553.7310987710953,
"timing/scoring_ms": 18607.969902455807,
"timing/total_ms": 19161.701001226902,
"tokens/completion": 26.25,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 23.25968861579895
},
{
"advantage/absmean": 0.027421876788139343,
"entropy": 0.5188111066818237,
"epoch": 0.022,
"grad_norm": 0.030197590581975912,
"importance_ratio": 0.9986535310745239,
"learning_rate": 0.0001,
"loss": -0.0035,
"mismatch_kl": 0.005903073586523533,
"reward": 0.026874996721744537,
"reward/refusal_reward_func": 0.026874996721744537,
"reward/std": 0.046599194407463074,
"step": 11,
"timing/generation_ms": 2835.02546697855,
"timing/scoring_ms": 24443.98508220911,
"timing/total_ms": 27279.01054918766,
"tokens/completion": 308.53125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 145.36076593399048
},
{
"advantage/absmean": 0.12355469167232513,
"entropy": 0.44281068444252014,
"epoch": 0.024,
"grad_norm": 0.42446378552193,
"importance_ratio": 0.9987350106239319,
"learning_rate": 0.0001,
"loss": -0.0423,
"mismatch_kl": 0.006952627561986446,
"reward": 0.09437499195337296,
"reward/refusal_reward_func": 0.09437499195337296,
"reward/std": 0.21499907970428467,
"step": 12,
"timing/generation_ms": 2469.3235754966736,
"timing/scoring_ms": 35024.46338534355,
"timing/total_ms": 37493.786960840225,
"tokens/completion": 273.28125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 154.79029417037964
},
{
"advantage/absmean": 0.09375,
"entropy": 0.5496101379394531,
"epoch": 0.026,
"grad_norm": 0.1257011193040082,
"importance_ratio": 0.9992591738700867,
"learning_rate": 0.0001,
"loss": -0.0035,
"mismatch_kl": 0.0034747051540762186,
"reward": 0.7599999904632568,
"reward/refusal_reward_func": 0.7599999904632568,
"reward/std": 0.19364915788173676,
"step": 13,
"timing/generation_ms": 8158.617563545704,
"timing/scoring_ms": 36215.20960330963,
"timing/total_ms": 44373.827166855335,
"tokens/completion": 930.6875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 122.23734498023987
},
{
"advantage/absmean": 0.1844140589237213,
"entropy": 0.6212608814239502,
"epoch": 0.028,
"grad_norm": 0.26809699141136345,
"importance_ratio": 1.0004856586456299,
"learning_rate": 0.0001,
"loss": -0.0125,
"mismatch_kl": 0.0054090130142867565,
"reward": 0.1978124976158142,
"reward/refusal_reward_func": 0.1978124976158142,
"reward/std": 0.21153803169727325,
"step": 14,
"timing/generation_ms": 3771.018899977207,
"timing/scoring_ms": 32557.9876229167,
"timing/total_ms": 36329.006522893906,
"tokens/completion": 423.1875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 76.63339400291443
},
{
"advantage/absmean": 0.09375,
"entropy": 0.5119910836219788,
"epoch": 0.03,
"grad_norm": 0.2293179923709544,
"importance_ratio": 1.0004829168319702,
"learning_rate": 0.0001,
"loss": 0.0032,
"mismatch_kl": 0.0062421830371022224,
"reward": 0.05999999865889549,
"reward/refusal_reward_func": 0.05999999865889549,
"reward/std": 0.19364915788173676,
"step": 15,
"timing/generation_ms": 3333.340108394623,
"timing/scoring_ms": 27093.5076251626,
"timing/total_ms": 30426.847733557224,
"tokens/completion": 374.25,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 143.25073266029358
},
{
"advantage/absmean": 0.023906249552965164,
"entropy": 0.7846541404724121,
"epoch": 0.032,
"grad_norm": 0.03020597167190001,
"importance_ratio": 0.9995452165603638,
"learning_rate": 0.0001,
"loss": 0.0001,
"mismatch_kl": 0.006196495145559311,
"reward": 0.04218750074505806,
"reward/refusal_reward_func": 0.04218750074505806,
"reward/std": 0.030489176511764526,
"step": 16,
"timing/generation_ms": 6338.90475332737,
"timing/scoring_ms": 35081.99892938137,
"timing/total_ms": 41420.90368270874,
"tokens/completion": 725.5625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 89.63312363624573
},
{
"advantage/absmean": 0.2109375,
"entropy": 0.4431617856025696,
"epoch": 0.034,
"grad_norm": 0.11997986504420295,
"importance_ratio": 1.0006847381591797,
"learning_rate": 0.0001,
"loss": -0.0079,
"mismatch_kl": 0.0046984292566776276,
"reward": 0.6850000023841858,
"reward/refusal_reward_func": 0.6850000023841858,
"reward/std": 0.2904737591743469,
"step": 17,
"timing/generation_ms": 4871.280819177628,
"timing/scoring_ms": 27726.198948919773,
"timing/total_ms": 32597.4797680974,
"tokens/completion": 541.3125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 72.54996109008789
},
{
"advantage/absmean": 0.020624998956918716,
"entropy": 0.005482906475663185,
"epoch": 0.036,
"grad_norm": 2.4822412849126587e-05,
"importance_ratio": 0.9996475577354431,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 3.828452292964357e-07,
"reward": 0.023749999701976776,
"reward/refusal_reward_func": 0.023749999701976776,
"reward/std": 0.026896795257925987,
"step": 18,
"timing/generation_ms": 466.4832055568695,
"timing/scoring_ms": 18893.90940964222,
"timing/total_ms": 19360.39261519909,
"tokens/completion": 13.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 24.94274139404297
},
{
"advantage/absmean": 0.06175781413912773,
"entropy": 0.5270810723304749,
"epoch": 0.038,
"grad_norm": 0.19852274538781398,
"importance_ratio": 1.0000227689743042,
"learning_rate": 0.0001,
"loss": 0.0024,
"mismatch_kl": 0.005923233926296234,
"reward": 0.06562499701976776,
"reward/refusal_reward_func": 0.06562499701976776,
"reward/std": 0.13811086118221283,
"step": 19,
"timing/generation_ms": 4183.5604682564735,
"timing/scoring_ms": 30921.414978802204,
"timing/total_ms": 35104.97544705868,
"tokens/completion": 476.46875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 69.38085293769836
},
{
"advantage/absmean": 0.0018164062639698386,
"entropy": 0.004832141101360321,
"epoch": 0.04,
"grad_norm": 0.00028270733132418016,
"importance_ratio": 0.9995356202125549,
"learning_rate": 0.0001,
"loss": -0.0,
"mismatch_kl": 1.2251906582605443e-06,
"reward": 0.010937499813735485,
"reward/refusal_reward_func": 0.010937499813735485,
"reward/std": 0.005219778511673212,
"step": 20,
"timing/generation_ms": 469.4804549217224,
"timing/scoring_ms": 18075.80190896988,
"timing/total_ms": 18545.2823638916,
"tokens/completion": 13.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 22.75140905380249
},
{
"advantage/absmean": 0.13593749701976776,
"entropy": 0.6113055944442749,
"epoch": 0.042,
"grad_norm": 0.14856988549338257,
"importance_ratio": 0.999966561794281,
"learning_rate": 0.0001,
"loss": -0.0022,
"mismatch_kl": 0.005093984771519899,
"reward": 0.7350000143051147,
"reward/refusal_reward_func": 0.7350000143051147,
"reward/std": 0.23318447172641754,
"step": 21,
"timing/generation_ms": 6330.163478851318,
"timing/scoring_ms": 28551.29039287567,
"timing/total_ms": 34881.45387172699,
"tokens/completion": 715.9375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 84.07915687561035
},
{
"advantage/absmean": 0.01318359375,
"entropy": 0.5851391553878784,
"epoch": 0.044,
"grad_norm": 0.040096615934913975,
"importance_ratio": 1.0004969835281372,
"learning_rate": 0.0001,
"loss": -0.0018,
"mismatch_kl": 0.009555388242006302,
"reward": 0.017812499776482582,
"reward/refusal_reward_func": 0.017812499776482582,
"reward/std": 0.020575225353240967,
"step": 22,
"timing/generation_ms": 2347.38065302372,
"timing/scoring_ms": 26964.800156652927,
"timing/total_ms": 29312.180809676647,
"tokens/completion": 255.15625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 150.09117031097412
},
{
"advantage/absmean": 0.01718750037252903,
"entropy": 0.5837696194648743,
"epoch": 0.046,
"grad_norm": 0.027639162796687714,
"importance_ratio": 1.0036453008651733,
"learning_rate": 0.0001,
"loss": -0.0037,
"mismatch_kl": 0.017472539097070694,
"reward": 0.022499999031424522,
"reward/refusal_reward_func": 0.022499999031424522,
"reward/std": 0.021650634706020355,
"step": 23,
"timing/generation_ms": 2000.5059093236923,
"timing/scoring_ms": 24497.070513665676,
"timing/total_ms": 26497.57642298937,
"tokens/completion": 222.25,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 39.751105070114136
},
{
"advantage/absmean": 0.09375,
"entropy": 0.6268512010574341,
"epoch": 0.048,
"grad_norm": 0.18640619697968583,
"importance_ratio": 0.9990081787109375,
"learning_rate": 0.0001,
"loss": -0.0019,
"mismatch_kl": 0.005103914998471737,
"reward": 0.7599999904632568,
"reward/refusal_reward_func": 0.7599999904632568,
"reward/std": 0.19364915788173676,
"step": 24,
"timing/generation_ms": 5919.098302721977,
"timing/scoring_ms": 32965.33615142107,
"timing/total_ms": 38884.43445414305,
"tokens/completion": 674.9375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 105.22681593894958
},
{
"advantage/absmean": 0.007929687388241291,
"entropy": 0.657754123210907,
"epoch": 0.05,
"grad_norm": 0.025336375406596286,
"importance_ratio": 1.0006119012832642,
"learning_rate": 0.0001,
"loss": -0.0006,
"mismatch_kl": 0.006333181634545326,
"reward": 0.014374999329447746,
"reward/refusal_reward_func": 0.014374999329447746,
"reward/std": 0.015398357063531876,
"step": 25,
"timing/generation_ms": 4377.142012119293,
"timing/scoring_ms": 25255.830891430378,
"timing/total_ms": 29632.97290354967,
"tokens/completion": 501.125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 47.89186096191406
},
{
"advantage/absmean": 0.0,
"entropy": 0.17220205068588257,
"epoch": 0.052,
"grad_norm": 0.0,
"importance_ratio": 1.0002461671829224,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.0026462471578270197,
"reward": 0.009999999776482582,
"reward/refusal_reward_func": 0.009999999776482582,
"reward/std": 0.0,
"step": 26,
"timing/generation_ms": 497.48579412698746,
"timing/scoring_ms": 18527.822844684124,
"timing/total_ms": 19025.30863881111,
"tokens/completion": 19.40625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 22.794724941253662
},
{
"advantage/absmean": 0.17499999701976776,
"entropy": 0.546366810798645,
"epoch": 0.054,
"grad_norm": 0.22199732245019957,
"importance_ratio": 0.999916672706604,
"learning_rate": 0.0001,
"loss": -0.0027,
"mismatch_kl": 0.005502623040229082,
"reward": 0.7100000381469727,
"reward/refusal_reward_func": 0.7100000381469727,
"reward/std": 0.26457512378692627,
"step": 27,
"timing/generation_ms": 7701.031573116779,
"timing/scoring_ms": 28537.479266524315,
"timing/total_ms": 36238.510839641094,
"tokens/completion": 881.84375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 68.64078164100647
},
{
"advantage/absmean": 0.06175781041383743,
"entropy": 0.6991814374923706,
"epoch": 0.056,
"grad_norm": 0.04213571392317473,
"importance_ratio": 0.9988569617271423,
"learning_rate": 0.0001,
"loss": -0.0026,
"mismatch_kl": 0.00713223684579134,
"reward": 0.07656250149011612,
"reward/refusal_reward_func": 0.07656250149011612,
"reward/std": 0.1120995506644249,
"step": 28,
"timing/generation_ms": 4908.71948748827,
"timing/scoring_ms": 35273.585848510265,
"timing/total_ms": 40182.305335998535,
"tokens/completion": 557.53125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 92.86418867111206
},
{
"advantage/absmean": 0.2201562523841858,
"entropy": 0.8785261511802673,
"epoch": 0.058,
"grad_norm": 0.3194279303513244,
"importance_ratio": 0.998691737651825,
"learning_rate": 0.0001,
"loss": 0.0079,
"mismatch_kl": 0.006111000664532185,
"reward": 0.5774999856948853,
"reward/refusal_reward_func": 0.5774999856948853,
"reward/std": 0.2741691768169403,
"step": 29,
"timing/generation_ms": 4181.9010972976685,
"timing/scoring_ms": 29512.903429567814,
"timing/total_ms": 33694.80452686548,
"tokens/completion": 478.875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 72.91563320159912
},
{
"advantage/absmean": 0.20125000178813934,
"entropy": 0.6921765208244324,
"epoch": 0.06,
"grad_norm": 0.4406053771041607,
"importance_ratio": 0.9964888095855713,
"learning_rate": 0.0001,
"loss": -0.0386,
"mismatch_kl": 0.01204030029475689,
"reward": 0.17999999225139618,
"reward/refusal_reward_func": 0.17999999225139618,
"reward/std": 0.23286262154579163,
"step": 30,
"timing/generation_ms": 3321.3287368416786,
"timing/scoring_ms": 29757.627181708813,
"timing/total_ms": 33078.95591855049,
"tokens/completion": 382.8125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 53.77872610092163
},
{
"advantage/absmean": 0.0018164062639698386,
"entropy": 0.5210146903991699,
"epoch": 0.062,
"grad_norm": 0.01658749077570669,
"importance_ratio": 0.9975528120994568,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.024219391867518425,
"reward": 0.010937499813735485,
"reward/refusal_reward_func": 0.010937499813735485,
"reward/std": 0.005219778511673212,
"step": 31,
"timing/generation_ms": 1940.1119500398636,
"timing/scoring_ms": 23180.305778980255,
"timing/total_ms": 25120.41772902012,
"tokens/completion": 190.5625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 33.192246198654175
},
{
"advantage/absmean": 0.04218750074505806,
"entropy": 0.7747635245323181,
"epoch": 0.064,
"grad_norm": 0.05856219259172721,
"importance_ratio": 1.0011621713638306,
"learning_rate": 0.0001,
"loss": 0.0001,
"mismatch_kl": 0.008931240066885948,
"reward": 0.0456249974668026,
"reward/refusal_reward_func": 0.0456249974668026,
"reward/std": 0.06082441285252571,
"step": 32,
"timing/generation_ms": 6628.670156002045,
"timing/scoring_ms": 29644.853502511978,
"timing/total_ms": 36273.52365851402,
"tokens/completion": 758.0625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 44.213383197784424
},
{
"advantage/absmean": 0.005097656510770321,
"entropy": 0.7032718062400818,
"epoch": 0.066,
"grad_norm": 0.004871385581572528,
"importance_ratio": 0.9980704188346863,
"learning_rate": 0.0001,
"loss": -0.0002,
"mismatch_kl": 0.017929747700691223,
"reward": 0.012812498956918716,
"reward/refusal_reward_func": 0.012812498956918716,
"reward/std": 0.008744417689740658,
"step": 33,
"timing/generation_ms": 2495.2172189950943,
"timing/scoring_ms": 23352.533906698227,
"timing/total_ms": 25847.75112569332,
"tokens/completion": 271.1875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 146.50593042373657
},
{
"advantage/absmean": 0.20359376072883606,
"entropy": 0.7547333240509033,
"epoch": 0.068,
"grad_norm": 0.2936937114262587,
"importance_ratio": 1.0006296634674072,
"learning_rate": 0.0001,
"loss": -0.0116,
"mismatch_kl": 0.011033565737307072,
"reward": 0.47718751430511475,
"reward/refusal_reward_func": 0.47718751430511475,
"reward/std": 0.26260992884635925,
"step": 34,
"timing/generation_ms": 4908.003121614456,
"timing/scoring_ms": 30874.776013195515,
"timing/total_ms": 35782.77913480997,
"tokens/completion": 551.65625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 60.00288248062134
},
{
"advantage/absmean": 0.11302733421325684,
"entropy": 0.7300074696540833,
"epoch": 0.07,
"grad_norm": 0.32493885105188386,
"importance_ratio": 0.9997804164886475,
"learning_rate": 0.0001,
"loss": -0.0011,
"mismatch_kl": 0.016147281974554062,
"reward": 0.10593750327825546,
"reward/refusal_reward_func": 0.10593750327825546,
"reward/std": 0.16747872531414032,
"step": 35,
"timing/generation_ms": 3645.879790186882,
"timing/scoring_ms": 28717.37616509199,
"timing/total_ms": 32363.255955278873,
"tokens/completion": 418.03125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 58.10710334777832
},
{
"advantage/absmean": 0.015136717818677425,
"entropy": 0.6982847452163696,
"epoch": 0.072,
"grad_norm": 0.04051473715939973,
"importance_ratio": 0.9992015957832336,
"learning_rate": 0.0001,
"loss": -0.0006,
"mismatch_kl": 0.01328412164002657,
"reward": 0.019687499850988388,
"reward/refusal_reward_func": 0.019687499850988388,
"reward/std": 0.02113710716366768,
"step": 36,
"timing/generation_ms": 3033.673010766506,
"timing/scoring_ms": 29509.141087532043,
"timing/total_ms": 32542.81409829855,
"tokens/completion": 339.375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 151.12143540382385
},
{
"advantage/absmean": 0.013593749143183231,
"entropy": 0.6699934005737305,
"epoch": 0.074,
"grad_norm": 0.02768782924464237,
"importance_ratio": 1.0014734268188477,
"learning_rate": 0.0001,
"loss": -0.0003,
"mismatch_kl": 0.00942978449165821,
"reward": 0.019062498584389687,
"reward/refusal_reward_func": 0.019062498584389687,
"reward/std": 0.017741085961461067,
"step": 37,
"timing/generation_ms": 4092.4242958426476,
"timing/scoring_ms": 29670.215159654617,
"timing/total_ms": 33762.639455497265,
"tokens/completion": 466.8125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 77.67575216293335
},
{
"advantage/absmean": 0.09351562708616257,
"entropy": 0.6306953430175781,
"epoch": 0.076,
"grad_norm": 0.2799348001151443,
"importance_ratio": 0.9985800981521606,
"learning_rate": 0.0001,
"loss": -0.0037,
"mismatch_kl": 0.009972751140594482,
"reward": 0.06187500059604645,
"reward/refusal_reward_func": 0.06187500059604645,
"reward/std": 0.19330088794231415,
"step": 38,
"timing/generation_ms": 5255.660645663738,
"timing/scoring_ms": 29534.79740768671,
"timing/total_ms": 34790.45805335045,
"tokens/completion": 602.75,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 56.81032872200012
},
{
"advantage/absmean": 0.021894531324505806,
"entropy": 0.6860240697860718,
"epoch": 0.078,
"grad_norm": 0.021804850572780036,
"importance_ratio": 0.999793291091919,
"learning_rate": 0.0001,
"loss": 0.0008,
"mismatch_kl": 0.008842560462653637,
"reward": 0.028437498956918716,
"reward/refusal_reward_func": 0.028437498956918716,
"reward/std": 0.02670549787580967,
"step": 39,
"timing/generation_ms": 5519.4277837872505,
"timing/scoring_ms": 28062.6777485013,
"timing/total_ms": 33582.10553228855,
"tokens/completion": 635.6875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 49.89069700241089
},
{
"advantage/absmean": 0.04843749850988388,
"entropy": 0.7330471873283386,
"epoch": 0.08,
"grad_norm": 0.23560813153480506,
"importance_ratio": 1.0003836154937744,
"learning_rate": 0.0001,
"loss": 0.0014,
"mismatch_kl": 0.008081368170678616,
"reward": 0.03500000014901161,
"reward/refusal_reward_func": 0.03500000014901161,
"reward/std": 0.13919411599636078,
"step": 40,
"timing/generation_ms": 3926.6494438052177,
"timing/scoring_ms": 27580.46282082796,
"timing/total_ms": 31507.11226463318,
"tokens/completion": 445.78125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 70.10888338088989
},
{
"advantage/absmean": 0.12544921040534973,
"entropy": 0.5206725597381592,
"epoch": 0.082,
"grad_norm": 0.5033391726424777,
"importance_ratio": 1.0012822151184082,
"learning_rate": 0.0001,
"loss": -0.0225,
"mismatch_kl": 0.007961519993841648,
"reward": 0.1446875035762787,
"reward/refusal_reward_func": 0.1446875035762787,
"reward/std": 0.1963731199502945,
"step": 41,
"timing/generation_ms": 1945.6753060221672,
"timing/scoring_ms": 24715.371668338776,
"timing/total_ms": 26661.046974360943,
"tokens/completion": 203.96875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 41.8072030544281
},
{
"advantage/absmean": 0.0018164062639698386,
"entropy": 0.82820063829422,
"epoch": 0.084,
"grad_norm": 0.0008894764900618824,
"importance_ratio": 0.9987862706184387,
"learning_rate": 0.0001,
"loss": 0.0002,
"mismatch_kl": 0.008028813637793064,
"reward": 0.010937499813735485,
"reward/refusal_reward_func": 0.010937499813735485,
"reward/std": 0.005219778511673212,
"step": 42,
"timing/generation_ms": 6800.906598567963,
"timing/scoring_ms": 31876.1548101902,
"timing/total_ms": 38677.06140875816,
"tokens/completion": 785.28125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 59.81179714202881
},
{
"advantage/absmean": 0.050859373062849045,
"entropy": 0.829409658908844,
"epoch": 0.086,
"grad_norm": 0.1901939415206169,
"importance_ratio": 1.0000019073486328,
"learning_rate": 0.0001,
"loss": 0.003,
"mismatch_kl": 0.009588975459337234,
"reward": 0.04312499612569809,
"reward/refusal_reward_func": 0.04312499612569809,
"reward/std": 0.1388217806816101,
"step": 43,
"timing/generation_ms": 6013.470813632011,
"timing/scoring_ms": 28884.994342923164,
"timing/total_ms": 34898.465156555176,
"tokens/completion": 691.1875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 51.95971155166626
},
{
"advantage/absmean": 0.01318359375,
"entropy": 0.9210640788078308,
"epoch": 0.088,
"grad_norm": 0.03744765208460247,
"importance_ratio": 0.9966821670532227,
"learning_rate": 0.0001,
"loss": -0.0019,
"mismatch_kl": 0.018160995095968246,
"reward": 0.017812499776482582,
"reward/refusal_reward_func": 0.017812499776482582,
"reward/std": 0.020575225353240967,
"step": 44,
"timing/generation_ms": 2924.579069018364,
"timing/scoring_ms": 20441.766560077667,
"timing/total_ms": 23366.34562909603,
"tokens/completion": 343.75,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 84.02694988250732
},
{
"advantage/absmean": 0.04843749850988388,
"entropy": 0.8891170024871826,
"epoch": 0.09,
"grad_norm": 0.12978747422641476,
"importance_ratio": 0.9995278120040894,
"learning_rate": 0.0001,
"loss": -0.0021,
"mismatch_kl": 0.004716904368251562,
"reward": 0.7849999666213989,
"reward/refusal_reward_func": 0.7849999666213989,
"reward/std": 0.13919411599636078,
"step": 45,
"timing/generation_ms": 12784.472778439522,
"timing/scoring_ms": 41995.29768526554,
"timing/total_ms": 54779.77046370506,
"tokens/completion": 1424.53125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 159.0035264492035
},
{
"advantage/absmean": 0.19917967915534973,
"entropy": 1.1817245483398438,
"epoch": 0.092,
"grad_norm": 0.18725081241592767,
"importance_ratio": 1.0018072128295898,
"learning_rate": 0.0001,
"loss": -0.0256,
"mismatch_kl": 0.008767618797719479,
"reward": 0.5290625095367432,
"reward/refusal_reward_func": 0.5290625095367432,
"reward/std": 0.2553595304489136,
"step": 46,
"timing/generation_ms": 11124.341294169426,
"timing/scoring_ms": 37177.75782942772,
"timing/total_ms": 48302.099123597145,
"tokens/completion": 1282.71875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 67.16122078895569
},
{
"advantage/absmean": 0.02968750149011612,
"entropy": 0.9758523106575012,
"epoch": 0.094,
"grad_norm": 0.038752578543884246,
"importance_ratio": 1.0004377365112305,
"learning_rate": 0.0001,
"loss": 0.0042,
"mismatch_kl": 0.007721519563347101,
"reward": 0.03500000014901161,
"reward/refusal_reward_func": 0.03500000014901161,
"reward/std": 0.04690415784716606,
"step": 47,
"timing/generation_ms": 8879.09684330225,
"timing/scoring_ms": 37631.5980181098,
"timing/total_ms": 46510.69486141205,
"tokens/completion": 1009.46875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 73.34895396232605
},
{
"advantage/absmean": 0.13593749701976776,
"entropy": 0.918021559715271,
"epoch": 0.096,
"grad_norm": 0.19107099447712278,
"importance_ratio": 0.9999390244483948,
"learning_rate": 0.0001,
"loss": -0.0005,
"mismatch_kl": 0.007433234713971615,
"reward": 0.7350000143051147,
"reward/refusal_reward_func": 0.7350000143051147,
"reward/std": 0.23318448662757874,
"step": 48,
"timing/generation_ms": 10426.40034854412,
"timing/scoring_ms": 37421.48996144533,
"timing/total_ms": 47847.89030998945,
"tokens/completion": 1180.71875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 93.57710862159729
},
{
"advantage/absmean": 0.17765624821186066,
"entropy": 1.0576484203338623,
"epoch": 0.098,
"grad_norm": 0.16558979067438845,
"importance_ratio": 1.0013474225997925,
"learning_rate": 0.0001,
"loss": -0.006,
"mismatch_kl": 0.006344192661345005,
"reward": 0.6915624737739563,
"reward/refusal_reward_func": 0.6915624737739563,
"reward/std": 0.24835848808288574,
"step": 49,
"timing/generation_ms": 8644.713327288628,
"timing/scoring_ms": 35074.74631816149,
"timing/total_ms": 43719.459645450115,
"tokens/completion": 984.625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 63.12021732330322
},
{
"advantage/absmean": 0.12312500178813934,
"entropy": 0.8099650144577026,
"epoch": 0.1,
"grad_norm": 0.1253616039663461,
"importance_ratio": 1.0017279386520386,
"learning_rate": 0.0001,
"loss": -0.0052,
"mismatch_kl": 0.008134027011692524,
"reward": 0.14249999821186066,
"reward/refusal_reward_func": 0.14249999821186066,
"reward/std": 0.1628841608762741,
"step": 50,
"timing/generation_ms": 8775.396101176739,
"timing/scoring_ms": 40764.19186592102,
"timing/total_ms": 49539.58796709776,
"tokens/completion": 1000.1875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 70.7229483127594
},
{
"advantage/absmean": 0.04843749850988388,
"entropy": 0.8692086338996887,
"epoch": 0.102,
"grad_norm": 0.016399389830222273,
"importance_ratio": 1.001124620437622,
"learning_rate": 0.0001,
"loss": -0.0013,
"mismatch_kl": 0.010322043672204018,
"reward": 0.7849999666213989,
"reward/refusal_reward_func": 0.7849999666213989,
"reward/std": 0.13919411599636078,
"step": 51,
"timing/generation_ms": 14720.608927309513,
"timing/scoring_ms": 41765.84377884865,
"timing/total_ms": 56486.45270615816,
"tokens/completion": 1616.15625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 89.23512148857117
},
{
"advantage/absmean": 0.2948242425918579,
"entropy": 1.1107399463653564,
"epoch": 0.104,
"grad_norm": 0.18323432452024488,
"importance_ratio": 1.0023913383483887,
"learning_rate": 0.0001,
"loss": 0.0017,
"mismatch_kl": 0.0076919617131352425,
"reward": 0.32218748331069946,
"reward/refusal_reward_func": 0.32218748331069946,
"reward/std": 0.3130169212818146,
"step": 52,
"timing/generation_ms": 15175.773054361343,
"timing/scoring_ms": 57348.27160835266,
"timing/total_ms": 72524.044662714,
"tokens/completion": 1662.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 182.7874722480774
},
{
"advantage/absmean": 0.15343749523162842,
"entropy": 0.9647155404090881,
"epoch": 0.106,
"grad_norm": 0.1470849270360251,
"importance_ratio": 1.0000090599060059,
"learning_rate": 0.0001,
"loss": 0.0102,
"mismatch_kl": 0.00862339697778225,
"reward": 0.6565625071525574,
"reward/refusal_reward_func": 0.6565625071525574,
"reward/std": 0.22976359724998474,
"step": 53,
"timing/generation_ms": 10986.380942165852,
"timing/scoring_ms": 38820.41800022125,
"timing/total_ms": 49806.798942387104,
"tokens/completion": 1229.09375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 157.4189648628235
},
{
"advantage/absmean": 0.04843749850988388,
"entropy": 0.8174250721931458,
"epoch": 0.108,
"grad_norm": 0.017013624591187354,
"importance_ratio": 1.000490427017212,
"learning_rate": 0.0001,
"loss": -0.0008,
"mismatch_kl": 0.004199406132102013,
"reward": 0.7849999666213989,
"reward/refusal_reward_func": 0.7849999666213989,
"reward/std": 0.13919411599636078,
"step": 54,
"timing/generation_ms": 13110.579743981361,
"timing/scoring_ms": 39618.385925889015,
"timing/total_ms": 52728.96566987038,
"tokens/completion": 1453.71875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 159.44240832328796
},
{
"advantage/absmean": 0.07984375208616257,
"entropy": 0.8634753227233887,
"epoch": 0.11,
"grad_norm": 0.06408238305252761,
"importance_ratio": 1.0001016855239868,
"learning_rate": 0.0001,
"loss": -0.0018,
"mismatch_kl": 0.006974226329475641,
"reward": 0.7643749713897705,
"reward/refusal_reward_func": 0.7643749713897705,
"reward/std": 0.15140874683856964,
"step": 55,
"timing/generation_ms": 13839.490927755833,
"timing/scoring_ms": 43094.41144019365,
"timing/total_ms": 56933.902367949486,
"tokens/completion": 1539.5625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 162.71979093551636
},
{
"advantage/absmean": 0.22667968273162842,
"entropy": 0.7791604399681091,
"epoch": 0.112,
"grad_norm": 0.16725001444914697,
"importance_ratio": 1.0006935596466064,
"learning_rate": 0.0001,
"loss": -0.0115,
"mismatch_kl": 0.007046831306070089,
"reward": 0.47468751668930054,
"reward/refusal_reward_func": 0.47468751668930054,
"reward/std": 0.2708203196525574,
"step": 56,
"timing/generation_ms": 16662.443839013577,
"timing/scoring_ms": 45676.41341686249,
"timing/total_ms": 62338.857255876064,
"tokens/completion": 1816.03125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 124.58920621871948
},
{
"advantage/absmean": 0.11601562052965164,
"entropy": 0.7439562678337097,
"epoch": 0.114,
"grad_norm": 0.13019703908354843,
"importance_ratio": 1.0008944272994995,
"learning_rate": 0.0001,
"loss": -0.0026,
"mismatch_kl": 0.009258158504962921,
"reward": 0.7171875238418579,
"reward/refusal_reward_func": 0.7171875238418579,
"reward/std": 0.16097815334796906,
"step": 57,
"timing/generation_ms": 12729.440599679947,
"timing/scoring_ms": 44226.92193090916,
"timing/total_ms": 56956.362530589104,
"tokens/completion": 1423.09375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 158.57960319519043
},
{
"advantage/absmean": 0.15726563334465027,
"entropy": 0.8287293910980225,
"epoch": 0.116,
"grad_norm": 0.14055232526730846,
"importance_ratio": 1.001518726348877,
"learning_rate": 0.0001,
"loss": -0.0132,
"mismatch_kl": 0.008714662864804268,
"reward": 0.6956250071525574,
"reward/refusal_reward_func": 0.6956250071525574,
"reward/std": 0.2274579405784607,
"step": 58,
"timing/generation_ms": 10148.803442716599,
"timing/scoring_ms": 45419.282242655754,
"timing/total_ms": 55568.08568537235,
"tokens/completion": 1146.34375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 88.32079148292542
},
{
"advantage/absmean": 0.18738281726837158,
"entropy": 0.6767469644546509,
"epoch": 0.118,
"grad_norm": 0.1800028526601889,
"importance_ratio": 0.9996235370635986,
"learning_rate": 0.0001,
"loss": -0.0004,
"mismatch_kl": 0.00572241609916091,
"reward": 0.6946874856948853,
"reward/refusal_reward_func": 0.6946874856948853,
"reward/std": 0.26609423756599426,
"step": 59,
"timing/generation_ms": 5857.890740036964,
"timing/scoring_ms": 28364.16070908308,
"timing/total_ms": 34222.051449120045,
"tokens/completion": 674.78125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 153.28042459487915
},
{
"advantage/absmean": 0.04843749850988388,
"entropy": 0.48612505197525024,
"epoch": 0.12,
"grad_norm": 0.017636908815215933,
"importance_ratio": 1.001957893371582,
"learning_rate": 0.0001,
"loss": -0.0003,
"mismatch_kl": 0.0077150240540504456,
"reward": 0.7849999666213989,
"reward/refusal_reward_func": 0.7849999666213989,
"reward/std": 0.13919411599636078,
"step": 60,
"timing/generation_ms": 13039.215676486492,
"timing/scoring_ms": 37324.29302483797,
"timing/total_ms": 50363.50870132446,
"tokens/completion": 1435.25,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 155.66002011299133
},
{
"advantage/absmean": 0.033906251192092896,
"entropy": 0.8923248648643494,
"epoch": 0.122,
"grad_norm": 0.01138046317613093,
"importance_ratio": 1.0010781288146973,
"learning_rate": 0.0001,
"loss": -0.0001,
"mismatch_kl": 0.009470692835748196,
"reward": 0.7925000190734863,
"reward/refusal_reward_func": 0.7925000190734863,
"reward/std": 0.09743587672710419,
"step": 61,
"timing/generation_ms": 20387.244410812855,
"timing/scoring_ms": 49000.582568347454,
"timing/total_ms": 69387.82697916031,
"tokens/completion": 2047.65625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 179.17891383171082
},
{
"advantage/absmean": 0.14195312559604645,
"entropy": 0.6803052425384521,
"epoch": 0.124,
"grad_norm": 0.1396860758416004,
"importance_ratio": 1.0004676580429077,
"learning_rate": 0.0001,
"loss": 0.0005,
"mismatch_kl": 0.009484711103141308,
"reward": 0.7112500071525574,
"reward/refusal_reward_func": 0.7112500071525574,
"reward/std": 0.19915054738521576,
"step": 62,
"timing/generation_ms": 20013.910226523876,
"timing/scoring_ms": 51556.99533224106,
"timing/total_ms": 71570.90555876493,
"tokens/completion": 2029.46875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 281.4015655517578
},
{
"advantage/absmean": 0.14208984375,
"entropy": 0.7867326736450195,
"epoch": 0.126,
"grad_norm": 0.07770627410364793,
"importance_ratio": 1.0014797449111938,
"learning_rate": 0.0001,
"loss": -0.0003,
"mismatch_kl": 0.014644050039350986,
"reward": 0.6584374904632568,
"reward/refusal_reward_func": 0.6584374904632568,
"reward/std": 0.21947535872459412,
"step": 63,
"timing/generation_ms": 19855.214461684227,
"timing/scoring_ms": 52655.76823055744,
"timing/total_ms": 72510.98269224167,
"tokens/completion": 2026.1875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 168.5356569290161
},
{
"advantage/absmean": 0.09375,
"entropy": 0.722605288028717,
"epoch": 0.128,
"grad_norm": 0.10541623752837016,
"importance_ratio": 1.0017896890640259,
"learning_rate": 0.0001,
"loss": 0.0002,
"mismatch_kl": 0.009318462572991848,
"reward": 0.7599999904632568,
"reward/refusal_reward_func": 0.7599999904632568,
"reward/std": 0.19364915788173676,
"step": 64,
"timing/generation_ms": 20329.02915775776,
"timing/scoring_ms": 43974.1270840168,
"timing/total_ms": 64303.15624177456,
"tokens/completion": 2046.09375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 69.69067120552063
},
{
"advantage/absmean": 0.13593751192092896,
"entropy": 0.6431602239608765,
"epoch": 0.13,
"grad_norm": 0.04996864946933639,
"importance_ratio": 1.0007299184799194,
"learning_rate": 0.0001,
"loss": 0.0042,
"mismatch_kl": 0.01128534134477377,
"reward": 0.7350000143051147,
"reward/refusal_reward_func": 0.7350000143051147,
"reward/std": 0.23318447172641754,
"step": 65,
"timing/generation_ms": 17292.446829378605,
"timing/scoring_ms": 48005.3500905633,
"timing/total_ms": 65297.7969199419,
"tokens/completion": 1868.75,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 137.745671749115
},
{
"advantage/absmean": 0.043593745678663254,
"entropy": 0.6788095831871033,
"epoch": 0.132,
"grad_norm": 0.1104672773690437,
"importance_ratio": 1.0008015632629395,
"learning_rate": 0.0001,
"loss": -0.0,
"mismatch_kl": 0.011132912710309029,
"reward": 0.7875000238418579,
"reward/refusal_reward_func": 0.7875000238418579,
"reward/std": 0.1252746880054474,
"step": 66,
"timing/generation_ms": 20402.18196809292,
"timing/scoring_ms": 42341.174609959126,
"timing/total_ms": 62743.356578052044,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 162.58158588409424
},
{
"advantage/absmean": 0.05214843899011612,
"entropy": 0.5562156438827515,
"epoch": 0.134,
"grad_norm": 0.016212117765327168,
"importance_ratio": 0.9997291564941406,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.008888973854482174,
"reward": 0.7821874618530273,
"reward/refusal_reward_func": 0.7821874618530273,
"reward/std": 0.1277872771024704,
"step": 67,
"timing/generation_ms": 20475.26439279318,
"timing/scoring_ms": 46698.052957654,
"timing/total_ms": 67173.31735044718,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 151.99999165534973
},
{
"advantage/absmean": 0.08964844048023224,
"entropy": 0.6966589093208313,
"epoch": 0.136,
"grad_norm": 0.1577321206922815,
"importance_ratio": 1.0010697841644287,
"learning_rate": 0.0001,
"loss": -0.0002,
"mismatch_kl": 0.00940707977861166,
"reward": 0.7621874809265137,
"reward/refusal_reward_func": 0.7621874809265137,
"reward/std": 0.18551842868328094,
"step": 68,
"timing/generation_ms": 20498.32931160927,
"timing/scoring_ms": 54109.15730148554,
"timing/total_ms": 74607.4866130948,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.203547000885
},
{
"advantage/absmean": 0.27099609375,
"entropy": 0.5135282278060913,
"epoch": 0.138,
"grad_norm": 0.2079259512896872,
"importance_ratio": 1.0020484924316406,
"learning_rate": 0.0001,
"loss": -0.0,
"mismatch_kl": 0.012067537754774094,
"reward": 0.6365625262260437,
"reward/refusal_reward_func": 0.6365625262260437,
"reward/std": 0.32783961296081543,
"step": 69,
"timing/generation_ms": 20592.435374855995,
"timing/scoring_ms": 59319.189973175526,
"timing/total_ms": 79911.62534803152,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 397.43536710739136
},
{
"advantage/absmean": 0.0,
"entropy": 0.4793013036251068,
"epoch": 0.14,
"grad_norm": 0.0,
"importance_ratio": 1.0002073049545288,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.01224527694284916,
"reward": 0.8100000023841858,
"reward/refusal_reward_func": 0.8100000023841858,
"reward/std": 0.0,
"step": 70,
"timing/generation_ms": 20546.609550714493,
"timing/scoring_ms": 42320.83362340927,
"timing/total_ms": 62867.443174123764,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 161.7110676765442
},
{
"advantage/absmean": 0.053593751043081284,
"entropy": 0.35620445013046265,
"epoch": 0.142,
"grad_norm": 0.06178490851261073,
"importance_ratio": 1.0008577108383179,
"learning_rate": 0.0001,
"loss": 0.0001,
"mismatch_kl": 0.007034921087324619,
"reward": 0.7793750166893005,
"reward/refusal_reward_func": 0.7793750166893005,
"reward/std": 0.08525467664003372,
"step": 71,
"timing/generation_ms": 20559.83528494835,
"timing/scoring_ms": 50445.407539606094,
"timing/total_ms": 71005.24282455444,
"tokens/completion": 2047.65625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 165.77110528945923
},
{
"advantage/absmean": 0.06621094048023224,
"entropy": 0.34386953711509705,
"epoch": 0.144,
"grad_norm": 0.06068478204359807,
"importance_ratio": 1.0005115270614624,
"learning_rate": 0.0001,
"loss": 0.0001,
"mismatch_kl": 0.006632550619542599,
"reward": 0.7746874690055847,
"reward/refusal_reward_func": 0.7746874690055847,
"reward/std": 0.14985378086566925,
"step": 72,
"timing/generation_ms": 20651.6492664814,
"timing/scoring_ms": 53449.473068118095,
"timing/total_ms": 74101.1223345995,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 396.06008672714233
},
{
"advantage/absmean": 0.08964844048023224,
"entropy": 0.3689025640487671,
"epoch": 0.146,
"grad_norm": 0.028592002076965557,
"importance_ratio": 1.0001220703125,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.009937528520822525,
"reward": 0.7621874809265137,
"reward/refusal_reward_func": 0.7621874809265137,
"reward/std": 0.18551842868328094,
"step": 73,
"timing/generation_ms": 20704.90287989378,
"timing/scoring_ms": 56059.35876071453,
"timing/total_ms": 76764.26164060831,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.15700674057007
},
{
"advantage/absmean": 0.10025390982627869,
"entropy": 0.3531518578529358,
"epoch": 0.148,
"grad_norm": 0.0966128922725595,
"importance_ratio": 0.9996236562728882,
"learning_rate": 0.0001,
"loss": -0.0001,
"mismatch_kl": 0.00807525310665369,
"reward": 0.754687488079071,
"reward/refusal_reward_func": 0.754687488079071,
"reward/std": 0.19453445076942444,
"step": 74,
"timing/generation_ms": 20194.012761116028,
"timing/scoring_ms": 45799.67290908098,
"timing/total_ms": 65993.68567019701,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 150.22299551963806
},
{
"advantage/absmean": 0.08964844048023224,
"entropy": 0.3910990059375763,
"epoch": 0.15,
"grad_norm": 0.14646197667696922,
"importance_ratio": 0.999146580696106,
"learning_rate": 0.0001,
"loss": -0.0003,
"mismatch_kl": 0.008608575910329819,
"reward": 0.7621874809265137,
"reward/refusal_reward_func": 0.7621874809265137,
"reward/std": 0.18551842868328094,
"step": 75,
"timing/generation_ms": 20237.79760301113,
"timing/scoring_ms": 51105.7443395257,
"timing/total_ms": 71343.54194253683,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.9270164966583
},
{
"advantage/absmean": 0.13537108898162842,
"entropy": 0.2564987540245056,
"epoch": 0.152,
"grad_norm": 0.10767344052989877,
"importance_ratio": 1.000230073928833,
"learning_rate": 0.0001,
"loss": 0.0002,
"mismatch_kl": 0.00910898856818676,
"reward": 0.7353124618530273,
"reward/refusal_reward_func": 0.7353124618530273,
"reward/std": 0.23228463530540466,
"step": 76,
"timing/generation_ms": 20231.063432991505,
"timing/scoring_ms": 64505.957297980785,
"timing/total_ms": 84737.02073097229,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.6906681060791
},
{
"advantage/absmean": 0.04843749850988388,
"entropy": 0.35127270221710205,
"epoch": 0.154,
"grad_norm": 0.05430162481667564,
"importance_ratio": 1.001720666885376,
"learning_rate": 0.0001,
"loss": -0.0112,
"mismatch_kl": 0.02907688170671463,
"reward": 0.7849999666213989,
"reward/refusal_reward_func": 0.7849999666213989,
"reward/std": 0.13919411599636078,
"step": 77,
"timing/generation_ms": 2299.7111305594444,
"timing/scoring_ms": 24145.363181829453,
"timing/total_ms": 26445.074312388897,
"tokens/completion": 256.90625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 43.695470094680786
},
{
"advantage/absmean": 0.17824219167232513,
"entropy": 0.2108859121799469,
"epoch": 0.156,
"grad_norm": 0.1343157239601839,
"importance_ratio": 1.000138521194458,
"learning_rate": 0.0001,
"loss": -0.0061,
"mismatch_kl": 0.006567842327058315,
"reward": 0.7043750286102295,
"reward/refusal_reward_func": 0.7043750286102295,
"reward/std": 0.26504644751548767,
"step": 78,
"timing/generation_ms": 19559.65828895569,
"timing/scoring_ms": 56106.24121129513,
"timing/total_ms": 75665.89950025082,
"tokens/completion": 2012.8125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 397.83462166786194
},
{
"advantage/absmean": 0.14109376072883606,
"entropy": 0.2118162214756012,
"epoch": 0.158,
"grad_norm": 0.04314766069392161,
"importance_ratio": 0.999754011631012,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.005713644903153181,
"reward": 0.7293750047683716,
"reward/refusal_reward_func": 0.7293750047683716,
"reward/std": 0.23431998491287231,
"step": 79,
"timing/generation_ms": 20274.492114782333,
"timing/scoring_ms": 53302.132822573185,
"timing/total_ms": 73576.62493735552,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 396.22464632987976
},
{
"advantage/absmean": 0.13593749701976776,
"entropy": 0.32609474658966064,
"epoch": 0.16,
"grad_norm": 0.10083540436117842,
"importance_ratio": 1.0001963376998901,
"learning_rate": 0.0001,
"loss": -0.0002,
"mismatch_kl": 0.008770663291215897,
"reward": 0.7350000143051147,
"reward/refusal_reward_func": 0.7350000143051147,
"reward/std": 0.23318448662757874,
"step": 80,
"timing/generation_ms": 20281.54794126749,
"timing/scoring_ms": 45572.31470942497,
"timing/total_ms": 65853.86265069246,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 157.70197463035583
},
{
"advantage/absmean": 0.13197265565395355,
"entropy": 0.349658727645874,
"epoch": 0.162,
"grad_norm": 0.15801403288716265,
"importance_ratio": 0.9997016191482544,
"learning_rate": 0.0001,
"loss": 0.0003,
"mismatch_kl": 0.007918323390185833,
"reward": 0.7371875047683716,
"reward/refusal_reward_func": 0.7371875047683716,
"reward/std": 0.22671890258789062,
"step": 81,
"timing/generation_ms": 19729.195773601532,
"timing/scoring_ms": 52965.313747525215,
"timing/total_ms": 72694.50952112675,
"tokens/completion": 2031.71875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.87234902381897
},
{
"advantage/absmean": 0.19093748927116394,
"entropy": 0.30307242274284363,
"epoch": 0.164,
"grad_norm": 0.05658978916858572,
"importance_ratio": 0.9997415542602539,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.009000571444630623,
"reward": 0.6924999952316284,
"reward/refusal_reward_func": 0.6924999952316284,
"reward/std": 0.2553306818008423,
"step": 82,
"timing/generation_ms": 20152.134649455547,
"timing/scoring_ms": 55147.81706035137,
"timing/total_ms": 75299.95170980692,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 276.00095558166504
},
{
"advantage/absmean": 0.13197265565395355,
"entropy": 0.2418041229248047,
"epoch": 0.166,
"grad_norm": 0.09124961991568047,
"importance_ratio": 0.9993461966514587,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.008319162763655186,
"reward": 0.7371875047683716,
"reward/refusal_reward_func": 0.7371875047683716,
"reward/std": 0.22671890258789062,
"step": 83,
"timing/generation_ms": 20235.445871949196,
"timing/scoring_ms": 60683.41539800167,
"timing/total_ms": 80918.86126995087,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.98225951194763
},
{
"advantage/absmean": 0.09492187201976776,
"entropy": 0.36278918385505676,
"epoch": 0.168,
"grad_norm": 0.10428837192295014,
"importance_ratio": 1.0009691715240479,
"learning_rate": 0.0001,
"loss": 0.0001,
"mismatch_kl": 0.006821990944445133,
"reward": 0.7593749761581421,
"reward/refusal_reward_func": 0.7593749761581421,
"reward/std": 0.19606979191303253,
"step": 84,
"timing/generation_ms": 20079.659663140774,
"timing/scoring_ms": 62456.88313245773,
"timing/total_ms": 82536.5427955985,
"tokens/completion": 2047.4375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.080442905426
},
{
"advantage/absmean": 0.16296875476837158,
"entropy": 0.2741187810897827,
"epoch": 0.17,
"grad_norm": 0.05559178148711944,
"importance_ratio": 1.0001330375671387,
"learning_rate": 0.0001,
"loss": 0.0002,
"mismatch_kl": 0.008196860551834106,
"reward": 0.7168750166893005,
"reward/refusal_reward_func": 0.7168750166893005,
"reward/std": 0.2492668777704239,
"step": 85,
"timing/generation_ms": 20191.432282328606,
"timing/scoring_ms": 66640.07867872715,
"timing/total_ms": 86831.51096105576,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.3853757381439
},
{
"advantage/absmean": 0.06513672322034836,
"entropy": 0.3744433522224426,
"epoch": 0.172,
"grad_norm": 0.02027358693373687,
"importance_ratio": 0.9997438192367554,
"learning_rate": 0.0001,
"loss": 0.0001,
"mismatch_kl": 0.0079119261354208,
"reward": 0.7740625143051147,
"reward/refusal_reward_func": 0.7740625143051147,
"reward/std": 0.1449754238128662,
"step": 86,
"timing/generation_ms": 20130.67189604044,
"timing/scoring_ms": 63243.81287395954,
"timing/total_ms": 83374.48476999998,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.553094625473
},
{
"advantage/absmean": 0.06621094048023224,
"entropy": 0.2802242636680603,
"epoch": 0.174,
"grad_norm": 0.11007226741752094,
"importance_ratio": 1.0002408027648926,
"learning_rate": 0.0001,
"loss": -0.0,
"mismatch_kl": 0.010727161541581154,
"reward": 0.7746875286102295,
"reward/refusal_reward_func": 0.7746875286102295,
"reward/std": 0.14985376596450806,
"step": 87,
"timing/generation_ms": 20197.582133114338,
"timing/scoring_ms": 60626.82098895311,
"timing/total_ms": 80824.40312206745,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.6186418533325
},
{
"advantage/absmean": 0.15476563572883606,
"entropy": 0.15260648727416992,
"epoch": 0.176,
"grad_norm": 0.09113868718318835,
"importance_ratio": 0.9991167187690735,
"learning_rate": 0.0001,
"loss": -0.0,
"mismatch_kl": 0.007635745219886303,
"reward": 0.7215625047683716,
"reward/refusal_reward_func": 0.7215625047683716,
"reward/std": 0.2370404750108719,
"step": 88,
"timing/generation_ms": 20186.022453010082,
"timing/scoring_ms": 65319.87015157938,
"timing/total_ms": 85505.89260458946,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.704843044281
},
{
"advantage/absmean": 0.09375,
"entropy": 0.2925874888896942,
"epoch": 0.178,
"grad_norm": 0.10900817571461202,
"importance_ratio": 1.0003485679626465,
"learning_rate": 0.0001,
"loss": 0.0003,
"mismatch_kl": 0.01015115063637495,
"reward": 0.7599999904632568,
"reward/refusal_reward_func": 0.7599999904632568,
"reward/std": 0.19364915788173676,
"step": 89,
"timing/generation_ms": 19985.6186658144,
"timing/scoring_ms": 50727.93058305979,
"timing/total_ms": 70713.54924887419,
"tokens/completion": 2039.15625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 292.40805864334106
},
{
"advantage/absmean": 0.24515625834465027,
"entropy": 0.24920716881752014,
"epoch": 0.18,
"grad_norm": 0.12621044924209393,
"importance_ratio": 1.000746250152588,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.008628414012491703,
"reward": 0.6162500381469727,
"reward/refusal_reward_func": 0.6162500381469727,
"reward/std": 0.2840307056903839,
"step": 90,
"timing/generation_ms": 20263.45807313919,
"timing/scoring_ms": 69397.22065627575,
"timing/total_ms": 89660.67872941494,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.5488703250885
},
{
"advantage/absmean": 0.10718750208616257,
"entropy": 0.3780635893344879,
"epoch": 0.182,
"grad_norm": 0.041883076718277436,
"importance_ratio": 1.0010097026824951,
"learning_rate": 0.0001,
"loss": 0.0001,
"mismatch_kl": 0.009139418601989746,
"reward": 0.7487499713897705,
"reward/refusal_reward_func": 0.7487499713897705,
"reward/std": 0.1976384073495865,
"step": 91,
"timing/generation_ms": 20657.530024647713,
"timing/scoring_ms": 65376.88625603914,
"timing/total_ms": 86034.41628068686,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.1356108188629
},
{
"advantage/absmean": 0.13974609971046448,
"entropy": 0.3699057102203369,
"epoch": 0.184,
"grad_norm": 0.06598040996573111,
"importance_ratio": 0.9998457431793213,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.0077532450668513775,
"reward": 0.7271875143051147,
"reward/refusal_reward_func": 0.7271875143051147,
"reward/std": 0.2168990969657898,
"step": 92,
"timing/generation_ms": 20616.587534546852,
"timing/scoring_ms": 47124.867990612984,
"timing/total_ms": 67741.45552515984,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 168.30030918121338
},
{
"advantage/absmean": 0.19189453125,
"entropy": 0.34180349111557007,
"epoch": 0.186,
"grad_norm": 0.14232699624625103,
"importance_ratio": 0.999165952205658,
"learning_rate": 0.0001,
"loss": 0.0003,
"mismatch_kl": 0.008044413290917873,
"reward": 0.6871874928474426,
"reward/refusal_reward_func": 0.6871874928474426,
"reward/std": 0.2622169256210327,
"step": 93,
"timing/generation_ms": 20529.827870428562,
"timing/scoring_ms": 60052.588775753975,
"timing/total_ms": 80582.41664618254,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 396.85295939445496
},
{
"advantage/absmean": 0.20988282561302185,
"entropy": 0.315336674451828,
"epoch": 0.188,
"grad_norm": 0.10320818511601894,
"importance_ratio": 1.000422477722168,
"learning_rate": 0.0001,
"loss": -0.0,
"mismatch_kl": 0.007602104917168617,
"reward": 0.6856250166893005,
"reward/refusal_reward_func": 0.6856250166893005,
"reward/std": 0.28907111287117004,
"step": 94,
"timing/generation_ms": 20579.548463225365,
"timing/scoring_ms": 51010.259330272675,
"timing/total_ms": 71589.80779349804,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.95818734169006
},
{
"advantage/absmean": 0.15345704555511475,
"entropy": 0.5262030363082886,
"epoch": 0.19,
"grad_norm": 0.05870104388608926,
"importance_ratio": 0.9999489188194275,
"learning_rate": 0.0001,
"loss": 0.0002,
"mismatch_kl": 0.0076672472059726715,
"reward": 0.7190625071525574,
"reward/refusal_reward_func": 0.7190625071525574,
"reward/std": 0.2384108603000641,
"step": 95,
"timing/generation_ms": 20614.634588360786,
"timing/scoring_ms": 65994.11156028509,
"timing/total_ms": 86608.74614864588,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.0603678226471
},
{
"advantage/absmean": 0.23779296875,
"entropy": 0.3904465436935425,
"epoch": 0.192,
"grad_norm": 0.1935352935904031,
"importance_ratio": 1.000299334526062,
"learning_rate": 0.0001,
"loss": 0.0,
"mismatch_kl": 0.008834589272737503,
"reward": 0.6578124761581421,
"reward/refusal_reward_func": 0.6578124761581421,
"reward/std": 0.2928708493709564,
"step": 96,
"timing/generation_ms": 20604.460656642914,
"timing/scoring_ms": 67186.7751404643,
"timing/total_ms": 87791.23579710722,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.96749925613403
},
{
"advantage/absmean": 0.2852538824081421,
"entropy": 0.32257264852523804,
"epoch": 0.194,
"grad_norm": 0.11248909611986616,
"importance_ratio": 0.9994723796844482,
"learning_rate": 0.0001,
"loss": -0.0,
"mismatch_kl": 0.008222612552344799,
"reward": 0.6115624904632568,
"reward/refusal_reward_func": 0.6115624904632568,
"reward/std": 0.3214528560638428,
"step": 97,
"timing/generation_ms": 20211.18316054344,
"timing/scoring_ms": 60143.2975307107,
"timing/total_ms": 80354.48069125414,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 395.2164263725281
},
{
"advantage/absmean": 0.09375,
"entropy": 0.45740675926208496,
"epoch": 0.196,
"grad_norm": 0.12144158106340411,
"importance_ratio": 0.9998034834861755,
"learning_rate": 0.0001,
"loss": 0.0001,
"mismatch_kl": 0.008089970797300339,
"reward": 0.7599999904632568,
"reward/refusal_reward_func": 0.7599999904632568,
"reward/std": 0.19364915788173676,
"step": 98,
"timing/generation_ms": 20260.59687882662,
"timing/scoring_ms": 44623.64313751459,
"timing/total_ms": 64884.24001634121,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 131.37106108665466
},
{
"advantage/absmean": 0.04843749850988388,
"entropy": 0.5326197147369385,
"epoch": 0.198,
"grad_norm": 0.12835217845348193,
"importance_ratio": 0.9994455575942993,
"learning_rate": 0.0001,
"loss": -0.0,
"mismatch_kl": 0.009172793477773666,
"reward": 0.7849999666213989,
"reward/refusal_reward_func": 0.7849999666213989,
"reward/std": 0.13919411599636078,
"step": 99,
"timing/generation_ms": 20207.647144794464,
"timing/scoring_ms": 49733.94272476435,
"timing/total_ms": 69941.58986955881,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 161.61080026626587
},
{
"advantage/absmean": 0.10171875357627869,
"entropy": 0.5042125582695007,
"epoch": 0.2,
"grad_norm": 0.08743211269759214,
"importance_ratio": 1.0003973245620728,
"learning_rate": 0.0001,
"loss": -0.0002,
"mismatch_kl": 0.01178868766874075,
"reward": 0.7518749833106995,
"reward/refusal_reward_func": 0.7518749833106995,
"reward/std": 0.17614690959453583,
"step": 100,
"timing/generation_ms": 20365.172304213047,
"timing/scoring_ms": 65265.56546241045,
"timing/total_ms": 85630.7377666235,
"tokens/completion": 2048.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 394.74688720703125
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}