| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2, |
| "eval_steps": 50, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "advantage/absmean": 0.0, |
| "entropy": 0.49213120341300964, |
| "epoch": 0.002, |
| "grad_norm": 0.0, |
| "importance_ratio": 0.9995924234390259, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "mismatch_kl": 0.0013128521386533976, |
| "reward": 0.009999999776482582, |
| "reward/refusal_reward_func": 0.009999999776482582, |
| "reward/std": 0.0, |
| "step": 1, |
| "timing/generation_ms": 3254.1263923048973, |
| "timing/scoring_ms": 25275.689974427223, |
| "timing/total_ms": 28529.81636673212, |
| "tokens/completion": 196.125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 37.95693778991699 |
| }, |
| { |
| "advantage/absmean": 0.17499999701976776, |
| "entropy": 0.6319499611854553, |
| "epoch": 0.004, |
| "grad_norm": 0.21545597111492612, |
| "importance_ratio": 0.9992015957832336, |
| "learning_rate": 1e-05, |
| "loss": -0.0015, |
| "mismatch_kl": 0.0010142761748284101, |
| "reward": 0.7100000381469727, |
| "reward/refusal_reward_func": 0.7100000381469727, |
| "reward/std": 0.26457512378692627, |
| "step": 2, |
| "timing/generation_ms": 5644.344195723534, |
| "timing/scoring_ms": 31337.847255170345, |
| "timing/total_ms": 36982.19145089388, |
| "tokens/completion": 642.9375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 92.69584393501282 |
| }, |
| { |
| "advantage/absmean": 0.08203125, |
| "entropy": 0.4765586853027344, |
| "epoch": 0.006, |
| "grad_norm": 0.2892202194064109, |
| "importance_ratio": 1.000748872756958, |
| "learning_rate": 2e-05, |
| "loss": 0.0023, |
| "mismatch_kl": 0.001345986733213067, |
| "reward": 0.06593750417232513, |
| "reward/refusal_reward_func": 0.06593750417232513, |
| "reward/std": 0.15905697643756866, |
| "step": 3, |
| "timing/generation_ms": 2269.3659961223602, |
| "timing/scoring_ms": 26262.165516614914, |
| "timing/total_ms": 28531.531512737274, |
| "tokens/completion": 240.15625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 143.94128561019897 |
| }, |
| { |
| "advantage/absmean": 0.0237890612334013, |
| "entropy": 0.5391862988471985, |
| "epoch": 0.008, |
| "grad_norm": 0.03657768868171977, |
| "importance_ratio": 1.0003156661987305, |
| "learning_rate": 3e-05, |
| "loss": 0.0003, |
| "mismatch_kl": 0.0014281703624874353, |
| "reward": 0.02812499925494194, |
| "reward/refusal_reward_func": 0.02812499925494194, |
| "reward/std": 0.028986798599362373, |
| "step": 4, |
| "timing/generation_ms": 3297.7539896965027, |
| "timing/scoring_ms": 18404.439702630043, |
| "timing/total_ms": 21702.193692326546, |
| "tokens/completion": 328.96875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 30.997375011444092 |
| }, |
| { |
| "advantage/absmean": 0.37681642174720764, |
| "entropy": 0.6411616206169128, |
| "epoch": 0.01, |
| "grad_norm": 0.3743397229309058, |
| "importance_ratio": 1.0011004209518433, |
| "learning_rate": 4e-05, |
| "loss": -0.0173, |
| "mismatch_kl": 0.00182775326538831, |
| "reward": 0.37406250834465027, |
| "reward/refusal_reward_func": 0.37406250834465027, |
| "reward/std": 0.3808777630329132, |
| "step": 5, |
| "timing/generation_ms": 4316.511310636997, |
| "timing/scoring_ms": 31669.483192265034, |
| "timing/total_ms": 35985.99450290203, |
| "tokens/completion": 485.3125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 72.83462309837341 |
| }, |
| { |
| "advantage/absmean": 0.007929688319563866, |
| "entropy": 0.5038114786148071, |
| "epoch": 0.012, |
| "grad_norm": 0.02765669861934211, |
| "importance_ratio": 0.999994158744812, |
| "learning_rate": 5e-05, |
| "loss": -0.0013, |
| "mismatch_kl": 0.0035878715571016073, |
| "reward": 0.014374999329447746, |
| "reward/refusal_reward_func": 0.014374999329447746, |
| "reward/std": 0.015398356132209301, |
| "step": 6, |
| "timing/generation_ms": 3028.248645365238, |
| "timing/scoring_ms": 25548.00620675087, |
| "timing/total_ms": 28576.254852116108, |
| "tokens/completion": 334.125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 36.018136739730835 |
| }, |
| { |
| "advantage/absmean": 0.0, |
| "entropy": 0.29107579588890076, |
| "epoch": 0.014, |
| "grad_norm": 0.0, |
| "importance_ratio": 1.0008047819137573, |
| "learning_rate": 6e-05, |
| "loss": 0.0, |
| "mismatch_kl": 0.003742748638615012, |
| "reward": 0.009999999776482582, |
| "reward/refusal_reward_func": 0.009999999776482582, |
| "reward/std": 0.0, |
| "step": 7, |
| "timing/generation_ms": 1139.9633809924126, |
| "timing/scoring_ms": 19651.204399764538, |
| "timing/total_ms": 20791.16778075695, |
| "tokens/completion": 100.375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 23.77466917037964 |
| }, |
| { |
| "advantage/absmean": 0.006562499795109034, |
| "entropy": 0.3522435426712036, |
| "epoch": 0.016, |
| "grad_norm": 0.013866793274081895, |
| "importance_ratio": 1.0039043426513672, |
| "learning_rate": 7e-05, |
| "loss": 0.0029, |
| "mismatch_kl": 0.022596202790737152, |
| "reward": 0.013749999925494194, |
| "reward/refusal_reward_func": 0.013749999925494194, |
| "reward/std": 0.009921567514538765, |
| "step": 8, |
| "timing/generation_ms": 784.0555533766747, |
| "timing/scoring_ms": 19302.494660019875, |
| "timing/total_ms": 20086.55021339655, |
| "tokens/completion": 56.875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 24.927189350128174 |
| }, |
| { |
| "advantage/absmean": 0.17390625178813934, |
| "entropy": 0.8274978995323181, |
| "epoch": 0.018, |
| "grad_norm": 0.11705006346082461, |
| "importance_ratio": 1.0001081228256226, |
| "learning_rate": 8e-05, |
| "loss": 0.0021, |
| "mismatch_kl": 0.0024926774203777313, |
| "reward": 0.6775000095367432, |
| "reward/refusal_reward_func": 0.6775000095367432, |
| "reward/std": 0.24893523752689362, |
| "step": 9, |
| "timing/generation_ms": 9839.449286460876, |
| "timing/scoring_ms": 40201.816976070404, |
| "timing/total_ms": 50041.26626253128, |
| "tokens/completion": 1103.59375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 94.76600408554077 |
| }, |
| { |
| "advantage/absmean": 0.0035156249068677425, |
| "entropy": 0.24158422648906708, |
| "epoch": 0.02, |
| "grad_norm": 0.005773329550661251, |
| "importance_ratio": 0.9983059763908386, |
| "learning_rate": 9e-05, |
| "loss": 0.0009, |
| "mismatch_kl": 0.05829961970448494, |
| "reward": 0.011874999850988388, |
| "reward/refusal_reward_func": 0.011874999850988388, |
| "reward/std": 0.007261843420565128, |
| "step": 10, |
| "timing/generation_ms": 553.7310987710953, |
| "timing/scoring_ms": 18607.969902455807, |
| "timing/total_ms": 19161.701001226902, |
| "tokens/completion": 26.25, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 23.25968861579895 |
| }, |
| { |
| "advantage/absmean": 0.027421876788139343, |
| "entropy": 0.5188111066818237, |
| "epoch": 0.022, |
| "grad_norm": 0.030197590581975912, |
| "importance_ratio": 0.9986535310745239, |
| "learning_rate": 0.0001, |
| "loss": -0.0035, |
| "mismatch_kl": 0.005903073586523533, |
| "reward": 0.026874996721744537, |
| "reward/refusal_reward_func": 0.026874996721744537, |
| "reward/std": 0.046599194407463074, |
| "step": 11, |
| "timing/generation_ms": 2835.02546697855, |
| "timing/scoring_ms": 24443.98508220911, |
| "timing/total_ms": 27279.01054918766, |
| "tokens/completion": 308.53125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 145.36076593399048 |
| }, |
| { |
| "advantage/absmean": 0.12355469167232513, |
| "entropy": 0.44281068444252014, |
| "epoch": 0.024, |
| "grad_norm": 0.42446378552193, |
| "importance_ratio": 0.9987350106239319, |
| "learning_rate": 0.0001, |
| "loss": -0.0423, |
| "mismatch_kl": 0.006952627561986446, |
| "reward": 0.09437499195337296, |
| "reward/refusal_reward_func": 0.09437499195337296, |
| "reward/std": 0.21499907970428467, |
| "step": 12, |
| "timing/generation_ms": 2469.3235754966736, |
| "timing/scoring_ms": 35024.46338534355, |
| "timing/total_ms": 37493.786960840225, |
| "tokens/completion": 273.28125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 154.79029417037964 |
| }, |
| { |
| "advantage/absmean": 0.09375, |
| "entropy": 0.5496101379394531, |
| "epoch": 0.026, |
| "grad_norm": 0.1257011193040082, |
| "importance_ratio": 0.9992591738700867, |
| "learning_rate": 0.0001, |
| "loss": -0.0035, |
| "mismatch_kl": 0.0034747051540762186, |
| "reward": 0.7599999904632568, |
| "reward/refusal_reward_func": 0.7599999904632568, |
| "reward/std": 0.19364915788173676, |
| "step": 13, |
| "timing/generation_ms": 8158.617563545704, |
| "timing/scoring_ms": 36215.20960330963, |
| "timing/total_ms": 44373.827166855335, |
| "tokens/completion": 930.6875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 122.23734498023987 |
| }, |
| { |
| "advantage/absmean": 0.1844140589237213, |
| "entropy": 0.6212608814239502, |
| "epoch": 0.028, |
| "grad_norm": 0.26809699141136345, |
| "importance_ratio": 1.0004856586456299, |
| "learning_rate": 0.0001, |
| "loss": -0.0125, |
| "mismatch_kl": 0.0054090130142867565, |
| "reward": 0.1978124976158142, |
| "reward/refusal_reward_func": 0.1978124976158142, |
| "reward/std": 0.21153803169727325, |
| "step": 14, |
| "timing/generation_ms": 3771.018899977207, |
| "timing/scoring_ms": 32557.9876229167, |
| "timing/total_ms": 36329.006522893906, |
| "tokens/completion": 423.1875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 76.63339400291443 |
| }, |
| { |
| "advantage/absmean": 0.09375, |
| "entropy": 0.5119910836219788, |
| "epoch": 0.03, |
| "grad_norm": 0.2293179923709544, |
| "importance_ratio": 1.0004829168319702, |
| "learning_rate": 0.0001, |
| "loss": 0.0032, |
| "mismatch_kl": 0.0062421830371022224, |
| "reward": 0.05999999865889549, |
| "reward/refusal_reward_func": 0.05999999865889549, |
| "reward/std": 0.19364915788173676, |
| "step": 15, |
| "timing/generation_ms": 3333.340108394623, |
| "timing/scoring_ms": 27093.5076251626, |
| "timing/total_ms": 30426.847733557224, |
| "tokens/completion": 374.25, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 143.25073266029358 |
| }, |
| { |
| "advantage/absmean": 0.023906249552965164, |
| "entropy": 0.7846541404724121, |
| "epoch": 0.032, |
| "grad_norm": 0.03020597167190001, |
| "importance_ratio": 0.9995452165603638, |
| "learning_rate": 0.0001, |
| "loss": 0.0001, |
| "mismatch_kl": 0.006196495145559311, |
| "reward": 0.04218750074505806, |
| "reward/refusal_reward_func": 0.04218750074505806, |
| "reward/std": 0.030489176511764526, |
| "step": 16, |
| "timing/generation_ms": 6338.90475332737, |
| "timing/scoring_ms": 35081.99892938137, |
| "timing/total_ms": 41420.90368270874, |
| "tokens/completion": 725.5625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 89.63312363624573 |
| }, |
| { |
| "advantage/absmean": 0.2109375, |
| "entropy": 0.4431617856025696, |
| "epoch": 0.034, |
| "grad_norm": 0.11997986504420295, |
| "importance_ratio": 1.0006847381591797, |
| "learning_rate": 0.0001, |
| "loss": -0.0079, |
| "mismatch_kl": 0.0046984292566776276, |
| "reward": 0.6850000023841858, |
| "reward/refusal_reward_func": 0.6850000023841858, |
| "reward/std": 0.2904737591743469, |
| "step": 17, |
| "timing/generation_ms": 4871.280819177628, |
| "timing/scoring_ms": 27726.198948919773, |
| "timing/total_ms": 32597.4797680974, |
| "tokens/completion": 541.3125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 72.54996109008789 |
| }, |
| { |
| "advantage/absmean": 0.020624998956918716, |
| "entropy": 0.005482906475663185, |
| "epoch": 0.036, |
| "grad_norm": 2.4822412849126587e-05, |
| "importance_ratio": 0.9996475577354431, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 3.828452292964357e-07, |
| "reward": 0.023749999701976776, |
| "reward/refusal_reward_func": 0.023749999701976776, |
| "reward/std": 0.026896795257925987, |
| "step": 18, |
| "timing/generation_ms": 466.4832055568695, |
| "timing/scoring_ms": 18893.90940964222, |
| "timing/total_ms": 19360.39261519909, |
| "tokens/completion": 13.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 24.94274139404297 |
| }, |
| { |
| "advantage/absmean": 0.06175781413912773, |
| "entropy": 0.5270810723304749, |
| "epoch": 0.038, |
| "grad_norm": 0.19852274538781398, |
| "importance_ratio": 1.0000227689743042, |
| "learning_rate": 0.0001, |
| "loss": 0.0024, |
| "mismatch_kl": 0.005923233926296234, |
| "reward": 0.06562499701976776, |
| "reward/refusal_reward_func": 0.06562499701976776, |
| "reward/std": 0.13811086118221283, |
| "step": 19, |
| "timing/generation_ms": 4183.5604682564735, |
| "timing/scoring_ms": 30921.414978802204, |
| "timing/total_ms": 35104.97544705868, |
| "tokens/completion": 476.46875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 69.38085293769836 |
| }, |
| { |
| "advantage/absmean": 0.0018164062639698386, |
| "entropy": 0.004832141101360321, |
| "epoch": 0.04, |
| "grad_norm": 0.00028270733132418016, |
| "importance_ratio": 0.9995356202125549, |
| "learning_rate": 0.0001, |
| "loss": -0.0, |
| "mismatch_kl": 1.2251906582605443e-06, |
| "reward": 0.010937499813735485, |
| "reward/refusal_reward_func": 0.010937499813735485, |
| "reward/std": 0.005219778511673212, |
| "step": 20, |
| "timing/generation_ms": 469.4804549217224, |
| "timing/scoring_ms": 18075.80190896988, |
| "timing/total_ms": 18545.2823638916, |
| "tokens/completion": 13.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 22.75140905380249 |
| }, |
| { |
| "advantage/absmean": 0.13593749701976776, |
| "entropy": 0.6113055944442749, |
| "epoch": 0.042, |
| "grad_norm": 0.14856988549338257, |
| "importance_ratio": 0.999966561794281, |
| "learning_rate": 0.0001, |
| "loss": -0.0022, |
| "mismatch_kl": 0.005093984771519899, |
| "reward": 0.7350000143051147, |
| "reward/refusal_reward_func": 0.7350000143051147, |
| "reward/std": 0.23318447172641754, |
| "step": 21, |
| "timing/generation_ms": 6330.163478851318, |
| "timing/scoring_ms": 28551.29039287567, |
| "timing/total_ms": 34881.45387172699, |
| "tokens/completion": 715.9375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 84.07915687561035 |
| }, |
| { |
| "advantage/absmean": 0.01318359375, |
| "entropy": 0.5851391553878784, |
| "epoch": 0.044, |
| "grad_norm": 0.040096615934913975, |
| "importance_ratio": 1.0004969835281372, |
| "learning_rate": 0.0001, |
| "loss": -0.0018, |
| "mismatch_kl": 0.009555388242006302, |
| "reward": 0.017812499776482582, |
| "reward/refusal_reward_func": 0.017812499776482582, |
| "reward/std": 0.020575225353240967, |
| "step": 22, |
| "timing/generation_ms": 2347.38065302372, |
| "timing/scoring_ms": 26964.800156652927, |
| "timing/total_ms": 29312.180809676647, |
| "tokens/completion": 255.15625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 150.09117031097412 |
| }, |
| { |
| "advantage/absmean": 0.01718750037252903, |
| "entropy": 0.5837696194648743, |
| "epoch": 0.046, |
| "grad_norm": 0.027639162796687714, |
| "importance_ratio": 1.0036453008651733, |
| "learning_rate": 0.0001, |
| "loss": -0.0037, |
| "mismatch_kl": 0.017472539097070694, |
| "reward": 0.022499999031424522, |
| "reward/refusal_reward_func": 0.022499999031424522, |
| "reward/std": 0.021650634706020355, |
| "step": 23, |
| "timing/generation_ms": 2000.5059093236923, |
| "timing/scoring_ms": 24497.070513665676, |
| "timing/total_ms": 26497.57642298937, |
| "tokens/completion": 222.25, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 39.751105070114136 |
| }, |
| { |
| "advantage/absmean": 0.09375, |
| "entropy": 0.6268512010574341, |
| "epoch": 0.048, |
| "grad_norm": 0.18640619697968583, |
| "importance_ratio": 0.9990081787109375, |
| "learning_rate": 0.0001, |
| "loss": -0.0019, |
| "mismatch_kl": 0.005103914998471737, |
| "reward": 0.7599999904632568, |
| "reward/refusal_reward_func": 0.7599999904632568, |
| "reward/std": 0.19364915788173676, |
| "step": 24, |
| "timing/generation_ms": 5919.098302721977, |
| "timing/scoring_ms": 32965.33615142107, |
| "timing/total_ms": 38884.43445414305, |
| "tokens/completion": 674.9375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 105.22681593894958 |
| }, |
| { |
| "advantage/absmean": 0.007929687388241291, |
| "entropy": 0.657754123210907, |
| "epoch": 0.05, |
| "grad_norm": 0.025336375406596286, |
| "importance_ratio": 1.0006119012832642, |
| "learning_rate": 0.0001, |
| "loss": -0.0006, |
| "mismatch_kl": 0.006333181634545326, |
| "reward": 0.014374999329447746, |
| "reward/refusal_reward_func": 0.014374999329447746, |
| "reward/std": 0.015398357063531876, |
| "step": 25, |
| "timing/generation_ms": 4377.142012119293, |
| "timing/scoring_ms": 25255.830891430378, |
| "timing/total_ms": 29632.97290354967, |
| "tokens/completion": 501.125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 47.89186096191406 |
| }, |
| { |
| "advantage/absmean": 0.0, |
| "entropy": 0.17220205068588257, |
| "epoch": 0.052, |
| "grad_norm": 0.0, |
| "importance_ratio": 1.0002461671829224, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.0026462471578270197, |
| "reward": 0.009999999776482582, |
| "reward/refusal_reward_func": 0.009999999776482582, |
| "reward/std": 0.0, |
| "step": 26, |
| "timing/generation_ms": 497.48579412698746, |
| "timing/scoring_ms": 18527.822844684124, |
| "timing/total_ms": 19025.30863881111, |
| "tokens/completion": 19.40625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 22.794724941253662 |
| }, |
| { |
| "advantage/absmean": 0.17499999701976776, |
| "entropy": 0.546366810798645, |
| "epoch": 0.054, |
| "grad_norm": 0.22199732245019957, |
| "importance_ratio": 0.999916672706604, |
| "learning_rate": 0.0001, |
| "loss": -0.0027, |
| "mismatch_kl": 0.005502623040229082, |
| "reward": 0.7100000381469727, |
| "reward/refusal_reward_func": 0.7100000381469727, |
| "reward/std": 0.26457512378692627, |
| "step": 27, |
| "timing/generation_ms": 7701.031573116779, |
| "timing/scoring_ms": 28537.479266524315, |
| "timing/total_ms": 36238.510839641094, |
| "tokens/completion": 881.84375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 68.64078164100647 |
| }, |
| { |
| "advantage/absmean": 0.06175781041383743, |
| "entropy": 0.6991814374923706, |
| "epoch": 0.056, |
| "grad_norm": 0.04213571392317473, |
| "importance_ratio": 0.9988569617271423, |
| "learning_rate": 0.0001, |
| "loss": -0.0026, |
| "mismatch_kl": 0.00713223684579134, |
| "reward": 0.07656250149011612, |
| "reward/refusal_reward_func": 0.07656250149011612, |
| "reward/std": 0.1120995506644249, |
| "step": 28, |
| "timing/generation_ms": 4908.71948748827, |
| "timing/scoring_ms": 35273.585848510265, |
| "timing/total_ms": 40182.305335998535, |
| "tokens/completion": 557.53125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 92.86418867111206 |
| }, |
| { |
| "advantage/absmean": 0.2201562523841858, |
| "entropy": 0.8785261511802673, |
| "epoch": 0.058, |
| "grad_norm": 0.3194279303513244, |
| "importance_ratio": 0.998691737651825, |
| "learning_rate": 0.0001, |
| "loss": 0.0079, |
| "mismatch_kl": 0.006111000664532185, |
| "reward": 0.5774999856948853, |
| "reward/refusal_reward_func": 0.5774999856948853, |
| "reward/std": 0.2741691768169403, |
| "step": 29, |
| "timing/generation_ms": 4181.9010972976685, |
| "timing/scoring_ms": 29512.903429567814, |
| "timing/total_ms": 33694.80452686548, |
| "tokens/completion": 478.875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 72.91563320159912 |
| }, |
| { |
| "advantage/absmean": 0.20125000178813934, |
| "entropy": 0.6921765208244324, |
| "epoch": 0.06, |
| "grad_norm": 0.4406053771041607, |
| "importance_ratio": 0.9964888095855713, |
| "learning_rate": 0.0001, |
| "loss": -0.0386, |
| "mismatch_kl": 0.01204030029475689, |
| "reward": 0.17999999225139618, |
| "reward/refusal_reward_func": 0.17999999225139618, |
| "reward/std": 0.23286262154579163, |
| "step": 30, |
| "timing/generation_ms": 3321.3287368416786, |
| "timing/scoring_ms": 29757.627181708813, |
| "timing/total_ms": 33078.95591855049, |
| "tokens/completion": 382.8125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 53.77872610092163 |
| }, |
| { |
| "advantage/absmean": 0.0018164062639698386, |
| "entropy": 0.5210146903991699, |
| "epoch": 0.062, |
| "grad_norm": 0.01658749077570669, |
| "importance_ratio": 0.9975528120994568, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.024219391867518425, |
| "reward": 0.010937499813735485, |
| "reward/refusal_reward_func": 0.010937499813735485, |
| "reward/std": 0.005219778511673212, |
| "step": 31, |
| "timing/generation_ms": 1940.1119500398636, |
| "timing/scoring_ms": 23180.305778980255, |
| "timing/total_ms": 25120.41772902012, |
| "tokens/completion": 190.5625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 33.192246198654175 |
| }, |
| { |
| "advantage/absmean": 0.04218750074505806, |
| "entropy": 0.7747635245323181, |
| "epoch": 0.064, |
| "grad_norm": 0.05856219259172721, |
| "importance_ratio": 1.0011621713638306, |
| "learning_rate": 0.0001, |
| "loss": 0.0001, |
| "mismatch_kl": 0.008931240066885948, |
| "reward": 0.0456249974668026, |
| "reward/refusal_reward_func": 0.0456249974668026, |
| "reward/std": 0.06082441285252571, |
| "step": 32, |
| "timing/generation_ms": 6628.670156002045, |
| "timing/scoring_ms": 29644.853502511978, |
| "timing/total_ms": 36273.52365851402, |
| "tokens/completion": 758.0625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 44.213383197784424 |
| }, |
| { |
| "advantage/absmean": 0.005097656510770321, |
| "entropy": 0.7032718062400818, |
| "epoch": 0.066, |
| "grad_norm": 0.004871385581572528, |
| "importance_ratio": 0.9980704188346863, |
| "learning_rate": 0.0001, |
| "loss": -0.0002, |
| "mismatch_kl": 0.017929747700691223, |
| "reward": 0.012812498956918716, |
| "reward/refusal_reward_func": 0.012812498956918716, |
| "reward/std": 0.008744417689740658, |
| "step": 33, |
| "timing/generation_ms": 2495.2172189950943, |
| "timing/scoring_ms": 23352.533906698227, |
| "timing/total_ms": 25847.75112569332, |
| "tokens/completion": 271.1875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 146.50593042373657 |
| }, |
| { |
| "advantage/absmean": 0.20359376072883606, |
| "entropy": 0.7547333240509033, |
| "epoch": 0.068, |
| "grad_norm": 0.2936937114262587, |
| "importance_ratio": 1.0006296634674072, |
| "learning_rate": 0.0001, |
| "loss": -0.0116, |
| "mismatch_kl": 0.011033565737307072, |
| "reward": 0.47718751430511475, |
| "reward/refusal_reward_func": 0.47718751430511475, |
| "reward/std": 0.26260992884635925, |
| "step": 34, |
| "timing/generation_ms": 4908.003121614456, |
| "timing/scoring_ms": 30874.776013195515, |
| "timing/total_ms": 35782.77913480997, |
| "tokens/completion": 551.65625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 60.00288248062134 |
| }, |
| { |
| "advantage/absmean": 0.11302733421325684, |
| "entropy": 0.7300074696540833, |
| "epoch": 0.07, |
| "grad_norm": 0.32493885105188386, |
| "importance_ratio": 0.9997804164886475, |
| "learning_rate": 0.0001, |
| "loss": -0.0011, |
| "mismatch_kl": 0.016147281974554062, |
| "reward": 0.10593750327825546, |
| "reward/refusal_reward_func": 0.10593750327825546, |
| "reward/std": 0.16747872531414032, |
| "step": 35, |
| "timing/generation_ms": 3645.879790186882, |
| "timing/scoring_ms": 28717.37616509199, |
| "timing/total_ms": 32363.255955278873, |
| "tokens/completion": 418.03125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 58.10710334777832 |
| }, |
| { |
| "advantage/absmean": 0.015136717818677425, |
| "entropy": 0.6982847452163696, |
| "epoch": 0.072, |
| "grad_norm": 0.04051473715939973, |
| "importance_ratio": 0.9992015957832336, |
| "learning_rate": 0.0001, |
| "loss": -0.0006, |
| "mismatch_kl": 0.01328412164002657, |
| "reward": 0.019687499850988388, |
| "reward/refusal_reward_func": 0.019687499850988388, |
| "reward/std": 0.02113710716366768, |
| "step": 36, |
| "timing/generation_ms": 3033.673010766506, |
| "timing/scoring_ms": 29509.141087532043, |
| "timing/total_ms": 32542.81409829855, |
| "tokens/completion": 339.375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 151.12143540382385 |
| }, |
| { |
| "advantage/absmean": 0.013593749143183231, |
| "entropy": 0.6699934005737305, |
| "epoch": 0.074, |
| "grad_norm": 0.02768782924464237, |
| "importance_ratio": 1.0014734268188477, |
| "learning_rate": 0.0001, |
| "loss": -0.0003, |
| "mismatch_kl": 0.00942978449165821, |
| "reward": 0.019062498584389687, |
| "reward/refusal_reward_func": 0.019062498584389687, |
| "reward/std": 0.017741085961461067, |
| "step": 37, |
| "timing/generation_ms": 4092.4242958426476, |
| "timing/scoring_ms": 29670.215159654617, |
| "timing/total_ms": 33762.639455497265, |
| "tokens/completion": 466.8125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 77.67575216293335 |
| }, |
| { |
| "advantage/absmean": 0.09351562708616257, |
| "entropy": 0.6306953430175781, |
| "epoch": 0.076, |
| "grad_norm": 0.2799348001151443, |
| "importance_ratio": 0.9985800981521606, |
| "learning_rate": 0.0001, |
| "loss": -0.0037, |
| "mismatch_kl": 0.009972751140594482, |
| "reward": 0.06187500059604645, |
| "reward/refusal_reward_func": 0.06187500059604645, |
| "reward/std": 0.19330088794231415, |
| "step": 38, |
| "timing/generation_ms": 5255.660645663738, |
| "timing/scoring_ms": 29534.79740768671, |
| "timing/total_ms": 34790.45805335045, |
| "tokens/completion": 602.75, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 56.81032872200012 |
| }, |
| { |
| "advantage/absmean": 0.021894531324505806, |
| "entropy": 0.6860240697860718, |
| "epoch": 0.078, |
| "grad_norm": 0.021804850572780036, |
| "importance_ratio": 0.999793291091919, |
| "learning_rate": 0.0001, |
| "loss": 0.0008, |
| "mismatch_kl": 0.008842560462653637, |
| "reward": 0.028437498956918716, |
| "reward/refusal_reward_func": 0.028437498956918716, |
| "reward/std": 0.02670549787580967, |
| "step": 39, |
| "timing/generation_ms": 5519.4277837872505, |
| "timing/scoring_ms": 28062.6777485013, |
| "timing/total_ms": 33582.10553228855, |
| "tokens/completion": 635.6875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 49.89069700241089 |
| }, |
| { |
| "advantage/absmean": 0.04843749850988388, |
| "entropy": 0.7330471873283386, |
| "epoch": 0.08, |
| "grad_norm": 0.23560813153480506, |
| "importance_ratio": 1.0003836154937744, |
| "learning_rate": 0.0001, |
| "loss": 0.0014, |
| "mismatch_kl": 0.008081368170678616, |
| "reward": 0.03500000014901161, |
| "reward/refusal_reward_func": 0.03500000014901161, |
| "reward/std": 0.13919411599636078, |
| "step": 40, |
| "timing/generation_ms": 3926.6494438052177, |
| "timing/scoring_ms": 27580.46282082796, |
| "timing/total_ms": 31507.11226463318, |
| "tokens/completion": 445.78125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 70.10888338088989 |
| }, |
| { |
| "advantage/absmean": 0.12544921040534973, |
| "entropy": 0.5206725597381592, |
| "epoch": 0.082, |
| "grad_norm": 0.5033391726424777, |
| "importance_ratio": 1.0012822151184082, |
| "learning_rate": 0.0001, |
| "loss": -0.0225, |
| "mismatch_kl": 0.007961519993841648, |
| "reward": 0.1446875035762787, |
| "reward/refusal_reward_func": 0.1446875035762787, |
| "reward/std": 0.1963731199502945, |
| "step": 41, |
| "timing/generation_ms": 1945.6753060221672, |
| "timing/scoring_ms": 24715.371668338776, |
| "timing/total_ms": 26661.046974360943, |
| "tokens/completion": 203.96875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 41.8072030544281 |
| }, |
| { |
| "advantage/absmean": 0.0018164062639698386, |
| "entropy": 0.82820063829422, |
| "epoch": 0.084, |
| "grad_norm": 0.0008894764900618824, |
| "importance_ratio": 0.9987862706184387, |
| "learning_rate": 0.0001, |
| "loss": 0.0002, |
| "mismatch_kl": 0.008028813637793064, |
| "reward": 0.010937499813735485, |
| "reward/refusal_reward_func": 0.010937499813735485, |
| "reward/std": 0.005219778511673212, |
| "step": 42, |
| "timing/generation_ms": 6800.906598567963, |
| "timing/scoring_ms": 31876.1548101902, |
| "timing/total_ms": 38677.06140875816, |
| "tokens/completion": 785.28125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 59.81179714202881 |
| }, |
| { |
| "advantage/absmean": 0.050859373062849045, |
| "entropy": 0.829409658908844, |
| "epoch": 0.086, |
| "grad_norm": 0.1901939415206169, |
| "importance_ratio": 1.0000019073486328, |
| "learning_rate": 0.0001, |
| "loss": 0.003, |
| "mismatch_kl": 0.009588975459337234, |
| "reward": 0.04312499612569809, |
| "reward/refusal_reward_func": 0.04312499612569809, |
| "reward/std": 0.1388217806816101, |
| "step": 43, |
| "timing/generation_ms": 6013.470813632011, |
| "timing/scoring_ms": 28884.994342923164, |
| "timing/total_ms": 34898.465156555176, |
| "tokens/completion": 691.1875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 51.95971155166626 |
| }, |
| { |
| "advantage/absmean": 0.01318359375, |
| "entropy": 0.9210640788078308, |
| "epoch": 0.088, |
| "grad_norm": 0.03744765208460247, |
| "importance_ratio": 0.9966821670532227, |
| "learning_rate": 0.0001, |
| "loss": -0.0019, |
| "mismatch_kl": 0.018160995095968246, |
| "reward": 0.017812499776482582, |
| "reward/refusal_reward_func": 0.017812499776482582, |
| "reward/std": 0.020575225353240967, |
| "step": 44, |
| "timing/generation_ms": 2924.579069018364, |
| "timing/scoring_ms": 20441.766560077667, |
| "timing/total_ms": 23366.34562909603, |
| "tokens/completion": 343.75, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 84.02694988250732 |
| }, |
| { |
| "advantage/absmean": 0.04843749850988388, |
| "entropy": 0.8891170024871826, |
| "epoch": 0.09, |
| "grad_norm": 0.12978747422641476, |
| "importance_ratio": 0.9995278120040894, |
| "learning_rate": 0.0001, |
| "loss": -0.0021, |
| "mismatch_kl": 0.004716904368251562, |
| "reward": 0.7849999666213989, |
| "reward/refusal_reward_func": 0.7849999666213989, |
| "reward/std": 0.13919411599636078, |
| "step": 45, |
| "timing/generation_ms": 12784.472778439522, |
| "timing/scoring_ms": 41995.29768526554, |
| "timing/total_ms": 54779.77046370506, |
| "tokens/completion": 1424.53125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 159.0035264492035 |
| }, |
| { |
| "advantage/absmean": 0.19917967915534973, |
| "entropy": 1.1817245483398438, |
| "epoch": 0.092, |
| "grad_norm": 0.18725081241592767, |
| "importance_ratio": 1.0018072128295898, |
| "learning_rate": 0.0001, |
| "loss": -0.0256, |
| "mismatch_kl": 0.008767618797719479, |
| "reward": 0.5290625095367432, |
| "reward/refusal_reward_func": 0.5290625095367432, |
| "reward/std": 0.2553595304489136, |
| "step": 46, |
| "timing/generation_ms": 11124.341294169426, |
| "timing/scoring_ms": 37177.75782942772, |
| "timing/total_ms": 48302.099123597145, |
| "tokens/completion": 1282.71875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 67.16122078895569 |
| }, |
| { |
| "advantage/absmean": 0.02968750149011612, |
| "entropy": 0.9758523106575012, |
| "epoch": 0.094, |
| "grad_norm": 0.038752578543884246, |
| "importance_ratio": 1.0004377365112305, |
| "learning_rate": 0.0001, |
| "loss": 0.0042, |
| "mismatch_kl": 0.007721519563347101, |
| "reward": 0.03500000014901161, |
| "reward/refusal_reward_func": 0.03500000014901161, |
| "reward/std": 0.04690415784716606, |
| "step": 47, |
| "timing/generation_ms": 8879.09684330225, |
| "timing/scoring_ms": 37631.5980181098, |
| "timing/total_ms": 46510.69486141205, |
| "tokens/completion": 1009.46875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 73.34895396232605 |
| }, |
| { |
| "advantage/absmean": 0.13593749701976776, |
| "entropy": 0.918021559715271, |
| "epoch": 0.096, |
| "grad_norm": 0.19107099447712278, |
| "importance_ratio": 0.9999390244483948, |
| "learning_rate": 0.0001, |
| "loss": -0.0005, |
| "mismatch_kl": 0.007433234713971615, |
| "reward": 0.7350000143051147, |
| "reward/refusal_reward_func": 0.7350000143051147, |
| "reward/std": 0.23318448662757874, |
| "step": 48, |
| "timing/generation_ms": 10426.40034854412, |
| "timing/scoring_ms": 37421.48996144533, |
| "timing/total_ms": 47847.89030998945, |
| "tokens/completion": 1180.71875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 93.57710862159729 |
| }, |
| { |
| "advantage/absmean": 0.17765624821186066, |
| "entropy": 1.0576484203338623, |
| "epoch": 0.098, |
| "grad_norm": 0.16558979067438845, |
| "importance_ratio": 1.0013474225997925, |
| "learning_rate": 0.0001, |
| "loss": -0.006, |
| "mismatch_kl": 0.006344192661345005, |
| "reward": 0.6915624737739563, |
| "reward/refusal_reward_func": 0.6915624737739563, |
| "reward/std": 0.24835848808288574, |
| "step": 49, |
| "timing/generation_ms": 8644.713327288628, |
| "timing/scoring_ms": 35074.74631816149, |
| "timing/total_ms": 43719.459645450115, |
| "tokens/completion": 984.625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 63.12021732330322 |
| }, |
| { |
| "advantage/absmean": 0.12312500178813934, |
| "entropy": 0.8099650144577026, |
| "epoch": 0.1, |
| "grad_norm": 0.1253616039663461, |
| "importance_ratio": 1.0017279386520386, |
| "learning_rate": 0.0001, |
| "loss": -0.0052, |
| "mismatch_kl": 0.008134027011692524, |
| "reward": 0.14249999821186066, |
| "reward/refusal_reward_func": 0.14249999821186066, |
| "reward/std": 0.1628841608762741, |
| "step": 50, |
| "timing/generation_ms": 8775.396101176739, |
| "timing/scoring_ms": 40764.19186592102, |
| "timing/total_ms": 49539.58796709776, |
| "tokens/completion": 1000.1875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 70.7229483127594 |
| }, |
| { |
| "advantage/absmean": 0.04843749850988388, |
| "entropy": 0.8692086338996887, |
| "epoch": 0.102, |
| "grad_norm": 0.016399389830222273, |
| "importance_ratio": 1.001124620437622, |
| "learning_rate": 0.0001, |
| "loss": -0.0013, |
| "mismatch_kl": 0.010322043672204018, |
| "reward": 0.7849999666213989, |
| "reward/refusal_reward_func": 0.7849999666213989, |
| "reward/std": 0.13919411599636078, |
| "step": 51, |
| "timing/generation_ms": 14720.608927309513, |
| "timing/scoring_ms": 41765.84377884865, |
| "timing/total_ms": 56486.45270615816, |
| "tokens/completion": 1616.15625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 89.23512148857117 |
| }, |
| { |
| "advantage/absmean": 0.2948242425918579, |
| "entropy": 1.1107399463653564, |
| "epoch": 0.104, |
| "grad_norm": 0.18323432452024488, |
| "importance_ratio": 1.0023913383483887, |
| "learning_rate": 0.0001, |
| "loss": 0.0017, |
| "mismatch_kl": 0.0076919617131352425, |
| "reward": 0.32218748331069946, |
| "reward/refusal_reward_func": 0.32218748331069946, |
| "reward/std": 0.3130169212818146, |
| "step": 52, |
| "timing/generation_ms": 15175.773054361343, |
| "timing/scoring_ms": 57348.27160835266, |
| "timing/total_ms": 72524.044662714, |
| "tokens/completion": 1662.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 182.7874722480774 |
| }, |
| { |
| "advantage/absmean": 0.15343749523162842, |
| "entropy": 0.9647155404090881, |
| "epoch": 0.106, |
| "grad_norm": 0.1470849270360251, |
| "importance_ratio": 1.0000090599060059, |
| "learning_rate": 0.0001, |
| "loss": 0.0102, |
| "mismatch_kl": 0.00862339697778225, |
| "reward": 0.6565625071525574, |
| "reward/refusal_reward_func": 0.6565625071525574, |
| "reward/std": 0.22976359724998474, |
| "step": 53, |
| "timing/generation_ms": 10986.380942165852, |
| "timing/scoring_ms": 38820.41800022125, |
| "timing/total_ms": 49806.798942387104, |
| "tokens/completion": 1229.09375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 157.4189648628235 |
| }, |
| { |
| "advantage/absmean": 0.04843749850988388, |
| "entropy": 0.8174250721931458, |
| "epoch": 0.108, |
| "grad_norm": 0.017013624591187354, |
| "importance_ratio": 1.000490427017212, |
| "learning_rate": 0.0001, |
| "loss": -0.0008, |
| "mismatch_kl": 0.004199406132102013, |
| "reward": 0.7849999666213989, |
| "reward/refusal_reward_func": 0.7849999666213989, |
| "reward/std": 0.13919411599636078, |
| "step": 54, |
| "timing/generation_ms": 13110.579743981361, |
| "timing/scoring_ms": 39618.385925889015, |
| "timing/total_ms": 52728.96566987038, |
| "tokens/completion": 1453.71875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 159.44240832328796 |
| }, |
| { |
| "advantage/absmean": 0.07984375208616257, |
| "entropy": 0.8634753227233887, |
| "epoch": 0.11, |
| "grad_norm": 0.06408238305252761, |
| "importance_ratio": 1.0001016855239868, |
| "learning_rate": 0.0001, |
| "loss": -0.0018, |
| "mismatch_kl": 0.006974226329475641, |
| "reward": 0.7643749713897705, |
| "reward/refusal_reward_func": 0.7643749713897705, |
| "reward/std": 0.15140874683856964, |
| "step": 55, |
| "timing/generation_ms": 13839.490927755833, |
| "timing/scoring_ms": 43094.41144019365, |
| "timing/total_ms": 56933.902367949486, |
| "tokens/completion": 1539.5625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 162.71979093551636 |
| }, |
| { |
| "advantage/absmean": 0.22667968273162842, |
| "entropy": 0.7791604399681091, |
| "epoch": 0.112, |
| "grad_norm": 0.16725001444914697, |
| "importance_ratio": 1.0006935596466064, |
| "learning_rate": 0.0001, |
| "loss": -0.0115, |
| "mismatch_kl": 0.007046831306070089, |
| "reward": 0.47468751668930054, |
| "reward/refusal_reward_func": 0.47468751668930054, |
| "reward/std": 0.2708203196525574, |
| "step": 56, |
| "timing/generation_ms": 16662.443839013577, |
| "timing/scoring_ms": 45676.41341686249, |
| "timing/total_ms": 62338.857255876064, |
| "tokens/completion": 1816.03125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 124.58920621871948 |
| }, |
| { |
| "advantage/absmean": 0.11601562052965164, |
| "entropy": 0.7439562678337097, |
| "epoch": 0.114, |
| "grad_norm": 0.13019703908354843, |
| "importance_ratio": 1.0008944272994995, |
| "learning_rate": 0.0001, |
| "loss": -0.0026, |
| "mismatch_kl": 0.009258158504962921, |
| "reward": 0.7171875238418579, |
| "reward/refusal_reward_func": 0.7171875238418579, |
| "reward/std": 0.16097815334796906, |
| "step": 57, |
| "timing/generation_ms": 12729.440599679947, |
| "timing/scoring_ms": 44226.92193090916, |
| "timing/total_ms": 56956.362530589104, |
| "tokens/completion": 1423.09375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 158.57960319519043 |
| }, |
| { |
| "advantage/absmean": 0.15726563334465027, |
| "entropy": 0.8287293910980225, |
| "epoch": 0.116, |
| "grad_norm": 0.14055232526730846, |
| "importance_ratio": 1.001518726348877, |
| "learning_rate": 0.0001, |
| "loss": -0.0132, |
| "mismatch_kl": 0.008714662864804268, |
| "reward": 0.6956250071525574, |
| "reward/refusal_reward_func": 0.6956250071525574, |
| "reward/std": 0.2274579405784607, |
| "step": 58, |
| "timing/generation_ms": 10148.803442716599, |
| "timing/scoring_ms": 45419.282242655754, |
| "timing/total_ms": 55568.08568537235, |
| "tokens/completion": 1146.34375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 88.32079148292542 |
| }, |
| { |
| "advantage/absmean": 0.18738281726837158, |
| "entropy": 0.6767469644546509, |
| "epoch": 0.118, |
| "grad_norm": 0.1800028526601889, |
| "importance_ratio": 0.9996235370635986, |
| "learning_rate": 0.0001, |
| "loss": -0.0004, |
| "mismatch_kl": 0.00572241609916091, |
| "reward": 0.6946874856948853, |
| "reward/refusal_reward_func": 0.6946874856948853, |
| "reward/std": 0.26609423756599426, |
| "step": 59, |
| "timing/generation_ms": 5857.890740036964, |
| "timing/scoring_ms": 28364.16070908308, |
| "timing/total_ms": 34222.051449120045, |
| "tokens/completion": 674.78125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 153.28042459487915 |
| }, |
| { |
| "advantage/absmean": 0.04843749850988388, |
| "entropy": 0.48612505197525024, |
| "epoch": 0.12, |
| "grad_norm": 0.017636908815215933, |
| "importance_ratio": 1.001957893371582, |
| "learning_rate": 0.0001, |
| "loss": -0.0003, |
| "mismatch_kl": 0.0077150240540504456, |
| "reward": 0.7849999666213989, |
| "reward/refusal_reward_func": 0.7849999666213989, |
| "reward/std": 0.13919411599636078, |
| "step": 60, |
| "timing/generation_ms": 13039.215676486492, |
| "timing/scoring_ms": 37324.29302483797, |
| "timing/total_ms": 50363.50870132446, |
| "tokens/completion": 1435.25, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 155.66002011299133 |
| }, |
| { |
| "advantage/absmean": 0.033906251192092896, |
| "entropy": 0.8923248648643494, |
| "epoch": 0.122, |
| "grad_norm": 0.01138046317613093, |
| "importance_ratio": 1.0010781288146973, |
| "learning_rate": 0.0001, |
| "loss": -0.0001, |
| "mismatch_kl": 0.009470692835748196, |
| "reward": 0.7925000190734863, |
| "reward/refusal_reward_func": 0.7925000190734863, |
| "reward/std": 0.09743587672710419, |
| "step": 61, |
| "timing/generation_ms": 20387.244410812855, |
| "timing/scoring_ms": 49000.582568347454, |
| "timing/total_ms": 69387.82697916031, |
| "tokens/completion": 2047.65625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 179.17891383171082 |
| }, |
| { |
| "advantage/absmean": 0.14195312559604645, |
| "entropy": 0.6803052425384521, |
| "epoch": 0.124, |
| "grad_norm": 0.1396860758416004, |
| "importance_ratio": 1.0004676580429077, |
| "learning_rate": 0.0001, |
| "loss": 0.0005, |
| "mismatch_kl": 0.009484711103141308, |
| "reward": 0.7112500071525574, |
| "reward/refusal_reward_func": 0.7112500071525574, |
| "reward/std": 0.19915054738521576, |
| "step": 62, |
| "timing/generation_ms": 20013.910226523876, |
| "timing/scoring_ms": 51556.99533224106, |
| "timing/total_ms": 71570.90555876493, |
| "tokens/completion": 2029.46875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 281.4015655517578 |
| }, |
| { |
| "advantage/absmean": 0.14208984375, |
| "entropy": 0.7867326736450195, |
| "epoch": 0.126, |
| "grad_norm": 0.07770627410364793, |
| "importance_ratio": 1.0014797449111938, |
| "learning_rate": 0.0001, |
| "loss": -0.0003, |
| "mismatch_kl": 0.014644050039350986, |
| "reward": 0.6584374904632568, |
| "reward/refusal_reward_func": 0.6584374904632568, |
| "reward/std": 0.21947535872459412, |
| "step": 63, |
| "timing/generation_ms": 19855.214461684227, |
| "timing/scoring_ms": 52655.76823055744, |
| "timing/total_ms": 72510.98269224167, |
| "tokens/completion": 2026.1875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 168.5356569290161 |
| }, |
| { |
| "advantage/absmean": 0.09375, |
| "entropy": 0.722605288028717, |
| "epoch": 0.128, |
| "grad_norm": 0.10541623752837016, |
| "importance_ratio": 1.0017896890640259, |
| "learning_rate": 0.0001, |
| "loss": 0.0002, |
| "mismatch_kl": 0.009318462572991848, |
| "reward": 0.7599999904632568, |
| "reward/refusal_reward_func": 0.7599999904632568, |
| "reward/std": 0.19364915788173676, |
| "step": 64, |
| "timing/generation_ms": 20329.02915775776, |
| "timing/scoring_ms": 43974.1270840168, |
| "timing/total_ms": 64303.15624177456, |
| "tokens/completion": 2046.09375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 69.69067120552063 |
| }, |
| { |
| "advantage/absmean": 0.13593751192092896, |
| "entropy": 0.6431602239608765, |
| "epoch": 0.13, |
| "grad_norm": 0.04996864946933639, |
| "importance_ratio": 1.0007299184799194, |
| "learning_rate": 0.0001, |
| "loss": 0.0042, |
| "mismatch_kl": 0.01128534134477377, |
| "reward": 0.7350000143051147, |
| "reward/refusal_reward_func": 0.7350000143051147, |
| "reward/std": 0.23318447172641754, |
| "step": 65, |
| "timing/generation_ms": 17292.446829378605, |
| "timing/scoring_ms": 48005.3500905633, |
| "timing/total_ms": 65297.7969199419, |
| "tokens/completion": 1868.75, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 137.745671749115 |
| }, |
| { |
| "advantage/absmean": 0.043593745678663254, |
| "entropy": 0.6788095831871033, |
| "epoch": 0.132, |
| "grad_norm": 0.1104672773690437, |
| "importance_ratio": 1.0008015632629395, |
| "learning_rate": 0.0001, |
| "loss": -0.0, |
| "mismatch_kl": 0.011132912710309029, |
| "reward": 0.7875000238418579, |
| "reward/refusal_reward_func": 0.7875000238418579, |
| "reward/std": 0.1252746880054474, |
| "step": 66, |
| "timing/generation_ms": 20402.18196809292, |
| "timing/scoring_ms": 42341.174609959126, |
| "timing/total_ms": 62743.356578052044, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 162.58158588409424 |
| }, |
| { |
| "advantage/absmean": 0.05214843899011612, |
| "entropy": 0.5562156438827515, |
| "epoch": 0.134, |
| "grad_norm": 0.016212117765327168, |
| "importance_ratio": 0.9997291564941406, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.008888973854482174, |
| "reward": 0.7821874618530273, |
| "reward/refusal_reward_func": 0.7821874618530273, |
| "reward/std": 0.1277872771024704, |
| "step": 67, |
| "timing/generation_ms": 20475.26439279318, |
| "timing/scoring_ms": 46698.052957654, |
| "timing/total_ms": 67173.31735044718, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 151.99999165534973 |
| }, |
| { |
| "advantage/absmean": 0.08964844048023224, |
| "entropy": 0.6966589093208313, |
| "epoch": 0.136, |
| "grad_norm": 0.1577321206922815, |
| "importance_ratio": 1.0010697841644287, |
| "learning_rate": 0.0001, |
| "loss": -0.0002, |
| "mismatch_kl": 0.00940707977861166, |
| "reward": 0.7621874809265137, |
| "reward/refusal_reward_func": 0.7621874809265137, |
| "reward/std": 0.18551842868328094, |
| "step": 68, |
| "timing/generation_ms": 20498.32931160927, |
| "timing/scoring_ms": 54109.15730148554, |
| "timing/total_ms": 74607.4866130948, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.203547000885 |
| }, |
| { |
| "advantage/absmean": 0.27099609375, |
| "entropy": 0.5135282278060913, |
| "epoch": 0.138, |
| "grad_norm": 0.2079259512896872, |
| "importance_ratio": 1.0020484924316406, |
| "learning_rate": 0.0001, |
| "loss": -0.0, |
| "mismatch_kl": 0.012067537754774094, |
| "reward": 0.6365625262260437, |
| "reward/refusal_reward_func": 0.6365625262260437, |
| "reward/std": 0.32783961296081543, |
| "step": 69, |
| "timing/generation_ms": 20592.435374855995, |
| "timing/scoring_ms": 59319.189973175526, |
| "timing/total_ms": 79911.62534803152, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 397.43536710739136 |
| }, |
| { |
| "advantage/absmean": 0.0, |
| "entropy": 0.4793013036251068, |
| "epoch": 0.14, |
| "grad_norm": 0.0, |
| "importance_ratio": 1.0002073049545288, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.01224527694284916, |
| "reward": 0.8100000023841858, |
| "reward/refusal_reward_func": 0.8100000023841858, |
| "reward/std": 0.0, |
| "step": 70, |
| "timing/generation_ms": 20546.609550714493, |
| "timing/scoring_ms": 42320.83362340927, |
| "timing/total_ms": 62867.443174123764, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 161.7110676765442 |
| }, |
| { |
| "advantage/absmean": 0.053593751043081284, |
| "entropy": 0.35620445013046265, |
| "epoch": 0.142, |
| "grad_norm": 0.06178490851261073, |
| "importance_ratio": 1.0008577108383179, |
| "learning_rate": 0.0001, |
| "loss": 0.0001, |
| "mismatch_kl": 0.007034921087324619, |
| "reward": 0.7793750166893005, |
| "reward/refusal_reward_func": 0.7793750166893005, |
| "reward/std": 0.08525467664003372, |
| "step": 71, |
| "timing/generation_ms": 20559.83528494835, |
| "timing/scoring_ms": 50445.407539606094, |
| "timing/total_ms": 71005.24282455444, |
| "tokens/completion": 2047.65625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 165.77110528945923 |
| }, |
| { |
| "advantage/absmean": 0.06621094048023224, |
| "entropy": 0.34386953711509705, |
| "epoch": 0.144, |
| "grad_norm": 0.06068478204359807, |
| "importance_ratio": 1.0005115270614624, |
| "learning_rate": 0.0001, |
| "loss": 0.0001, |
| "mismatch_kl": 0.006632550619542599, |
| "reward": 0.7746874690055847, |
| "reward/refusal_reward_func": 0.7746874690055847, |
| "reward/std": 0.14985378086566925, |
| "step": 72, |
| "timing/generation_ms": 20651.6492664814, |
| "timing/scoring_ms": 53449.473068118095, |
| "timing/total_ms": 74101.1223345995, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 396.06008672714233 |
| }, |
| { |
| "advantage/absmean": 0.08964844048023224, |
| "entropy": 0.3689025640487671, |
| "epoch": 0.146, |
| "grad_norm": 0.028592002076965557, |
| "importance_ratio": 1.0001220703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.009937528520822525, |
| "reward": 0.7621874809265137, |
| "reward/refusal_reward_func": 0.7621874809265137, |
| "reward/std": 0.18551842868328094, |
| "step": 73, |
| "timing/generation_ms": 20704.90287989378, |
| "timing/scoring_ms": 56059.35876071453, |
| "timing/total_ms": 76764.26164060831, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.15700674057007 |
| }, |
| { |
| "advantage/absmean": 0.10025390982627869, |
| "entropy": 0.3531518578529358, |
| "epoch": 0.148, |
| "grad_norm": 0.0966128922725595, |
| "importance_ratio": 0.9996236562728882, |
| "learning_rate": 0.0001, |
| "loss": -0.0001, |
| "mismatch_kl": 0.00807525310665369, |
| "reward": 0.754687488079071, |
| "reward/refusal_reward_func": 0.754687488079071, |
| "reward/std": 0.19453445076942444, |
| "step": 74, |
| "timing/generation_ms": 20194.012761116028, |
| "timing/scoring_ms": 45799.67290908098, |
| "timing/total_ms": 65993.68567019701, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 150.22299551963806 |
| }, |
| { |
| "advantage/absmean": 0.08964844048023224, |
| "entropy": 0.3910990059375763, |
| "epoch": 0.15, |
| "grad_norm": 0.14646197667696922, |
| "importance_ratio": 0.999146580696106, |
| "learning_rate": 0.0001, |
| "loss": -0.0003, |
| "mismatch_kl": 0.008608575910329819, |
| "reward": 0.7621874809265137, |
| "reward/refusal_reward_func": 0.7621874809265137, |
| "reward/std": 0.18551842868328094, |
| "step": 75, |
| "timing/generation_ms": 20237.79760301113, |
| "timing/scoring_ms": 51105.7443395257, |
| "timing/total_ms": 71343.54194253683, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.9270164966583 |
| }, |
| { |
| "advantage/absmean": 0.13537108898162842, |
| "entropy": 0.2564987540245056, |
| "epoch": 0.152, |
| "grad_norm": 0.10767344052989877, |
| "importance_ratio": 1.000230073928833, |
| "learning_rate": 0.0001, |
| "loss": 0.0002, |
| "mismatch_kl": 0.00910898856818676, |
| "reward": 0.7353124618530273, |
| "reward/refusal_reward_func": 0.7353124618530273, |
| "reward/std": 0.23228463530540466, |
| "step": 76, |
| "timing/generation_ms": 20231.063432991505, |
| "timing/scoring_ms": 64505.957297980785, |
| "timing/total_ms": 84737.02073097229, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.6906681060791 |
| }, |
| { |
| "advantage/absmean": 0.04843749850988388, |
| "entropy": 0.35127270221710205, |
| "epoch": 0.154, |
| "grad_norm": 0.05430162481667564, |
| "importance_ratio": 1.001720666885376, |
| "learning_rate": 0.0001, |
| "loss": -0.0112, |
| "mismatch_kl": 0.02907688170671463, |
| "reward": 0.7849999666213989, |
| "reward/refusal_reward_func": 0.7849999666213989, |
| "reward/std": 0.13919411599636078, |
| "step": 77, |
| "timing/generation_ms": 2299.7111305594444, |
| "timing/scoring_ms": 24145.363181829453, |
| "timing/total_ms": 26445.074312388897, |
| "tokens/completion": 256.90625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 43.695470094680786 |
| }, |
| { |
| "advantage/absmean": 0.17824219167232513, |
| "entropy": 0.2108859121799469, |
| "epoch": 0.156, |
| "grad_norm": 0.1343157239601839, |
| "importance_ratio": 1.000138521194458, |
| "learning_rate": 0.0001, |
| "loss": -0.0061, |
| "mismatch_kl": 0.006567842327058315, |
| "reward": 0.7043750286102295, |
| "reward/refusal_reward_func": 0.7043750286102295, |
| "reward/std": 0.26504644751548767, |
| "step": 78, |
| "timing/generation_ms": 19559.65828895569, |
| "timing/scoring_ms": 56106.24121129513, |
| "timing/total_ms": 75665.89950025082, |
| "tokens/completion": 2012.8125, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 397.83462166786194 |
| }, |
| { |
| "advantage/absmean": 0.14109376072883606, |
| "entropy": 0.2118162214756012, |
| "epoch": 0.158, |
| "grad_norm": 0.04314766069392161, |
| "importance_ratio": 0.999754011631012, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.005713644903153181, |
| "reward": 0.7293750047683716, |
| "reward/refusal_reward_func": 0.7293750047683716, |
| "reward/std": 0.23431998491287231, |
| "step": 79, |
| "timing/generation_ms": 20274.492114782333, |
| "timing/scoring_ms": 53302.132822573185, |
| "timing/total_ms": 73576.62493735552, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 396.22464632987976 |
| }, |
| { |
| "advantage/absmean": 0.13593749701976776, |
| "entropy": 0.32609474658966064, |
| "epoch": 0.16, |
| "grad_norm": 0.10083540436117842, |
| "importance_ratio": 1.0001963376998901, |
| "learning_rate": 0.0001, |
| "loss": -0.0002, |
| "mismatch_kl": 0.008770663291215897, |
| "reward": 0.7350000143051147, |
| "reward/refusal_reward_func": 0.7350000143051147, |
| "reward/std": 0.23318448662757874, |
| "step": 80, |
| "timing/generation_ms": 20281.54794126749, |
| "timing/scoring_ms": 45572.31470942497, |
| "timing/total_ms": 65853.86265069246, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 157.70197463035583 |
| }, |
| { |
| "advantage/absmean": 0.13197265565395355, |
| "entropy": 0.349658727645874, |
| "epoch": 0.162, |
| "grad_norm": 0.15801403288716265, |
| "importance_ratio": 0.9997016191482544, |
| "learning_rate": 0.0001, |
| "loss": 0.0003, |
| "mismatch_kl": 0.007918323390185833, |
| "reward": 0.7371875047683716, |
| "reward/refusal_reward_func": 0.7371875047683716, |
| "reward/std": 0.22671890258789062, |
| "step": 81, |
| "timing/generation_ms": 19729.195773601532, |
| "timing/scoring_ms": 52965.313747525215, |
| "timing/total_ms": 72694.50952112675, |
| "tokens/completion": 2031.71875, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.87234902381897 |
| }, |
| { |
| "advantage/absmean": 0.19093748927116394, |
| "entropy": 0.30307242274284363, |
| "epoch": 0.164, |
| "grad_norm": 0.05658978916858572, |
| "importance_ratio": 0.9997415542602539, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.009000571444630623, |
| "reward": 0.6924999952316284, |
| "reward/refusal_reward_func": 0.6924999952316284, |
| "reward/std": 0.2553306818008423, |
| "step": 82, |
| "timing/generation_ms": 20152.134649455547, |
| "timing/scoring_ms": 55147.81706035137, |
| "timing/total_ms": 75299.95170980692, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 276.00095558166504 |
| }, |
| { |
| "advantage/absmean": 0.13197265565395355, |
| "entropy": 0.2418041229248047, |
| "epoch": 0.166, |
| "grad_norm": 0.09124961991568047, |
| "importance_ratio": 0.9993461966514587, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.008319162763655186, |
| "reward": 0.7371875047683716, |
| "reward/refusal_reward_func": 0.7371875047683716, |
| "reward/std": 0.22671890258789062, |
| "step": 83, |
| "timing/generation_ms": 20235.445871949196, |
| "timing/scoring_ms": 60683.41539800167, |
| "timing/total_ms": 80918.86126995087, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.98225951194763 |
| }, |
| { |
| "advantage/absmean": 0.09492187201976776, |
| "entropy": 0.36278918385505676, |
| "epoch": 0.168, |
| "grad_norm": 0.10428837192295014, |
| "importance_ratio": 1.0009691715240479, |
| "learning_rate": 0.0001, |
| "loss": 0.0001, |
| "mismatch_kl": 0.006821990944445133, |
| "reward": 0.7593749761581421, |
| "reward/refusal_reward_func": 0.7593749761581421, |
| "reward/std": 0.19606979191303253, |
| "step": 84, |
| "timing/generation_ms": 20079.659663140774, |
| "timing/scoring_ms": 62456.88313245773, |
| "timing/total_ms": 82536.5427955985, |
| "tokens/completion": 2047.4375, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.080442905426 |
| }, |
| { |
| "advantage/absmean": 0.16296875476837158, |
| "entropy": 0.2741187810897827, |
| "epoch": 0.17, |
| "grad_norm": 0.05559178148711944, |
| "importance_ratio": 1.0001330375671387, |
| "learning_rate": 0.0001, |
| "loss": 0.0002, |
| "mismatch_kl": 0.008196860551834106, |
| "reward": 0.7168750166893005, |
| "reward/refusal_reward_func": 0.7168750166893005, |
| "reward/std": 0.2492668777704239, |
| "step": 85, |
| "timing/generation_ms": 20191.432282328606, |
| "timing/scoring_ms": 66640.07867872715, |
| "timing/total_ms": 86831.51096105576, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.3853757381439 |
| }, |
| { |
| "advantage/absmean": 0.06513672322034836, |
| "entropy": 0.3744433522224426, |
| "epoch": 0.172, |
| "grad_norm": 0.02027358693373687, |
| "importance_ratio": 0.9997438192367554, |
| "learning_rate": 0.0001, |
| "loss": 0.0001, |
| "mismatch_kl": 0.0079119261354208, |
| "reward": 0.7740625143051147, |
| "reward/refusal_reward_func": 0.7740625143051147, |
| "reward/std": 0.1449754238128662, |
| "step": 86, |
| "timing/generation_ms": 20130.67189604044, |
| "timing/scoring_ms": 63243.81287395954, |
| "timing/total_ms": 83374.48476999998, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.553094625473 |
| }, |
| { |
| "advantage/absmean": 0.06621094048023224, |
| "entropy": 0.2802242636680603, |
| "epoch": 0.174, |
| "grad_norm": 0.11007226741752094, |
| "importance_ratio": 1.0002408027648926, |
| "learning_rate": 0.0001, |
| "loss": -0.0, |
| "mismatch_kl": 0.010727161541581154, |
| "reward": 0.7746875286102295, |
| "reward/refusal_reward_func": 0.7746875286102295, |
| "reward/std": 0.14985376596450806, |
| "step": 87, |
| "timing/generation_ms": 20197.582133114338, |
| "timing/scoring_ms": 60626.82098895311, |
| "timing/total_ms": 80824.40312206745, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.6186418533325 |
| }, |
| { |
| "advantage/absmean": 0.15476563572883606, |
| "entropy": 0.15260648727416992, |
| "epoch": 0.176, |
| "grad_norm": 0.09113868718318835, |
| "importance_ratio": 0.9991167187690735, |
| "learning_rate": 0.0001, |
| "loss": -0.0, |
| "mismatch_kl": 0.007635745219886303, |
| "reward": 0.7215625047683716, |
| "reward/refusal_reward_func": 0.7215625047683716, |
| "reward/std": 0.2370404750108719, |
| "step": 88, |
| "timing/generation_ms": 20186.022453010082, |
| "timing/scoring_ms": 65319.87015157938, |
| "timing/total_ms": 85505.89260458946, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.704843044281 |
| }, |
| { |
| "advantage/absmean": 0.09375, |
| "entropy": 0.2925874888896942, |
| "epoch": 0.178, |
| "grad_norm": 0.10900817571461202, |
| "importance_ratio": 1.0003485679626465, |
| "learning_rate": 0.0001, |
| "loss": 0.0003, |
| "mismatch_kl": 0.01015115063637495, |
| "reward": 0.7599999904632568, |
| "reward/refusal_reward_func": 0.7599999904632568, |
| "reward/std": 0.19364915788173676, |
| "step": 89, |
| "timing/generation_ms": 19985.6186658144, |
| "timing/scoring_ms": 50727.93058305979, |
| "timing/total_ms": 70713.54924887419, |
| "tokens/completion": 2039.15625, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 292.40805864334106 |
| }, |
| { |
| "advantage/absmean": 0.24515625834465027, |
| "entropy": 0.24920716881752014, |
| "epoch": 0.18, |
| "grad_norm": 0.12621044924209393, |
| "importance_ratio": 1.000746250152588, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.008628414012491703, |
| "reward": 0.6162500381469727, |
| "reward/refusal_reward_func": 0.6162500381469727, |
| "reward/std": 0.2840307056903839, |
| "step": 90, |
| "timing/generation_ms": 20263.45807313919, |
| "timing/scoring_ms": 69397.22065627575, |
| "timing/total_ms": 89660.67872941494, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.5488703250885 |
| }, |
| { |
| "advantage/absmean": 0.10718750208616257, |
| "entropy": 0.3780635893344879, |
| "epoch": 0.182, |
| "grad_norm": 0.041883076718277436, |
| "importance_ratio": 1.0010097026824951, |
| "learning_rate": 0.0001, |
| "loss": 0.0001, |
| "mismatch_kl": 0.009139418601989746, |
| "reward": 0.7487499713897705, |
| "reward/refusal_reward_func": 0.7487499713897705, |
| "reward/std": 0.1976384073495865, |
| "step": 91, |
| "timing/generation_ms": 20657.530024647713, |
| "timing/scoring_ms": 65376.88625603914, |
| "timing/total_ms": 86034.41628068686, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.1356108188629 |
| }, |
| { |
| "advantage/absmean": 0.13974609971046448, |
| "entropy": 0.3699057102203369, |
| "epoch": 0.184, |
| "grad_norm": 0.06598040996573111, |
| "importance_ratio": 0.9998457431793213, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.0077532450668513775, |
| "reward": 0.7271875143051147, |
| "reward/refusal_reward_func": 0.7271875143051147, |
| "reward/std": 0.2168990969657898, |
| "step": 92, |
| "timing/generation_ms": 20616.587534546852, |
| "timing/scoring_ms": 47124.867990612984, |
| "timing/total_ms": 67741.45552515984, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 168.30030918121338 |
| }, |
| { |
| "advantage/absmean": 0.19189453125, |
| "entropy": 0.34180349111557007, |
| "epoch": 0.186, |
| "grad_norm": 0.14232699624625103, |
| "importance_ratio": 0.999165952205658, |
| "learning_rate": 0.0001, |
| "loss": 0.0003, |
| "mismatch_kl": 0.008044413290917873, |
| "reward": 0.6871874928474426, |
| "reward/refusal_reward_func": 0.6871874928474426, |
| "reward/std": 0.2622169256210327, |
| "step": 93, |
| "timing/generation_ms": 20529.827870428562, |
| "timing/scoring_ms": 60052.588775753975, |
| "timing/total_ms": 80582.41664618254, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 396.85295939445496 |
| }, |
| { |
| "advantage/absmean": 0.20988282561302185, |
| "entropy": 0.315336674451828, |
| "epoch": 0.188, |
| "grad_norm": 0.10320818511601894, |
| "importance_ratio": 1.000422477722168, |
| "learning_rate": 0.0001, |
| "loss": -0.0, |
| "mismatch_kl": 0.007602104917168617, |
| "reward": 0.6856250166893005, |
| "reward/refusal_reward_func": 0.6856250166893005, |
| "reward/std": 0.28907111287117004, |
| "step": 94, |
| "timing/generation_ms": 20579.548463225365, |
| "timing/scoring_ms": 51010.259330272675, |
| "timing/total_ms": 71589.80779349804, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.95818734169006 |
| }, |
| { |
| "advantage/absmean": 0.15345704555511475, |
| "entropy": 0.5262030363082886, |
| "epoch": 0.19, |
| "grad_norm": 0.05870104388608926, |
| "importance_ratio": 0.9999489188194275, |
| "learning_rate": 0.0001, |
| "loss": 0.0002, |
| "mismatch_kl": 0.0076672472059726715, |
| "reward": 0.7190625071525574, |
| "reward/refusal_reward_func": 0.7190625071525574, |
| "reward/std": 0.2384108603000641, |
| "step": 95, |
| "timing/generation_ms": 20614.634588360786, |
| "timing/scoring_ms": 65994.11156028509, |
| "timing/total_ms": 86608.74614864588, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.0603678226471 |
| }, |
| { |
| "advantage/absmean": 0.23779296875, |
| "entropy": 0.3904465436935425, |
| "epoch": 0.192, |
| "grad_norm": 0.1935352935904031, |
| "importance_ratio": 1.000299334526062, |
| "learning_rate": 0.0001, |
| "loss": 0.0, |
| "mismatch_kl": 0.008834589272737503, |
| "reward": 0.6578124761581421, |
| "reward/refusal_reward_func": 0.6578124761581421, |
| "reward/std": 0.2928708493709564, |
| "step": 96, |
| "timing/generation_ms": 20604.460656642914, |
| "timing/scoring_ms": 67186.7751404643, |
| "timing/total_ms": 87791.23579710722, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.96749925613403 |
| }, |
| { |
| "advantage/absmean": 0.2852538824081421, |
| "entropy": 0.32257264852523804, |
| "epoch": 0.194, |
| "grad_norm": 0.11248909611986616, |
| "importance_ratio": 0.9994723796844482, |
| "learning_rate": 0.0001, |
| "loss": -0.0, |
| "mismatch_kl": 0.008222612552344799, |
| "reward": 0.6115624904632568, |
| "reward/refusal_reward_func": 0.6115624904632568, |
| "reward/std": 0.3214528560638428, |
| "step": 97, |
| "timing/generation_ms": 20211.18316054344, |
| "timing/scoring_ms": 60143.2975307107, |
| "timing/total_ms": 80354.48069125414, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 395.2164263725281 |
| }, |
| { |
| "advantage/absmean": 0.09375, |
| "entropy": 0.45740675926208496, |
| "epoch": 0.196, |
| "grad_norm": 0.12144158106340411, |
| "importance_ratio": 0.9998034834861755, |
| "learning_rate": 0.0001, |
| "loss": 0.0001, |
| "mismatch_kl": 0.008089970797300339, |
| "reward": 0.7599999904632568, |
| "reward/refusal_reward_func": 0.7599999904632568, |
| "reward/std": 0.19364915788173676, |
| "step": 98, |
| "timing/generation_ms": 20260.59687882662, |
| "timing/scoring_ms": 44623.64313751459, |
| "timing/total_ms": 64884.24001634121, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 131.37106108665466 |
| }, |
| { |
| "advantage/absmean": 0.04843749850988388, |
| "entropy": 0.5326197147369385, |
| "epoch": 0.198, |
| "grad_norm": 0.12835217845348193, |
| "importance_ratio": 0.9994455575942993, |
| "learning_rate": 0.0001, |
| "loss": -0.0, |
| "mismatch_kl": 0.009172793477773666, |
| "reward": 0.7849999666213989, |
| "reward/refusal_reward_func": 0.7849999666213989, |
| "reward/std": 0.13919411599636078, |
| "step": 99, |
| "timing/generation_ms": 20207.647144794464, |
| "timing/scoring_ms": 49733.94272476435, |
| "timing/total_ms": 69941.58986955881, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 161.61080026626587 |
| }, |
| { |
| "advantage/absmean": 0.10171875357627869, |
| "entropy": 0.5042125582695007, |
| "epoch": 0.2, |
| "grad_norm": 0.08743211269759214, |
| "importance_ratio": 1.0003973245620728, |
| "learning_rate": 0.0001, |
| "loss": -0.0002, |
| "mismatch_kl": 0.01178868766874075, |
| "reward": 0.7518749833106995, |
| "reward/refusal_reward_func": 0.7518749833106995, |
| "reward/std": 0.17614690959453583, |
| "step": 100, |
| "timing/generation_ms": 20365.172304213047, |
| "timing/scoring_ms": 65265.56546241045, |
| "timing/total_ms": 85630.7377666235, |
| "tokens/completion": 2048.0, |
| "tokens/masked_fraction": 0.0, |
| "wall_clock/generate_s": 394.74688720703125 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|