{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2733333333333333, "eval_steps": 50, "global_step": 205, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage/absmean": 0.12451171875, "entropy": 1.3932524919509888, "epoch": 0.0013333333333333333, "grad_norm": 0.016694727116637192, "importance_ratio": 0.9986082315444946, "learning_rate": 0.0, "loss": -0.0189, "mismatch_kl": 0.004300346598029137, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 1, "timing/generation_ms": 11961.050138808787, "timing/scoring_ms": 0.0, "timing/total_ms": 11961.050138808787, "tokens/completion": 551.78125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 248.72634100914001 }, { "advantage/absmean": 0.12451171875, "entropy": 1.0695139169692993, "epoch": 0.0026666666666666666, "grad_norm": 0.008567213424127631, "importance_ratio": 0.9980430603027344, "learning_rate": 1.0000000000000002e-06, "loss": -0.0055, "mismatch_kl": 0.0036789600271731615, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 2, "timing/generation_ms": 11398.794241249561, "timing/scoring_ms": 0.0, "timing/total_ms": 11398.794241249561, "tokens/completion": 647.02734375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 67.39928388595581 }, { "advantage/absmean": 0.12451171875, "entropy": 1.2690891027450562, "epoch": 0.004, "grad_norm": 0.007856590727089238, "importance_ratio": 0.9990478157997131, "learning_rate": 2.0000000000000003e-06, "loss": -0.0147, "mismatch_kl": 0.00404919171705842, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 3, "timing/generation_ms": 13145.053108222783, "timing/scoring_ms": 0.0, "timing/total_ms": 13145.053108222783, "tokens/completion": 695.94140625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 188.99010276794434 }, { "advantage/absmean": 0.12451171875, "entropy": 0.5635457634925842, "epoch": 0.005333333333333333, "grad_norm": 0.008427354641048032, "importance_ratio": 0.9995828866958618, "learning_rate": 3e-06, "loss": -0.0056, "mismatch_kl": 0.0024689023848623037, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 4, "timing/generation_ms": 12098.999472334981, "timing/scoring_ms": 0.0, "timing/total_ms": 12098.999472334981, "tokens/completion": 634.3515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 52.7923378944397 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8588207364082336, "epoch": 0.006666666666666667, "grad_norm": 0.015271082061520619, "importance_ratio": 0.9998404383659363, "learning_rate": 4.000000000000001e-06, "loss": -0.0201, "mismatch_kl": 0.003175633493810892, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 5, "timing/generation_ms": 9795.204265974462, "timing/scoring_ms": 0.0, "timing/total_ms": 9795.204265974462, "tokens/completion": 595.30078125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 56.867586612701416 }, { "advantage/absmean": 0.12451171875, "entropy": 1.0917283296585083, "epoch": 0.008, "grad_norm": 0.015440441848262498, "importance_ratio": 1.0006937980651855, "learning_rate": 5e-06, "loss": -0.0046, "mismatch_kl": 0.003965948708355427, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 6, "timing/generation_ms": 3492.4034476280212, "timing/scoring_ms": 0.0, "timing/total_ms": 3492.4034476280212, "tokens/completion": 176.77734375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 61.55981087684631 }, { "advantage/absmean": 0.12451171875, "entropy": 0.750698983669281, "epoch": 0.009333333333333334, "grad_norm": 0.008842566430176115, "importance_ratio": 1.0032514333724976, "learning_rate": 5e-06, "loss": 0.0042, "mismatch_kl": 0.0037081094924360514, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 7, "timing/generation_ms": 12012.088196352124, "timing/scoring_ms": 0.0, "timing/total_ms": 12012.088196352124, "tokens/completion": 664.06640625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 80.06084942817688 }, { "advantage/absmean": 0.12451171875, "entropy": 0.6958726644515991, "epoch": 0.010666666666666666, "grad_norm": 0.020865513665125984, "importance_ratio": 0.9998727440834045, "learning_rate": 5e-06, "loss": -0.0015, "mismatch_kl": 0.003091922029852867, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 8, "timing/generation_ms": 7164.519478566945, "timing/scoring_ms": 0.0, "timing/total_ms": 7164.519478566945, "tokens/completion": 376.96484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 122.57408618927002 }, { "advantage/absmean": 0.12451171875, "entropy": 0.647992730140686, "epoch": 0.012, "grad_norm": 0.010516528439614162, "importance_ratio": 0.9973449110984802, "learning_rate": 5e-06, "loss": 0.0348, "mismatch_kl": 0.002668753731995821, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 9, "timing/generation_ms": 9473.532978445292, "timing/scoring_ms": 0.0, "timing/total_ms": 9473.532978445292, "tokens/completion": 589.9375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 111.60580968856812 }, { "advantage/absmean": 0.12451171875, "entropy": 0.9175997972488403, "epoch": 0.013333333333333334, "grad_norm": 0.017217377658999368, "importance_ratio": 0.9963379502296448, "learning_rate": 5e-06, "loss": -0.0133, "mismatch_kl": 0.003761034458875656, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 10, "timing/generation_ms": 8803.215935826302, "timing/scoring_ms": 0.0, "timing/total_ms": 8803.215935826302, "tokens/completion": 432.890625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 137.27361369132996 }, { "advantage/absmean": 0.12451171875, "entropy": 0.6505714654922485, "epoch": 0.014666666666666666, "grad_norm": 0.0034942507757306364, "importance_ratio": 0.9997450113296509, "learning_rate": 5e-06, "loss": 0.0567, "mismatch_kl": 0.025293370708823204, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 11, "timing/generation_ms": 28037.367599084973, "timing/scoring_ms": 0.0, "timing/total_ms": 28037.367599084973, "tokens/completion": 1677.38671875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 147.27029275894165 }, { "advantage/absmean": 0.12451171875, "entropy": 0.623925507068634, "epoch": 0.016, "grad_norm": 0.004363286027787366, "importance_ratio": 0.9998379349708557, "learning_rate": 5e-06, "loss": 0.037, "mismatch_kl": 0.027607521042227745, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 12, "timing/generation_ms": 30658.961144275963, "timing/scoring_ms": 0.0, "timing/total_ms": 30658.961144275963, "tokens/completion": 1772.48046875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 228.39264035224915 }, { "advantage/absmean": 0.12451171875, "entropy": 1.2309722900390625, "epoch": 0.017333333333333333, "grad_norm": 0.01910079735377139, "importance_ratio": 0.9967860579490662, "learning_rate": 5e-06, "loss": -0.0146, "mismatch_kl": 0.004334039054811001, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 13, "timing/generation_ms": 7481.697678565979, "timing/scoring_ms": 0.0, "timing/total_ms": 7481.697678565979, "tokens/completion": 458.546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 125.09760117530823 }, { "advantage/absmean": 0.12451171875, "entropy": 0.583360493183136, "epoch": 0.018666666666666668, "grad_norm": 0.006859469099074894, "importance_ratio": 0.9988465905189514, "learning_rate": 5e-06, "loss": -0.0041, "mismatch_kl": 0.0028068351093679667, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 14, "timing/generation_ms": 8050.086663104594, "timing/scoring_ms": 0.0, "timing/total_ms": 8050.086663104594, "tokens/completion": 466.06640625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 117.39565086364746 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7860226035118103, "epoch": 0.02, "grad_norm": 0.011283066327858677, "importance_ratio": 1.002608299255371, "learning_rate": 5e-06, "loss": -0.0035, "mismatch_kl": 0.004051415715366602, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 15, "timing/generation_ms": 9803.531439974904, "timing/scoring_ms": 0.0, "timing/total_ms": 9803.531439974904, "tokens/completion": 522.2109375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 48.61639094352722 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7184260487556458, "epoch": 0.021333333333333333, "grad_norm": 0.010228445907240152, "importance_ratio": 1.000801920890808, "learning_rate": 5e-06, "loss": -0.0066, "mismatch_kl": 0.006085229571908712, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 16, "timing/generation_ms": 8376.314821653068, "timing/scoring_ms": 0.0, "timing/total_ms": 8376.314821653068, "tokens/completion": 458.83984375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 69.11118984222412 }, { "advantage/absmean": 0.12451171875, "entropy": 0.18061073124408722, "epoch": 0.02266666666666667, "grad_norm": 0.0036474713562644418, "importance_ratio": 0.9987739324569702, "learning_rate": 5e-06, "loss": 0.0657, "mismatch_kl": 0.025802385061979294, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 17, "timing/generation_ms": 16682.96501878649, "timing/scoring_ms": 0.0, "timing/total_ms": 16682.96501878649, "tokens/completion": 1178.22265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 110.8058807849884 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3768082559108734, "epoch": 0.024, "grad_norm": 0.007994642717131743, "importance_ratio": 0.9989356398582458, "learning_rate": 5e-06, "loss": 0.0198, "mismatch_kl": 0.0024773485492914915, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 18, "timing/generation_ms": 45841.41308255494, "timing/scoring_ms": 0.0, "timing/total_ms": 45841.41308255494, "tokens/completion": 2401.60546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 182.70060086250305 }, { "advantage/absmean": 0.12451171875, "entropy": 0.9849978685379028, "epoch": 0.025333333333333333, "grad_norm": 0.007975010652496835, "importance_ratio": 0.9994485974311829, "learning_rate": 5e-06, "loss": -0.0032, "mismatch_kl": 0.007306213956326246, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 19, "timing/generation_ms": 21281.952754594386, "timing/scoring_ms": 0.0, "timing/total_ms": 21281.952754594386, "tokens/completion": 1127.03515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 118.257479429245 }, { "advantage/absmean": 0.12451171875, "entropy": 0.783280074596405, "epoch": 0.02666666666666667, "grad_norm": 0.01013309688610727, "importance_ratio": 1.0076329708099365, "learning_rate": 5e-06, "loss": -0.002, "mismatch_kl": 0.008437588810920715, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 20, "timing/generation_ms": 11283.36211759597, "timing/scoring_ms": 0.0, "timing/total_ms": 11283.36211759597, "tokens/completion": 603.92578125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 50.433815717697144 }, { "advantage/absmean": 0.12451171875, "entropy": 0.18535619974136353, "epoch": 0.028, "grad_norm": 0.12592122275182266, "importance_ratio": 0.994857132434845, "learning_rate": 5e-06, "loss": 0.057, "mismatch_kl": 0.004472589120268822, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 21, "timing/generation_ms": 69204.76855803281, "timing/scoring_ms": 0.0, "timing/total_ms": 69204.76855803281, "tokens/completion": 3062.171875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 335.8162593841553 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7172983884811401, "epoch": 0.029333333333333333, "grad_norm": 0.011698600330274578, "importance_ratio": 1.0030107498168945, "learning_rate": 5e-06, "loss": -0.0094, "mismatch_kl": 0.03951645269989967, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 22, "timing/generation_ms": 16505.55451028049, "timing/scoring_ms": 0.0, "timing/total_ms": 16505.55451028049, "tokens/completion": 675.60546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 61.02479434013367 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8583077192306519, "epoch": 0.030666666666666665, "grad_norm": 0.02332906550498323, "importance_ratio": 1.0737003087997437, "learning_rate": 5e-06, "loss": 0.0468, "mismatch_kl": 0.21222208440303802, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 23, "timing/generation_ms": 47965.167357586324, "timing/scoring_ms": 0.0, "timing/total_ms": 47965.167357586324, "tokens/completion": 2437.57421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 184.88851642608643 }, { "advantage/absmean": 0.12451171875, "entropy": 0.6403871178627014, "epoch": 0.032, "grad_norm": 0.0064139472738614185, "importance_ratio": 1.0027199983596802, "learning_rate": 5e-06, "loss": 0.0079, "mismatch_kl": 0.029356306418776512, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 24, "timing/generation_ms": 25231.056010350585, "timing/scoring_ms": 0.0, "timing/total_ms": 25231.056010350585, "tokens/completion": 1253.125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 127.16959929466248 }, { "advantage/absmean": 0.12451171875, "entropy": 0.26308295130729675, "epoch": 0.03333333333333333, "grad_norm": 0.004856521131545869, "importance_ratio": 0.99989914894104, "learning_rate": 5e-06, "loss": 0.0162, "mismatch_kl": 0.006057343445718288, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 25, "timing/generation_ms": 44386.24160736799, "timing/scoring_ms": 0.0, "timing/total_ms": 44386.24160736799, "tokens/completion": 2212.2421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 173.18823885917664 }, { "advantage/absmean": 0.12451171875, "entropy": 0.12470932304859161, "epoch": 0.034666666666666665, "grad_norm": 0.001678447935003649, "importance_ratio": 1.0007412433624268, "learning_rate": 5e-06, "loss": 0.0462, "mismatch_kl": 0.001119845313951373, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 26, "timing/generation_ms": 100999.46400336921, "timing/scoring_ms": 0.0, "timing/total_ms": 100999.46400336921, "tokens/completion": 3716.6796875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 303.84296584129333 }, { "advantage/absmean": 0.12451171875, "entropy": 0.17583802342414856, "epoch": 0.036, "grad_norm": 0.002584350761592735, "importance_ratio": 1.001440405845642, "learning_rate": 5e-06, "loss": 0.0264, "mismatch_kl": 0.0013389256782829762, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 27, "timing/generation_ms": 55200.44738613069, "timing/scoring_ms": 0.0, "timing/total_ms": 55200.44738613069, "tokens/completion": 2656.7265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 294.736074924469 }, { "advantage/absmean": 0.12451171875, "entropy": 0.19836626946926117, "epoch": 0.037333333333333336, "grad_norm": 0.005548904662699889, "importance_ratio": 1.0022764205932617, "learning_rate": 5e-06, "loss": 0.0251, "mismatch_kl": 0.0019016863079741597, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 28, "timing/generation_ms": 57617.69188474864, "timing/scoring_ms": 0.0, "timing/total_ms": 57617.69188474864, "tokens/completion": 2797.6171875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 228.97359490394592 }, { "advantage/absmean": 0.12451171875, "entropy": 0.37241131067276, "epoch": 0.03866666666666667, "grad_norm": 0.02006388030524017, "importance_ratio": 1.053019404411316, "learning_rate": 5e-06, "loss": 0.0557, "mismatch_kl": 0.9634742736816406, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 29, "timing/generation_ms": 41741.05513561517, "timing/scoring_ms": 0.0, "timing/total_ms": 41741.05513561517, "tokens/completion": 2055.87890625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 207.62974190711975 }, { "advantage/absmean": 0.12451171875, "entropy": 0.25762397050857544, "epoch": 0.04, "grad_norm": 0.006806951429177103, "importance_ratio": 0.983231246471405, "learning_rate": 5e-06, "loss": 0.0364, "mismatch_kl": 0.06448693573474884, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 30, "timing/generation_ms": 29489.30747061968, "timing/scoring_ms": 0.0, "timing/total_ms": 29489.30747061968, "tokens/completion": 1709.59765625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 175.62516474723816 }, { "advantage/absmean": 0.12451171875, "entropy": 0.019520161673426628, "epoch": 0.04133333333333333, "grad_norm": 0.0005178617259035183, "importance_ratio": 0.9998506307601929, "learning_rate": 5e-06, "loss": 0.0014, "mismatch_kl": 0.0017281156033277512, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 31, "timing/generation_ms": 255150.22794622928, "timing/scoring_ms": 0.0, "timing/total_ms": 255150.22794622928, "tokens/completion": 6100.89453125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 537.7091252803802 }, { "advantage/absmean": 0.12451171875, "entropy": 0.005344062577933073, "epoch": 0.042666666666666665, "grad_norm": 0.00042076548606043374, "importance_ratio": 1.0000818967819214, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00012820436677429825, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 32, "timing/generation_ms": 252645.98809182644, "timing/scoring_ms": 0.0, "timing/total_ms": 252645.98809182644, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 564.6809096336365 }, { "advantage/absmean": 0.12451171875, "entropy": 0.0041460455395281315, "epoch": 0.044, "grad_norm": 0.0004905946483254039, "importance_ratio": 1.0000282526016235, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 6.918103463249281e-05, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 33, "timing/generation_ms": 262179.48642838746, "timing/scoring_ms": 0.0, "timing/total_ms": 262179.48642838746, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 574.2838616371155 }, { "advantage/absmean": 0.12451171875, "entropy": 0.005189419258385897, "epoch": 0.04533333333333334, "grad_norm": 0.0003380219234355203, "importance_ratio": 1.0000487565994263, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 7.488115079468116e-05, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 34, "timing/generation_ms": 257649.44062847644, "timing/scoring_ms": 0.0, "timing/total_ms": 257649.44062847644, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 616.5528900623322 }, { "advantage/absmean": 0.12451171875, "entropy": 0.005219260696321726, "epoch": 0.04666666666666667, "grad_norm": 0.0006402287013777213, "importance_ratio": 1.0000388622283936, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00010059373016702011, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 35, "timing/generation_ms": 263093.6838546768, "timing/scoring_ms": 0.0, "timing/total_ms": 263093.6838546768, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 567.3024535179138 }, { "advantage/absmean": 0.12451171875, "entropy": 0.008569693192839622, "epoch": 0.048, "grad_norm": 0.0005047742243801816, "importance_ratio": 1.0000779628753662, "learning_rate": 5e-06, "loss": 0.0004, "mismatch_kl": 0.0001211672934005037, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 36, "timing/generation_ms": 242657.4441930279, "timing/scoring_ms": 0.0, "timing/total_ms": 242657.4441930279, "tokens/completion": 6123.421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 543.5283715724945 }, { "advantage/absmean": 0.12451171875, "entropy": 0.011535107158124447, "epoch": 0.04933333333333333, "grad_norm": 0.0004641880444433118, "importance_ratio": 1.0000940561294556, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00016296253306791186, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 37, "timing/generation_ms": 253055.44871557504, "timing/scoring_ms": 0.0, "timing/total_ms": 253055.44871557504, "tokens/completion": 6100.4375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 529.3097188472748 }, { "advantage/absmean": 0.12451171875, "entropy": 0.008278747089207172, "epoch": 0.050666666666666665, "grad_norm": 0.0015602978869027017, "importance_ratio": 1.000083565711975, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00012404406152199954, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 38, "timing/generation_ms": 259809.8956849426, "timing/scoring_ms": 0.0, "timing/total_ms": 259809.8956849426, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 572.6026647090912 }, { "advantage/absmean": 0.12451171875, "entropy": 0.0070807370357215405, "epoch": 0.052, "grad_norm": 0.0004621624833577141, "importance_ratio": 1.000075340270996, "learning_rate": 5e-06, "loss": -0.0, "mismatch_kl": 0.00010999527876265347, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 39, "timing/generation_ms": 266124.4311518967, "timing/scoring_ms": 0.0, "timing/total_ms": 266124.4311518967, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 645.3593230247498 }, { "advantage/absmean": 0.12451171875, "entropy": 0.00655187526717782, "epoch": 0.05333333333333334, "grad_norm": 0.00032702966921445734, "importance_ratio": 1.0000351667404175, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00014068085874896497, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 40, "timing/generation_ms": 262011.0893426463, "timing/scoring_ms": 0.0, "timing/total_ms": 262011.0893426463, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 515.61732006073 }, { "advantage/absmean": 0.12451171875, "entropy": 0.0039160363376140594, "epoch": 0.05466666666666667, "grad_norm": 0.0003724535269895079, "importance_ratio": 1.0000481605529785, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 7.484626985387877e-05, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 41, "timing/generation_ms": 255759.41647868603, "timing/scoring_ms": 0.0, "timing/total_ms": 255759.41647868603, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 565.8597645759583 }, { "advantage/absmean": 0.12451171875, "entropy": 0.01127232052385807, "epoch": 0.056, "grad_norm": 0.0008175801103252065, "importance_ratio": 1.0000771284103394, "learning_rate": 5e-06, "loss": 0.0068, "mismatch_kl": 0.00016380040324293077, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 42, "timing/generation_ms": 238812.61033378541, "timing/scoring_ms": 0.0, "timing/total_ms": 238812.61033378541, "tokens/completion": 6073.61328125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 484.4759180545807 }, { "advantage/absmean": 0.12451171875, "entropy": 0.01085229218006134, "epoch": 0.05733333333333333, "grad_norm": 0.0004839828768652627, "importance_ratio": 1.0000557899475098, "learning_rate": 5e-06, "loss": 0.0063, "mismatch_kl": 0.00013297870464157313, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 43, "timing/generation_ms": 256715.18344525248, "timing/scoring_ms": 0.0, "timing/total_ms": 256715.18344525248, "tokens/completion": 6078.20703125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 537.6344306468964 }, { "advantage/absmean": 0.12451171875, "entropy": 0.005195128731429577, "epoch": 0.058666666666666666, "grad_norm": 0.00023276391851811837, "importance_ratio": 1.0000344514846802, "learning_rate": 5e-06, "loss": 0.0023, "mismatch_kl": 8.078882819972932e-05, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 44, "timing/generation_ms": 245682.50108975917, "timing/scoring_ms": 0.0, "timing/total_ms": 245682.50108975917, "tokens/completion": 6098.1015625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 491.3542585372925 }, { "advantage/absmean": 0.12451171875, "entropy": 0.0033533975947648287, "epoch": 0.06, "grad_norm": 0.00016439514868896496, "importance_ratio": 1.00002920627594, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 7.133631879696622e-05, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 45, "timing/generation_ms": 261392.2018893063, "timing/scoring_ms": 0.0, "timing/total_ms": 261392.2018893063, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 638.6866817474365 }, { "advantage/absmean": 0.12451171875, "entropy": 0.009633159264922142, "epoch": 0.06133333333333333, "grad_norm": 0.0005837700251924664, "importance_ratio": 1.000110149383545, "learning_rate": 5e-06, "loss": -0.0005, "mismatch_kl": 0.00014644436305388808, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 46, "timing/generation_ms": 259352.97147464007, "timing/scoring_ms": 0.0, "timing/total_ms": 259352.97147464007, "tokens/completion": 6100.9375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 726.6395015716553 }, { "advantage/absmean": 0.12451171875, "entropy": 0.008214793168008327, "epoch": 0.06266666666666666, "grad_norm": 0.0003491652028248511, "importance_ratio": 1.0000574588775635, "learning_rate": 5e-06, "loss": -0.0005, "mismatch_kl": 0.00012681909720413387, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 47, "timing/generation_ms": 251731.6782604903, "timing/scoring_ms": 0.0, "timing/total_ms": 251731.6782604903, "tokens/completion": 6120.80078125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 504.8533480167389 }, { "advantage/absmean": 0.12451171875, "entropy": 0.012482496909797192, "epoch": 0.064, "grad_norm": 0.0008089181923655795, "importance_ratio": 1.0000419616699219, "learning_rate": 5e-06, "loss": 0.003, "mismatch_kl": 0.00024501114967279136, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 48, "timing/generation_ms": 260055.6455301121, "timing/scoring_ms": 0.0, "timing/total_ms": 260055.6455301121, "tokens/completion": 6038.9921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 520.350103855133 }, { "advantage/absmean": 0.12451171875, "entropy": 0.008223201148211956, "epoch": 0.06533333333333333, "grad_norm": 0.0005775216775221585, "importance_ratio": 1.0000702142715454, "learning_rate": 5e-06, "loss": -0.0, "mismatch_kl": 0.0001139239757321775, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 49, "timing/generation_ms": 262634.82890836895, "timing/scoring_ms": 0.0, "timing/total_ms": 262634.82890836895, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 694.4226834774017 }, { "advantage/absmean": 0.12451171875, "entropy": 0.006501559168100357, "epoch": 0.06666666666666667, "grad_norm": 0.0004452247469025534, "importance_ratio": 1.000080943107605, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00019989976135548204, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 50, "timing/generation_ms": 252373.39910119772, "timing/scoring_ms": 0.0, "timing/total_ms": 252373.39910119772, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 633.9480290412903 }, { "advantage/absmean": 0.12451171875, "entropy": 0.011557838879525661, "epoch": 0.068, "grad_norm": 0.00040538021426552616, "importance_ratio": 1.0000510215759277, "learning_rate": 5e-06, "loss": 0.0163, "mismatch_kl": 0.00014912446204107255, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 51, "timing/generation_ms": 231235.03853101283, "timing/scoring_ms": 0.0, "timing/total_ms": 231235.03853101283, "tokens/completion": 5880.91015625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 553.8161387443542 }, { "advantage/absmean": 0.12451171875, "entropy": 0.008280275389552116, "epoch": 0.06933333333333333, "grad_norm": 0.0006837160840031847, "importance_ratio": 1.0000361204147339, "learning_rate": 5e-06, "loss": -0.0009, "mismatch_kl": 0.00011032609472749755, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 52, "timing/generation_ms": 268335.500174202, "timing/scoring_ms": 0.0, "timing/total_ms": 268335.500174202, "tokens/completion": 6076.33984375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 532.5728721618652 }, { "advantage/absmean": 0.12451171875, "entropy": 0.008777043782174587, "epoch": 0.07066666666666667, "grad_norm": 0.00047547446087476704, "importance_ratio": 1.0000946521759033, "learning_rate": 5e-06, "loss": -0.0, "mismatch_kl": 0.0001269574131583795, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 53, "timing/generation_ms": 256683.97525977343, "timing/scoring_ms": 0.0, "timing/total_ms": 256683.97525977343, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 506.92905497550964 }, { "advantage/absmean": 0.12451171875, "entropy": 0.011497734114527702, "epoch": 0.072, "grad_norm": 0.00027828097052508087, "importance_ratio": 1.000109076499939, "learning_rate": 5e-06, "loss": 0.0042, "mismatch_kl": 0.00013832931290380657, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 54, "timing/generation_ms": 245946.20873313397, "timing/scoring_ms": 0.0, "timing/total_ms": 245946.20873313397, "tokens/completion": 6032.51953125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 505.11912751197815 }, { "advantage/absmean": 0.12451171875, "entropy": 0.010809739120304585, "epoch": 0.07333333333333333, "grad_norm": 0.0007032954488382401, "importance_ratio": 1.0000889301300049, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00015762390103191137, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 55, "timing/generation_ms": 264072.7631729096, "timing/scoring_ms": 0.0, "timing/total_ms": 264072.7631729096, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 523.6702523231506 }, { "advantage/absmean": 0.12451171875, "entropy": 0.009559578262269497, "epoch": 0.07466666666666667, "grad_norm": 0.0010708393934808242, "importance_ratio": 1.0000908374786377, "learning_rate": 5e-06, "loss": 0.0051, "mismatch_kl": 0.00013747472257819027, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 56, "timing/generation_ms": 250449.08253196627, "timing/scoring_ms": 0.0, "timing/total_ms": 250449.08253196627, "tokens/completion": 6098.72265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 556.8832399845123 }, { "advantage/absmean": 0.12451171875, "entropy": 0.011213499121367931, "epoch": 0.076, "grad_norm": 0.00044938202555849837, "importance_ratio": 1.0000908374786377, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00015059650468174368, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 57, "timing/generation_ms": 263455.5452140048, "timing/scoring_ms": 0.0, "timing/total_ms": 263455.5452140048, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 632.40900182724 }, { "advantage/absmean": 0.12451171875, "entropy": 0.005081878509372473, "epoch": 0.07733333333333334, "grad_norm": 0.0003246328757380694, "importance_ratio": 1.0000656843185425, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 0.00019094608433078974, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 58, "timing/generation_ms": 256806.45045358688, "timing/scoring_ms": 0.0, "timing/total_ms": 256806.45045358688, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 503.00778365135193 }, { "advantage/absmean": 0.12451171875, "entropy": 0.01805613562464714, "epoch": 0.07866666666666666, "grad_norm": 0.0007634702119519025, "importance_ratio": 1.0001803636550903, "learning_rate": 5e-06, "loss": 0.0025, "mismatch_kl": 0.00021581076725851744, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 59, "timing/generation_ms": 254470.52423935384, "timing/scoring_ms": 0.0, "timing/total_ms": 254470.52423935384, "tokens/completion": 6079.921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 499.350706577301 }, { "advantage/absmean": 0.12451171875, "entropy": 0.047813381999731064, "epoch": 0.08, "grad_norm": 0.0034811244478141165, "importance_ratio": 1.0005850791931152, "learning_rate": 5e-06, "loss": 0.0385, "mismatch_kl": 0.0006162600475363433, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 60, "timing/generation_ms": 122059.79803204536, "timing/scoring_ms": 0.0, "timing/total_ms": 122059.79803204536, "tokens/completion": 4056.4140625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 406.85777831077576 }, { "advantage/absmean": 0.12451171875, "entropy": 0.006517002824693918, "epoch": 0.08133333333333333, "grad_norm": 0.00045405486723584484, "importance_ratio": 1.0000643730163574, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 8.087344031082466e-05, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 61, "timing/generation_ms": 262080.00864181668, "timing/scoring_ms": 0.0, "timing/total_ms": 262080.00864181668, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 513.6219637393951 }, { "advantage/absmean": 0.12451171875, "entropy": 0.0059960088692605495, "epoch": 0.08266666666666667, "grad_norm": 0.0003004741817689029, "importance_ratio": 1.0000419616699219, "learning_rate": 5e-06, "loss": 0.0, "mismatch_kl": 7.99954796093516e-05, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 62, "timing/generation_ms": 261857.35533758998, "timing/scoring_ms": 0.0, "timing/total_ms": 261857.35533758998, "tokens/completion": 6144.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 537.6526563167572 }, { "advantage/absmean": 0.12451171875, "entropy": 0.012267248705029488, "epoch": 0.084, "grad_norm": 0.0018275298082432536, "importance_ratio": 1.0001516342163086, "learning_rate": 5e-06, "loss": 0.0273, "mismatch_kl": 0.00015860867279116064, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 63, "timing/generation_ms": 223553.63579373807, "timing/scoring_ms": 0.0, "timing/total_ms": 223553.63579373807, "tokens/completion": 5578.8046875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 449.565260887146 }, { "advantage/absmean": 0.12451171875, "entropy": 0.017613664269447327, "epoch": 0.08533333333333333, "grad_norm": 0.0013137454797814432, "importance_ratio": 1.0001808404922485, "learning_rate": 5e-06, "loss": 0.0296, "mismatch_kl": 0.00018238124903291464, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 64, "timing/generation_ms": 197715.4450826347, "timing/scoring_ms": 0.0, "timing/total_ms": 197715.4450826347, "tokens/completion": 5301.74609375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 467.5368883609772 }, { "advantage/absmean": 0.12451171875, "entropy": 0.012099393643438816, "epoch": 0.08666666666666667, "grad_norm": 0.00029163323031709923, "importance_ratio": 1.0000910758972168, "learning_rate": 5e-06, "loss": 0.0101, "mismatch_kl": 0.0001367869263049215, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 65, "timing/generation_ms": 253292.40265209228, "timing/scoring_ms": 0.0, "timing/total_ms": 253292.40265209228, "tokens/completion": 5987.40234375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 504.62310814857483 }, { "advantage/absmean": 0.12451171875, "entropy": 0.016472794115543365, "epoch": 0.088, "grad_norm": 0.000537146473230196, "importance_ratio": 1.0002104043960571, "learning_rate": 5e-06, "loss": 0.0046, "mismatch_kl": 0.00019632629118859768, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 66, "timing/generation_ms": 244626.61108747125, "timing/scoring_ms": 0.0, "timing/total_ms": 244626.61108747125, "tokens/completion": 5880.29296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 546.9820070266724 }, { "advantage/absmean": 0.12451171875, "entropy": 0.03573580086231232, "epoch": 0.08933333333333333, "grad_norm": 0.0018214337047260279, "importance_ratio": 1.0006996393203735, "learning_rate": 5e-06, "loss": 0.0366, "mismatch_kl": 0.0005711132544092834, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 67, "timing/generation_ms": 171141.10032841563, "timing/scoring_ms": 0.0, "timing/total_ms": 171141.10032841563, "tokens/completion": 4912.99609375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 387.35487270355225 }, { "advantage/absmean": 0.12451171875, "entropy": 0.024245120584964752, "epoch": 0.09066666666666667, "grad_norm": 0.0007171125744050383, "importance_ratio": 1.0004810094833374, "learning_rate": 5e-06, "loss": 0.0327, "mismatch_kl": 0.0003458830469753593, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 68, "timing/generation_ms": 175763.37515283376, "timing/scoring_ms": 0.0, "timing/total_ms": 175763.37515283376, "tokens/completion": 5039.39453125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 399.21359062194824 }, { "advantage/absmean": 0.12451171875, "entropy": 0.025269493460655212, "epoch": 0.092, "grad_norm": 0.0004443143666122359, "importance_ratio": 1.000417947769165, "learning_rate": 5e-06, "loss": 0.0151, "mismatch_kl": 0.000321421044645831, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 69, "timing/generation_ms": 250666.16093274206, "timing/scoring_ms": 0.0, "timing/total_ms": 250666.16093274206, "tokens/completion": 5965.16796875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 632.227735042572 }, { "advantage/absmean": 0.12451171875, "entropy": 0.024551477283239365, "epoch": 0.09333333333333334, "grad_norm": 0.0015252781439401258, "importance_ratio": 1.0006314516067505, "learning_rate": 5e-06, "loss": 0.0348, "mismatch_kl": 0.0005003436817787588, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 70, "timing/generation_ms": 191529.1232522577, "timing/scoring_ms": 0.0, "timing/total_ms": 191529.1232522577, "tokens/completion": 5294.87890625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 407.7219111919403 }, { "advantage/absmean": 0.12451171875, "entropy": 0.02018953487277031, "epoch": 0.09466666666666666, "grad_norm": 0.0011570903491081794, "importance_ratio": 1.0002988576889038, "learning_rate": 5e-06, "loss": 0.0237, "mismatch_kl": 0.00033742599771358073, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 71, "timing/generation_ms": 210619.99121960253, "timing/scoring_ms": 0.0, "timing/total_ms": 210619.99121960253, "tokens/completion": 5332.65625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 494.4582040309906 }, { "advantage/absmean": 0.12451171875, "entropy": 0.08414055407047272, "epoch": 0.096, "grad_norm": 0.005691985408928669, "importance_ratio": 1.002629280090332, "learning_rate": 5e-06, "loss": 0.0631, "mismatch_kl": 0.0030276263132691383, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 72, "timing/generation_ms": 26491.081130690873, "timing/scoring_ms": 0.0, "timing/total_ms": 26491.081130690873, "tokens/completion": 1684.4921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 175.0816376209259 }, { "advantage/absmean": 0.12451171875, "entropy": 0.08044799417257309, "epoch": 0.09733333333333333, "grad_norm": 0.0067108539111987095, "importance_ratio": 1.0022099018096924, "learning_rate": 5e-06, "loss": 0.0512, "mismatch_kl": 0.0033263727091252804, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 73, "timing/generation_ms": 26663.206906057894, "timing/scoring_ms": 0.0, "timing/total_ms": 26663.206906057894, "tokens/completion": 1624.47265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 179.0183322429657 }, { "advantage/absmean": 0.12451171875, "entropy": 0.14499743282794952, "epoch": 0.09866666666666667, "grad_norm": 0.010377228969329702, "importance_ratio": 1.0045510530471802, "learning_rate": 5e-06, "loss": 0.0301, "mismatch_kl": 0.03058871254324913, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 74, "timing/generation_ms": 11363.965434022248, "timing/scoring_ms": 0.0, "timing/total_ms": 11363.965434022248, "tokens/completion": 733.40234375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 162.93997645378113 }, { "advantage/absmean": 0.12451171875, "entropy": 0.15485742688179016, "epoch": 0.1, "grad_norm": 0.037501291580980145, "importance_ratio": 1.0262236595153809, "learning_rate": 5e-06, "loss": 0.0478, "mismatch_kl": 0.5780022144317627, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 75, "timing/generation_ms": 31973.80775306374, "timing/scoring_ms": 0.0, "timing/total_ms": 31973.80775306374, "tokens/completion": 1854.69921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 206.36020827293396 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2712324857711792, "epoch": 0.10133333333333333, "grad_norm": 0.021496155900656944, "importance_ratio": 0.747008204460144, "learning_rate": 5e-06, "loss": -0.001, "mismatch_kl": 4.077150344848633, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 76, "timing/generation_ms": 19520.673436112702, "timing/scoring_ms": 0.0, "timing/total_ms": 19520.673436112702, "tokens/completion": 1019.1015625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 104.34236979484558 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3253353238105774, "epoch": 0.10266666666666667, "grad_norm": 0.014127787785753907, "importance_ratio": 0.5209717154502869, "learning_rate": 5e-06, "loss": 0.0074, "mismatch_kl": 11.41779899597168, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 77, "timing/generation_ms": 33620.65821047872, "timing/scoring_ms": 0.0, "timing/total_ms": 33620.65821047872, "tokens/completion": 1925.72265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 257.44123911857605 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3993019163608551, "epoch": 0.104, "grad_norm": 0.009151033649610016, "importance_ratio": 0.29432952404022217, "learning_rate": 5e-06, "loss": 0.0157, "mismatch_kl": 11.372162818908691, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 78, "timing/generation_ms": 11082.484270446002, "timing/scoring_ms": 0.0, "timing/total_ms": 11082.484270446002, "tokens/completion": 828.0546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 115.73264193534851 }, { "advantage/absmean": 0.12451171875, "entropy": 0.1168494001030922, "epoch": 0.10533333333333333, "grad_norm": 0.006117265962728229, "importance_ratio": 0.1935732513666153, "learning_rate": 5e-06, "loss": -0.0017, "mismatch_kl": 21.00209617614746, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 79, "timing/generation_ms": 36345.630533993244, "timing/scoring_ms": 0.0, "timing/total_ms": 36345.630533993244, "tokens/completion": 2084.80859375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 232.0772545337677 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4559866189956665, "epoch": 0.10666666666666667, "grad_norm": 0.02899073922789841, "importance_ratio": 0.9647712111473083, "learning_rate": 5e-06, "loss": -0.0109, "mismatch_kl": 0.1562381535768509, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 80, "timing/generation_ms": 3708.529833704233, "timing/scoring_ms": 0.0, "timing/total_ms": 3708.529833704233, "tokens/completion": 172.21484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 79.40927720069885 }, { "advantage/absmean": 0.12451171875, "entropy": 0.18381687998771667, "epoch": 0.108, "grad_norm": 0.03870938318729351, "importance_ratio": 0.9867123365402222, "learning_rate": 5e-06, "loss": 0.0003, "mismatch_kl": 0.09630821645259857, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 81, "timing/generation_ms": 7641.556458547711, "timing/scoring_ms": 0.0, "timing/total_ms": 7641.556458547711, "tokens/completion": 342.55078125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 28.48853635787964 }, { "advantage/absmean": 0.12451171875, "entropy": 0.28385868668556213, "epoch": 0.10933333333333334, "grad_norm": 0.024463462093216322, "importance_ratio": 0.9961410760879517, "learning_rate": 5e-06, "loss": -0.0027, "mismatch_kl": 0.046350929886102676, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 82, "timing/generation_ms": 14151.478135958314, "timing/scoring_ms": 0.0, "timing/total_ms": 14151.478135958314, "tokens/completion": 640.5703125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 51.07678151130676 }, { "advantage/absmean": 0.12451171875, "entropy": 0.25215646624565125, "epoch": 0.11066666666666666, "grad_norm": 0.025956305888591907, "importance_ratio": 0.9893953204154968, "learning_rate": 5e-06, "loss": 0.0024, "mismatch_kl": 0.06097816303372383, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 83, "timing/generation_ms": 9802.852495573461, "timing/scoring_ms": 0.0, "timing/total_ms": 9802.852495573461, "tokens/completion": 486.23828125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 135.5597288608551 }, { "advantage/absmean": 0.12451171875, "entropy": 0.18832416832447052, "epoch": 0.112, "grad_norm": 0.05162272724580775, "importance_ratio": 0.9795369505882263, "learning_rate": 5e-06, "loss": -0.0063, "mismatch_kl": 0.09001336991786957, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 84, "timing/generation_ms": 8744.545813649893, "timing/scoring_ms": 0.0, "timing/total_ms": 8744.545813649893, "tokens/completion": 422.9921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 202.02377605438232 }, { "advantage/absmean": 0.12451171875, "entropy": 0.20757851004600525, "epoch": 0.11333333333333333, "grad_norm": 0.029849605436009424, "importance_ratio": 0.9847032427787781, "learning_rate": 5e-06, "loss": 0.0003, "mismatch_kl": 0.08596009016036987, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 85, "timing/generation_ms": 6916.043497622013, "timing/scoring_ms": 0.0, "timing/total_ms": 6916.043497622013, "tokens/completion": 315.65625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 26.646199941635132 }, { "advantage/absmean": 0.12451171875, "entropy": 0.19341044127941132, "epoch": 0.11466666666666667, "grad_norm": 0.023761811444065736, "importance_ratio": 0.9906992316246033, "learning_rate": 5e-06, "loss": -0.0037, "mismatch_kl": 0.04626338183879852, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 86, "timing/generation_ms": 10513.352582231164, "timing/scoring_ms": 0.0, "timing/total_ms": 10513.352582231164, "tokens/completion": 565.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 43.092281341552734 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2988993227481842, "epoch": 0.116, "grad_norm": 0.08410779443510906, "importance_ratio": 0.9899005889892578, "learning_rate": 5e-06, "loss": -0.0182, "mismatch_kl": 0.048949241638183594, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 87, "timing/generation_ms": 6337.426606565714, "timing/scoring_ms": 0.0, "timing/total_ms": 6337.426606565714, "tokens/completion": 288.53125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 29.87082028388977 }, { "advantage/absmean": 0.12451171875, "entropy": 0.24379415810108185, "epoch": 0.11733333333333333, "grad_norm": 0.033951546211805725, "importance_ratio": 0.9842061996459961, "learning_rate": 5e-06, "loss": -0.001, "mismatch_kl": 0.05609630420804024, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 88, "timing/generation_ms": 12948.228243738413, "timing/scoring_ms": 0.0, "timing/total_ms": 12948.228243738413, "tokens/completion": 572.8359375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 158.39017939567566 }, { "advantage/absmean": 0.12451171875, "entropy": 0.46492651104927063, "epoch": 0.11866666666666667, "grad_norm": 0.05385249484621595, "importance_ratio": 0.9755511283874512, "learning_rate": 5e-06, "loss": 0.0005, "mismatch_kl": 0.16615039110183716, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 89, "timing/generation_ms": 11146.457904949784, "timing/scoring_ms": 0.0, "timing/total_ms": 11146.457904949784, "tokens/completion": 531.22265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 125.18756413459778 }, { "advantage/absmean": 0.12451171875, "entropy": 0.26703542470932007, "epoch": 0.12, "grad_norm": 0.02367206113805114, "importance_ratio": 0.9910291433334351, "learning_rate": 5e-06, "loss": -0.0072, "mismatch_kl": 0.041237972676754, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 90, "timing/generation_ms": 12296.578384935856, "timing/scoring_ms": 0.0, "timing/total_ms": 12296.578384935856, "tokens/completion": 619.4375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 123.89916157722473 }, { "advantage/absmean": 0.12451171875, "entropy": 0.5690855979919434, "epoch": 0.12133333333333333, "grad_norm": 0.030434949636985786, "importance_ratio": 0.9436249136924744, "learning_rate": 5e-06, "loss": 0.0044, "mismatch_kl": 0.4027661979198456, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 91, "timing/generation_ms": 17300.37511046976, "timing/scoring_ms": 0.0, "timing/total_ms": 17300.37511046976, "tokens/completion": 803.75, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 68.73723077774048 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2795153260231018, "epoch": 0.12266666666666666, "grad_norm": 0.033606081779905164, "importance_ratio": 0.9910190105438232, "learning_rate": 5e-06, "loss": -0.0021, "mismatch_kl": 0.048360757529735565, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 92, "timing/generation_ms": 10146.174241788685, "timing/scoring_ms": 0.0, "timing/total_ms": 10146.174241788685, "tokens/completion": 409.20703125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 115.50342917442322 }, { "advantage/absmean": 0.12451171875, "entropy": 0.25254565477371216, "epoch": 0.124, "grad_norm": 0.02170917112603325, "importance_ratio": 0.9928799867630005, "learning_rate": 5e-06, "loss": 0.0035, "mismatch_kl": 0.03083646297454834, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 93, "timing/generation_ms": 14904.53880932182, "timing/scoring_ms": 0.0, "timing/total_ms": 14904.53880932182, "tokens/completion": 689.578125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 136.12913012504578 }, { "advantage/absmean": 0.12451171875, "entropy": 0.25149497389793396, "epoch": 0.12533333333333332, "grad_norm": 0.049807356598740776, "importance_ratio": 0.990451455116272, "learning_rate": 5e-06, "loss": -0.0058, "mismatch_kl": 0.03808113560080528, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 94, "timing/generation_ms": 8459.820285439491, "timing/scoring_ms": 0.0, "timing/total_ms": 8459.820285439491, "tokens/completion": 413.421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 34.11598253250122 }, { "advantage/absmean": 0.12451171875, "entropy": 0.21991755068302155, "epoch": 0.12666666666666668, "grad_norm": 0.02577498970131942, "importance_ratio": 0.9890254139900208, "learning_rate": 5e-06, "loss": -0.0012, "mismatch_kl": 0.05755931884050369, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 95, "timing/generation_ms": 5316.206902265549, "timing/scoring_ms": 0.0, "timing/total_ms": 5316.206902265549, "tokens/completion": 254.72265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 129.7372589111328 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4472619593143463, "epoch": 0.128, "grad_norm": 0.040975406412791814, "importance_ratio": 0.9842396378517151, "learning_rate": 5e-06, "loss": -0.003, "mismatch_kl": 0.14270469546318054, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 96, "timing/generation_ms": 6448.528000153601, "timing/scoring_ms": 0.0, "timing/total_ms": 6448.528000153601, "tokens/completion": 303.2421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 152.90578722953796 }, { "advantage/absmean": 0.12451171875, "entropy": 0.24905133247375488, "epoch": 0.12933333333333333, "grad_norm": 0.0336787422018486, "importance_ratio": 0.9942489862442017, "learning_rate": 5e-06, "loss": -0.0073, "mismatch_kl": 0.03845536336302757, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 97, "timing/generation_ms": 10672.863409854472, "timing/scoring_ms": 0.0, "timing/total_ms": 10672.863409854472, "tokens/completion": 522.453125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 58.958009481430054 }, { "advantage/absmean": 0.12451171875, "entropy": 0.37947529554367065, "epoch": 0.13066666666666665, "grad_norm": 0.03256153448253783, "importance_ratio": 0.9943234324455261, "learning_rate": 5e-06, "loss": -0.0033, "mismatch_kl": 0.0457632839679718, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 98, "timing/generation_ms": 7797.16813378036, "timing/scoring_ms": 0.0, "timing/total_ms": 7797.16813378036, "tokens/completion": 321.6484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 57.01115918159485 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3295568525791168, "epoch": 0.132, "grad_norm": 0.025070691541196687, "importance_ratio": 0.9886187314987183, "learning_rate": 5e-06, "loss": 0.002, "mismatch_kl": 0.055542413145303726, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 99, "timing/generation_ms": 12520.016725175083, "timing/scoring_ms": 0.0, "timing/total_ms": 12520.016725175083, "tokens/completion": 560.515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 134.89474534988403 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3819415867328644, "epoch": 0.13333333333333333, "grad_norm": 0.029430906337480585, "importance_ratio": 0.9973994493484497, "learning_rate": 5e-06, "loss": 0.0014, "mismatch_kl": 0.03809521347284317, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 100, "timing/generation_ms": 7522.873256355524, "timing/scoring_ms": 0.0, "timing/total_ms": 7522.873256355524, "tokens/completion": 381.24609375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 42.47270226478577 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3258141577243805, "epoch": 0.13466666666666666, "grad_norm": 0.06302493851707891, "importance_ratio": 0.995746374130249, "learning_rate": 5e-06, "loss": -0.0032, "mismatch_kl": 0.05126583203673363, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 101, "timing/generation_ms": 6897.25607726723, "timing/scoring_ms": 0.0, "timing/total_ms": 6897.25607726723, "tokens/completion": 331.53515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 101.3789484500885 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8970124125480652, "epoch": 0.136, "grad_norm": 0.03515811902568956, "importance_ratio": 0.8364270925521851, "learning_rate": 5e-06, "loss": 0.0067, "mismatch_kl": 1.5947057008743286, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 102, "timing/generation_ms": 12960.892278701067, "timing/scoring_ms": 0.0, "timing/total_ms": 12960.892278701067, "tokens/completion": 679.25390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 92.91760039329529 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2645859122276306, "epoch": 0.13733333333333334, "grad_norm": 0.03015986556668391, "importance_ratio": 0.9922869205474854, "learning_rate": 5e-06, "loss": -0.0033, "mismatch_kl": 0.032752275466918945, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 103, "timing/generation_ms": 12081.96578361094, "timing/scoring_ms": 0.0, "timing/total_ms": 12081.96578361094, "tokens/completion": 635.26171875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 221.86856937408447 }, { "advantage/absmean": 0.12451171875, "entropy": 0.40493857860565186, "epoch": 0.13866666666666666, "grad_norm": 0.029340951142688608, "importance_ratio": 0.9976834058761597, "learning_rate": 5e-06, "loss": -0.0075, "mismatch_kl": 0.039802681654691696, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 104, "timing/generation_ms": 8452.124254778028, "timing/scoring_ms": 0.0, "timing/total_ms": 8452.124254778028, "tokens/completion": 392.85546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 78.09920930862427 }, { "advantage/absmean": 0.12451171875, "entropy": 0.42381417751312256, "epoch": 0.14, "grad_norm": 0.03251134797029109, "importance_ratio": 0.9939345121383667, "learning_rate": 5e-06, "loss": -0.0025, "mismatch_kl": 0.045791786164045334, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 105, "timing/generation_ms": 11178.499449044466, "timing/scoring_ms": 0.0, "timing/total_ms": 11178.499449044466, "tokens/completion": 480.08984375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 36.62562108039856 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2739037871360779, "epoch": 0.14133333333333334, "grad_norm": 0.0476499263024248, "importance_ratio": 0.9929625988006592, "learning_rate": 5e-06, "loss": -0.0024, "mismatch_kl": 0.036298882216215134, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 106, "timing/generation_ms": 10698.151003569365, "timing/scoring_ms": 0.0, "timing/total_ms": 10698.151003569365, "tokens/completion": 521.33203125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 130.2317771911621 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2831694781780243, "epoch": 0.14266666666666666, "grad_norm": 0.048559683162439526, "importance_ratio": 0.9895249605178833, "learning_rate": 5e-06, "loss": -0.0018, "mismatch_kl": 0.04853809252381325, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 107, "timing/generation_ms": 10670.390761457384, "timing/scoring_ms": 0.0, "timing/total_ms": 10670.390761457384, "tokens/completion": 504.16015625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 154.62130737304688 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4273696541786194, "epoch": 0.144, "grad_norm": 0.04246003800252577, "importance_ratio": 0.9897579550743103, "learning_rate": 5e-06, "loss": -0.0004, "mismatch_kl": 0.05487997457385063, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 108, "timing/generation_ms": 5255.264617502689, "timing/scoring_ms": 0.0, "timing/total_ms": 5255.264617502689, "tokens/completion": 253.4296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 62.357131004333496 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3309624195098877, "epoch": 0.14533333333333334, "grad_norm": 0.020612894864024223, "importance_ratio": 0.994171679019928, "learning_rate": 5e-06, "loss": 0.004, "mismatch_kl": 0.028750188648700714, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 109, "timing/generation_ms": 17462.82579470426, "timing/scoring_ms": 0.0, "timing/total_ms": 17462.82579470426, "tokens/completion": 909.28515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 84.52479147911072 }, { "advantage/absmean": 0.12451171875, "entropy": 0.45720767974853516, "epoch": 0.14666666666666667, "grad_norm": 0.048825755999723545, "importance_ratio": 0.9917762279510498, "learning_rate": 5e-06, "loss": -0.003, "mismatch_kl": 0.03884867951273918, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 110, "timing/generation_ms": 10527.64255553484, "timing/scoring_ms": 0.0, "timing/total_ms": 10527.64255553484, "tokens/completion": 457.21875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 118.98395490646362 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3448692262172699, "epoch": 0.148, "grad_norm": 0.02391536511668303, "importance_ratio": 0.9938703775405884, "learning_rate": 5e-06, "loss": -0.0118, "mismatch_kl": 0.03092486597597599, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 111, "timing/generation_ms": 11426.006315276027, "timing/scoring_ms": 0.0, "timing/total_ms": 11426.006315276027, "tokens/completion": 603.828125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 44.38506889343262 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4210182726383209, "epoch": 0.14933333333333335, "grad_norm": 0.017744426750614804, "importance_ratio": 0.9841266870498657, "learning_rate": 5e-06, "loss": 0.0031, "mismatch_kl": 0.15376684069633484, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 112, "timing/generation_ms": 15345.524672418833, "timing/scoring_ms": 0.0, "timing/total_ms": 15345.524672418833, "tokens/completion": 679.61328125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 77.3697247505188 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3890233039855957, "epoch": 0.15066666666666667, "grad_norm": 0.042319164028374844, "importance_ratio": 0.9905653595924377, "learning_rate": 5e-06, "loss": -0.0067, "mismatch_kl": 0.03776917979121208, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 113, "timing/generation_ms": 8361.73670180142, "timing/scoring_ms": 0.0, "timing/total_ms": 8361.73670180142, "tokens/completion": 386.69921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 33.98000693321228 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3506433963775635, "epoch": 0.152, "grad_norm": 0.022347419652582003, "importance_ratio": 0.9932938814163208, "learning_rate": 5e-06, "loss": -0.0024, "mismatch_kl": 0.03900053724646568, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 114, "timing/generation_ms": 10107.008518651128, "timing/scoring_ms": 0.0, "timing/total_ms": 10107.008518651128, "tokens/completion": 531.8671875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 156.0705955028534 }, { "advantage/absmean": 0.12451171875, "entropy": 0.28853052854537964, "epoch": 0.15333333333333332, "grad_norm": 0.02467560875646059, "importance_ratio": 0.9956313967704773, "learning_rate": 5e-06, "loss": -0.0077, "mismatch_kl": 0.021128181368112564, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 115, "timing/generation_ms": 13438.352120108902, "timing/scoring_ms": 0.0, "timing/total_ms": 13438.352120108902, "tokens/completion": 638.3359375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 51.55745196342468 }, { "advantage/absmean": 0.12451171875, "entropy": 0.5352842807769775, "epoch": 0.15466666666666667, "grad_norm": 0.0500581678773726, "importance_ratio": 0.9921436905860901, "learning_rate": 5e-06, "loss": -0.0035, "mismatch_kl": 0.0745246633887291, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 116, "timing/generation_ms": 6379.514851607382, "timing/scoring_ms": 0.0, "timing/total_ms": 6379.514851607382, "tokens/completion": 304.5625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 38.366251945495605 }, { "advantage/absmean": 0.12451171875, "entropy": 0.36106666922569275, "epoch": 0.156, "grad_norm": 0.063234851546128, "importance_ratio": 0.9977811574935913, "learning_rate": 5e-06, "loss": -0.0007, "mismatch_kl": 0.029981082305312157, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 117, "timing/generation_ms": 7269.031744450331, "timing/scoring_ms": 0.0, "timing/total_ms": 7269.031744450331, "tokens/completion": 359.06640625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 27.440030097961426 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3367100656032562, "epoch": 0.15733333333333333, "grad_norm": 0.059808565066134974, "importance_ratio": 0.988777220249176, "learning_rate": 5e-06, "loss": -0.0044, "mismatch_kl": 0.044747766107320786, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 118, "timing/generation_ms": 9969.640973955393, "timing/scoring_ms": 0.0, "timing/total_ms": 9969.640973955393, "tokens/completion": 485.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 82.32884860038757 }, { "advantage/absmean": 0.12451171875, "entropy": 0.35986092686653137, "epoch": 0.15866666666666668, "grad_norm": 0.020285418443392603, "importance_ratio": 0.9924752116203308, "learning_rate": 5e-06, "loss": 0.0042, "mismatch_kl": 0.031399309635162354, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 119, "timing/generation_ms": 15499.55965206027, "timing/scoring_ms": 0.0, "timing/total_ms": 15499.55965206027, "tokens/completion": 796.76171875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 56.515456199645996 }, { "advantage/absmean": 0.12451171875, "entropy": 0.40867432951927185, "epoch": 0.16, "grad_norm": 0.018892048843934344, "importance_ratio": 0.9954840540885925, "learning_rate": 5e-06, "loss": -0.0094, "mismatch_kl": 0.030410781502723694, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 120, "timing/generation_ms": 13046.93166166544, "timing/scoring_ms": 0.0, "timing/total_ms": 13046.93166166544, "tokens/completion": 672.06640625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 51.22301483154297 }, { "advantage/absmean": 0.12451171875, "entropy": 0.25322413444519043, "epoch": 0.16133333333333333, "grad_norm": 0.019402594506856746, "importance_ratio": 0.9968504309654236, "learning_rate": 5e-06, "loss": -0.0018, "mismatch_kl": 0.020855166018009186, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 121, "timing/generation_ms": 33212.274321354926, "timing/scoring_ms": 0.0, "timing/total_ms": 33212.274321354926, "tokens/completion": 1494.39453125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 196.6885223388672 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3203243613243103, "epoch": 0.16266666666666665, "grad_norm": 0.016032102577421704, "importance_ratio": 0.9980469942092896, "learning_rate": 5e-06, "loss": -0.0013, "mismatch_kl": 0.01909617707133293, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 122, "timing/generation_ms": 21461.640139110386, "timing/scoring_ms": 0.0, "timing/total_ms": 21461.640139110386, "tokens/completion": 1059.1953125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 84.59676575660706 }, { "advantage/absmean": 0.12451171875, "entropy": 0.5010811686515808, "epoch": 0.164, "grad_norm": 0.02848759848639813, "importance_ratio": 1.0016131401062012, "learning_rate": 5e-06, "loss": -0.0097, "mismatch_kl": 0.02760869450867176, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 123, "timing/generation_ms": 9319.45723388344, "timing/scoring_ms": 0.0, "timing/total_ms": 9319.45723388344, "tokens/completion": 433.1015625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 36.64540505409241 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4700590968132019, "epoch": 0.16533333333333333, "grad_norm": 0.025031920446653932, "importance_ratio": 0.9973174929618835, "learning_rate": 5e-06, "loss": -0.0072, "mismatch_kl": 0.03977029770612717, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 124, "timing/generation_ms": 9967.066356912255, "timing/scoring_ms": 0.0, "timing/total_ms": 9967.066356912255, "tokens/completion": 478.1328125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 137.7500193119049 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4410494863986969, "epoch": 0.16666666666666666, "grad_norm": 0.02102977498791798, "importance_ratio": 0.9927030801773071, "learning_rate": 5e-06, "loss": -0.0044, "mismatch_kl": 0.05027690902352333, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 125, "timing/generation_ms": 13226.7307927832, "timing/scoring_ms": 0.0, "timing/total_ms": 13226.7307927832, "tokens/completion": 666.65234375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 119.67769002914429 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2941017150878906, "epoch": 0.168, "grad_norm": 0.01764622195762912, "importance_ratio": 0.9970736503601074, "learning_rate": 5e-06, "loss": -0.0039, "mismatch_kl": 0.025975050404667854, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 126, "timing/generation_ms": 30093.59989501536, "timing/scoring_ms": 0.0, "timing/total_ms": 30093.59989501536, "tokens/completion": 1403.23046875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 111.32783484458923 }, { "advantage/absmean": 0.12451171875, "entropy": 0.37822288274765015, "epoch": 0.16933333333333334, "grad_norm": 0.03205413439415866, "importance_ratio": 0.9921689629554749, "learning_rate": 5e-06, "loss": -0.0015, "mismatch_kl": 0.10021175444126129, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 127, "timing/generation_ms": 25918.55046711862, "timing/scoring_ms": 0.0, "timing/total_ms": 25918.55046711862, "tokens/completion": 1132.37890625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 101.07530164718628 }, { "advantage/absmean": 0.12451171875, "entropy": 0.46506795287132263, "epoch": 0.17066666666666666, "grad_norm": 0.026459518059964743, "importance_ratio": 0.995638906955719, "learning_rate": 5e-06, "loss": -0.0065, "mismatch_kl": 0.03533043712377548, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 128, "timing/generation_ms": 8870.356048457325, "timing/scoring_ms": 0.0, "timing/total_ms": 8870.356048457325, "tokens/completion": 477.8046875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 41.62081718444824 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4616319537162781, "epoch": 0.172, "grad_norm": 0.029689428333274717, "importance_ratio": 0.992743194103241, "learning_rate": 5e-06, "loss": -0.0116, "mismatch_kl": 0.043640002608299255, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 129, "timing/generation_ms": 17582.845278084278, "timing/scoring_ms": 0.0, "timing/total_ms": 17582.845278084278, "tokens/completion": 896.60546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 103.23663401603699 }, { "advantage/absmean": 0.12451171875, "entropy": 0.318230539560318, "epoch": 0.17333333333333334, "grad_norm": 0.021848886677287266, "importance_ratio": 1.0002652406692505, "learning_rate": 5e-06, "loss": -0.0028, "mismatch_kl": 0.032250385731458664, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 130, "timing/generation_ms": 12501.79857108742, "timing/scoring_ms": 0.0, "timing/total_ms": 12501.79857108742, "tokens/completion": 636.82421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 46.11354732513428 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2867668569087982, "epoch": 0.17466666666666666, "grad_norm": 0.0152850963716213, "importance_ratio": 0.9975439310073853, "learning_rate": 5e-06, "loss": 0.0004, "mismatch_kl": 0.03095307946205139, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 131, "timing/generation_ms": 21872.447106055915, "timing/scoring_ms": 0.0, "timing/total_ms": 21872.447106055915, "tokens/completion": 1016.09765625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 75.5360016822815 }, { "advantage/absmean": 0.12451171875, "entropy": 0.6940531134605408, "epoch": 0.176, "grad_norm": 0.027620607135447624, "importance_ratio": 0.9872549176216125, "learning_rate": 5e-06, "loss": 0.0013, "mismatch_kl": 0.14033383131027222, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 132, "timing/generation_ms": 11405.475388281047, "timing/scoring_ms": 0.0, "timing/total_ms": 11405.475388281047, "tokens/completion": 487.51953125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 55.63127040863037 }, { "advantage/absmean": 0.12451171875, "entropy": 0.36297503113746643, "epoch": 0.17733333333333334, "grad_norm": 0.029171908888413572, "importance_ratio": 0.9953750967979431, "learning_rate": 5e-06, "loss": -0.0051, "mismatch_kl": 0.035398464649915695, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 133, "timing/generation_ms": 17919.221241027117, "timing/scoring_ms": 0.0, "timing/total_ms": 17919.221241027117, "tokens/completion": 900.453125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 170.36363244056702 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2939022481441498, "epoch": 0.17866666666666667, "grad_norm": 0.02565678896444847, "importance_ratio": 0.99770587682724, "learning_rate": 5e-06, "loss": -0.0013, "mismatch_kl": 0.019702836871147156, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 134, "timing/generation_ms": 26027.854280546308, "timing/scoring_ms": 0.0, "timing/total_ms": 26027.854280546308, "tokens/completion": 1189.94921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 208.00876903533936 }, { "advantage/absmean": 0.12451171875, "entropy": 0.27582186460494995, "epoch": 0.18, "grad_norm": 0.025995432419046362, "importance_ratio": 0.9993173480033875, "learning_rate": 5e-06, "loss": 0.0001, "mismatch_kl": 0.023949675261974335, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 135, "timing/generation_ms": 19027.399071492255, "timing/scoring_ms": 0.0, "timing/total_ms": 19027.399071492255, "tokens/completion": 910.98828125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 69.73441195487976 }, { "advantage/absmean": 0.12451171875, "entropy": 0.31243762373924255, "epoch": 0.18133333333333335, "grad_norm": 0.021978924242567442, "importance_ratio": 0.9992286562919617, "learning_rate": 5e-06, "loss": -0.0016, "mismatch_kl": 0.024040305987000465, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 136, "timing/generation_ms": 14330.211003310978, "timing/scoring_ms": 0.0, "timing/total_ms": 14330.211003310978, "tokens/completion": 671.7265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 53.44596743583679 }, { "advantage/absmean": 0.12451171875, "entropy": 0.24772067368030548, "epoch": 0.18266666666666667, "grad_norm": 0.022707662268209423, "importance_ratio": 0.9990280866622925, "learning_rate": 5e-06, "loss": -0.0023, "mismatch_kl": 0.022532925009727478, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 137, "timing/generation_ms": 35249.2256751284, "timing/scoring_ms": 0.0, "timing/total_ms": 35249.2256751284, "tokens/completion": 1598.390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 216.32258987426758 }, { "advantage/absmean": 0.12451171875, "entropy": 0.38041970133781433, "epoch": 0.184, "grad_norm": 0.046110003811864524, "importance_ratio": 0.9846709370613098, "learning_rate": 5e-06, "loss": -0.0024, "mismatch_kl": 0.1807573288679123, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 138, "timing/generation_ms": 10808.89296438545, "timing/scoring_ms": 0.0, "timing/total_ms": 10808.89296438545, "tokens/completion": 505.0625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 72.23299145698547 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3362736403942108, "epoch": 0.18533333333333332, "grad_norm": 0.057037876570506886, "importance_ratio": 0.9923868179321289, "learning_rate": 5e-06, "loss": 0.0033, "mismatch_kl": 0.0626266598701477, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 139, "timing/generation_ms": 8591.852098703384, "timing/scoring_ms": 0.0, "timing/total_ms": 8591.852098703384, "tokens/completion": 445.6875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 83.33036708831787 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2598806321620941, "epoch": 0.18666666666666668, "grad_norm": 0.021433898880701664, "importance_ratio": 0.9913464784622192, "learning_rate": 5e-06, "loss": 0.0022, "mismatch_kl": 0.04193839803338051, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 140, "timing/generation_ms": 22836.472398601472, "timing/scoring_ms": 0.0, "timing/total_ms": 22836.472398601472, "tokens/completion": 1069.79296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 123.7300488948822 }, { "advantage/absmean": 0.12451171875, "entropy": 0.27669745683670044, "epoch": 0.188, "grad_norm": 0.040287051430344514, "importance_ratio": 0.9890030026435852, "learning_rate": 5e-06, "loss": 0.0006, "mismatch_kl": 0.03683684393763542, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 141, "timing/generation_ms": 22967.52266585827, "timing/scoring_ms": 0.0, "timing/total_ms": 22967.52266585827, "tokens/completion": 1105.08203125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 119.94411706924438 }, { "advantage/absmean": 0.12451171875, "entropy": 0.32473960518836975, "epoch": 0.18933333333333333, "grad_norm": 0.03235428789871377, "importance_ratio": 0.9974983334541321, "learning_rate": 5e-06, "loss": 0.0005, "mismatch_kl": 0.021878903731703758, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 142, "timing/generation_ms": 20083.584303036332, "timing/scoring_ms": 0.0, "timing/total_ms": 20083.584303036332, "tokens/completion": 1026.375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 182.45814514160156 }, { "advantage/absmean": 0.12451171875, "entropy": 0.32302016019821167, "epoch": 0.19066666666666668, "grad_norm": 0.02364068100843913, "importance_ratio": 1.000141978263855, "learning_rate": 5e-06, "loss": 0.0026, "mismatch_kl": 0.027520477771759033, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 143, "timing/generation_ms": 13226.199164055288, "timing/scoring_ms": 0.0, "timing/total_ms": 13226.199164055288, "tokens/completion": 630.8828125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 68.72126913070679 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4063912034034729, "epoch": 0.192, "grad_norm": 0.016855205380348858, "importance_ratio": 0.9972877502441406, "learning_rate": 5e-06, "loss": -0.0044, "mismatch_kl": 0.02402544766664505, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 144, "timing/generation_ms": 18624.562999233603, "timing/scoring_ms": 0.0, "timing/total_ms": 18624.562999233603, "tokens/completion": 916.34765625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 63.37579298019409 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3544447124004364, "epoch": 0.19333333333333333, "grad_norm": 0.03420133721717633, "importance_ratio": 0.9964665174484253, "learning_rate": 5e-06, "loss": -0.0075, "mismatch_kl": 0.020806703716516495, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 145, "timing/generation_ms": 18501.724537461996, "timing/scoring_ms": 0.0, "timing/total_ms": 18501.724537461996, "tokens/completion": 914.03515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 63.586211919784546 }, { "advantage/absmean": 0.12451171875, "entropy": 0.31913280487060547, "epoch": 0.19466666666666665, "grad_norm": 0.025814291552238237, "importance_ratio": 0.9976394176483154, "learning_rate": 5e-06, "loss": -0.0017, "mismatch_kl": 0.02318250946700573, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 146, "timing/generation_ms": 17320.88227570057, "timing/scoring_ms": 0.0, "timing/total_ms": 17320.88227570057, "tokens/completion": 802.69921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 161.1075360774994 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3060760796070099, "epoch": 0.196, "grad_norm": 0.024041285955131858, "importance_ratio": 0.9983845353126526, "learning_rate": 5e-06, "loss": -0.0044, "mismatch_kl": 0.021491888910531998, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 147, "timing/generation_ms": 20764.05915338546, "timing/scoring_ms": 0.0, "timing/total_ms": 20764.05915338546, "tokens/completion": 1029.03125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 80.10747575759888 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4907422661781311, "epoch": 0.19733333333333333, "grad_norm": 0.01969056173140591, "importance_ratio": 0.9921115040779114, "learning_rate": 5e-06, "loss": 0.0019, "mismatch_kl": 0.09054939448833466, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 148, "timing/generation_ms": 14571.548252366483, "timing/scoring_ms": 0.0, "timing/total_ms": 14571.548252366483, "tokens/completion": 646.578125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 94.1196072101593 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2544015944004059, "epoch": 0.19866666666666666, "grad_norm": 0.020070961466503938, "importance_ratio": 0.998515784740448, "learning_rate": 5e-06, "loss": -0.0002, "mismatch_kl": 0.019744453951716423, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 149, "timing/generation_ms": 23987.087721936405, "timing/scoring_ms": 0.0, "timing/total_ms": 23987.087721936405, "tokens/completion": 1105.234375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 87.52198696136475 }, { "advantage/absmean": 0.12451171875, "entropy": 0.28248143196105957, "epoch": 0.2, "grad_norm": 0.0191634545508177, "importance_ratio": 0.9957163333892822, "learning_rate": 5e-06, "loss": -0.004, "mismatch_kl": 0.018821164965629578, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 150, "timing/generation_ms": 20559.32307895273, "timing/scoring_ms": 0.0, "timing/total_ms": 20559.32307895273, "tokens/completion": 1016.2265625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 131.24922895431519 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4225759208202362, "epoch": 0.20133333333333334, "grad_norm": 0.01854881603951969, "importance_ratio": 0.9962813854217529, "learning_rate": 5e-06, "loss": -0.0017, "mismatch_kl": 0.025664212182164192, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 151, "timing/generation_ms": 22859.651166945696, "timing/scoring_ms": 0.0, "timing/total_ms": 22859.651166945696, "tokens/completion": 1112.96484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 170.4989137649536 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2856869399547577, "epoch": 0.20266666666666666, "grad_norm": 0.018394448039889547, "importance_ratio": 0.9985631704330444, "learning_rate": 5e-06, "loss": -0.0018, "mismatch_kl": 0.024066420271992683, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 152, "timing/generation_ms": 37744.059775955975, "timing/scoring_ms": 0.0, "timing/total_ms": 37744.059775955975, "tokens/completion": 1768.79296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 248.44115471839905 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2514509856700897, "epoch": 0.204, "grad_norm": 0.023912470711877663, "importance_ratio": 0.9981127381324768, "learning_rate": 5e-06, "loss": -0.0029, "mismatch_kl": 0.020759448409080505, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 153, "timing/generation_ms": 25985.86314264685, "timing/scoring_ms": 0.0, "timing/total_ms": 25985.86314264685, "tokens/completion": 1309.546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 143.50284838676453 }, { "advantage/absmean": 0.12451171875, "entropy": 0.30002838373184204, "epoch": 0.20533333333333334, "grad_norm": 0.018497092206319014, "importance_ratio": 0.9994171857833862, "learning_rate": 5e-06, "loss": -0.0022, "mismatch_kl": 0.015115631744265556, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 154, "timing/generation_ms": 20836.18642948568, "timing/scoring_ms": 0.0, "timing/total_ms": 20836.18642948568, "tokens/completion": 972.66796875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 112.54808211326599 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3808918297290802, "epoch": 0.20666666666666667, "grad_norm": 0.014750747901418159, "importance_ratio": 0.9998784065246582, "learning_rate": 5e-06, "loss": -0.0023, "mismatch_kl": 0.0203760527074337, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 155, "timing/generation_ms": 28712.269487790763, "timing/scoring_ms": 0.0, "timing/total_ms": 28712.269487790763, "tokens/completion": 1384.42578125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 116.96515583992004 }, { "advantage/absmean": 0.12451171875, "entropy": 0.404234915971756, "epoch": 0.208, "grad_norm": 0.02774018143964054, "importance_ratio": 0.9903627038002014, "learning_rate": 5e-06, "loss": 0.0022, "mismatch_kl": 0.09949617087841034, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 156, "timing/generation_ms": 15220.996337942779, "timing/scoring_ms": 0.0, "timing/total_ms": 15220.996337942779, "tokens/completion": 733.44921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 80.95505475997925 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2150656282901764, "epoch": 0.20933333333333334, "grad_norm": 0.012574265789504322, "importance_ratio": 0.9968655109405518, "learning_rate": 5e-06, "loss": -0.0043, "mismatch_kl": 0.01895724982023239, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 157, "timing/generation_ms": 46771.82784862816, "timing/scoring_ms": 0.0, "timing/total_ms": 46771.82784862816, "tokens/completion": 2055.46875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 183.42079520225525 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2713158428668976, "epoch": 0.21066666666666667, "grad_norm": 0.03512934826143982, "importance_ratio": 0.9985222220420837, "learning_rate": 5e-06, "loss": -0.0028, "mismatch_kl": 0.01624884642660618, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 158, "timing/generation_ms": 20947.266034781933, "timing/scoring_ms": 0.0, "timing/total_ms": 20947.266034781933, "tokens/completion": 1009.90234375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 87.24977517127991 }, { "advantage/absmean": 0.12451171875, "entropy": 0.32832008600234985, "epoch": 0.212, "grad_norm": 0.02405397079489038, "importance_ratio": 0.9991105198860168, "learning_rate": 5e-06, "loss": -0.0056, "mismatch_kl": 0.016867484897375107, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 159, "timing/generation_ms": 21430.58088142425, "timing/scoring_ms": 0.0, "timing/total_ms": 21430.58088142425, "tokens/completion": 1012.43359375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 87.2035722732544 }, { "advantage/absmean": 0.12451171875, "entropy": 0.32067254185676575, "epoch": 0.21333333333333335, "grad_norm": 0.030583585605830663, "importance_ratio": 1.0010290145874023, "learning_rate": 5e-06, "loss": 0.0029, "mismatch_kl": 0.01957845501601696, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 160, "timing/generation_ms": 12068.631175905466, "timing/scoring_ms": 0.0, "timing/total_ms": 12068.631175905466, "tokens/completion": 585.69921875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 46.4997832775116 }, { "advantage/absmean": 0.12451171875, "entropy": 0.20440350472927094, "epoch": 0.21466666666666667, "grad_norm": 0.009198384471964699, "importance_ratio": 0.9953656196594238, "learning_rate": 5e-06, "loss": -0.0052, "mismatch_kl": 0.024851609021425247, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 161, "timing/generation_ms": 64061.363669112325, "timing/scoring_ms": 0.0, "timing/total_ms": 64061.363669112325, "tokens/completion": 2746.5390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 252.9020836353302 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2289305031299591, "epoch": 0.216, "grad_norm": 0.017027620442399836, "importance_ratio": 0.9964645504951477, "learning_rate": 5e-06, "loss": 0.0005, "mismatch_kl": 0.02016555331647396, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 162, "timing/generation_ms": 29072.1739763394, "timing/scoring_ms": 0.0, "timing/total_ms": 29072.1739763394, "tokens/completion": 1294.0546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 187.8606402873993 }, { "advantage/absmean": 0.12451171875, "entropy": 0.23871932923793793, "epoch": 0.21733333333333332, "grad_norm": 0.026046585403665903, "importance_ratio": 0.998152494430542, "learning_rate": 5e-06, "loss": 0.0052, "mismatch_kl": 0.016869615763425827, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 163, "timing/generation_ms": 33103.609337471426, "timing/scoring_ms": 0.0, "timing/total_ms": 33103.609337471426, "tokens/completion": 1545.50390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 139.85770416259766 }, { "advantage/absmean": 0.12451171875, "entropy": 0.28158116340637207, "epoch": 0.21866666666666668, "grad_norm": 0.015259806348832568, "importance_ratio": 0.9982590079307556, "learning_rate": 5e-06, "loss": -0.0053, "mismatch_kl": 0.022746765986084938, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 164, "timing/generation_ms": 26944.41274832934, "timing/scoring_ms": 0.0, "timing/total_ms": 26944.41274832934, "tokens/completion": 1337.65625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 109.10997653007507 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3031062185764313, "epoch": 0.22, "grad_norm": 0.016960115464425836, "importance_ratio": 0.9974260926246643, "learning_rate": 5e-06, "loss": -0.0023, "mismatch_kl": 0.02418132871389389, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 165, "timing/generation_ms": 26665.55192042142, "timing/scoring_ms": 0.0, "timing/total_ms": 26665.55192042142, "tokens/completion": 1298.09765625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 233.19409203529358 }, { "advantage/absmean": 0.12451171875, "entropy": 0.30360692739486694, "epoch": 0.22133333333333333, "grad_norm": 0.03976443826488329, "importance_ratio": 0.9983341097831726, "learning_rate": 5e-06, "loss": -0.0064, "mismatch_kl": 0.02314077690243721, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 166, "timing/generation_ms": 14128.881074488163, "timing/scoring_ms": 0.0, "timing/total_ms": 14128.881074488163, "tokens/completion": 701.61328125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 55.524725914001465 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2678433656692505, "epoch": 0.22266666666666668, "grad_norm": 0.03342438517457818, "importance_ratio": 0.9922596216201782, "learning_rate": 5e-06, "loss": -0.0023, "mismatch_kl": 0.035250429064035416, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 167, "timing/generation_ms": 21135.669719427824, "timing/scoring_ms": 0.0, "timing/total_ms": 21135.669719427824, "tokens/completion": 1019.171875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 149.8279891014099 }, { "advantage/absmean": 0.12451171875, "entropy": 0.20458683371543884, "epoch": 0.224, "grad_norm": 0.022088093083212943, "importance_ratio": 0.9954257011413574, "learning_rate": 5e-06, "loss": -0.0018, "mismatch_kl": 0.023710263893008232, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 168, "timing/generation_ms": 59294.02190912515, "timing/scoring_ms": 0.0, "timing/total_ms": 59294.02190912515, "tokens/completion": 2536.8828125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 207.61119556427002 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4547651410102844, "epoch": 0.22533333333333333, "grad_norm": 0.03804278639742813, "importance_ratio": 0.9720731973648071, "learning_rate": 5e-06, "loss": 0.0026, "mismatch_kl": 0.2540355324745178, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 169, "timing/generation_ms": 14632.340895012021, "timing/scoring_ms": 0.0, "timing/total_ms": 14632.340895012021, "tokens/completion": 634.8203125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 66.74064421653748 }, { "advantage/absmean": 0.12451171875, "entropy": 0.26701289415359497, "epoch": 0.22666666666666666, "grad_norm": 0.03041084967586165, "importance_ratio": 0.9971191883087158, "learning_rate": 5e-06, "loss": -0.0024, "mismatch_kl": 0.02894790843129158, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 170, "timing/generation_ms": 21908.162399195135, "timing/scoring_ms": 0.0, "timing/total_ms": 21908.162399195135, "tokens/completion": 1060.19140625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 285.11374616622925 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3831964433193207, "epoch": 0.228, "grad_norm": 0.020277373003486452, "importance_ratio": 0.9703661799430847, "learning_rate": 5e-06, "loss": -0.0013, "mismatch_kl": 0.288127064704895, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 171, "timing/generation_ms": 21739.85463846475, "timing/scoring_ms": 0.0, "timing/total_ms": 21739.85463846475, "tokens/completion": 1042.390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 126.53577995300293 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3104299008846283, "epoch": 0.22933333333333333, "grad_norm": 0.05268300034795112, "importance_ratio": 0.9946843981742859, "learning_rate": 5e-06, "loss": -0.0045, "mismatch_kl": 0.028223995119333267, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 172, "timing/generation_ms": 18181.49754870683, "timing/scoring_ms": 0.0, "timing/total_ms": 18181.49754870683, "tokens/completion": 876.87890625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 102.08800101280212 }, { "advantage/absmean": 0.12451171875, "entropy": 0.32149240374565125, "epoch": 0.23066666666666666, "grad_norm": 0.019198595379338976, "importance_ratio": 0.9882834553718567, "learning_rate": 5e-06, "loss": 0.0031, "mismatch_kl": 0.09531966596841812, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 173, "timing/generation_ms": 26753.23315896094, "timing/scoring_ms": 0.0, "timing/total_ms": 26753.23315896094, "tokens/completion": 1199.828125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 243.50505256652832 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2507164180278778, "epoch": 0.232, "grad_norm": 0.0248134202199756, "importance_ratio": 0.9970893263816833, "learning_rate": 5e-06, "loss": -0.0063, "mismatch_kl": 0.033440057188272476, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 174, "timing/generation_ms": 32734.658079221845, "timing/scoring_ms": 0.0, "timing/total_ms": 32734.658079221845, "tokens/completion": 1582.765625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 236.81393718719482 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2938965857028961, "epoch": 0.23333333333333334, "grad_norm": 0.023295024031541062, "importance_ratio": 0.9996641874313354, "learning_rate": 5e-06, "loss": -0.0014, "mismatch_kl": 0.030382564291357994, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 175, "timing/generation_ms": 18484.799866564572, "timing/scoring_ms": 0.0, "timing/total_ms": 18484.799866564572, "tokens/completion": 869.8203125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 89.94726347923279 }, { "advantage/absmean": 0.12451171875, "entropy": 0.24128344655036926, "epoch": 0.23466666666666666, "grad_norm": 0.021681137287839845, "importance_ratio": 0.995689868927002, "learning_rate": 5e-06, "loss": -0.0024, "mismatch_kl": 0.025076182559132576, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 176, "timing/generation_ms": 16699.054242111742, "timing/scoring_ms": 0.0, "timing/total_ms": 16699.054242111742, "tokens/completion": 831.890625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 76.11790347099304 }, { "advantage/absmean": 0.12451171875, "entropy": 0.26724985241889954, "epoch": 0.236, "grad_norm": 0.015254325506305103, "importance_ratio": 0.992223858833313, "learning_rate": 5e-06, "loss": -0.0003, "mismatch_kl": 0.02879425697028637, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 177, "timing/generation_ms": 30596.904239617288, "timing/scoring_ms": 0.0, "timing/total_ms": 30596.904239617288, "tokens/completion": 1407.20703125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 199.58447432518005 }, { "advantage/absmean": 0.12451171875, "entropy": 0.28972604870796204, "epoch": 0.23733333333333334, "grad_norm": 0.01945907676336341, "importance_ratio": 0.9937379956245422, "learning_rate": 5e-06, "loss": -0.0002, "mismatch_kl": 0.026391636580228806, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 178, "timing/generation_ms": 22168.457314372063, "timing/scoring_ms": 0.0, "timing/total_ms": 22168.457314372063, "tokens/completion": 1017.8515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 198.82207107543945 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2955513298511505, "epoch": 0.23866666666666667, "grad_norm": 0.034061359790196394, "importance_ratio": 0.9955794811248779, "learning_rate": 5e-06, "loss": -0.0017, "mismatch_kl": 0.026111198589205742, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 179, "timing/generation_ms": 17585.104428231716, "timing/scoring_ms": 0.0, "timing/total_ms": 17585.104428231716, "tokens/completion": 836.7421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 98.93776655197144 }, { "advantage/absmean": 0.12451171875, "entropy": 0.33897051215171814, "epoch": 0.24, "grad_norm": 0.026732099750916328, "importance_ratio": 0.9968024492263794, "learning_rate": 5e-06, "loss": -0.0016, "mismatch_kl": 0.03142106905579567, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 180, "timing/generation_ms": 14579.319617711008, "timing/scoring_ms": 0.0, "timing/total_ms": 14579.319617711008, "tokens/completion": 657.60546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 48.83777070045471 }, { "advantage/absmean": 0.12451171875, "entropy": 0.27722474932670593, "epoch": 0.24133333333333334, "grad_norm": 0.02190113915349276, "importance_ratio": 0.9932956099510193, "learning_rate": 5e-06, "loss": -0.0039, "mismatch_kl": 0.039353836327791214, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 181, "timing/generation_ms": 16838.846164755523, "timing/scoring_ms": 0.0, "timing/total_ms": 16838.846164755523, "tokens/completion": 837.53125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 90.39262366294861 }, { "advantage/absmean": 0.12451171875, "entropy": 0.21952733397483826, "epoch": 0.24266666666666667, "grad_norm": 0.019030162680243098, "importance_ratio": 0.9920942783355713, "learning_rate": 5e-06, "loss": 0.0007, "mismatch_kl": 0.03863741457462311, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 182, "timing/generation_ms": 19943.43529921025, "timing/scoring_ms": 0.0, "timing/total_ms": 19943.43529921025, "tokens/completion": 959.51953125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 68.7491762638092 }, { "advantage/absmean": 0.12451171875, "entropy": 0.37819504737854004, "epoch": 0.244, "grad_norm": 0.030600275992650774, "importance_ratio": 0.9981564879417419, "learning_rate": 5e-06, "loss": -0.0061, "mismatch_kl": 0.0258224718272686, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 183, "timing/generation_ms": 19337.73651625961, "timing/scoring_ms": 0.0, "timing/total_ms": 19337.73651625961, "tokens/completion": 909.80078125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 67.45709013938904 }, { "advantage/absmean": 0.12451171875, "entropy": 0.24391266703605652, "epoch": 0.24533333333333332, "grad_norm": 0.020045952746227204, "importance_ratio": 0.9952253103256226, "learning_rate": 5e-06, "loss": -0.0035, "mismatch_kl": 0.022540580481290817, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 184, "timing/generation_ms": 29042.017024941742, "timing/scoring_ms": 0.0, "timing/total_ms": 29042.017024941742, "tokens/completion": 1416.3046875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 224.1438853740692 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2780689597129822, "epoch": 0.24666666666666667, "grad_norm": 0.0286906981880458, "importance_ratio": 0.9939864277839661, "learning_rate": 5e-06, "loss": 0.0002, "mismatch_kl": 0.028331460431218147, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 185, "timing/generation_ms": 13990.399835631251, "timing/scoring_ms": 0.0, "timing/total_ms": 13990.399835631251, "tokens/completion": 712.27734375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 65.08906888961792 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2785170078277588, "epoch": 0.248, "grad_norm": 0.019455372327007777, "importance_ratio": 0.9962543249130249, "learning_rate": 5e-06, "loss": 0.0021, "mismatch_kl": 0.030258335173130035, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 186, "timing/generation_ms": 29046.93407472223, "timing/scoring_ms": 0.0, "timing/total_ms": 29046.93407472223, "tokens/completion": 1342.078125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 117.269207239151 }, { "advantage/absmean": 0.12451171875, "entropy": 0.29877498745918274, "epoch": 0.24933333333333332, "grad_norm": 0.041522981103745076, "importance_ratio": 0.9973271489143372, "learning_rate": 5e-06, "loss": 0.0005, "mismatch_kl": 0.027791054919362068, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 187, "timing/generation_ms": 27519.34172678739, "timing/scoring_ms": 0.0, "timing/total_ms": 27519.34172678739, "tokens/completion": 1335.86328125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 109.74448680877686 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2548399567604065, "epoch": 0.25066666666666665, "grad_norm": 0.01914209458227723, "importance_ratio": 0.9980031251907349, "learning_rate": 5e-06, "loss": -0.0056, "mismatch_kl": 0.023154988884925842, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 188, "timing/generation_ms": 18434.748891741037, "timing/scoring_ms": 0.0, "timing/total_ms": 18434.748891741037, "tokens/completion": 841.21484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 100.93693470954895 }, { "advantage/absmean": 0.12451171875, "entropy": 0.36281952261924744, "epoch": 0.252, "grad_norm": 0.04366345528631447, "importance_ratio": 0.997806966304779, "learning_rate": 5e-06, "loss": -0.0104, "mismatch_kl": 0.0235320795327425, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 189, "timing/generation_ms": 25268.099238164723, "timing/scoring_ms": 0.0, "timing/total_ms": 25268.099238164723, "tokens/completion": 1256.1484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 102.91489505767822 }, { "advantage/absmean": 0.12451171875, "entropy": 0.22508475184440613, "epoch": 0.25333333333333335, "grad_norm": 0.01385345071504184, "importance_ratio": 0.9968878626823425, "learning_rate": 5e-06, "loss": -0.0107, "mismatch_kl": 0.02765449695289135, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 190, "timing/generation_ms": 37916.601489298046, "timing/scoring_ms": 0.0, "timing/total_ms": 37916.601489298046, "tokens/completion": 1717.34765625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 139.42678880691528 }, { "advantage/absmean": 0.12451171875, "entropy": 0.40229278802871704, "epoch": 0.25466666666666665, "grad_norm": 0.02875613000959139, "importance_ratio": 0.9828155040740967, "learning_rate": 5e-06, "loss": 0.0055, "mismatch_kl": 0.19772163033485413, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 191, "timing/generation_ms": 32680.235791951418, "timing/scoring_ms": 0.0, "timing/total_ms": 32680.235791951418, "tokens/completion": 1459.58203125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 144.90490436553955 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2181045562028885, "epoch": 0.256, "grad_norm": 0.019693707478772454, "importance_ratio": 0.9942646026611328, "learning_rate": 5e-06, "loss": 0.0029, "mismatch_kl": 0.03511533513665199, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 192, "timing/generation_ms": 36065.32556284219, "timing/scoring_ms": 0.0, "timing/total_ms": 36065.32556284219, "tokens/completion": 1708.7734375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 126.33067202568054 }, { "advantage/absmean": 0.12451171875, "entropy": 0.2962771952152252, "epoch": 0.25733333333333336, "grad_norm": 0.02416381381264868, "importance_ratio": 0.9941651821136475, "learning_rate": 5e-06, "loss": 0.0024, "mismatch_kl": 0.0343640111386776, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 193, "timing/generation_ms": 36326.69063284993, "timing/scoring_ms": 0.0, "timing/total_ms": 36326.69063284993, "tokens/completion": 1645.30859375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 146.5855736732483 }, { "advantage/absmean": 0.12451171875, "entropy": 0.22655896842479706, "epoch": 0.25866666666666666, "grad_norm": 0.024160165001251035, "importance_ratio": 0.995488166809082, "learning_rate": 5e-06, "loss": 0.0023, "mismatch_kl": 0.023622261360287666, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 194, "timing/generation_ms": 40274.337109178305, "timing/scoring_ms": 0.0, "timing/total_ms": 40274.337109178305, "tokens/completion": 1910.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 137.63950419425964 }, { "advantage/absmean": 0.12451171875, "entropy": 0.24619098007678986, "epoch": 0.26, "grad_norm": 0.008997397579246655, "importance_ratio": 0.9905009865760803, "learning_rate": 5e-06, "loss": 0.0047, "mismatch_kl": 0.06482454389333725, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 195, "timing/generation_ms": 107369.31251455098, "timing/scoring_ms": 0.0, "timing/total_ms": 107369.31251455098, "tokens/completion": 3881.7421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 291.5552787780762 }, { "advantage/absmean": 0.12451171875, "entropy": 0.24800750613212585, "epoch": 0.2613333333333333, "grad_norm": 0.041355633656673725, "importance_ratio": 0.996856689453125, "learning_rate": 5e-06, "loss": 0.0027, "mismatch_kl": 0.023481056094169617, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 196, "timing/generation_ms": 23556.342590600252, "timing/scoring_ms": 0.0, "timing/total_ms": 23556.342590600252, "tokens/completion": 801.36328125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 66.23490047454834 }, { "advantage/absmean": 0.12451171875, "entropy": 0.20097197592258453, "epoch": 0.26266666666666666, "grad_norm": 0.01639665709788699, "importance_ratio": 0.995540201663971, "learning_rate": 5e-06, "loss": -0.0009, "mismatch_kl": 0.02512766607105732, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 197, "timing/generation_ms": 54791.293187998235, "timing/scoring_ms": 0.0, "timing/total_ms": 54791.293187998235, "tokens/completion": 2467.2578125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 184.51049184799194 }, { "advantage/absmean": 0.12451171875, "entropy": 0.24079304933547974, "epoch": 0.264, "grad_norm": 0.033558115100562454, "importance_ratio": 0.9966259002685547, "learning_rate": 5e-06, "loss": -0.0129, "mismatch_kl": 0.02248232252895832, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 198, "timing/generation_ms": 38877.40421388298, "timing/scoring_ms": 0.0, "timing/total_ms": 38877.40421388298, "tokens/completion": 1947.15625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 256.89259123802185 }, { "advantage/absmean": 0.12451171875, "entropy": 0.22992920875549316, "epoch": 0.2653333333333333, "grad_norm": 0.019833326998120116, "importance_ratio": 0.996269166469574, "learning_rate": 5e-06, "loss": -0.0002, "mismatch_kl": 0.02254408784210682, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 199, "timing/generation_ms": 22910.992676392198, "timing/scoring_ms": 0.0, "timing/total_ms": 22910.992676392198, "tokens/completion": 1146.32421875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 153.08721899986267 }, { "advantage/absmean": 0.12451171875, "entropy": 0.21609917283058167, "epoch": 0.26666666666666666, "grad_norm": 0.017782941960253474, "importance_ratio": 0.9933099746704102, "learning_rate": 5e-06, "loss": -0.0047, "mismatch_kl": 0.028513798490166664, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 200, "timing/generation_ms": 28995.982899330556, "timing/scoring_ms": 0.0, "timing/total_ms": 28995.982899330556, "tokens/completion": 1354.24609375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 139.1398515701294 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3927169740200043, "epoch": 0.268, "grad_norm": 0.08540874966055562, "importance_ratio": 0.9711376428604126, "learning_rate": 5e-06, "loss": 0.0081, "mismatch_kl": 0.2314944714307785, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 201, "timing/generation_ms": 31200.909822247922, "timing/scoring_ms": 0.0, "timing/total_ms": 31200.909822247922, "tokens/completion": 1405.9765625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 178.80973744392395 }, { "advantage/absmean": 0.12451171875, "entropy": 0.235797718167305, "epoch": 0.2693333333333333, "grad_norm": 0.01568085371274426, "importance_ratio": 0.9909575581550598, "learning_rate": 5e-06, "loss": -0.0079, "mismatch_kl": 0.039374206215143204, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 202, "timing/generation_ms": 42998.49198944867, "timing/scoring_ms": 0.0, "timing/total_ms": 42998.49198944867, "tokens/completion": 1907.31640625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 141.76219058036804 }, { "advantage/absmean": 0.12451171875, "entropy": 0.23127324879169464, "epoch": 0.27066666666666667, "grad_norm": 0.02007459981352103, "importance_ratio": 0.9912987947463989, "learning_rate": 5e-06, "loss": -0.001, "mismatch_kl": 0.03943263366818428, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 203, "timing/generation_ms": 37774.500319734216, "timing/scoring_ms": 0.0, "timing/total_ms": 37774.500319734216, "tokens/completion": 1693.734375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 302.7908329963684 }, { "advantage/absmean": 0.12451171875, "entropy": 0.22054153680801392, "epoch": 0.272, "grad_norm": 0.021761300841866088, "importance_ratio": 0.9904981851577759, "learning_rate": 5e-06, "loss": -0.0026, "mismatch_kl": 0.037401266396045685, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 204, "timing/generation_ms": 42541.27501603216, "timing/scoring_ms": 0.0, "timing/total_ms": 42541.27501603216, "tokens/completion": 1937.69140625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 233.74011135101318 }, { "advantage/absmean": 0.12451171875, "entropy": 0.22628618776798248, "epoch": 0.2733333333333333, "grad_norm": 0.011121419921268808, "importance_ratio": 0.9924519658088684, "learning_rate": 5e-06, "loss": 0.0013, "mismatch_kl": 0.03573086857795715, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 205, "timing/generation_ms": 35010.2855078876, "timing/scoring_ms": 0.0, "timing/total_ms": 35010.2855078876, "tokens/completion": 1629.62890625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 137.56320452690125 } ], "logging_steps": 1, "max_steps": 750, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }