CT-Apertus-Step205 / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
27d208e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2733333333333333,
"eval_steps": 50,
"global_step": 205,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"advantage/absmean": 0.12451171875,
"entropy": 1.3932524919509888,
"epoch": 0.0013333333333333333,
"grad_norm": 0.016694727116637192,
"importance_ratio": 0.9986082315444946,
"learning_rate": 0.0,
"loss": -0.0189,
"mismatch_kl": 0.004300346598029137,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 1,
"timing/generation_ms": 11961.050138808787,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11961.050138808787,
"tokens/completion": 551.78125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 248.72634100914001
},
{
"advantage/absmean": 0.12451171875,
"entropy": 1.0695139169692993,
"epoch": 0.0026666666666666666,
"grad_norm": 0.008567213424127631,
"importance_ratio": 0.9980430603027344,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0055,
"mismatch_kl": 0.0036789600271731615,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 2,
"timing/generation_ms": 11398.794241249561,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11398.794241249561,
"tokens/completion": 647.02734375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 67.39928388595581
},
{
"advantage/absmean": 0.12451171875,
"entropy": 1.2690891027450562,
"epoch": 0.004,
"grad_norm": 0.007856590727089238,
"importance_ratio": 0.9990478157997131,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0147,
"mismatch_kl": 0.00404919171705842,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 3,
"timing/generation_ms": 13145.053108222783,
"timing/scoring_ms": 0.0,
"timing/total_ms": 13145.053108222783,
"tokens/completion": 695.94140625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 188.99010276794434
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.5635457634925842,
"epoch": 0.005333333333333333,
"grad_norm": 0.008427354641048032,
"importance_ratio": 0.9995828866958618,
"learning_rate": 3e-06,
"loss": -0.0056,
"mismatch_kl": 0.0024689023848623037,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 4,
"timing/generation_ms": 12098.999472334981,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12098.999472334981,
"tokens/completion": 634.3515625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 52.7923378944397
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.8588207364082336,
"epoch": 0.006666666666666667,
"grad_norm": 0.015271082061520619,
"importance_ratio": 0.9998404383659363,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0201,
"mismatch_kl": 0.003175633493810892,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 5,
"timing/generation_ms": 9795.204265974462,
"timing/scoring_ms": 0.0,
"timing/total_ms": 9795.204265974462,
"tokens/completion": 595.30078125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 56.867586612701416
},
{
"advantage/absmean": 0.12451171875,
"entropy": 1.0917283296585083,
"epoch": 0.008,
"grad_norm": 0.015440441848262498,
"importance_ratio": 1.0006937980651855,
"learning_rate": 5e-06,
"loss": -0.0046,
"mismatch_kl": 0.003965948708355427,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 6,
"timing/generation_ms": 3492.4034476280212,
"timing/scoring_ms": 0.0,
"timing/total_ms": 3492.4034476280212,
"tokens/completion": 176.77734375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 61.55981087684631
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.750698983669281,
"epoch": 0.009333333333333334,
"grad_norm": 0.008842566430176115,
"importance_ratio": 1.0032514333724976,
"learning_rate": 5e-06,
"loss": 0.0042,
"mismatch_kl": 0.0037081094924360514,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 7,
"timing/generation_ms": 12012.088196352124,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12012.088196352124,
"tokens/completion": 664.06640625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 80.06084942817688
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.6958726644515991,
"epoch": 0.010666666666666666,
"grad_norm": 0.020865513665125984,
"importance_ratio": 0.9998727440834045,
"learning_rate": 5e-06,
"loss": -0.0015,
"mismatch_kl": 0.003091922029852867,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 8,
"timing/generation_ms": 7164.519478566945,
"timing/scoring_ms": 0.0,
"timing/total_ms": 7164.519478566945,
"tokens/completion": 376.96484375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 122.57408618927002
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.647992730140686,
"epoch": 0.012,
"grad_norm": 0.010516528439614162,
"importance_ratio": 0.9973449110984802,
"learning_rate": 5e-06,
"loss": 0.0348,
"mismatch_kl": 0.002668753731995821,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 9,
"timing/generation_ms": 9473.532978445292,
"timing/scoring_ms": 0.0,
"timing/total_ms": 9473.532978445292,
"tokens/completion": 589.9375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 111.60580968856812
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.9175997972488403,
"epoch": 0.013333333333333334,
"grad_norm": 0.017217377658999368,
"importance_ratio": 0.9963379502296448,
"learning_rate": 5e-06,
"loss": -0.0133,
"mismatch_kl": 0.003761034458875656,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 10,
"timing/generation_ms": 8803.215935826302,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8803.215935826302,
"tokens/completion": 432.890625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 137.27361369132996
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.6505714654922485,
"epoch": 0.014666666666666666,
"grad_norm": 0.0034942507757306364,
"importance_ratio": 0.9997450113296509,
"learning_rate": 5e-06,
"loss": 0.0567,
"mismatch_kl": 0.025293370708823204,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 11,
"timing/generation_ms": 28037.367599084973,
"timing/scoring_ms": 0.0,
"timing/total_ms": 28037.367599084973,
"tokens/completion": 1677.38671875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 147.27029275894165
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.623925507068634,
"epoch": 0.016,
"grad_norm": 0.004363286027787366,
"importance_ratio": 0.9998379349708557,
"learning_rate": 5e-06,
"loss": 0.037,
"mismatch_kl": 0.027607521042227745,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 12,
"timing/generation_ms": 30658.961144275963,
"timing/scoring_ms": 0.0,
"timing/total_ms": 30658.961144275963,
"tokens/completion": 1772.48046875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 228.39264035224915
},
{
"advantage/absmean": 0.12451171875,
"entropy": 1.2309722900390625,
"epoch": 0.017333333333333333,
"grad_norm": 0.01910079735377139,
"importance_ratio": 0.9967860579490662,
"learning_rate": 5e-06,
"loss": -0.0146,
"mismatch_kl": 0.004334039054811001,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 13,
"timing/generation_ms": 7481.697678565979,
"timing/scoring_ms": 0.0,
"timing/total_ms": 7481.697678565979,
"tokens/completion": 458.546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 125.09760117530823
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.583360493183136,
"epoch": 0.018666666666666668,
"grad_norm": 0.006859469099074894,
"importance_ratio": 0.9988465905189514,
"learning_rate": 5e-06,
"loss": -0.0041,
"mismatch_kl": 0.0028068351093679667,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 14,
"timing/generation_ms": 8050.086663104594,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8050.086663104594,
"tokens/completion": 466.06640625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 117.39565086364746
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.7860226035118103,
"epoch": 0.02,
"grad_norm": 0.011283066327858677,
"importance_ratio": 1.002608299255371,
"learning_rate": 5e-06,
"loss": -0.0035,
"mismatch_kl": 0.004051415715366602,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 15,
"timing/generation_ms": 9803.531439974904,
"timing/scoring_ms": 0.0,
"timing/total_ms": 9803.531439974904,
"tokens/completion": 522.2109375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 48.61639094352722
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.7184260487556458,
"epoch": 0.021333333333333333,
"grad_norm": 0.010228445907240152,
"importance_ratio": 1.000801920890808,
"learning_rate": 5e-06,
"loss": -0.0066,
"mismatch_kl": 0.006085229571908712,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 16,
"timing/generation_ms": 8376.314821653068,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8376.314821653068,
"tokens/completion": 458.83984375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 69.11118984222412
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.18061073124408722,
"epoch": 0.02266666666666667,
"grad_norm": 0.0036474713562644418,
"importance_ratio": 0.9987739324569702,
"learning_rate": 5e-06,
"loss": 0.0657,
"mismatch_kl": 0.025802385061979294,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 17,
"timing/generation_ms": 16682.96501878649,
"timing/scoring_ms": 0.0,
"timing/total_ms": 16682.96501878649,
"tokens/completion": 1178.22265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 110.8058807849884
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3768082559108734,
"epoch": 0.024,
"grad_norm": 0.007994642717131743,
"importance_ratio": 0.9989356398582458,
"learning_rate": 5e-06,
"loss": 0.0198,
"mismatch_kl": 0.0024773485492914915,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 18,
"timing/generation_ms": 45841.41308255494,
"timing/scoring_ms": 0.0,
"timing/total_ms": 45841.41308255494,
"tokens/completion": 2401.60546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 182.70060086250305
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.9849978685379028,
"epoch": 0.025333333333333333,
"grad_norm": 0.007975010652496835,
"importance_ratio": 0.9994485974311829,
"learning_rate": 5e-06,
"loss": -0.0032,
"mismatch_kl": 0.007306213956326246,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 19,
"timing/generation_ms": 21281.952754594386,
"timing/scoring_ms": 0.0,
"timing/total_ms": 21281.952754594386,
"tokens/completion": 1127.03515625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 118.257479429245
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.783280074596405,
"epoch": 0.02666666666666667,
"grad_norm": 0.01013309688610727,
"importance_ratio": 1.0076329708099365,
"learning_rate": 5e-06,
"loss": -0.002,
"mismatch_kl": 0.008437588810920715,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 20,
"timing/generation_ms": 11283.36211759597,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11283.36211759597,
"tokens/completion": 603.92578125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 50.433815717697144
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.18535619974136353,
"epoch": 0.028,
"grad_norm": 0.12592122275182266,
"importance_ratio": 0.994857132434845,
"learning_rate": 5e-06,
"loss": 0.057,
"mismatch_kl": 0.004472589120268822,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 21,
"timing/generation_ms": 69204.76855803281,
"timing/scoring_ms": 0.0,
"timing/total_ms": 69204.76855803281,
"tokens/completion": 3062.171875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 335.8162593841553
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.7172983884811401,
"epoch": 0.029333333333333333,
"grad_norm": 0.011698600330274578,
"importance_ratio": 1.0030107498168945,
"learning_rate": 5e-06,
"loss": -0.0094,
"mismatch_kl": 0.03951645269989967,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 22,
"timing/generation_ms": 16505.55451028049,
"timing/scoring_ms": 0.0,
"timing/total_ms": 16505.55451028049,
"tokens/completion": 675.60546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 61.02479434013367
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.8583077192306519,
"epoch": 0.030666666666666665,
"grad_norm": 0.02332906550498323,
"importance_ratio": 1.0737003087997437,
"learning_rate": 5e-06,
"loss": 0.0468,
"mismatch_kl": 0.21222208440303802,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 23,
"timing/generation_ms": 47965.167357586324,
"timing/scoring_ms": 0.0,
"timing/total_ms": 47965.167357586324,
"tokens/completion": 2437.57421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 184.88851642608643
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.6403871178627014,
"epoch": 0.032,
"grad_norm": 0.0064139472738614185,
"importance_ratio": 1.0027199983596802,
"learning_rate": 5e-06,
"loss": 0.0079,
"mismatch_kl": 0.029356306418776512,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 24,
"timing/generation_ms": 25231.056010350585,
"timing/scoring_ms": 0.0,
"timing/total_ms": 25231.056010350585,
"tokens/completion": 1253.125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 127.16959929466248
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.26308295130729675,
"epoch": 0.03333333333333333,
"grad_norm": 0.004856521131545869,
"importance_ratio": 0.99989914894104,
"learning_rate": 5e-06,
"loss": 0.0162,
"mismatch_kl": 0.006057343445718288,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 25,
"timing/generation_ms": 44386.24160736799,
"timing/scoring_ms": 0.0,
"timing/total_ms": 44386.24160736799,
"tokens/completion": 2212.2421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 173.18823885917664
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.12470932304859161,
"epoch": 0.034666666666666665,
"grad_norm": 0.001678447935003649,
"importance_ratio": 1.0007412433624268,
"learning_rate": 5e-06,
"loss": 0.0462,
"mismatch_kl": 0.001119845313951373,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 26,
"timing/generation_ms": 100999.46400336921,
"timing/scoring_ms": 0.0,
"timing/total_ms": 100999.46400336921,
"tokens/completion": 3716.6796875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 303.84296584129333
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.17583802342414856,
"epoch": 0.036,
"grad_norm": 0.002584350761592735,
"importance_ratio": 1.001440405845642,
"learning_rate": 5e-06,
"loss": 0.0264,
"mismatch_kl": 0.0013389256782829762,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 27,
"timing/generation_ms": 55200.44738613069,
"timing/scoring_ms": 0.0,
"timing/total_ms": 55200.44738613069,
"tokens/completion": 2656.7265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 294.736074924469
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.19836626946926117,
"epoch": 0.037333333333333336,
"grad_norm": 0.005548904662699889,
"importance_ratio": 1.0022764205932617,
"learning_rate": 5e-06,
"loss": 0.0251,
"mismatch_kl": 0.0019016863079741597,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 28,
"timing/generation_ms": 57617.69188474864,
"timing/scoring_ms": 0.0,
"timing/total_ms": 57617.69188474864,
"tokens/completion": 2797.6171875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 228.97359490394592
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.37241131067276,
"epoch": 0.03866666666666667,
"grad_norm": 0.02006388030524017,
"importance_ratio": 1.053019404411316,
"learning_rate": 5e-06,
"loss": 0.0557,
"mismatch_kl": 0.9634742736816406,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 29,
"timing/generation_ms": 41741.05513561517,
"timing/scoring_ms": 0.0,
"timing/total_ms": 41741.05513561517,
"tokens/completion": 2055.87890625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 207.62974190711975
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.25762397050857544,
"epoch": 0.04,
"grad_norm": 0.006806951429177103,
"importance_ratio": 0.983231246471405,
"learning_rate": 5e-06,
"loss": 0.0364,
"mismatch_kl": 0.06448693573474884,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 30,
"timing/generation_ms": 29489.30747061968,
"timing/scoring_ms": 0.0,
"timing/total_ms": 29489.30747061968,
"tokens/completion": 1709.59765625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 175.62516474723816
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.019520161673426628,
"epoch": 0.04133333333333333,
"grad_norm": 0.0005178617259035183,
"importance_ratio": 0.9998506307601929,
"learning_rate": 5e-06,
"loss": 0.0014,
"mismatch_kl": 0.0017281156033277512,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 31,
"timing/generation_ms": 255150.22794622928,
"timing/scoring_ms": 0.0,
"timing/total_ms": 255150.22794622928,
"tokens/completion": 6100.89453125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 537.7091252803802
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.005344062577933073,
"epoch": 0.042666666666666665,
"grad_norm": 0.00042076548606043374,
"importance_ratio": 1.0000818967819214,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00012820436677429825,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 32,
"timing/generation_ms": 252645.98809182644,
"timing/scoring_ms": 0.0,
"timing/total_ms": 252645.98809182644,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 564.6809096336365
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.0041460455395281315,
"epoch": 0.044,
"grad_norm": 0.0004905946483254039,
"importance_ratio": 1.0000282526016235,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 6.918103463249281e-05,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 33,
"timing/generation_ms": 262179.48642838746,
"timing/scoring_ms": 0.0,
"timing/total_ms": 262179.48642838746,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 574.2838616371155
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.005189419258385897,
"epoch": 0.04533333333333334,
"grad_norm": 0.0003380219234355203,
"importance_ratio": 1.0000487565994263,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 7.488115079468116e-05,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 34,
"timing/generation_ms": 257649.44062847644,
"timing/scoring_ms": 0.0,
"timing/total_ms": 257649.44062847644,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 616.5528900623322
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.005219260696321726,
"epoch": 0.04666666666666667,
"grad_norm": 0.0006402287013777213,
"importance_ratio": 1.0000388622283936,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00010059373016702011,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 35,
"timing/generation_ms": 263093.6838546768,
"timing/scoring_ms": 0.0,
"timing/total_ms": 263093.6838546768,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 567.3024535179138
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.008569693192839622,
"epoch": 0.048,
"grad_norm": 0.0005047742243801816,
"importance_ratio": 1.0000779628753662,
"learning_rate": 5e-06,
"loss": 0.0004,
"mismatch_kl": 0.0001211672934005037,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 36,
"timing/generation_ms": 242657.4441930279,
"timing/scoring_ms": 0.0,
"timing/total_ms": 242657.4441930279,
"tokens/completion": 6123.421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 543.5283715724945
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.011535107158124447,
"epoch": 0.04933333333333333,
"grad_norm": 0.0004641880444433118,
"importance_ratio": 1.0000940561294556,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00016296253306791186,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 37,
"timing/generation_ms": 253055.44871557504,
"timing/scoring_ms": 0.0,
"timing/total_ms": 253055.44871557504,
"tokens/completion": 6100.4375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 529.3097188472748
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.008278747089207172,
"epoch": 0.050666666666666665,
"grad_norm": 0.0015602978869027017,
"importance_ratio": 1.000083565711975,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00012404406152199954,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 38,
"timing/generation_ms": 259809.8956849426,
"timing/scoring_ms": 0.0,
"timing/total_ms": 259809.8956849426,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 572.6026647090912
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.0070807370357215405,
"epoch": 0.052,
"grad_norm": 0.0004621624833577141,
"importance_ratio": 1.000075340270996,
"learning_rate": 5e-06,
"loss": -0.0,
"mismatch_kl": 0.00010999527876265347,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 39,
"timing/generation_ms": 266124.4311518967,
"timing/scoring_ms": 0.0,
"timing/total_ms": 266124.4311518967,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 645.3593230247498
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.00655187526717782,
"epoch": 0.05333333333333334,
"grad_norm": 0.00032702966921445734,
"importance_ratio": 1.0000351667404175,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00014068085874896497,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 40,
"timing/generation_ms": 262011.0893426463,
"timing/scoring_ms": 0.0,
"timing/total_ms": 262011.0893426463,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 515.61732006073
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.0039160363376140594,
"epoch": 0.05466666666666667,
"grad_norm": 0.0003724535269895079,
"importance_ratio": 1.0000481605529785,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 7.484626985387877e-05,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 41,
"timing/generation_ms": 255759.41647868603,
"timing/scoring_ms": 0.0,
"timing/total_ms": 255759.41647868603,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 565.8597645759583
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.01127232052385807,
"epoch": 0.056,
"grad_norm": 0.0008175801103252065,
"importance_ratio": 1.0000771284103394,
"learning_rate": 5e-06,
"loss": 0.0068,
"mismatch_kl": 0.00016380040324293077,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 42,
"timing/generation_ms": 238812.61033378541,
"timing/scoring_ms": 0.0,
"timing/total_ms": 238812.61033378541,
"tokens/completion": 6073.61328125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 484.4759180545807
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.01085229218006134,
"epoch": 0.05733333333333333,
"grad_norm": 0.0004839828768652627,
"importance_ratio": 1.0000557899475098,
"learning_rate": 5e-06,
"loss": 0.0063,
"mismatch_kl": 0.00013297870464157313,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 43,
"timing/generation_ms": 256715.18344525248,
"timing/scoring_ms": 0.0,
"timing/total_ms": 256715.18344525248,
"tokens/completion": 6078.20703125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 537.6344306468964
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.005195128731429577,
"epoch": 0.058666666666666666,
"grad_norm": 0.00023276391851811837,
"importance_ratio": 1.0000344514846802,
"learning_rate": 5e-06,
"loss": 0.0023,
"mismatch_kl": 8.078882819972932e-05,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 44,
"timing/generation_ms": 245682.50108975917,
"timing/scoring_ms": 0.0,
"timing/total_ms": 245682.50108975917,
"tokens/completion": 6098.1015625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 491.3542585372925
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.0033533975947648287,
"epoch": 0.06,
"grad_norm": 0.00016439514868896496,
"importance_ratio": 1.00002920627594,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 7.133631879696622e-05,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 45,
"timing/generation_ms": 261392.2018893063,
"timing/scoring_ms": 0.0,
"timing/total_ms": 261392.2018893063,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 638.6866817474365
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.009633159264922142,
"epoch": 0.06133333333333333,
"grad_norm": 0.0005837700251924664,
"importance_ratio": 1.000110149383545,
"learning_rate": 5e-06,
"loss": -0.0005,
"mismatch_kl": 0.00014644436305388808,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 46,
"timing/generation_ms": 259352.97147464007,
"timing/scoring_ms": 0.0,
"timing/total_ms": 259352.97147464007,
"tokens/completion": 6100.9375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 726.6395015716553
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.008214793168008327,
"epoch": 0.06266666666666666,
"grad_norm": 0.0003491652028248511,
"importance_ratio": 1.0000574588775635,
"learning_rate": 5e-06,
"loss": -0.0005,
"mismatch_kl": 0.00012681909720413387,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 47,
"timing/generation_ms": 251731.6782604903,
"timing/scoring_ms": 0.0,
"timing/total_ms": 251731.6782604903,
"tokens/completion": 6120.80078125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 504.8533480167389
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.012482496909797192,
"epoch": 0.064,
"grad_norm": 0.0008089181923655795,
"importance_ratio": 1.0000419616699219,
"learning_rate": 5e-06,
"loss": 0.003,
"mismatch_kl": 0.00024501114967279136,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 48,
"timing/generation_ms": 260055.6455301121,
"timing/scoring_ms": 0.0,
"timing/total_ms": 260055.6455301121,
"tokens/completion": 6038.9921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 520.350103855133
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.008223201148211956,
"epoch": 0.06533333333333333,
"grad_norm": 0.0005775216775221585,
"importance_ratio": 1.0000702142715454,
"learning_rate": 5e-06,
"loss": -0.0,
"mismatch_kl": 0.0001139239757321775,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 49,
"timing/generation_ms": 262634.82890836895,
"timing/scoring_ms": 0.0,
"timing/total_ms": 262634.82890836895,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 694.4226834774017
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.006501559168100357,
"epoch": 0.06666666666666667,
"grad_norm": 0.0004452247469025534,
"importance_ratio": 1.000080943107605,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00019989976135548204,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 50,
"timing/generation_ms": 252373.39910119772,
"timing/scoring_ms": 0.0,
"timing/total_ms": 252373.39910119772,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 633.9480290412903
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.011557838879525661,
"epoch": 0.068,
"grad_norm": 0.00040538021426552616,
"importance_ratio": 1.0000510215759277,
"learning_rate": 5e-06,
"loss": 0.0163,
"mismatch_kl": 0.00014912446204107255,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 51,
"timing/generation_ms": 231235.03853101283,
"timing/scoring_ms": 0.0,
"timing/total_ms": 231235.03853101283,
"tokens/completion": 5880.91015625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 553.8161387443542
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.008280275389552116,
"epoch": 0.06933333333333333,
"grad_norm": 0.0006837160840031847,
"importance_ratio": 1.0000361204147339,
"learning_rate": 5e-06,
"loss": -0.0009,
"mismatch_kl": 0.00011032609472749755,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 52,
"timing/generation_ms": 268335.500174202,
"timing/scoring_ms": 0.0,
"timing/total_ms": 268335.500174202,
"tokens/completion": 6076.33984375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 532.5728721618652
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.008777043782174587,
"epoch": 0.07066666666666667,
"grad_norm": 0.00047547446087476704,
"importance_ratio": 1.0000946521759033,
"learning_rate": 5e-06,
"loss": -0.0,
"mismatch_kl": 0.0001269574131583795,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 53,
"timing/generation_ms": 256683.97525977343,
"timing/scoring_ms": 0.0,
"timing/total_ms": 256683.97525977343,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 506.92905497550964
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.011497734114527702,
"epoch": 0.072,
"grad_norm": 0.00027828097052508087,
"importance_ratio": 1.000109076499939,
"learning_rate": 5e-06,
"loss": 0.0042,
"mismatch_kl": 0.00013832931290380657,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 54,
"timing/generation_ms": 245946.20873313397,
"timing/scoring_ms": 0.0,
"timing/total_ms": 245946.20873313397,
"tokens/completion": 6032.51953125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 505.11912751197815
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.010809739120304585,
"epoch": 0.07333333333333333,
"grad_norm": 0.0007032954488382401,
"importance_ratio": 1.0000889301300049,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00015762390103191137,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 55,
"timing/generation_ms": 264072.7631729096,
"timing/scoring_ms": 0.0,
"timing/total_ms": 264072.7631729096,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 523.6702523231506
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.009559578262269497,
"epoch": 0.07466666666666667,
"grad_norm": 0.0010708393934808242,
"importance_ratio": 1.0000908374786377,
"learning_rate": 5e-06,
"loss": 0.0051,
"mismatch_kl": 0.00013747472257819027,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 56,
"timing/generation_ms": 250449.08253196627,
"timing/scoring_ms": 0.0,
"timing/total_ms": 250449.08253196627,
"tokens/completion": 6098.72265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 556.8832399845123
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.011213499121367931,
"epoch": 0.076,
"grad_norm": 0.00044938202555849837,
"importance_ratio": 1.0000908374786377,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00015059650468174368,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 57,
"timing/generation_ms": 263455.5452140048,
"timing/scoring_ms": 0.0,
"timing/total_ms": 263455.5452140048,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 632.40900182724
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.005081878509372473,
"epoch": 0.07733333333333334,
"grad_norm": 0.0003246328757380694,
"importance_ratio": 1.0000656843185425,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 0.00019094608433078974,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 58,
"timing/generation_ms": 256806.45045358688,
"timing/scoring_ms": 0.0,
"timing/total_ms": 256806.45045358688,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 503.00778365135193
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.01805613562464714,
"epoch": 0.07866666666666666,
"grad_norm": 0.0007634702119519025,
"importance_ratio": 1.0001803636550903,
"learning_rate": 5e-06,
"loss": 0.0025,
"mismatch_kl": 0.00021581076725851744,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 59,
"timing/generation_ms": 254470.52423935384,
"timing/scoring_ms": 0.0,
"timing/total_ms": 254470.52423935384,
"tokens/completion": 6079.921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 499.350706577301
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.047813381999731064,
"epoch": 0.08,
"grad_norm": 0.0034811244478141165,
"importance_ratio": 1.0005850791931152,
"learning_rate": 5e-06,
"loss": 0.0385,
"mismatch_kl": 0.0006162600475363433,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 60,
"timing/generation_ms": 122059.79803204536,
"timing/scoring_ms": 0.0,
"timing/total_ms": 122059.79803204536,
"tokens/completion": 4056.4140625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 406.85777831077576
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.006517002824693918,
"epoch": 0.08133333333333333,
"grad_norm": 0.00045405486723584484,
"importance_ratio": 1.0000643730163574,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 8.087344031082466e-05,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 61,
"timing/generation_ms": 262080.00864181668,
"timing/scoring_ms": 0.0,
"timing/total_ms": 262080.00864181668,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 513.6219637393951
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.0059960088692605495,
"epoch": 0.08266666666666667,
"grad_norm": 0.0003004741817689029,
"importance_ratio": 1.0000419616699219,
"learning_rate": 5e-06,
"loss": 0.0,
"mismatch_kl": 7.99954796093516e-05,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 62,
"timing/generation_ms": 261857.35533758998,
"timing/scoring_ms": 0.0,
"timing/total_ms": 261857.35533758998,
"tokens/completion": 6144.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 537.6526563167572
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.012267248705029488,
"epoch": 0.084,
"grad_norm": 0.0018275298082432536,
"importance_ratio": 1.0001516342163086,
"learning_rate": 5e-06,
"loss": 0.0273,
"mismatch_kl": 0.00015860867279116064,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 63,
"timing/generation_ms": 223553.63579373807,
"timing/scoring_ms": 0.0,
"timing/total_ms": 223553.63579373807,
"tokens/completion": 5578.8046875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 449.565260887146
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.017613664269447327,
"epoch": 0.08533333333333333,
"grad_norm": 0.0013137454797814432,
"importance_ratio": 1.0001808404922485,
"learning_rate": 5e-06,
"loss": 0.0296,
"mismatch_kl": 0.00018238124903291464,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 64,
"timing/generation_ms": 197715.4450826347,
"timing/scoring_ms": 0.0,
"timing/total_ms": 197715.4450826347,
"tokens/completion": 5301.74609375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 467.5368883609772
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.012099393643438816,
"epoch": 0.08666666666666667,
"grad_norm": 0.00029163323031709923,
"importance_ratio": 1.0000910758972168,
"learning_rate": 5e-06,
"loss": 0.0101,
"mismatch_kl": 0.0001367869263049215,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 65,
"timing/generation_ms": 253292.40265209228,
"timing/scoring_ms": 0.0,
"timing/total_ms": 253292.40265209228,
"tokens/completion": 5987.40234375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 504.62310814857483
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.016472794115543365,
"epoch": 0.088,
"grad_norm": 0.000537146473230196,
"importance_ratio": 1.0002104043960571,
"learning_rate": 5e-06,
"loss": 0.0046,
"mismatch_kl": 0.00019632629118859768,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 66,
"timing/generation_ms": 244626.61108747125,
"timing/scoring_ms": 0.0,
"timing/total_ms": 244626.61108747125,
"tokens/completion": 5880.29296875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 546.9820070266724
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.03573580086231232,
"epoch": 0.08933333333333333,
"grad_norm": 0.0018214337047260279,
"importance_ratio": 1.0006996393203735,
"learning_rate": 5e-06,
"loss": 0.0366,
"mismatch_kl": 0.0005711132544092834,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 67,
"timing/generation_ms": 171141.10032841563,
"timing/scoring_ms": 0.0,
"timing/total_ms": 171141.10032841563,
"tokens/completion": 4912.99609375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 387.35487270355225
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.024245120584964752,
"epoch": 0.09066666666666667,
"grad_norm": 0.0007171125744050383,
"importance_ratio": 1.0004810094833374,
"learning_rate": 5e-06,
"loss": 0.0327,
"mismatch_kl": 0.0003458830469753593,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 68,
"timing/generation_ms": 175763.37515283376,
"timing/scoring_ms": 0.0,
"timing/total_ms": 175763.37515283376,
"tokens/completion": 5039.39453125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 399.21359062194824
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.025269493460655212,
"epoch": 0.092,
"grad_norm": 0.0004443143666122359,
"importance_ratio": 1.000417947769165,
"learning_rate": 5e-06,
"loss": 0.0151,
"mismatch_kl": 0.000321421044645831,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 69,
"timing/generation_ms": 250666.16093274206,
"timing/scoring_ms": 0.0,
"timing/total_ms": 250666.16093274206,
"tokens/completion": 5965.16796875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 632.227735042572
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.024551477283239365,
"epoch": 0.09333333333333334,
"grad_norm": 0.0015252781439401258,
"importance_ratio": 1.0006314516067505,
"learning_rate": 5e-06,
"loss": 0.0348,
"mismatch_kl": 0.0005003436817787588,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 70,
"timing/generation_ms": 191529.1232522577,
"timing/scoring_ms": 0.0,
"timing/total_ms": 191529.1232522577,
"tokens/completion": 5294.87890625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 407.7219111919403
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.02018953487277031,
"epoch": 0.09466666666666666,
"grad_norm": 0.0011570903491081794,
"importance_ratio": 1.0002988576889038,
"learning_rate": 5e-06,
"loss": 0.0237,
"mismatch_kl": 0.00033742599771358073,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 71,
"timing/generation_ms": 210619.99121960253,
"timing/scoring_ms": 0.0,
"timing/total_ms": 210619.99121960253,
"tokens/completion": 5332.65625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 494.4582040309906
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.08414055407047272,
"epoch": 0.096,
"grad_norm": 0.005691985408928669,
"importance_ratio": 1.002629280090332,
"learning_rate": 5e-06,
"loss": 0.0631,
"mismatch_kl": 0.0030276263132691383,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 72,
"timing/generation_ms": 26491.081130690873,
"timing/scoring_ms": 0.0,
"timing/total_ms": 26491.081130690873,
"tokens/completion": 1684.4921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 175.0816376209259
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.08044799417257309,
"epoch": 0.09733333333333333,
"grad_norm": 0.0067108539111987095,
"importance_ratio": 1.0022099018096924,
"learning_rate": 5e-06,
"loss": 0.0512,
"mismatch_kl": 0.0033263727091252804,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 73,
"timing/generation_ms": 26663.206906057894,
"timing/scoring_ms": 0.0,
"timing/total_ms": 26663.206906057894,
"tokens/completion": 1624.47265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 179.0183322429657
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.14499743282794952,
"epoch": 0.09866666666666667,
"grad_norm": 0.010377228969329702,
"importance_ratio": 1.0045510530471802,
"learning_rate": 5e-06,
"loss": 0.0301,
"mismatch_kl": 0.03058871254324913,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 74,
"timing/generation_ms": 11363.965434022248,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11363.965434022248,
"tokens/completion": 733.40234375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 162.93997645378113
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.15485742688179016,
"epoch": 0.1,
"grad_norm": 0.037501291580980145,
"importance_ratio": 1.0262236595153809,
"learning_rate": 5e-06,
"loss": 0.0478,
"mismatch_kl": 0.5780022144317627,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 75,
"timing/generation_ms": 31973.80775306374,
"timing/scoring_ms": 0.0,
"timing/total_ms": 31973.80775306374,
"tokens/completion": 1854.69921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 206.36020827293396
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2712324857711792,
"epoch": 0.10133333333333333,
"grad_norm": 0.021496155900656944,
"importance_ratio": 0.747008204460144,
"learning_rate": 5e-06,
"loss": -0.001,
"mismatch_kl": 4.077150344848633,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 76,
"timing/generation_ms": 19520.673436112702,
"timing/scoring_ms": 0.0,
"timing/total_ms": 19520.673436112702,
"tokens/completion": 1019.1015625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 104.34236979484558
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3253353238105774,
"epoch": 0.10266666666666667,
"grad_norm": 0.014127787785753907,
"importance_ratio": 0.5209717154502869,
"learning_rate": 5e-06,
"loss": 0.0074,
"mismatch_kl": 11.41779899597168,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 77,
"timing/generation_ms": 33620.65821047872,
"timing/scoring_ms": 0.0,
"timing/total_ms": 33620.65821047872,
"tokens/completion": 1925.72265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 257.44123911857605
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3993019163608551,
"epoch": 0.104,
"grad_norm": 0.009151033649610016,
"importance_ratio": 0.29432952404022217,
"learning_rate": 5e-06,
"loss": 0.0157,
"mismatch_kl": 11.372162818908691,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 78,
"timing/generation_ms": 11082.484270446002,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11082.484270446002,
"tokens/completion": 828.0546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 115.73264193534851
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.1168494001030922,
"epoch": 0.10533333333333333,
"grad_norm": 0.006117265962728229,
"importance_ratio": 0.1935732513666153,
"learning_rate": 5e-06,
"loss": -0.0017,
"mismatch_kl": 21.00209617614746,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 79,
"timing/generation_ms": 36345.630533993244,
"timing/scoring_ms": 0.0,
"timing/total_ms": 36345.630533993244,
"tokens/completion": 2084.80859375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 232.0772545337677
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4559866189956665,
"epoch": 0.10666666666666667,
"grad_norm": 0.02899073922789841,
"importance_ratio": 0.9647712111473083,
"learning_rate": 5e-06,
"loss": -0.0109,
"mismatch_kl": 0.1562381535768509,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 80,
"timing/generation_ms": 3708.529833704233,
"timing/scoring_ms": 0.0,
"timing/total_ms": 3708.529833704233,
"tokens/completion": 172.21484375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 79.40927720069885
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.18381687998771667,
"epoch": 0.108,
"grad_norm": 0.03870938318729351,
"importance_ratio": 0.9867123365402222,
"learning_rate": 5e-06,
"loss": 0.0003,
"mismatch_kl": 0.09630821645259857,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 81,
"timing/generation_ms": 7641.556458547711,
"timing/scoring_ms": 0.0,
"timing/total_ms": 7641.556458547711,
"tokens/completion": 342.55078125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 28.48853635787964
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.28385868668556213,
"epoch": 0.10933333333333334,
"grad_norm": 0.024463462093216322,
"importance_ratio": 0.9961410760879517,
"learning_rate": 5e-06,
"loss": -0.0027,
"mismatch_kl": 0.046350929886102676,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 82,
"timing/generation_ms": 14151.478135958314,
"timing/scoring_ms": 0.0,
"timing/total_ms": 14151.478135958314,
"tokens/completion": 640.5703125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 51.07678151130676
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.25215646624565125,
"epoch": 0.11066666666666666,
"grad_norm": 0.025956305888591907,
"importance_ratio": 0.9893953204154968,
"learning_rate": 5e-06,
"loss": 0.0024,
"mismatch_kl": 0.06097816303372383,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 83,
"timing/generation_ms": 9802.852495573461,
"timing/scoring_ms": 0.0,
"timing/total_ms": 9802.852495573461,
"tokens/completion": 486.23828125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 135.5597288608551
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.18832416832447052,
"epoch": 0.112,
"grad_norm": 0.05162272724580775,
"importance_ratio": 0.9795369505882263,
"learning_rate": 5e-06,
"loss": -0.0063,
"mismatch_kl": 0.09001336991786957,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 84,
"timing/generation_ms": 8744.545813649893,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8744.545813649893,
"tokens/completion": 422.9921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 202.02377605438232
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.20757851004600525,
"epoch": 0.11333333333333333,
"grad_norm": 0.029849605436009424,
"importance_ratio": 0.9847032427787781,
"learning_rate": 5e-06,
"loss": 0.0003,
"mismatch_kl": 0.08596009016036987,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 85,
"timing/generation_ms": 6916.043497622013,
"timing/scoring_ms": 0.0,
"timing/total_ms": 6916.043497622013,
"tokens/completion": 315.65625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 26.646199941635132
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.19341044127941132,
"epoch": 0.11466666666666667,
"grad_norm": 0.023761811444065736,
"importance_ratio": 0.9906992316246033,
"learning_rate": 5e-06,
"loss": -0.0037,
"mismatch_kl": 0.04626338183879852,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 86,
"timing/generation_ms": 10513.352582231164,
"timing/scoring_ms": 0.0,
"timing/total_ms": 10513.352582231164,
"tokens/completion": 565.625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 43.092281341552734
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2988993227481842,
"epoch": 0.116,
"grad_norm": 0.08410779443510906,
"importance_ratio": 0.9899005889892578,
"learning_rate": 5e-06,
"loss": -0.0182,
"mismatch_kl": 0.048949241638183594,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 87,
"timing/generation_ms": 6337.426606565714,
"timing/scoring_ms": 0.0,
"timing/total_ms": 6337.426606565714,
"tokens/completion": 288.53125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 29.87082028388977
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.24379415810108185,
"epoch": 0.11733333333333333,
"grad_norm": 0.033951546211805725,
"importance_ratio": 0.9842061996459961,
"learning_rate": 5e-06,
"loss": -0.001,
"mismatch_kl": 0.05609630420804024,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 88,
"timing/generation_ms": 12948.228243738413,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12948.228243738413,
"tokens/completion": 572.8359375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 158.39017939567566
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.46492651104927063,
"epoch": 0.11866666666666667,
"grad_norm": 0.05385249484621595,
"importance_ratio": 0.9755511283874512,
"learning_rate": 5e-06,
"loss": 0.0005,
"mismatch_kl": 0.16615039110183716,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 89,
"timing/generation_ms": 11146.457904949784,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11146.457904949784,
"tokens/completion": 531.22265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 125.18756413459778
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.26703542470932007,
"epoch": 0.12,
"grad_norm": 0.02367206113805114,
"importance_ratio": 0.9910291433334351,
"learning_rate": 5e-06,
"loss": -0.0072,
"mismatch_kl": 0.041237972676754,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 90,
"timing/generation_ms": 12296.578384935856,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12296.578384935856,
"tokens/completion": 619.4375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 123.89916157722473
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.5690855979919434,
"epoch": 0.12133333333333333,
"grad_norm": 0.030434949636985786,
"importance_ratio": 0.9436249136924744,
"learning_rate": 5e-06,
"loss": 0.0044,
"mismatch_kl": 0.4027661979198456,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 91,
"timing/generation_ms": 17300.37511046976,
"timing/scoring_ms": 0.0,
"timing/total_ms": 17300.37511046976,
"tokens/completion": 803.75,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 68.73723077774048
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2795153260231018,
"epoch": 0.12266666666666666,
"grad_norm": 0.033606081779905164,
"importance_ratio": 0.9910190105438232,
"learning_rate": 5e-06,
"loss": -0.0021,
"mismatch_kl": 0.048360757529735565,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 92,
"timing/generation_ms": 10146.174241788685,
"timing/scoring_ms": 0.0,
"timing/total_ms": 10146.174241788685,
"tokens/completion": 409.20703125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 115.50342917442322
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.25254565477371216,
"epoch": 0.124,
"grad_norm": 0.02170917112603325,
"importance_ratio": 0.9928799867630005,
"learning_rate": 5e-06,
"loss": 0.0035,
"mismatch_kl": 0.03083646297454834,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 93,
"timing/generation_ms": 14904.53880932182,
"timing/scoring_ms": 0.0,
"timing/total_ms": 14904.53880932182,
"tokens/completion": 689.578125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 136.12913012504578
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.25149497389793396,
"epoch": 0.12533333333333332,
"grad_norm": 0.049807356598740776,
"importance_ratio": 0.990451455116272,
"learning_rate": 5e-06,
"loss": -0.0058,
"mismatch_kl": 0.03808113560080528,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 94,
"timing/generation_ms": 8459.820285439491,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8459.820285439491,
"tokens/completion": 413.421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 34.11598253250122
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.21991755068302155,
"epoch": 0.12666666666666668,
"grad_norm": 0.02577498970131942,
"importance_ratio": 0.9890254139900208,
"learning_rate": 5e-06,
"loss": -0.0012,
"mismatch_kl": 0.05755931884050369,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 95,
"timing/generation_ms": 5316.206902265549,
"timing/scoring_ms": 0.0,
"timing/total_ms": 5316.206902265549,
"tokens/completion": 254.72265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 129.7372589111328
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4472619593143463,
"epoch": 0.128,
"grad_norm": 0.040975406412791814,
"importance_ratio": 0.9842396378517151,
"learning_rate": 5e-06,
"loss": -0.003,
"mismatch_kl": 0.14270469546318054,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 96,
"timing/generation_ms": 6448.528000153601,
"timing/scoring_ms": 0.0,
"timing/total_ms": 6448.528000153601,
"tokens/completion": 303.2421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 152.90578722953796
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.24905133247375488,
"epoch": 0.12933333333333333,
"grad_norm": 0.0336787422018486,
"importance_ratio": 0.9942489862442017,
"learning_rate": 5e-06,
"loss": -0.0073,
"mismatch_kl": 0.03845536336302757,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 97,
"timing/generation_ms": 10672.863409854472,
"timing/scoring_ms": 0.0,
"timing/total_ms": 10672.863409854472,
"tokens/completion": 522.453125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 58.958009481430054
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.37947529554367065,
"epoch": 0.13066666666666665,
"grad_norm": 0.03256153448253783,
"importance_ratio": 0.9943234324455261,
"learning_rate": 5e-06,
"loss": -0.0033,
"mismatch_kl": 0.0457632839679718,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 98,
"timing/generation_ms": 7797.16813378036,
"timing/scoring_ms": 0.0,
"timing/total_ms": 7797.16813378036,
"tokens/completion": 321.6484375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 57.01115918159485
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3295568525791168,
"epoch": 0.132,
"grad_norm": 0.025070691541196687,
"importance_ratio": 0.9886187314987183,
"learning_rate": 5e-06,
"loss": 0.002,
"mismatch_kl": 0.055542413145303726,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 99,
"timing/generation_ms": 12520.016725175083,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12520.016725175083,
"tokens/completion": 560.515625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 134.89474534988403
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3819415867328644,
"epoch": 0.13333333333333333,
"grad_norm": 0.029430906337480585,
"importance_ratio": 0.9973994493484497,
"learning_rate": 5e-06,
"loss": 0.0014,
"mismatch_kl": 0.03809521347284317,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 100,
"timing/generation_ms": 7522.873256355524,
"timing/scoring_ms": 0.0,
"timing/total_ms": 7522.873256355524,
"tokens/completion": 381.24609375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 42.47270226478577
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3258141577243805,
"epoch": 0.13466666666666666,
"grad_norm": 0.06302493851707891,
"importance_ratio": 0.995746374130249,
"learning_rate": 5e-06,
"loss": -0.0032,
"mismatch_kl": 0.05126583203673363,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 101,
"timing/generation_ms": 6897.25607726723,
"timing/scoring_ms": 0.0,
"timing/total_ms": 6897.25607726723,
"tokens/completion": 331.53515625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 101.3789484500885
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.8970124125480652,
"epoch": 0.136,
"grad_norm": 0.03515811902568956,
"importance_ratio": 0.8364270925521851,
"learning_rate": 5e-06,
"loss": 0.0067,
"mismatch_kl": 1.5947057008743286,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 102,
"timing/generation_ms": 12960.892278701067,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12960.892278701067,
"tokens/completion": 679.25390625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 92.91760039329529
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2645859122276306,
"epoch": 0.13733333333333334,
"grad_norm": 0.03015986556668391,
"importance_ratio": 0.9922869205474854,
"learning_rate": 5e-06,
"loss": -0.0033,
"mismatch_kl": 0.032752275466918945,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 103,
"timing/generation_ms": 12081.96578361094,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12081.96578361094,
"tokens/completion": 635.26171875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 221.86856937408447
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.40493857860565186,
"epoch": 0.13866666666666666,
"grad_norm": 0.029340951142688608,
"importance_ratio": 0.9976834058761597,
"learning_rate": 5e-06,
"loss": -0.0075,
"mismatch_kl": 0.039802681654691696,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 104,
"timing/generation_ms": 8452.124254778028,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8452.124254778028,
"tokens/completion": 392.85546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 78.09920930862427
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.42381417751312256,
"epoch": 0.14,
"grad_norm": 0.03251134797029109,
"importance_ratio": 0.9939345121383667,
"learning_rate": 5e-06,
"loss": -0.0025,
"mismatch_kl": 0.045791786164045334,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 105,
"timing/generation_ms": 11178.499449044466,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11178.499449044466,
"tokens/completion": 480.08984375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 36.62562108039856
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2739037871360779,
"epoch": 0.14133333333333334,
"grad_norm": 0.0476499263024248,
"importance_ratio": 0.9929625988006592,
"learning_rate": 5e-06,
"loss": -0.0024,
"mismatch_kl": 0.036298882216215134,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 106,
"timing/generation_ms": 10698.151003569365,
"timing/scoring_ms": 0.0,
"timing/total_ms": 10698.151003569365,
"tokens/completion": 521.33203125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 130.2317771911621
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2831694781780243,
"epoch": 0.14266666666666666,
"grad_norm": 0.048559683162439526,
"importance_ratio": 0.9895249605178833,
"learning_rate": 5e-06,
"loss": -0.0018,
"mismatch_kl": 0.04853809252381325,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 107,
"timing/generation_ms": 10670.390761457384,
"timing/scoring_ms": 0.0,
"timing/total_ms": 10670.390761457384,
"tokens/completion": 504.16015625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 154.62130737304688
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4273696541786194,
"epoch": 0.144,
"grad_norm": 0.04246003800252577,
"importance_ratio": 0.9897579550743103,
"learning_rate": 5e-06,
"loss": -0.0004,
"mismatch_kl": 0.05487997457385063,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 108,
"timing/generation_ms": 5255.264617502689,
"timing/scoring_ms": 0.0,
"timing/total_ms": 5255.264617502689,
"tokens/completion": 253.4296875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 62.357131004333496
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3309624195098877,
"epoch": 0.14533333333333334,
"grad_norm": 0.020612894864024223,
"importance_ratio": 0.994171679019928,
"learning_rate": 5e-06,
"loss": 0.004,
"mismatch_kl": 0.028750188648700714,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 109,
"timing/generation_ms": 17462.82579470426,
"timing/scoring_ms": 0.0,
"timing/total_ms": 17462.82579470426,
"tokens/completion": 909.28515625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 84.52479147911072
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.45720767974853516,
"epoch": 0.14666666666666667,
"grad_norm": 0.048825755999723545,
"importance_ratio": 0.9917762279510498,
"learning_rate": 5e-06,
"loss": -0.003,
"mismatch_kl": 0.03884867951273918,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 110,
"timing/generation_ms": 10527.64255553484,
"timing/scoring_ms": 0.0,
"timing/total_ms": 10527.64255553484,
"tokens/completion": 457.21875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 118.98395490646362
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3448692262172699,
"epoch": 0.148,
"grad_norm": 0.02391536511668303,
"importance_ratio": 0.9938703775405884,
"learning_rate": 5e-06,
"loss": -0.0118,
"mismatch_kl": 0.03092486597597599,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 111,
"timing/generation_ms": 11426.006315276027,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11426.006315276027,
"tokens/completion": 603.828125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 44.38506889343262
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4210182726383209,
"epoch": 0.14933333333333335,
"grad_norm": 0.017744426750614804,
"importance_ratio": 0.9841266870498657,
"learning_rate": 5e-06,
"loss": 0.0031,
"mismatch_kl": 0.15376684069633484,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 112,
"timing/generation_ms": 15345.524672418833,
"timing/scoring_ms": 0.0,
"timing/total_ms": 15345.524672418833,
"tokens/completion": 679.61328125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 77.3697247505188
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3890233039855957,
"epoch": 0.15066666666666667,
"grad_norm": 0.042319164028374844,
"importance_ratio": 0.9905653595924377,
"learning_rate": 5e-06,
"loss": -0.0067,
"mismatch_kl": 0.03776917979121208,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 113,
"timing/generation_ms": 8361.73670180142,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8361.73670180142,
"tokens/completion": 386.69921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 33.98000693321228
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3506433963775635,
"epoch": 0.152,
"grad_norm": 0.022347419652582003,
"importance_ratio": 0.9932938814163208,
"learning_rate": 5e-06,
"loss": -0.0024,
"mismatch_kl": 0.03900053724646568,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 114,
"timing/generation_ms": 10107.008518651128,
"timing/scoring_ms": 0.0,
"timing/total_ms": 10107.008518651128,
"tokens/completion": 531.8671875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 156.0705955028534
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.28853052854537964,
"epoch": 0.15333333333333332,
"grad_norm": 0.02467560875646059,
"importance_ratio": 0.9956313967704773,
"learning_rate": 5e-06,
"loss": -0.0077,
"mismatch_kl": 0.021128181368112564,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 115,
"timing/generation_ms": 13438.352120108902,
"timing/scoring_ms": 0.0,
"timing/total_ms": 13438.352120108902,
"tokens/completion": 638.3359375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 51.55745196342468
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.5352842807769775,
"epoch": 0.15466666666666667,
"grad_norm": 0.0500581678773726,
"importance_ratio": 0.9921436905860901,
"learning_rate": 5e-06,
"loss": -0.0035,
"mismatch_kl": 0.0745246633887291,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 116,
"timing/generation_ms": 6379.514851607382,
"timing/scoring_ms": 0.0,
"timing/total_ms": 6379.514851607382,
"tokens/completion": 304.5625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 38.366251945495605
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.36106666922569275,
"epoch": 0.156,
"grad_norm": 0.063234851546128,
"importance_ratio": 0.9977811574935913,
"learning_rate": 5e-06,
"loss": -0.0007,
"mismatch_kl": 0.029981082305312157,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 117,
"timing/generation_ms": 7269.031744450331,
"timing/scoring_ms": 0.0,
"timing/total_ms": 7269.031744450331,
"tokens/completion": 359.06640625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 27.440030097961426
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3367100656032562,
"epoch": 0.15733333333333333,
"grad_norm": 0.059808565066134974,
"importance_ratio": 0.988777220249176,
"learning_rate": 5e-06,
"loss": -0.0044,
"mismatch_kl": 0.044747766107320786,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 118,
"timing/generation_ms": 9969.640973955393,
"timing/scoring_ms": 0.0,
"timing/total_ms": 9969.640973955393,
"tokens/completion": 485.625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 82.32884860038757
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.35986092686653137,
"epoch": 0.15866666666666668,
"grad_norm": 0.020285418443392603,
"importance_ratio": 0.9924752116203308,
"learning_rate": 5e-06,
"loss": 0.0042,
"mismatch_kl": 0.031399309635162354,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 119,
"timing/generation_ms": 15499.55965206027,
"timing/scoring_ms": 0.0,
"timing/total_ms": 15499.55965206027,
"tokens/completion": 796.76171875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 56.515456199645996
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.40867432951927185,
"epoch": 0.16,
"grad_norm": 0.018892048843934344,
"importance_ratio": 0.9954840540885925,
"learning_rate": 5e-06,
"loss": -0.0094,
"mismatch_kl": 0.030410781502723694,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 120,
"timing/generation_ms": 13046.93166166544,
"timing/scoring_ms": 0.0,
"timing/total_ms": 13046.93166166544,
"tokens/completion": 672.06640625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 51.22301483154297
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.25322413444519043,
"epoch": 0.16133333333333333,
"grad_norm": 0.019402594506856746,
"importance_ratio": 0.9968504309654236,
"learning_rate": 5e-06,
"loss": -0.0018,
"mismatch_kl": 0.020855166018009186,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 121,
"timing/generation_ms": 33212.274321354926,
"timing/scoring_ms": 0.0,
"timing/total_ms": 33212.274321354926,
"tokens/completion": 1494.39453125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 196.6885223388672
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3203243613243103,
"epoch": 0.16266666666666665,
"grad_norm": 0.016032102577421704,
"importance_ratio": 0.9980469942092896,
"learning_rate": 5e-06,
"loss": -0.0013,
"mismatch_kl": 0.01909617707133293,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 122,
"timing/generation_ms": 21461.640139110386,
"timing/scoring_ms": 0.0,
"timing/total_ms": 21461.640139110386,
"tokens/completion": 1059.1953125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 84.59676575660706
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.5010811686515808,
"epoch": 0.164,
"grad_norm": 0.02848759848639813,
"importance_ratio": 1.0016131401062012,
"learning_rate": 5e-06,
"loss": -0.0097,
"mismatch_kl": 0.02760869450867176,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 123,
"timing/generation_ms": 9319.45723388344,
"timing/scoring_ms": 0.0,
"timing/total_ms": 9319.45723388344,
"tokens/completion": 433.1015625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 36.64540505409241
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4700590968132019,
"epoch": 0.16533333333333333,
"grad_norm": 0.025031920446653932,
"importance_ratio": 0.9973174929618835,
"learning_rate": 5e-06,
"loss": -0.0072,
"mismatch_kl": 0.03977029770612717,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 124,
"timing/generation_ms": 9967.066356912255,
"timing/scoring_ms": 0.0,
"timing/total_ms": 9967.066356912255,
"tokens/completion": 478.1328125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 137.7500193119049
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4410494863986969,
"epoch": 0.16666666666666666,
"grad_norm": 0.02102977498791798,
"importance_ratio": 0.9927030801773071,
"learning_rate": 5e-06,
"loss": -0.0044,
"mismatch_kl": 0.05027690902352333,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 125,
"timing/generation_ms": 13226.7307927832,
"timing/scoring_ms": 0.0,
"timing/total_ms": 13226.7307927832,
"tokens/completion": 666.65234375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 119.67769002914429
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2941017150878906,
"epoch": 0.168,
"grad_norm": 0.01764622195762912,
"importance_ratio": 0.9970736503601074,
"learning_rate": 5e-06,
"loss": -0.0039,
"mismatch_kl": 0.025975050404667854,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 126,
"timing/generation_ms": 30093.59989501536,
"timing/scoring_ms": 0.0,
"timing/total_ms": 30093.59989501536,
"tokens/completion": 1403.23046875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 111.32783484458923
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.37822288274765015,
"epoch": 0.16933333333333334,
"grad_norm": 0.03205413439415866,
"importance_ratio": 0.9921689629554749,
"learning_rate": 5e-06,
"loss": -0.0015,
"mismatch_kl": 0.10021175444126129,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 127,
"timing/generation_ms": 25918.55046711862,
"timing/scoring_ms": 0.0,
"timing/total_ms": 25918.55046711862,
"tokens/completion": 1132.37890625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 101.07530164718628
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.46506795287132263,
"epoch": 0.17066666666666666,
"grad_norm": 0.026459518059964743,
"importance_ratio": 0.995638906955719,
"learning_rate": 5e-06,
"loss": -0.0065,
"mismatch_kl": 0.03533043712377548,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 128,
"timing/generation_ms": 8870.356048457325,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8870.356048457325,
"tokens/completion": 477.8046875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 41.62081718444824
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4616319537162781,
"epoch": 0.172,
"grad_norm": 0.029689428333274717,
"importance_ratio": 0.992743194103241,
"learning_rate": 5e-06,
"loss": -0.0116,
"mismatch_kl": 0.043640002608299255,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 129,
"timing/generation_ms": 17582.845278084278,
"timing/scoring_ms": 0.0,
"timing/total_ms": 17582.845278084278,
"tokens/completion": 896.60546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 103.23663401603699
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.318230539560318,
"epoch": 0.17333333333333334,
"grad_norm": 0.021848886677287266,
"importance_ratio": 1.0002652406692505,
"learning_rate": 5e-06,
"loss": -0.0028,
"mismatch_kl": 0.032250385731458664,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 130,
"timing/generation_ms": 12501.79857108742,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12501.79857108742,
"tokens/completion": 636.82421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 46.11354732513428
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2867668569087982,
"epoch": 0.17466666666666666,
"grad_norm": 0.0152850963716213,
"importance_ratio": 0.9975439310073853,
"learning_rate": 5e-06,
"loss": 0.0004,
"mismatch_kl": 0.03095307946205139,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 131,
"timing/generation_ms": 21872.447106055915,
"timing/scoring_ms": 0.0,
"timing/total_ms": 21872.447106055915,
"tokens/completion": 1016.09765625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 75.5360016822815
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.6940531134605408,
"epoch": 0.176,
"grad_norm": 0.027620607135447624,
"importance_ratio": 0.9872549176216125,
"learning_rate": 5e-06,
"loss": 0.0013,
"mismatch_kl": 0.14033383131027222,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 132,
"timing/generation_ms": 11405.475388281047,
"timing/scoring_ms": 0.0,
"timing/total_ms": 11405.475388281047,
"tokens/completion": 487.51953125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 55.63127040863037
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.36297503113746643,
"epoch": 0.17733333333333334,
"grad_norm": 0.029171908888413572,
"importance_ratio": 0.9953750967979431,
"learning_rate": 5e-06,
"loss": -0.0051,
"mismatch_kl": 0.035398464649915695,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 133,
"timing/generation_ms": 17919.221241027117,
"timing/scoring_ms": 0.0,
"timing/total_ms": 17919.221241027117,
"tokens/completion": 900.453125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 170.36363244056702
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2939022481441498,
"epoch": 0.17866666666666667,
"grad_norm": 0.02565678896444847,
"importance_ratio": 0.99770587682724,
"learning_rate": 5e-06,
"loss": -0.0013,
"mismatch_kl": 0.019702836871147156,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 134,
"timing/generation_ms": 26027.854280546308,
"timing/scoring_ms": 0.0,
"timing/total_ms": 26027.854280546308,
"tokens/completion": 1189.94921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 208.00876903533936
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.27582186460494995,
"epoch": 0.18,
"grad_norm": 0.025995432419046362,
"importance_ratio": 0.9993173480033875,
"learning_rate": 5e-06,
"loss": 0.0001,
"mismatch_kl": 0.023949675261974335,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 135,
"timing/generation_ms": 19027.399071492255,
"timing/scoring_ms": 0.0,
"timing/total_ms": 19027.399071492255,
"tokens/completion": 910.98828125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 69.73441195487976
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.31243762373924255,
"epoch": 0.18133333333333335,
"grad_norm": 0.021978924242567442,
"importance_ratio": 0.9992286562919617,
"learning_rate": 5e-06,
"loss": -0.0016,
"mismatch_kl": 0.024040305987000465,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 136,
"timing/generation_ms": 14330.211003310978,
"timing/scoring_ms": 0.0,
"timing/total_ms": 14330.211003310978,
"tokens/completion": 671.7265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 53.44596743583679
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.24772067368030548,
"epoch": 0.18266666666666667,
"grad_norm": 0.022707662268209423,
"importance_ratio": 0.9990280866622925,
"learning_rate": 5e-06,
"loss": -0.0023,
"mismatch_kl": 0.022532925009727478,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 137,
"timing/generation_ms": 35249.2256751284,
"timing/scoring_ms": 0.0,
"timing/total_ms": 35249.2256751284,
"tokens/completion": 1598.390625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 216.32258987426758
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.38041970133781433,
"epoch": 0.184,
"grad_norm": 0.046110003811864524,
"importance_ratio": 0.9846709370613098,
"learning_rate": 5e-06,
"loss": -0.0024,
"mismatch_kl": 0.1807573288679123,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 138,
"timing/generation_ms": 10808.89296438545,
"timing/scoring_ms": 0.0,
"timing/total_ms": 10808.89296438545,
"tokens/completion": 505.0625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 72.23299145698547
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3362736403942108,
"epoch": 0.18533333333333332,
"grad_norm": 0.057037876570506886,
"importance_ratio": 0.9923868179321289,
"learning_rate": 5e-06,
"loss": 0.0033,
"mismatch_kl": 0.0626266598701477,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 139,
"timing/generation_ms": 8591.852098703384,
"timing/scoring_ms": 0.0,
"timing/total_ms": 8591.852098703384,
"tokens/completion": 445.6875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 83.33036708831787
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2598806321620941,
"epoch": 0.18666666666666668,
"grad_norm": 0.021433898880701664,
"importance_ratio": 0.9913464784622192,
"learning_rate": 5e-06,
"loss": 0.0022,
"mismatch_kl": 0.04193839803338051,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 140,
"timing/generation_ms": 22836.472398601472,
"timing/scoring_ms": 0.0,
"timing/total_ms": 22836.472398601472,
"tokens/completion": 1069.79296875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 123.7300488948822
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.27669745683670044,
"epoch": 0.188,
"grad_norm": 0.040287051430344514,
"importance_ratio": 0.9890030026435852,
"learning_rate": 5e-06,
"loss": 0.0006,
"mismatch_kl": 0.03683684393763542,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 141,
"timing/generation_ms": 22967.52266585827,
"timing/scoring_ms": 0.0,
"timing/total_ms": 22967.52266585827,
"tokens/completion": 1105.08203125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 119.94411706924438
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.32473960518836975,
"epoch": 0.18933333333333333,
"grad_norm": 0.03235428789871377,
"importance_ratio": 0.9974983334541321,
"learning_rate": 5e-06,
"loss": 0.0005,
"mismatch_kl": 0.021878903731703758,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 142,
"timing/generation_ms": 20083.584303036332,
"timing/scoring_ms": 0.0,
"timing/total_ms": 20083.584303036332,
"tokens/completion": 1026.375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 182.45814514160156
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.32302016019821167,
"epoch": 0.19066666666666668,
"grad_norm": 0.02364068100843913,
"importance_ratio": 1.000141978263855,
"learning_rate": 5e-06,
"loss": 0.0026,
"mismatch_kl": 0.027520477771759033,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 143,
"timing/generation_ms": 13226.199164055288,
"timing/scoring_ms": 0.0,
"timing/total_ms": 13226.199164055288,
"tokens/completion": 630.8828125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 68.72126913070679
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4063912034034729,
"epoch": 0.192,
"grad_norm": 0.016855205380348858,
"importance_ratio": 0.9972877502441406,
"learning_rate": 5e-06,
"loss": -0.0044,
"mismatch_kl": 0.02402544766664505,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 144,
"timing/generation_ms": 18624.562999233603,
"timing/scoring_ms": 0.0,
"timing/total_ms": 18624.562999233603,
"tokens/completion": 916.34765625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 63.37579298019409
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3544447124004364,
"epoch": 0.19333333333333333,
"grad_norm": 0.03420133721717633,
"importance_ratio": 0.9964665174484253,
"learning_rate": 5e-06,
"loss": -0.0075,
"mismatch_kl": 0.020806703716516495,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 145,
"timing/generation_ms": 18501.724537461996,
"timing/scoring_ms": 0.0,
"timing/total_ms": 18501.724537461996,
"tokens/completion": 914.03515625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 63.586211919784546
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.31913280487060547,
"epoch": 0.19466666666666665,
"grad_norm": 0.025814291552238237,
"importance_ratio": 0.9976394176483154,
"learning_rate": 5e-06,
"loss": -0.0017,
"mismatch_kl": 0.02318250946700573,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 146,
"timing/generation_ms": 17320.88227570057,
"timing/scoring_ms": 0.0,
"timing/total_ms": 17320.88227570057,
"tokens/completion": 802.69921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 161.1075360774994
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3060760796070099,
"epoch": 0.196,
"grad_norm": 0.024041285955131858,
"importance_ratio": 0.9983845353126526,
"learning_rate": 5e-06,
"loss": -0.0044,
"mismatch_kl": 0.021491888910531998,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 147,
"timing/generation_ms": 20764.05915338546,
"timing/scoring_ms": 0.0,
"timing/total_ms": 20764.05915338546,
"tokens/completion": 1029.03125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 80.10747575759888
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4907422661781311,
"epoch": 0.19733333333333333,
"grad_norm": 0.01969056173140591,
"importance_ratio": 0.9921115040779114,
"learning_rate": 5e-06,
"loss": 0.0019,
"mismatch_kl": 0.09054939448833466,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 148,
"timing/generation_ms": 14571.548252366483,
"timing/scoring_ms": 0.0,
"timing/total_ms": 14571.548252366483,
"tokens/completion": 646.578125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 94.1196072101593
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2544015944004059,
"epoch": 0.19866666666666666,
"grad_norm": 0.020070961466503938,
"importance_ratio": 0.998515784740448,
"learning_rate": 5e-06,
"loss": -0.0002,
"mismatch_kl": 0.019744453951716423,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 149,
"timing/generation_ms": 23987.087721936405,
"timing/scoring_ms": 0.0,
"timing/total_ms": 23987.087721936405,
"tokens/completion": 1105.234375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 87.52198696136475
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.28248143196105957,
"epoch": 0.2,
"grad_norm": 0.0191634545508177,
"importance_ratio": 0.9957163333892822,
"learning_rate": 5e-06,
"loss": -0.004,
"mismatch_kl": 0.018821164965629578,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 150,
"timing/generation_ms": 20559.32307895273,
"timing/scoring_ms": 0.0,
"timing/total_ms": 20559.32307895273,
"tokens/completion": 1016.2265625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 131.24922895431519
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4225759208202362,
"epoch": 0.20133333333333334,
"grad_norm": 0.01854881603951969,
"importance_ratio": 0.9962813854217529,
"learning_rate": 5e-06,
"loss": -0.0017,
"mismatch_kl": 0.025664212182164192,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 151,
"timing/generation_ms": 22859.651166945696,
"timing/scoring_ms": 0.0,
"timing/total_ms": 22859.651166945696,
"tokens/completion": 1112.96484375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 170.4989137649536
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2856869399547577,
"epoch": 0.20266666666666666,
"grad_norm": 0.018394448039889547,
"importance_ratio": 0.9985631704330444,
"learning_rate": 5e-06,
"loss": -0.0018,
"mismatch_kl": 0.024066420271992683,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 152,
"timing/generation_ms": 37744.059775955975,
"timing/scoring_ms": 0.0,
"timing/total_ms": 37744.059775955975,
"tokens/completion": 1768.79296875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 248.44115471839905
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2514509856700897,
"epoch": 0.204,
"grad_norm": 0.023912470711877663,
"importance_ratio": 0.9981127381324768,
"learning_rate": 5e-06,
"loss": -0.0029,
"mismatch_kl": 0.020759448409080505,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 153,
"timing/generation_ms": 25985.86314264685,
"timing/scoring_ms": 0.0,
"timing/total_ms": 25985.86314264685,
"tokens/completion": 1309.546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 143.50284838676453
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.30002838373184204,
"epoch": 0.20533333333333334,
"grad_norm": 0.018497092206319014,
"importance_ratio": 0.9994171857833862,
"learning_rate": 5e-06,
"loss": -0.0022,
"mismatch_kl": 0.015115631744265556,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 154,
"timing/generation_ms": 20836.18642948568,
"timing/scoring_ms": 0.0,
"timing/total_ms": 20836.18642948568,
"tokens/completion": 972.66796875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 112.54808211326599
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3808918297290802,
"epoch": 0.20666666666666667,
"grad_norm": 0.014750747901418159,
"importance_ratio": 0.9998784065246582,
"learning_rate": 5e-06,
"loss": -0.0023,
"mismatch_kl": 0.0203760527074337,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 155,
"timing/generation_ms": 28712.269487790763,
"timing/scoring_ms": 0.0,
"timing/total_ms": 28712.269487790763,
"tokens/completion": 1384.42578125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 116.96515583992004
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.404234915971756,
"epoch": 0.208,
"grad_norm": 0.02774018143964054,
"importance_ratio": 0.9903627038002014,
"learning_rate": 5e-06,
"loss": 0.0022,
"mismatch_kl": 0.09949617087841034,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 156,
"timing/generation_ms": 15220.996337942779,
"timing/scoring_ms": 0.0,
"timing/total_ms": 15220.996337942779,
"tokens/completion": 733.44921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 80.95505475997925
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2150656282901764,
"epoch": 0.20933333333333334,
"grad_norm": 0.012574265789504322,
"importance_ratio": 0.9968655109405518,
"learning_rate": 5e-06,
"loss": -0.0043,
"mismatch_kl": 0.01895724982023239,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 157,
"timing/generation_ms": 46771.82784862816,
"timing/scoring_ms": 0.0,
"timing/total_ms": 46771.82784862816,
"tokens/completion": 2055.46875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 183.42079520225525
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2713158428668976,
"epoch": 0.21066666666666667,
"grad_norm": 0.03512934826143982,
"importance_ratio": 0.9985222220420837,
"learning_rate": 5e-06,
"loss": -0.0028,
"mismatch_kl": 0.01624884642660618,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 158,
"timing/generation_ms": 20947.266034781933,
"timing/scoring_ms": 0.0,
"timing/total_ms": 20947.266034781933,
"tokens/completion": 1009.90234375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 87.24977517127991
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.32832008600234985,
"epoch": 0.212,
"grad_norm": 0.02405397079489038,
"importance_ratio": 0.9991105198860168,
"learning_rate": 5e-06,
"loss": -0.0056,
"mismatch_kl": 0.016867484897375107,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 159,
"timing/generation_ms": 21430.58088142425,
"timing/scoring_ms": 0.0,
"timing/total_ms": 21430.58088142425,
"tokens/completion": 1012.43359375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 87.2035722732544
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.32067254185676575,
"epoch": 0.21333333333333335,
"grad_norm": 0.030583585605830663,
"importance_ratio": 1.0010290145874023,
"learning_rate": 5e-06,
"loss": 0.0029,
"mismatch_kl": 0.01957845501601696,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 160,
"timing/generation_ms": 12068.631175905466,
"timing/scoring_ms": 0.0,
"timing/total_ms": 12068.631175905466,
"tokens/completion": 585.69921875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 46.4997832775116
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.20440350472927094,
"epoch": 0.21466666666666667,
"grad_norm": 0.009198384471964699,
"importance_ratio": 0.9953656196594238,
"learning_rate": 5e-06,
"loss": -0.0052,
"mismatch_kl": 0.024851609021425247,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 161,
"timing/generation_ms": 64061.363669112325,
"timing/scoring_ms": 0.0,
"timing/total_ms": 64061.363669112325,
"tokens/completion": 2746.5390625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 252.9020836353302
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2289305031299591,
"epoch": 0.216,
"grad_norm": 0.017027620442399836,
"importance_ratio": 0.9964645504951477,
"learning_rate": 5e-06,
"loss": 0.0005,
"mismatch_kl": 0.02016555331647396,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 162,
"timing/generation_ms": 29072.1739763394,
"timing/scoring_ms": 0.0,
"timing/total_ms": 29072.1739763394,
"tokens/completion": 1294.0546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 187.8606402873993
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.23871932923793793,
"epoch": 0.21733333333333332,
"grad_norm": 0.026046585403665903,
"importance_ratio": 0.998152494430542,
"learning_rate": 5e-06,
"loss": 0.0052,
"mismatch_kl": 0.016869615763425827,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 163,
"timing/generation_ms": 33103.609337471426,
"timing/scoring_ms": 0.0,
"timing/total_ms": 33103.609337471426,
"tokens/completion": 1545.50390625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 139.85770416259766
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.28158116340637207,
"epoch": 0.21866666666666668,
"grad_norm": 0.015259806348832568,
"importance_ratio": 0.9982590079307556,
"learning_rate": 5e-06,
"loss": -0.0053,
"mismatch_kl": 0.022746765986084938,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 164,
"timing/generation_ms": 26944.41274832934,
"timing/scoring_ms": 0.0,
"timing/total_ms": 26944.41274832934,
"tokens/completion": 1337.65625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 109.10997653007507
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3031062185764313,
"epoch": 0.22,
"grad_norm": 0.016960115464425836,
"importance_ratio": 0.9974260926246643,
"learning_rate": 5e-06,
"loss": -0.0023,
"mismatch_kl": 0.02418132871389389,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 165,
"timing/generation_ms": 26665.55192042142,
"timing/scoring_ms": 0.0,
"timing/total_ms": 26665.55192042142,
"tokens/completion": 1298.09765625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 233.19409203529358
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.30360692739486694,
"epoch": 0.22133333333333333,
"grad_norm": 0.03976443826488329,
"importance_ratio": 0.9983341097831726,
"learning_rate": 5e-06,
"loss": -0.0064,
"mismatch_kl": 0.02314077690243721,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 166,
"timing/generation_ms": 14128.881074488163,
"timing/scoring_ms": 0.0,
"timing/total_ms": 14128.881074488163,
"tokens/completion": 701.61328125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 55.524725914001465
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2678433656692505,
"epoch": 0.22266666666666668,
"grad_norm": 0.03342438517457818,
"importance_ratio": 0.9922596216201782,
"learning_rate": 5e-06,
"loss": -0.0023,
"mismatch_kl": 0.035250429064035416,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 167,
"timing/generation_ms": 21135.669719427824,
"timing/scoring_ms": 0.0,
"timing/total_ms": 21135.669719427824,
"tokens/completion": 1019.171875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 149.8279891014099
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.20458683371543884,
"epoch": 0.224,
"grad_norm": 0.022088093083212943,
"importance_ratio": 0.9954257011413574,
"learning_rate": 5e-06,
"loss": -0.0018,
"mismatch_kl": 0.023710263893008232,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 168,
"timing/generation_ms": 59294.02190912515,
"timing/scoring_ms": 0.0,
"timing/total_ms": 59294.02190912515,
"tokens/completion": 2536.8828125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 207.61119556427002
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.4547651410102844,
"epoch": 0.22533333333333333,
"grad_norm": 0.03804278639742813,
"importance_ratio": 0.9720731973648071,
"learning_rate": 5e-06,
"loss": 0.0026,
"mismatch_kl": 0.2540355324745178,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 169,
"timing/generation_ms": 14632.340895012021,
"timing/scoring_ms": 0.0,
"timing/total_ms": 14632.340895012021,
"tokens/completion": 634.8203125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 66.74064421653748
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.26701289415359497,
"epoch": 0.22666666666666666,
"grad_norm": 0.03041084967586165,
"importance_ratio": 0.9971191883087158,
"learning_rate": 5e-06,
"loss": -0.0024,
"mismatch_kl": 0.02894790843129158,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 170,
"timing/generation_ms": 21908.162399195135,
"timing/scoring_ms": 0.0,
"timing/total_ms": 21908.162399195135,
"tokens/completion": 1060.19140625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 285.11374616622925
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3831964433193207,
"epoch": 0.228,
"grad_norm": 0.020277373003486452,
"importance_ratio": 0.9703661799430847,
"learning_rate": 5e-06,
"loss": -0.0013,
"mismatch_kl": 0.288127064704895,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 171,
"timing/generation_ms": 21739.85463846475,
"timing/scoring_ms": 0.0,
"timing/total_ms": 21739.85463846475,
"tokens/completion": 1042.390625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 126.53577995300293
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3104299008846283,
"epoch": 0.22933333333333333,
"grad_norm": 0.05268300034795112,
"importance_ratio": 0.9946843981742859,
"learning_rate": 5e-06,
"loss": -0.0045,
"mismatch_kl": 0.028223995119333267,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 172,
"timing/generation_ms": 18181.49754870683,
"timing/scoring_ms": 0.0,
"timing/total_ms": 18181.49754870683,
"tokens/completion": 876.87890625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 102.08800101280212
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.32149240374565125,
"epoch": 0.23066666666666666,
"grad_norm": 0.019198595379338976,
"importance_ratio": 0.9882834553718567,
"learning_rate": 5e-06,
"loss": 0.0031,
"mismatch_kl": 0.09531966596841812,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 173,
"timing/generation_ms": 26753.23315896094,
"timing/scoring_ms": 0.0,
"timing/total_ms": 26753.23315896094,
"tokens/completion": 1199.828125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 243.50505256652832
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2507164180278778,
"epoch": 0.232,
"grad_norm": 0.0248134202199756,
"importance_ratio": 0.9970893263816833,
"learning_rate": 5e-06,
"loss": -0.0063,
"mismatch_kl": 0.033440057188272476,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 174,
"timing/generation_ms": 32734.658079221845,
"timing/scoring_ms": 0.0,
"timing/total_ms": 32734.658079221845,
"tokens/completion": 1582.765625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 236.81393718719482
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2938965857028961,
"epoch": 0.23333333333333334,
"grad_norm": 0.023295024031541062,
"importance_ratio": 0.9996641874313354,
"learning_rate": 5e-06,
"loss": -0.0014,
"mismatch_kl": 0.030382564291357994,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 175,
"timing/generation_ms": 18484.799866564572,
"timing/scoring_ms": 0.0,
"timing/total_ms": 18484.799866564572,
"tokens/completion": 869.8203125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 89.94726347923279
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.24128344655036926,
"epoch": 0.23466666666666666,
"grad_norm": 0.021681137287839845,
"importance_ratio": 0.995689868927002,
"learning_rate": 5e-06,
"loss": -0.0024,
"mismatch_kl": 0.025076182559132576,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 176,
"timing/generation_ms": 16699.054242111742,
"timing/scoring_ms": 0.0,
"timing/total_ms": 16699.054242111742,
"tokens/completion": 831.890625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 76.11790347099304
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.26724985241889954,
"epoch": 0.236,
"grad_norm": 0.015254325506305103,
"importance_ratio": 0.992223858833313,
"learning_rate": 5e-06,
"loss": -0.0003,
"mismatch_kl": 0.02879425697028637,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 177,
"timing/generation_ms": 30596.904239617288,
"timing/scoring_ms": 0.0,
"timing/total_ms": 30596.904239617288,
"tokens/completion": 1407.20703125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 199.58447432518005
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.28972604870796204,
"epoch": 0.23733333333333334,
"grad_norm": 0.01945907676336341,
"importance_ratio": 0.9937379956245422,
"learning_rate": 5e-06,
"loss": -0.0002,
"mismatch_kl": 0.026391636580228806,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 178,
"timing/generation_ms": 22168.457314372063,
"timing/scoring_ms": 0.0,
"timing/total_ms": 22168.457314372063,
"tokens/completion": 1017.8515625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 198.82207107543945
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2955513298511505,
"epoch": 0.23866666666666667,
"grad_norm": 0.034061359790196394,
"importance_ratio": 0.9955794811248779,
"learning_rate": 5e-06,
"loss": -0.0017,
"mismatch_kl": 0.026111198589205742,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 179,
"timing/generation_ms": 17585.104428231716,
"timing/scoring_ms": 0.0,
"timing/total_ms": 17585.104428231716,
"tokens/completion": 836.7421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 98.93776655197144
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.33897051215171814,
"epoch": 0.24,
"grad_norm": 0.026732099750916328,
"importance_ratio": 0.9968024492263794,
"learning_rate": 5e-06,
"loss": -0.0016,
"mismatch_kl": 0.03142106905579567,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 180,
"timing/generation_ms": 14579.319617711008,
"timing/scoring_ms": 0.0,
"timing/total_ms": 14579.319617711008,
"tokens/completion": 657.60546875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 48.83777070045471
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.27722474932670593,
"epoch": 0.24133333333333334,
"grad_norm": 0.02190113915349276,
"importance_ratio": 0.9932956099510193,
"learning_rate": 5e-06,
"loss": -0.0039,
"mismatch_kl": 0.039353836327791214,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 181,
"timing/generation_ms": 16838.846164755523,
"timing/scoring_ms": 0.0,
"timing/total_ms": 16838.846164755523,
"tokens/completion": 837.53125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 90.39262366294861
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.21952733397483826,
"epoch": 0.24266666666666667,
"grad_norm": 0.019030162680243098,
"importance_ratio": 0.9920942783355713,
"learning_rate": 5e-06,
"loss": 0.0007,
"mismatch_kl": 0.03863741457462311,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 182,
"timing/generation_ms": 19943.43529921025,
"timing/scoring_ms": 0.0,
"timing/total_ms": 19943.43529921025,
"tokens/completion": 959.51953125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 68.7491762638092
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.37819504737854004,
"epoch": 0.244,
"grad_norm": 0.030600275992650774,
"importance_ratio": 0.9981564879417419,
"learning_rate": 5e-06,
"loss": -0.0061,
"mismatch_kl": 0.0258224718272686,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 183,
"timing/generation_ms": 19337.73651625961,
"timing/scoring_ms": 0.0,
"timing/total_ms": 19337.73651625961,
"tokens/completion": 909.80078125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 67.45709013938904
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.24391266703605652,
"epoch": 0.24533333333333332,
"grad_norm": 0.020045952746227204,
"importance_ratio": 0.9952253103256226,
"learning_rate": 5e-06,
"loss": -0.0035,
"mismatch_kl": 0.022540580481290817,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 184,
"timing/generation_ms": 29042.017024941742,
"timing/scoring_ms": 0.0,
"timing/total_ms": 29042.017024941742,
"tokens/completion": 1416.3046875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 224.1438853740692
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2780689597129822,
"epoch": 0.24666666666666667,
"grad_norm": 0.0286906981880458,
"importance_ratio": 0.9939864277839661,
"learning_rate": 5e-06,
"loss": 0.0002,
"mismatch_kl": 0.028331460431218147,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 185,
"timing/generation_ms": 13990.399835631251,
"timing/scoring_ms": 0.0,
"timing/total_ms": 13990.399835631251,
"tokens/completion": 712.27734375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 65.08906888961792
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2785170078277588,
"epoch": 0.248,
"grad_norm": 0.019455372327007777,
"importance_ratio": 0.9962543249130249,
"learning_rate": 5e-06,
"loss": 0.0021,
"mismatch_kl": 0.030258335173130035,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 186,
"timing/generation_ms": 29046.93407472223,
"timing/scoring_ms": 0.0,
"timing/total_ms": 29046.93407472223,
"tokens/completion": 1342.078125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 117.269207239151
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.29877498745918274,
"epoch": 0.24933333333333332,
"grad_norm": 0.041522981103745076,
"importance_ratio": 0.9973271489143372,
"learning_rate": 5e-06,
"loss": 0.0005,
"mismatch_kl": 0.027791054919362068,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 187,
"timing/generation_ms": 27519.34172678739,
"timing/scoring_ms": 0.0,
"timing/total_ms": 27519.34172678739,
"tokens/completion": 1335.86328125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 109.74448680877686
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2548399567604065,
"epoch": 0.25066666666666665,
"grad_norm": 0.01914209458227723,
"importance_ratio": 0.9980031251907349,
"learning_rate": 5e-06,
"loss": -0.0056,
"mismatch_kl": 0.023154988884925842,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 188,
"timing/generation_ms": 18434.748891741037,
"timing/scoring_ms": 0.0,
"timing/total_ms": 18434.748891741037,
"tokens/completion": 841.21484375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 100.93693470954895
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.36281952261924744,
"epoch": 0.252,
"grad_norm": 0.04366345528631447,
"importance_ratio": 0.997806966304779,
"learning_rate": 5e-06,
"loss": -0.0104,
"mismatch_kl": 0.0235320795327425,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 189,
"timing/generation_ms": 25268.099238164723,
"timing/scoring_ms": 0.0,
"timing/total_ms": 25268.099238164723,
"tokens/completion": 1256.1484375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 102.91489505767822
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.22508475184440613,
"epoch": 0.25333333333333335,
"grad_norm": 0.01385345071504184,
"importance_ratio": 0.9968878626823425,
"learning_rate": 5e-06,
"loss": -0.0107,
"mismatch_kl": 0.02765449695289135,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 190,
"timing/generation_ms": 37916.601489298046,
"timing/scoring_ms": 0.0,
"timing/total_ms": 37916.601489298046,
"tokens/completion": 1717.34765625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 139.42678880691528
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.40229278802871704,
"epoch": 0.25466666666666665,
"grad_norm": 0.02875613000959139,
"importance_ratio": 0.9828155040740967,
"learning_rate": 5e-06,
"loss": 0.0055,
"mismatch_kl": 0.19772163033485413,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 191,
"timing/generation_ms": 32680.235791951418,
"timing/scoring_ms": 0.0,
"timing/total_ms": 32680.235791951418,
"tokens/completion": 1459.58203125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 144.90490436553955
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2181045562028885,
"epoch": 0.256,
"grad_norm": 0.019693707478772454,
"importance_ratio": 0.9942646026611328,
"learning_rate": 5e-06,
"loss": 0.0029,
"mismatch_kl": 0.03511533513665199,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 192,
"timing/generation_ms": 36065.32556284219,
"timing/scoring_ms": 0.0,
"timing/total_ms": 36065.32556284219,
"tokens/completion": 1708.7734375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 126.33067202568054
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.2962771952152252,
"epoch": 0.25733333333333336,
"grad_norm": 0.02416381381264868,
"importance_ratio": 0.9941651821136475,
"learning_rate": 5e-06,
"loss": 0.0024,
"mismatch_kl": 0.0343640111386776,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 193,
"timing/generation_ms": 36326.69063284993,
"timing/scoring_ms": 0.0,
"timing/total_ms": 36326.69063284993,
"tokens/completion": 1645.30859375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 146.5855736732483
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.22655896842479706,
"epoch": 0.25866666666666666,
"grad_norm": 0.024160165001251035,
"importance_ratio": 0.995488166809082,
"learning_rate": 5e-06,
"loss": 0.0023,
"mismatch_kl": 0.023622261360287666,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 194,
"timing/generation_ms": 40274.337109178305,
"timing/scoring_ms": 0.0,
"timing/total_ms": 40274.337109178305,
"tokens/completion": 1910.0,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 137.63950419425964
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.24619098007678986,
"epoch": 0.26,
"grad_norm": 0.008997397579246655,
"importance_ratio": 0.9905009865760803,
"learning_rate": 5e-06,
"loss": 0.0047,
"mismatch_kl": 0.06482454389333725,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 195,
"timing/generation_ms": 107369.31251455098,
"timing/scoring_ms": 0.0,
"timing/total_ms": 107369.31251455098,
"tokens/completion": 3881.7421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 291.5552787780762
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.24800750613212585,
"epoch": 0.2613333333333333,
"grad_norm": 0.041355633656673725,
"importance_ratio": 0.996856689453125,
"learning_rate": 5e-06,
"loss": 0.0027,
"mismatch_kl": 0.023481056094169617,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 196,
"timing/generation_ms": 23556.342590600252,
"timing/scoring_ms": 0.0,
"timing/total_ms": 23556.342590600252,
"tokens/completion": 801.36328125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 66.23490047454834
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.20097197592258453,
"epoch": 0.26266666666666666,
"grad_norm": 0.01639665709788699,
"importance_ratio": 0.995540201663971,
"learning_rate": 5e-06,
"loss": -0.0009,
"mismatch_kl": 0.02512766607105732,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 197,
"timing/generation_ms": 54791.293187998235,
"timing/scoring_ms": 0.0,
"timing/total_ms": 54791.293187998235,
"tokens/completion": 2467.2578125,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 184.51049184799194
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.24079304933547974,
"epoch": 0.264,
"grad_norm": 0.033558115100562454,
"importance_ratio": 0.9966259002685547,
"learning_rate": 5e-06,
"loss": -0.0129,
"mismatch_kl": 0.02248232252895832,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 198,
"timing/generation_ms": 38877.40421388298,
"timing/scoring_ms": 0.0,
"timing/total_ms": 38877.40421388298,
"tokens/completion": 1947.15625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 256.89259123802185
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.22992920875549316,
"epoch": 0.2653333333333333,
"grad_norm": 0.019833326998120116,
"importance_ratio": 0.996269166469574,
"learning_rate": 5e-06,
"loss": -0.0002,
"mismatch_kl": 0.02254408784210682,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 199,
"timing/generation_ms": 22910.992676392198,
"timing/scoring_ms": 0.0,
"timing/total_ms": 22910.992676392198,
"tokens/completion": 1146.32421875,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 153.08721899986267
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.21609917283058167,
"epoch": 0.26666666666666666,
"grad_norm": 0.017782941960253474,
"importance_ratio": 0.9933099746704102,
"learning_rate": 5e-06,
"loss": -0.0047,
"mismatch_kl": 0.028513798490166664,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 200,
"timing/generation_ms": 28995.982899330556,
"timing/scoring_ms": 0.0,
"timing/total_ms": 28995.982899330556,
"tokens/completion": 1354.24609375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 139.1398515701294
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.3927169740200043,
"epoch": 0.268,
"grad_norm": 0.08540874966055562,
"importance_ratio": 0.9711376428604126,
"learning_rate": 5e-06,
"loss": 0.0081,
"mismatch_kl": 0.2314944714307785,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 201,
"timing/generation_ms": 31200.909822247922,
"timing/scoring_ms": 0.0,
"timing/total_ms": 31200.909822247922,
"tokens/completion": 1405.9765625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 178.80973744392395
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.235797718167305,
"epoch": 0.2693333333333333,
"grad_norm": 0.01568085371274426,
"importance_ratio": 0.9909575581550598,
"learning_rate": 5e-06,
"loss": -0.0079,
"mismatch_kl": 0.039374206215143204,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 202,
"timing/generation_ms": 42998.49198944867,
"timing/scoring_ms": 0.0,
"timing/total_ms": 42998.49198944867,
"tokens/completion": 1907.31640625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 141.76219058036804
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.23127324879169464,
"epoch": 0.27066666666666667,
"grad_norm": 0.02007459981352103,
"importance_ratio": 0.9912987947463989,
"learning_rate": 5e-06,
"loss": -0.001,
"mismatch_kl": 0.03943263366818428,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 203,
"timing/generation_ms": 37774.500319734216,
"timing/scoring_ms": 0.0,
"timing/total_ms": 37774.500319734216,
"tokens/completion": 1693.734375,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 302.7908329963684
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.22054153680801392,
"epoch": 0.272,
"grad_norm": 0.021761300841866088,
"importance_ratio": 0.9904981851577759,
"learning_rate": 5e-06,
"loss": -0.0026,
"mismatch_kl": 0.037401266396045685,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 204,
"timing/generation_ms": 42541.27501603216,
"timing/scoring_ms": 0.0,
"timing/total_ms": 42541.27501603216,
"tokens/completion": 1937.69140625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 233.74011135101318
},
{
"advantage/absmean": 0.12451171875,
"entropy": 0.22628618776798248,
"epoch": 0.2733333333333333,
"grad_norm": 0.011121419921268808,
"importance_ratio": 0.9924519658088684,
"learning_rate": 5e-06,
"loss": 0.0013,
"mismatch_kl": 0.03573086857795715,
"reward": 0.12451171875,
"reward/std": 0.1738164722919464,
"step": 205,
"timing/generation_ms": 35010.2855078876,
"timing/scoring_ms": 0.0,
"timing/total_ms": 35010.2855078876,
"tokens/completion": 1629.62890625,
"tokens/masked_fraction": 0.0,
"wall_clock/generate_s": 137.56320452690125
}
],
"logging_steps": 1,
"max_steps": 750,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}