{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06666666666666667, "eval_steps": 50, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage/absmean": 0.12451171875, "entropy": 1.3988752365112305, "epoch": 0.0013333333333333333, "grad_norm": 0.01450820083840917, "importance_ratio": 0.9983458518981934, "learning_rate": 0.0, "loss": -0.0028, "mismatch_kl": 0.004329901188611984, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 1, "timing/generation_ms": 12196.653502061963, "timing/scoring_ms": 0.0, "timing/total_ms": 12196.653502061963, "tokens/completion": 562.04296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 163.39810061454773 }, { "advantage/absmean": 0.12451171875, "entropy": 1.0297880172729492, "epoch": 0.0026666666666666666, "grad_norm": 0.006125098422428371, "importance_ratio": 0.9977808594703674, "learning_rate": 1.0000000000000002e-06, "loss": 0.0118, "mismatch_kl": 0.0036596579011529684, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 2, "timing/generation_ms": 10855.522208847106, "timing/scoring_ms": 0.0, "timing/total_ms": 10855.522208847106, "tokens/completion": 652.203125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 98.15957498550415 }, { "advantage/absmean": 0.12451171875, "entropy": 1.2343848943710327, "epoch": 0.004, "grad_norm": 0.0093110934895908, "importance_ratio": 0.9983258843421936, "learning_rate": 2.0000000000000003e-06, "loss": -0.0068, "mismatch_kl": 0.00391958886757493, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 3, "timing/generation_ms": 14581.869984045625, "timing/scoring_ms": 0.0, "timing/total_ms": 14581.869984045625, "tokens/completion": 722.37109375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 171.60404181480408 }, { "advantage/absmean": 0.12451171875, "entropy": 0.581649661064148, "epoch": 0.005333333333333333, "grad_norm": 0.007696628408420481, "importance_ratio": 0.9986447095870972, "learning_rate": 3e-06, "loss": -0.0043, "mismatch_kl": 0.0024762798566371202, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 4, "timing/generation_ms": 11191.347393207252, "timing/scoring_ms": 0.0, "timing/total_ms": 11191.347393207252, "tokens/completion": 595.73046875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 186.33580946922302 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8588891625404358, "epoch": 0.006666666666666667, "grad_norm": 0.0055080213738955075, "importance_ratio": 0.9988943934440613, "learning_rate": 4.000000000000001e-06, "loss": -0.0033, "mismatch_kl": 0.0031517043244093657, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 5, "timing/generation_ms": 10668.582463636994, "timing/scoring_ms": 0.0, "timing/total_ms": 10668.582463636994, "tokens/completion": 636.53125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 82.27488708496094 }, { "advantage/absmean": 0.12451171875, "entropy": 1.071407675743103, "epoch": 0.008, "grad_norm": 0.02271832942623967, "importance_ratio": 0.998067319393158, "learning_rate": 5e-06, "loss": 0.0019, "mismatch_kl": 0.003643231000751257, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 6, "timing/generation_ms": 3378.591795451939, "timing/scoring_ms": 0.0, "timing/total_ms": 3378.591795451939, "tokens/completion": 178.73828125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 105.60203862190247 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8531922698020935, "epoch": 0.009333333333333334, "grad_norm": 0.018354067998903482, "importance_ratio": 0.9980432391166687, "learning_rate": 5e-06, "loss": -0.0002, "mismatch_kl": 0.003655636915937066, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 7, "timing/generation_ms": 12279.695899225771, "timing/scoring_ms": 0.0, "timing/total_ms": 12279.695899225771, "tokens/completion": 631.35546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 104.63563537597656 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7072162628173828, "epoch": 0.010666666666666666, "grad_norm": 0.005552532749027135, "importance_ratio": 0.9982293844223022, "learning_rate": 5e-06, "loss": -0.0014, "mismatch_kl": 0.0029395928140729666, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 8, "timing/generation_ms": 6614.743183366954, "timing/scoring_ms": 0.0, "timing/total_ms": 6614.743183366954, "tokens/completion": 339.1640625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 132.164165019989 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8156145215034485, "epoch": 0.012, "grad_norm": 0.008176505226750404, "importance_ratio": 0.9981797933578491, "learning_rate": 5e-06, "loss": 0.0027, "mismatch_kl": 0.0031279518734663725, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 9, "timing/generation_ms": 8826.908372342587, "timing/scoring_ms": 0.0, "timing/total_ms": 8826.908372342587, "tokens/completion": 444.53515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 144.61542773246765 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8708666563034058, "epoch": 0.013333333333333334, "grad_norm": 0.009382372847258274, "importance_ratio": 0.9981642961502075, "learning_rate": 5e-06, "loss": 0.0126, "mismatch_kl": 0.0030885515734553337, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 10, "timing/generation_ms": 7367.4805322662, "timing/scoring_ms": 0.0, "timing/total_ms": 7367.4805322662, "tokens/completion": 400.74609375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 176.57727003097534 }, { "advantage/absmean": 0.12451171875, "entropy": 0.6906348466873169, "epoch": 0.014666666666666666, "grad_norm": 0.007616251351947248, "importance_ratio": 1.0045424699783325, "learning_rate": 5e-06, "loss": 0.0542, "mismatch_kl": 0.03194786608219147, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 11, "timing/generation_ms": 26879.562875255942, "timing/scoring_ms": 0.0, "timing/total_ms": 26879.562875255942, "tokens/completion": 1682.0546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 190.3386266231537 }, { "advantage/absmean": 0.12451171875, "entropy": 0.6506091356277466, "epoch": 0.016, "grad_norm": 0.004382353798954015, "importance_ratio": 0.9982648491859436, "learning_rate": 5e-06, "loss": 0.043, "mismatch_kl": 0.02482638508081436, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 12, "timing/generation_ms": 22301.60311050713, "timing/scoring_ms": 0.0, "timing/total_ms": 22301.60311050713, "tokens/completion": 1387.734375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 177.6784646511078 }, { "advantage/absmean": 0.12451171875, "entropy": 1.143129825592041, "epoch": 0.017333333333333333, "grad_norm": 0.009321138085104996, "importance_ratio": 1.001217007637024, "learning_rate": 5e-06, "loss": -0.0139, "mismatch_kl": 0.0036374337505549192, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 13, "timing/generation_ms": 6277.724616229534, "timing/scoring_ms": 0.0, "timing/total_ms": 6277.724616229534, "tokens/completion": 432.66796875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 129.69259929656982 }, { "advantage/absmean": 0.12451171875, "entropy": 0.650863766670227, "epoch": 0.018666666666666668, "grad_norm": 0.0076614251264825245, "importance_ratio": 0.9983827471733093, "learning_rate": 5e-06, "loss": 0.0049, "mismatch_kl": 0.0027237425092607737, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 14, "timing/generation_ms": 7103.812717832625, "timing/scoring_ms": 0.0, "timing/total_ms": 7103.812717832625, "tokens/completion": 404.96484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 57.65379452705383 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7439635992050171, "epoch": 0.02, "grad_norm": 0.009401568464987338, "importance_ratio": 0.9981654286384583, "learning_rate": 5e-06, "loss": 0.0109, "mismatch_kl": 0.002909082220867276, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 15, "timing/generation_ms": 8292.532542720437, "timing/scoring_ms": 0.0, "timing/total_ms": 8292.532542720437, "tokens/completion": 459.85546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 92.21157336235046 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7638830542564392, "epoch": 0.021333333333333333, "grad_norm": 0.010374572910211358, "importance_ratio": 0.9969711899757385, "learning_rate": 5e-06, "loss": -0.005, "mismatch_kl": 0.0034673516638576984, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 16, "timing/generation_ms": 5712.000676430762, "timing/scoring_ms": 0.0, "timing/total_ms": 5712.000676430762, "tokens/completion": 308.4296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 52.6866238117218 }, { "advantage/absmean": 0.12451171875, "entropy": 0.18189160525798798, "epoch": 0.02266666666666667, "grad_norm": 0.00257455457059234, "importance_ratio": 0.9984971880912781, "learning_rate": 5e-06, "loss": 0.0681, "mismatch_kl": 0.018514186143875122, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 17, "timing/generation_ms": 14281.423358246684, "timing/scoring_ms": 0.0, "timing/total_ms": 14281.423358246684, "tokens/completion": 1101.546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 129.24327325820923 }, { "advantage/absmean": 0.12451171875, "entropy": 0.5917271375656128, "epoch": 0.024, "grad_norm": 0.005668903472483887, "importance_ratio": 0.999828577041626, "learning_rate": 5e-06, "loss": 0.027, "mismatch_kl": 0.002100760815665126, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 18, "timing/generation_ms": 24175.399120897055, "timing/scoring_ms": 0.0, "timing/total_ms": 24175.399120897055, "tokens/completion": 1504.0390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 158.57504653930664 }, { "advantage/absmean": 0.12451171875, "entropy": 1.3282008171081543, "epoch": 0.025333333333333333, "grad_norm": 0.006636047431888786, "importance_ratio": 1.0022344589233398, "learning_rate": 5e-06, "loss": -0.0015, "mismatch_kl": 0.004634195473045111, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 19, "timing/generation_ms": 15713.139976374805, "timing/scoring_ms": 0.0, "timing/total_ms": 15713.139976374805, "tokens/completion": 764.3203125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 78.56244468688965 }, { "advantage/absmean": 0.12451171875, "entropy": 1.02470862865448, "epoch": 0.02666666666666667, "grad_norm": 0.00833481021786943, "importance_ratio": 1.0026451349258423, "learning_rate": 5e-06, "loss": -0.0052, "mismatch_kl": 0.004158638883382082, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 20, "timing/generation_ms": 6632.851202040911, "timing/scoring_ms": 0.0, "timing/total_ms": 6632.851202040911, "tokens/completion": 382.6171875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 199.43552422523499 }, { "advantage/absmean": 0.12451171875, "entropy": 0.07275530695915222, "epoch": 0.028, "grad_norm": 0.005944388738403685, "importance_ratio": 0.9988561868667603, "learning_rate": 5e-06, "loss": 0.0452, "mismatch_kl": 0.00023643655003979802, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 21, "timing/generation_ms": 119174.6030151844, "timing/scoring_ms": 0.0, "timing/total_ms": 119174.6030151844, "tokens/completion": 4008.38671875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 685.4423098564148 }, { "advantage/absmean": 0.12451171875, "entropy": 0.83598792552948, "epoch": 0.029333333333333333, "grad_norm": 0.00977617477475085, "importance_ratio": 0.995696485042572, "learning_rate": 5e-06, "loss": 0.0066, "mismatch_kl": 0.003957619424909353, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 22, "timing/generation_ms": 12322.44247943163, "timing/scoring_ms": 0.0, "timing/total_ms": 12322.44247943163, "tokens/completion": 442.85546875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 94.37256598472595 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8219886422157288, "epoch": 0.030666666666666665, "grad_norm": 0.0057449599218849946, "importance_ratio": 0.9990558624267578, "learning_rate": 5e-06, "loss": 0.0388, "mismatch_kl": 0.031180420890450478, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 23, "timing/generation_ms": 29090.628595091403, "timing/scoring_ms": 0.0, "timing/total_ms": 29090.628595091403, "tokens/completion": 1716.3515625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 169.45334482192993 }, { "advantage/absmean": 0.12451171875, "entropy": 1.0089167356491089, "epoch": 0.032, "grad_norm": 0.009762837519367, "importance_ratio": 0.9979202151298523, "learning_rate": 5e-06, "loss": 0.0012, "mismatch_kl": 0.00405939482152462, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 24, "timing/generation_ms": 17154.327374882996, "timing/scoring_ms": 0.0, "timing/total_ms": 17154.327374882996, "tokens/completion": 883.390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 130.7891206741333 }, { "advantage/absmean": 0.12451171875, "entropy": 0.5053093433380127, "epoch": 0.03333333333333333, "grad_norm": 0.007416974683241316, "importance_ratio": 0.9982149600982666, "learning_rate": 5e-06, "loss": -0.0068, "mismatch_kl": 0.0024536694400012493, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 25, "timing/generation_ms": 28463.361867703497, "timing/scoring_ms": 0.0, "timing/total_ms": 28463.361867703497, "tokens/completion": 1409.54296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 178.60342526435852 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4973055422306061, "epoch": 0.034666666666666665, "grad_norm": 0.004048717808220336, "importance_ratio": 1.0012173652648926, "learning_rate": 5e-06, "loss": 0.0547, "mismatch_kl": 0.03473234549164772, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 26, "timing/generation_ms": 18848.746892996132, "timing/scoring_ms": 0.0, "timing/total_ms": 18848.746892996132, "tokens/completion": 1286.6875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 181.75563287734985 }, { "advantage/absmean": 0.12451171875, "entropy": 0.5914682149887085, "epoch": 0.036, "grad_norm": 0.010568088931367656, "importance_ratio": 0.9986244440078735, "learning_rate": 5e-06, "loss": -0.0214, "mismatch_kl": 0.002536088228225708, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 27, "timing/generation_ms": 11602.461927570403, "timing/scoring_ms": 0.0, "timing/total_ms": 11602.461927570403, "tokens/completion": 734.80078125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 188.88015818595886 }, { "advantage/absmean": 0.12451171875, "entropy": 0.4526905119419098, "epoch": 0.037333333333333336, "grad_norm": 0.0035728175606856527, "importance_ratio": 0.9999799728393555, "learning_rate": 5e-06, "loss": 0.0026, "mismatch_kl": 0.0024842985440045595, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 28, "timing/generation_ms": 30549.59301650524, "timing/scoring_ms": 0.0, "timing/total_ms": 30549.59301650524, "tokens/completion": 1536.96875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 328.0478210449219 }, { "advantage/absmean": 0.12451171875, "entropy": 0.32794511318206787, "epoch": 0.03866666666666667, "grad_norm": 0.003333518820406266, "importance_ratio": 0.9995192885398865, "learning_rate": 5e-06, "loss": 0.056, "mismatch_kl": 0.028769802302122116, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 29, "timing/generation_ms": 18838.333567604423, "timing/scoring_ms": 0.0, "timing/total_ms": 18838.333567604423, "tokens/completion": 1263.640625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 290.5948350429535 }, { "advantage/absmean": 0.12451171875, "entropy": 0.9063822031021118, "epoch": 0.04, "grad_norm": 0.007342388496075293, "importance_ratio": 0.9953157901763916, "learning_rate": 5e-06, "loss": 0.0025, "mismatch_kl": 0.004266439005732536, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 30, "timing/generation_ms": 9477.213966660202, "timing/scoring_ms": 0.0, "timing/total_ms": 9477.213966660202, "tokens/completion": 473.26953125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 62.30127143859863 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7977282404899597, "epoch": 0.04133333333333333, "grad_norm": 0.00884043332375607, "importance_ratio": 0.9971498847007751, "learning_rate": 5e-06, "loss": -0.0029, "mismatch_kl": 0.004033135715872049, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 31, "timing/generation_ms": 18995.201839134097, "timing/scoring_ms": 0.0, "timing/total_ms": 18995.201839134097, "tokens/completion": 958.625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 88.6347918510437 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8451470732688904, "epoch": 0.042666666666666665, "grad_norm": 0.018842389370386323, "importance_ratio": 0.9982671141624451, "learning_rate": 5e-06, "loss": 0.0369, "mismatch_kl": 0.003600390162318945, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 32, "timing/generation_ms": 5587.277088314295, "timing/scoring_ms": 0.0, "timing/total_ms": 5587.277088314295, "tokens/completion": 407.12109375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 140.4788475036621 }, { "advantage/absmean": 0.12451171875, "entropy": 1.1521912813186646, "epoch": 0.044, "grad_norm": 0.006379742039913797, "importance_ratio": 0.997858464717865, "learning_rate": 5e-06, "loss": -0.0065, "mismatch_kl": 0.005035887472331524, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 33, "timing/generation_ms": 18916.152058169246, "timing/scoring_ms": 0.0, "timing/total_ms": 18916.152058169246, "tokens/completion": 966.55859375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 90.41954302787781 }, { "advantage/absmean": 0.12451171875, "entropy": 1.1553761959075928, "epoch": 0.04533333333333334, "grad_norm": 0.010733713274389883, "importance_ratio": 1.0111567974090576, "learning_rate": 5e-06, "loss": 0.0014, "mismatch_kl": 0.006704343948513269, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 34, "timing/generation_ms": 17302.85968258977, "timing/scoring_ms": 0.0, "timing/total_ms": 17302.85968258977, "tokens/completion": 864.44140625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 143.8659963607788 }, { "advantage/absmean": 0.12451171875, "entropy": 0.3105199635028839, "epoch": 0.04666666666666667, "grad_norm": 0.003940130100379767, "importance_ratio": 1.0006911754608154, "learning_rate": 5e-06, "loss": 0.0315, "mismatch_kl": 0.022524980828166008, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 35, "timing/generation_ms": 29806.298807263374, "timing/scoring_ms": 0.0, "timing/total_ms": 29806.298807263374, "tokens/completion": 1672.48828125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 164.04821372032166 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7972971200942993, "epoch": 0.048, "grad_norm": 0.008409173142054645, "importance_ratio": 0.9948906898498535, "learning_rate": 5e-06, "loss": 0.004, "mismatch_kl": 0.004282351583242416, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 36, "timing/generation_ms": 14936.399303376675, "timing/scoring_ms": 0.0, "timing/total_ms": 14936.399303376675, "tokens/completion": 787.78125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 196.79586815834045 }, { "advantage/absmean": 0.12451171875, "entropy": 0.5769950747489929, "epoch": 0.04933333333333333, "grad_norm": 0.009636377939254703, "importance_ratio": 0.9972301721572876, "learning_rate": 5e-06, "loss": -0.0011, "mismatch_kl": 0.003603809280321002, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 37, "timing/generation_ms": 13729.571803472936, "timing/scoring_ms": 0.0, "timing/total_ms": 13729.571803472936, "tokens/completion": 697.64453125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 76.77378511428833 }, { "advantage/absmean": 0.12451171875, "entropy": 0.715777575969696, "epoch": 0.050666666666666665, "grad_norm": 0.005305945077729364, "importance_ratio": 0.9969701766967773, "learning_rate": 5e-06, "loss": 0.0093, "mismatch_kl": 0.004232620354741812, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 38, "timing/generation_ms": 26689.202761277556, "timing/scoring_ms": 0.0, "timing/total_ms": 26689.202761277556, "tokens/completion": 1302.75390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 119.53459739685059 }, { "advantage/absmean": 0.12451171875, "entropy": 1.108477234840393, "epoch": 0.052, "grad_norm": 0.01158876392732835, "importance_ratio": 0.9918505549430847, "learning_rate": 5e-06, "loss": 0.0069, "mismatch_kl": 0.0055715711787343025, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 39, "timing/generation_ms": 9316.26115180552, "timing/scoring_ms": 0.0, "timing/total_ms": 9316.26115180552, "tokens/completion": 510.88671875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 134.0537760257721 }, { "advantage/absmean": 0.12451171875, "entropy": 1.0468562841415405, "epoch": 0.05333333333333334, "grad_norm": 0.006250915142780056, "importance_ratio": 0.993874192237854, "learning_rate": 5e-06, "loss": -0.004, "mismatch_kl": 0.00569565873593092, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 40, "timing/generation_ms": 22657.30178449303, "timing/scoring_ms": 0.0, "timing/total_ms": 22657.30178449303, "tokens/completion": 1117.0390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 104.31990480422974 }, { "advantage/absmean": 0.12451171875, "entropy": 1.0242066383361816, "epoch": 0.05466666666666667, "grad_norm": 0.009730238448609988, "importance_ratio": 1.0014866590499878, "learning_rate": 5e-06, "loss": 0.0029, "mismatch_kl": 0.006813807878643274, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 41, "timing/generation_ms": 15266.15516282618, "timing/scoring_ms": 0.0, "timing/total_ms": 15266.15516282618, "tokens/completion": 789.296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 137.6475269794464 }, { "advantage/absmean": 0.12451171875, "entropy": 0.9917812943458557, "epoch": 0.056, "grad_norm": 0.015130940878153589, "importance_ratio": 0.9915910959243774, "learning_rate": 5e-06, "loss": -0.0016, "mismatch_kl": 0.006494010798633099, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 42, "timing/generation_ms": 8552.51188017428, "timing/scoring_ms": 0.0, "timing/total_ms": 8552.51188017428, "tokens/completion": 427.00390625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 58.246270418167114 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7529230117797852, "epoch": 0.05733333333333333, "grad_norm": 0.017225340266775354, "importance_ratio": 0.9983583092689514, "learning_rate": 5e-06, "loss": 0.0017, "mismatch_kl": 0.005849814508110285, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 43, "timing/generation_ms": 5776.03021170944, "timing/scoring_ms": 0.0, "timing/total_ms": 5776.03021170944, "tokens/completion": 292.6484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 138.62879586219788 }, { "advantage/absmean": 0.12451171875, "entropy": 0.9116057753562927, "epoch": 0.058666666666666666, "grad_norm": 0.013240792131345649, "importance_ratio": 0.993713915348053, "learning_rate": 5e-06, "loss": -0.006, "mismatch_kl": 0.00599726801738143, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 44, "timing/generation_ms": 4909.729053266346, "timing/scoring_ms": 0.0, "timing/total_ms": 4909.729053266346, "tokens/completion": 252.2890625, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 53.461458683013916 }, { "advantage/absmean": 0.12451171875, "entropy": 0.6952740550041199, "epoch": 0.06, "grad_norm": 0.007271643900369788, "importance_ratio": 0.9978048205375671, "learning_rate": 5e-06, "loss": -0.0094, "mismatch_kl": 0.004028764553368092, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 45, "timing/generation_ms": 12042.251928709447, "timing/scoring_ms": 0.0, "timing/total_ms": 12042.251928709447, "tokens/completion": 668.03125, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 77.72424340248108 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8735002279281616, "epoch": 0.06133333333333333, "grad_norm": 0.00817327643152143, "importance_ratio": 1.0016076564788818, "learning_rate": 5e-06, "loss": 0.0002, "mismatch_kl": 0.004535754211246967, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 46, "timing/generation_ms": 8553.523855283856, "timing/scoring_ms": 0.0, "timing/total_ms": 8553.523855283856, "tokens/completion": 459.87109375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 104.68091750144958 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7288662195205688, "epoch": 0.06266666666666666, "grad_norm": 0.016180435920793518, "importance_ratio": 1.0001567602157593, "learning_rate": 5e-06, "loss": 0.0002, "mismatch_kl": 0.006666674744337797, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 47, "timing/generation_ms": 7466.575676575303, "timing/scoring_ms": 0.0, "timing/total_ms": 7466.575676575303, "tokens/completion": 361.484375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 59.11225175857544 }, { "advantage/absmean": 0.12451171875, "entropy": 0.6449630856513977, "epoch": 0.064, "grad_norm": 0.004581873635760183, "importance_ratio": 1.0026441812515259, "learning_rate": 5e-06, "loss": 0.0588, "mismatch_kl": 0.059744831174612045, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 48, "timing/generation_ms": 14945.35976741463, "timing/scoring_ms": 0.0, "timing/total_ms": 14945.35976741463, "tokens/completion": 1044.0, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 182.3247947692871 }, { "advantage/absmean": 0.12451171875, "entropy": 0.8048098683357239, "epoch": 0.06533333333333333, "grad_norm": 0.0052364810032066635, "importance_ratio": 0.9973055720329285, "learning_rate": 5e-06, "loss": 0.0347, "mismatch_kl": 0.0451083704829216, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 49, "timing/generation_ms": 28440.53523708135, "timing/scoring_ms": 0.0, "timing/total_ms": 28440.53523708135, "tokens/completion": 1630.4296875, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 221.0147523880005 }, { "advantage/absmean": 0.12451171875, "entropy": 0.7735000252723694, "epoch": 0.06666666666666667, "grad_norm": 0.015103596816141955, "importance_ratio": 0.9899436831474304, "learning_rate": 5e-06, "loss": -0.0022, "mismatch_kl": 0.008240272291004658, "reward": 0.12451171875, "reward/std": 0.1738164722919464, "step": 50, "timing/generation_ms": 6061.147706583142, "timing/scoring_ms": 0.0, "timing/total_ms": 6061.147706583142, "tokens/completion": 331.859375, "tokens/masked_fraction": 0.0, "wall_clock/generate_s": 108.05560183525085 } ], "logging_steps": 1, "max_steps": 750, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }