| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.39603960396039606, | |
| "eval_steps": 500, | |
| "global_step": 40, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9300.9375, | |
| "epoch": 0.009900990099009901, | |
| "grad_norm": 0.2749840021133423, | |
| "kl": 0.0, | |
| "learning_rate": 3.999032564583976e-06, | |
| "loss": 0.009333692491054535, | |
| "max_completion_length": 14084.125, | |
| "min_completion_length": 5729.875, | |
| "num_updates": 1, | |
| "rewards": 1.173762883991003, | |
| "rewards/cosine_scaled_reward": 0.27115931920707226, | |
| "rewards/format_reward2": 0.8515625, | |
| "rewards/len_reward": 0.051041055703535676, | |
| "rewards_std": 0.5518537946045399, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9752.296875, | |
| "epoch": 0.019801980198019802, | |
| "grad_norm": 0.23711691796779633, | |
| "kl": 0.0007762908935546875, | |
| "learning_rate": 3.996131194267188e-06, | |
| "loss": 0.016636773943901062, | |
| "max_completion_length": 14506.25, | |
| "min_completion_length": 3615.875, | |
| "num_updates": 2, | |
| "rewards": 1.011244721710682, | |
| "rewards/cosine_scaled_reward": 0.1618131911382079, | |
| "rewards/format_reward2": 0.8203125, | |
| "rewards/len_reward": 0.02911903988569975, | |
| "rewards_std": 0.6834513954818249, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 10513.359375, | |
| "epoch": 0.0297029702970297, | |
| "grad_norm": 0.26973867416381836, | |
| "kl": 0.0009961128234863281, | |
| "learning_rate": 3.9912986959380376e-06, | |
| "loss": -0.002310425043106079, | |
| "max_completion_length": 14084.875, | |
| "min_completion_length": 5952.75, | |
| "num_updates": 3, | |
| "rewards": 0.8836403228342533, | |
| "rewards/cosine_scaled_reward": 0.06623293040320277, | |
| "rewards/format_reward2": 0.84375, | |
| "rewards/len_reward": -0.026342609897255898, | |
| "rewards_std": 0.590987540781498, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 10638.6484375, | |
| "epoch": 0.039603960396039604, | |
| "grad_norm": 0.3450890779495239, | |
| "kl": 0.0011034011840820312, | |
| "learning_rate": 3.9845397447265526e-06, | |
| "loss": 2.3186206817626953e-05, | |
| "max_completion_length": 15636.125, | |
| "min_completion_length": 6815.75, | |
| "num_updates": 4, | |
| "rewards": 0.8896834207698703, | |
| "rewards/cosine_scaled_reward": 0.17461357091087848, | |
| "rewards/format_reward2": 0.7109375, | |
| "rewards/len_reward": 0.004132358357310295, | |
| "rewards_std": 0.6534126400947571, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9484.03125, | |
| "epoch": 0.04950495049504951, | |
| "grad_norm": 0.245803564786911, | |
| "kl": 0.0010962486267089844, | |
| "learning_rate": 3.975860879481513e-06, | |
| "loss": -0.025934472680091858, | |
| "max_completion_length": 14890.25, | |
| "min_completion_length": 5349.375, | |
| "num_updates": 5, | |
| "rewards": 0.9708382207900286, | |
| "rewards/cosine_scaled_reward": 0.10498641454614699, | |
| "rewards/format_reward2": 0.859375, | |
| "rewards/len_reward": 0.0064767999574542046, | |
| "rewards_std": 0.655558355152607, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9448.5859375, | |
| "epoch": 0.0594059405940594, | |
| "grad_norm": 0.24996301531791687, | |
| "kl": 0.0013685226440429688, | |
| "learning_rate": 3.965270496444528e-06, | |
| "loss": 0.005861759185791016, | |
| "max_completion_length": 15323.625, | |
| "min_completion_length": 3290.125, | |
| "num_updates": 6, | |
| "rewards": 0.9613782716915011, | |
| "rewards/cosine_scaled_reward": 0.2207801272161305, | |
| "rewards/format_reward2": 0.7734375, | |
| "rewards/len_reward": -0.03283937182277441, | |
| "rewards_std": 0.8270582258701324, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9011.703125, | |
| "epoch": 0.06930693069306931, | |
| "grad_norm": 0.24825839698314667, | |
| "kl": 0.0017271041870117188, | |
| "learning_rate": 3.952778841127214e-06, | |
| "loss": -0.010295629501342773, | |
| "max_completion_length": 12310.875, | |
| "min_completion_length": 4909.25, | |
| "num_updates": 7, | |
| "rewards": 1.1428990792483091, | |
| "rewards/cosine_scaled_reward": 0.24160153639968485, | |
| "rewards/format_reward2": 0.8515625, | |
| "rewards/len_reward": 0.04973505577072501, | |
| "rewards_std": 0.5509255714714527, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 11187.453125, | |
| "epoch": 0.07920792079207921, | |
| "grad_norm": 0.2318185567855835, | |
| "kl": 0.0022907257080078125, | |
| "learning_rate": 3.938397998399332e-06, | |
| "loss": 0.007296696305274963, | |
| "max_completion_length": 14553.875, | |
| "min_completion_length": 4561.375, | |
| "num_updates": 8, | |
| "rewards": 0.8727323254570365, | |
| "rewards/cosine_scaled_reward": 0.09352816140744835, | |
| "rewards/format_reward2": 0.796875, | |
| "rewards/len_reward": -0.01767082791775465, | |
| "rewards_std": 0.6038715615868568, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 10907.1484375, | |
| "epoch": 0.0891089108910891, | |
| "grad_norm": 0.23230598866939545, | |
| "kl": 0.002445220947265625, | |
| "learning_rate": 3.922141880797449e-06, | |
| "loss": 0.016454651951789856, | |
| "max_completion_length": 15823.125, | |
| "min_completion_length": 4670.375, | |
| "num_updates": 9, | |
| "rewards": 0.8584917988628149, | |
| "rewards/cosine_scaled_reward": 0.12224693153984845, | |
| "rewards/format_reward2": 0.7421875, | |
| "rewards/len_reward": -0.0059426589868962765, | |
| "rewards_std": 0.7407274544239044, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 10617.234375, | |
| "epoch": 0.09900990099009901, | |
| "grad_norm": 0.3195263147354126, | |
| "kl": 0.0034656524658203125, | |
| "learning_rate": 3.90402621506546e-06, | |
| "loss": 0.022236675024032593, | |
| "max_completion_length": 14234.125, | |
| "min_completion_length": 6616.625, | |
| "num_updates": 10, | |
| "rewards": 0.9227555003017187, | |
| "rewards/cosine_scaled_reward": 0.16829395852982998, | |
| "rewards/format_reward2": 0.765625, | |
| "rewards/len_reward": -0.011163473129272461, | |
| "rewards_std": 0.5515045262873173, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 11003.5703125, | |
| "epoch": 0.10891089108910891, | |
| "grad_norm": 0.22265592217445374, | |
| "kl": 0.0044879913330078125, | |
| "learning_rate": 3.884068526939978e-06, | |
| "loss": -0.013431079685688019, | |
| "max_completion_length": 14716.25, | |
| "min_completion_length": 5951.375, | |
| "num_updates": 11, | |
| "rewards": 0.8861873494461179, | |
| "rewards/cosine_scaled_reward": 0.16878368379548192, | |
| "rewards/format_reward2": 0.765625, | |
| "rewards/len_reward": -0.04822135902941227, | |
| "rewards_std": 0.5935308411717415, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7835.3984375, | |
| "epoch": 0.1188118811881188, | |
| "grad_norm": 0.3398093581199646, | |
| "kl": 0.0051021575927734375, | |
| "learning_rate": 3.862288124195319e-06, | |
| "loss": -0.013615414500236511, | |
| "max_completion_length": 13709.5, | |
| "min_completion_length": 2604.5, | |
| "num_updates": 12, | |
| "rewards": 1.274961642920971, | |
| "rewards/cosine_scaled_reward": 0.32682749163359404, | |
| "rewards/format_reward2": 0.890625, | |
| "rewards/len_reward": 0.0575091321952641, | |
| "rewards_std": 0.7341729030013084, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9435.609375, | |
| "epoch": 0.12871287128712872, | |
| "grad_norm": 0.2820684313774109, | |
| "kl": 0.006267547607421875, | |
| "learning_rate": 3.8387060779644725e-06, | |
| "loss": 0.015070796012878418, | |
| "max_completion_length": 13926.75, | |
| "min_completion_length": 3093.375, | |
| "num_updates": 13, | |
| "rewards": 0.9852710571140051, | |
| "rewards/cosine_scaled_reward": 0.22535304143093526, | |
| "rewards/format_reward2": 0.7578125, | |
| "rewards/len_reward": 0.0021055126562714577, | |
| "rewards_std": 0.7060699462890625, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 6023.7421875, | |
| "epoch": 0.13861386138613863, | |
| "grad_norm": 0.3811286985874176, | |
| "kl": 0.006252288818359375, | |
| "learning_rate": 3.8133452023541447e-06, | |
| "loss": 0.032392144203186035, | |
| "max_completion_length": 15730.75, | |
| "min_completion_length": 1983.375, | |
| "num_updates": 14, | |
| "rewards": 1.5124556943774223, | |
| "rewards/cosine_scaled_reward": 0.5594989098608494, | |
| "rewards/format_reward2": 0.875, | |
| "rewards/len_reward": 0.07795678498223424, | |
| "rewards_std": 0.7117869555950165, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9897.4609375, | |
| "epoch": 0.1485148514851485, | |
| "grad_norm": 0.2323319911956787, | |
| "kl": 0.006938934326171875, | |
| "learning_rate": 3.786230032373583e-06, | |
| "loss": -0.02542346715927124, | |
| "max_completion_length": 14731.125, | |
| "min_completion_length": 4343.625, | |
| "num_updates": 15, | |
| "rewards": 1.046268306672573, | |
| "rewards/cosine_scaled_reward": 0.24649553978815675, | |
| "rewards/format_reward2": 0.7890625, | |
| "rewards/len_reward": 0.010710292495787144, | |
| "rewards_std": 0.6413916498422623, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 8641.328125, | |
| "epoch": 0.15841584158415842, | |
| "grad_norm": 0.2797869145870209, | |
| "kl": 0.009868621826171875, | |
| "learning_rate": 3.7573868001985375e-06, | |
| "loss": 0.00245087593793869, | |
| "max_completion_length": 14046.0, | |
| "min_completion_length": 2590.0, | |
| "num_updates": 16, | |
| "rewards": 1.0253378190100193, | |
| "rewards/cosine_scaled_reward": 0.18167185690253973, | |
| "rewards/format_reward2": 0.8515625, | |
| "rewards/len_reward": -0.007896540686488152, | |
| "rewards_std": 0.7439497336745262, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9286.7734375, | |
| "epoch": 0.16831683168316833, | |
| "grad_norm": 0.2538006007671356, | |
| "kl": 0.009235382080078125, | |
| "learning_rate": 3.7268434097933267e-06, | |
| "loss": 0.012023478746414185, | |
| "max_completion_length": 14357.0, | |
| "min_completion_length": 4187.625, | |
| "num_updates": 17, | |
| "rewards": 1.116831500083208, | |
| "rewards/cosine_scaled_reward": 0.26869785273447633, | |
| "rewards/format_reward2": 0.8046875, | |
| "rewards/len_reward": 0.04344612918794155, | |
| "rewards_std": 0.6232936978340149, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7877.734375, | |
| "epoch": 0.1782178217821782, | |
| "grad_norm": 0.27215102314949036, | |
| "kl": 0.01183319091796875, | |
| "learning_rate": 3.6946294099155545e-06, | |
| "loss": 0.00474470853805542, | |
| "max_completion_length": 14090.375, | |
| "min_completion_length": 2752.125, | |
| "num_updates": 18, | |
| "rewards": 1.222687341272831, | |
| "rewards/cosine_scaled_reward": 0.3153993431478739, | |
| "rewards/format_reward2": 0.875, | |
| "rewards/len_reward": 0.032287961803376675, | |
| "rewards_std": 0.7518719509243965, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7099.703125, | |
| "epoch": 0.18811881188118812, | |
| "grad_norm": 0.3579545319080353, | |
| "kl": 0.013427734375, | |
| "learning_rate": 3.6607759655295948e-06, | |
| "loss": 0.01689109206199646, | |
| "max_completion_length": 14201.875, | |
| "min_completion_length": 1985.625, | |
| "num_updates": 19, | |
| "rewards": 1.2932276129722595, | |
| "rewards/cosine_scaled_reward": 0.3619839735329151, | |
| "rewards/format_reward2": 0.8515625, | |
| "rewards/len_reward": 0.07968113431707025, | |
| "rewards_std": 0.7946057394146919, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7809.1328125, | |
| "epoch": 0.19801980198019803, | |
| "grad_norm": 0.3970443904399872, | |
| "kl": 0.0154876708984375, | |
| "learning_rate": 3.6253158276565003e-06, | |
| "loss": 0.013616234064102173, | |
| "max_completion_length": 13511.875, | |
| "min_completion_length": 1854.125, | |
| "num_updates": 20, | |
| "rewards": 1.3438544012606144, | |
| "rewards/cosine_scaled_reward": 0.41305189533159137, | |
| "rewards/format_reward2": 0.875, | |
| "rewards/len_reward": 0.055802563671022654, | |
| "rewards_std": 0.5804904215037823, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9812.9296875, | |
| "epoch": 0.2079207920792079, | |
| "grad_norm": 0.5228389501571655, | |
| "kl": 0.01609039306640625, | |
| "learning_rate": 3.5882833016895067e-06, | |
| "loss": -0.00042431801557540894, | |
| "max_completion_length": 12778.25, | |
| "min_completion_length": 4778.0, | |
| "num_updates": 21, | |
| "rewards": 1.136468593031168, | |
| "rewards/cosine_scaled_reward": 0.18093573104124516, | |
| "rewards/format_reward2": 0.875, | |
| "rewards/len_reward": 0.08053285209462047, | |
| "rewards_std": 0.5555343925952911, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 10106.0, | |
| "epoch": 0.21782178217821782, | |
| "grad_norm": 0.3004865050315857, | |
| "kl": 0.01863861083984375, | |
| "learning_rate": 3.5497142142057796e-06, | |
| "loss": 0.0011682212352752686, | |
| "max_completion_length": 13495.75, | |
| "min_completion_length": 5747.125, | |
| "num_updates": 22, | |
| "rewards": 1.1095520546659827, | |
| "rewards/cosine_scaled_reward": 0.20947218214860186, | |
| "rewards/format_reward2": 0.875, | |
| "rewards/len_reward": 0.025079891085624695, | |
| "rewards_std": 0.4830879457294941, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 8428.5078125, | |
| "epoch": 0.22772277227722773, | |
| "grad_norm": 0.36966538429260254, | |
| "kl": 0.01552581787109375, | |
| "learning_rate": 3.509645878306514e-06, | |
| "loss": 0.0047097280621528625, | |
| "max_completion_length": 14159.0, | |
| "min_completion_length": 1964.5, | |
| "num_updates": 23, | |
| "rewards": 1.1670421473681927, | |
| "rewards/cosine_scaled_reward": 0.29000907950103283, | |
| "rewards/format_reward2": 0.8515625, | |
| "rewards/len_reward": 0.025470565538853407, | |
| "rewards_std": 0.615565050393343, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 5892.2109375, | |
| "epoch": 0.2376237623762376, | |
| "grad_norm": 125.0189208984375, | |
| "kl": 0.43308258056640625, | |
| "learning_rate": 3.4681170575189206e-06, | |
| "loss": 0.00223734974861145, | |
| "max_completion_length": 11146.375, | |
| "min_completion_length": 1771.125, | |
| "num_updates": 24, | |
| "rewards": 1.4564557410776615, | |
| "rewards/cosine_scaled_reward": 0.44786818977445364, | |
| "rewards/format_reward2": 0.90625, | |
| "rewards/len_reward": 0.10233754548244178, | |
| "rewards_std": 0.6475037336349487, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 6293.140625, | |
| "epoch": 0.24752475247524752, | |
| "grad_norm": 0.4965859651565552, | |
| "kl": 0.0188446044921875, | |
| "learning_rate": 3.425167928295014e-06, | |
| "loss": 0.019756004214286804, | |
| "max_completion_length": 11885.375, | |
| "min_completion_length": 2043.5, | |
| "num_updates": 25, | |
| "rewards": 1.2420116439461708, | |
| "rewards/cosine_scaled_reward": 0.24037119653075933, | |
| "rewards/format_reward2": 0.921875, | |
| "rewards/len_reward": 0.07976543391123414, | |
| "rewards_std": 0.768707849085331, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7320.9765625, | |
| "epoch": 0.25742574257425743, | |
| "grad_norm": 0.32264769077301025, | |
| "kl": 0.0201263427734375, | |
| "learning_rate": 3.3808400411434935e-06, | |
| "loss": 0.007990241050720215, | |
| "max_completion_length": 14976.25, | |
| "min_completion_length": 1978.0, | |
| "num_updates": 26, | |
| "rewards": 1.2049608379602432, | |
| "rewards/cosine_scaled_reward": 0.328420914709568, | |
| "rewards/format_reward2": 0.8671875, | |
| "rewards/len_reward": 0.009352410677820444, | |
| "rewards_std": 0.7654620930552483, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 5956.828125, | |
| "epoch": 0.26732673267326734, | |
| "grad_norm": 0.3196789026260376, | |
| "kl": 0.0207061767578125, | |
| "learning_rate": 3.335176280432307e-06, | |
| "loss": -0.00398920476436615, | |
| "max_completion_length": 10882.125, | |
| "min_completion_length": 2554.375, | |
| "num_updates": 27, | |
| "rewards": 1.3529352433979511, | |
| "rewards/cosine_scaled_reward": 0.30559817608445883, | |
| "rewards/format_reward2": 0.9609375, | |
| "rewards/len_reward": 0.08639959944412112, | |
| "rewards_std": 0.7427709549665451, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 6028.84375, | |
| "epoch": 0.27722772277227725, | |
| "grad_norm": 0.3432248532772064, | |
| "kl": 0.02164459228515625, | |
| "learning_rate": 3.2882208229007955e-06, | |
| "loss": -0.015418417751789093, | |
| "max_completion_length": 11848.0, | |
| "min_completion_length": 2110.75, | |
| "num_updates": 28, | |
| "rewards": 1.3594568185508251, | |
| "rewards/cosine_scaled_reward": 0.35286577604711056, | |
| "rewards/format_reward2": 0.90625, | |
| "rewards/len_reward": 0.10034103039652109, | |
| "rewards_std": 0.7156434431672096, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 8682.453125, | |
| "epoch": 0.2871287128712871, | |
| "grad_norm": 0.2364882528781891, | |
| "kl": 0.024139404296875, | |
| "learning_rate": 3.24001909492155e-06, | |
| "loss": 0.0035642534494400024, | |
| "max_completion_length": 13773.375, | |
| "min_completion_length": 3882.25, | |
| "num_updates": 29, | |
| "rewards": 1.13150573708117, | |
| "rewards/cosine_scaled_reward": 0.195555618731305, | |
| "rewards/format_reward2": 0.9140625, | |
| "rewards/len_reward": 0.021887621260248125, | |
| "rewards_std": 0.6196031682193279, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 9472.0859375, | |
| "epoch": 0.297029702970297, | |
| "grad_norm": 0.23454353213310242, | |
| "kl": 0.029876708984375, | |
| "learning_rate": 3.190617728553332e-06, | |
| "loss": 0.0017639100551605225, | |
| "max_completion_length": 14098.75, | |
| "min_completion_length": 4757.875, | |
| "num_updates": 30, | |
| "rewards": 1.0655029881745577, | |
| "rewards/cosine_scaled_reward": 0.137826404068619, | |
| "rewards/format_reward2": 0.8984375, | |
| "rewards/len_reward": 0.029239090159535408, | |
| "rewards_std": 0.5960428677499294, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 6721.03125, | |
| "epoch": 0.3069306930693069, | |
| "grad_norm": 0.36463433504104614, | |
| "kl": 0.02364349365234375, | |
| "learning_rate": 3.140064516427565e-06, | |
| "loss": 0.02541273832321167, | |
| "max_completion_length": 11538.875, | |
| "min_completion_length": 3436.5, | |
| "num_updates": 31, | |
| "rewards": 1.2696323096752167, | |
| "rewards/cosine_scaled_reward": 0.264737417222932, | |
| "rewards/format_reward2": 0.921875, | |
| "rewards/len_reward": 0.08301988849416375, | |
| "rewards_std": 0.7188399098813534, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7162.3828125, | |
| "epoch": 0.31683168316831684, | |
| "grad_norm": 0.5464898347854614, | |
| "kl": 0.02471923828125, | |
| "learning_rate": 3.0884083655120544e-06, | |
| "loss": 0.01196742057800293, | |
| "max_completion_length": 10615.375, | |
| "min_completion_length": 4111.875, | |
| "num_updates": 32, | |
| "rewards": 1.05987061932683, | |
| "rewards/cosine_scaled_reward": 0.07295701105613261, | |
| "rewards/format_reward2": 0.96875, | |
| "rewards/len_reward": 0.018163591157644987, | |
| "rewards_std": 0.6865731440484524, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 8008.421875, | |
| "epoch": 0.32673267326732675, | |
| "grad_norm": 67.73995208740234, | |
| "kl": 0.0318756103515625, | |
| "learning_rate": 3.0356992497966503e-06, | |
| "loss": -0.01266103982925415, | |
| "max_completion_length": 12517.5, | |
| "min_completion_length": 2955.875, | |
| "num_updates": 33, | |
| "rewards": 1.1564012691378593, | |
| "rewards/cosine_scaled_reward": 0.18674227688461542, | |
| "rewards/format_reward2": 0.9453125, | |
| "rewards/len_reward": 0.024346530437469482, | |
| "rewards_std": 0.5944486074149609, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 5801.1328125, | |
| "epoch": 0.33663366336633666, | |
| "grad_norm": 0.44804856181144714, | |
| "kl": 0.0241241455078125, | |
| "learning_rate": 2.981988161946644e-06, | |
| "loss": -0.0008684098720550537, | |
| "max_completion_length": 13239.625, | |
| "min_completion_length": 1896.5, | |
| "num_updates": 34, | |
| "rewards": 1.5373370498418808, | |
| "rewards/cosine_scaled_reward": 0.4724404886364937, | |
| "rewards/format_reward2": 0.9609375, | |
| "rewards/len_reward": 0.10395912081003189, | |
| "rewards_std": 0.5325119644403458, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7891.390625, | |
| "epoch": 0.3465346534653465, | |
| "grad_norm": 0.395713746547699, | |
| "kl": 0.0246734619140625, | |
| "learning_rate": 2.9273270639706544e-06, | |
| "loss": 0.009494274854660034, | |
| "max_completion_length": 13065.75, | |
| "min_completion_length": 3577.25, | |
| "num_updates": 35, | |
| "rewards": 1.2487693056464195, | |
| "rewards/cosine_scaled_reward": 0.21439548954367638, | |
| "rewards/format_reward2": 0.9609375, | |
| "rewards/len_reward": 0.0734363030642271, | |
| "rewards_std": 0.6400604620575905, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 8915.25, | |
| "epoch": 0.3564356435643564, | |
| "grad_norm": 0.33517077565193176, | |
| "kl": 0.0334320068359375, | |
| "learning_rate": 2.871768836950742e-06, | |
| "loss": -0.018658161163330078, | |
| "max_completion_length": 12554.875, | |
| "min_completion_length": 5071.375, | |
| "num_updates": 36, | |
| "rewards": 1.1192209478467703, | |
| "rewards/cosine_scaled_reward": 0.15578080737031996, | |
| "rewards/format_reward2": 0.9296875, | |
| "rewards/len_reward": 0.03375265281647444, | |
| "rewards_std": 0.5714416801929474, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 6871.40625, | |
| "epoch": 0.36633663366336633, | |
| "grad_norm": 0.3194531202316284, | |
| "kl": 0.0283966064453125, | |
| "learning_rate": 2.8153672298833772e-06, | |
| "loss": 0.027765318751335144, | |
| "max_completion_length": 11280.625, | |
| "min_completion_length": 2181.75, | |
| "num_updates": 37, | |
| "rewards": 1.2906805723905563, | |
| "rewards/cosine_scaled_reward": 0.289469544775784, | |
| "rewards/format_reward2": 0.9609375, | |
| "rewards/len_reward": 0.04027354822028428, | |
| "rewards_std": 0.6187824495136738, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7555.359375, | |
| "epoch": 0.37623762376237624, | |
| "grad_norm": 0.329673707485199, | |
| "kl": 0.03289794921875, | |
| "learning_rate": 2.7581768076807586e-06, | |
| "loss": 0.00029387325048446655, | |
| "max_completion_length": 13576.25, | |
| "min_completion_length": 3269.75, | |
| "num_updates": 38, | |
| "rewards": 0.9393773451447487, | |
| "rewards/cosine_scaled_reward": 0.02606131136417389, | |
| "rewards/format_reward2": 0.9375, | |
| "rewards/len_reward": -0.024183956440538168, | |
| "rewards_std": 0.7055792585015297, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 7003.8125, | |
| "epoch": 0.38613861386138615, | |
| "grad_norm": 0.3149705231189728, | |
| "kl": 0.03338623046875, | |
| "learning_rate": 2.700252898382781e-06, | |
| "loss": 0.00039067864418029785, | |
| "max_completion_length": 12233.875, | |
| "min_completion_length": 2225.0, | |
| "num_updates": 39, | |
| "rewards": 1.323565311729908, | |
| "rewards/cosine_scaled_reward": 0.23708410863764584, | |
| "rewards/format_reward2": 0.9609375, | |
| "rewards/len_reward": 0.12554369773715734, | |
| "rewards_std": 0.639260545372963, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 5583.1171875, | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 0.4048033356666565, | |
| "kl": 0.027130126953125, | |
| "learning_rate": 2.641651539630735e-06, | |
| "loss": 0.015950188040733337, | |
| "max_completion_length": 9941.625, | |
| "min_completion_length": 1826.0, | |
| "num_updates": 40, | |
| "rewards": 1.4943003356456757, | |
| "rewards/cosine_scaled_reward": 0.4848987963050604, | |
| "rewards/format_reward2": 0.921875, | |
| "rewards/len_reward": 0.08752657752484083, | |
| "rewards_std": 0.636099562048912, | |
| "step": 40 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 101, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 5, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": true, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0, | |
| "train_batch_size": null, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |