| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9998343548119927, | |
| "eval_steps": 50, | |
| "global_step": 1509, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 384.27969818115236, | |
| "epoch": 0.006625807520291536, | |
| "grad_norm": 0.9638224244117737, | |
| "kl": 0.000667405128479004, | |
| "learning_rate": 1.3245033112582784e-06, | |
| "loss": 0.0, | |
| "reward": 0.44505209624767306, | |
| "reward_std": 0.42465767413377764, | |
| "rewards/accuracy_reward": 0.14088542019017042, | |
| "rewards/format_reward": 0.3041666761040688, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 186.6330778121948, | |
| "epoch": 0.013251615040583071, | |
| "grad_norm": 0.6953328847885132, | |
| "kl": 0.033650970458984374, | |
| "learning_rate": 2.6490066225165567e-06, | |
| "loss": 0.0013, | |
| "reward": 0.9434896126389504, | |
| "reward_std": 0.258835174748674, | |
| "rewards/accuracy_reward": 0.06432291923556477, | |
| "rewards/format_reward": 0.8791666895151138, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 147.85651473999025, | |
| "epoch": 0.019877422560874606, | |
| "grad_norm": 0.45314687490463257, | |
| "kl": 0.04227294921875, | |
| "learning_rate": 3.973509933774835e-06, | |
| "loss": 0.0017, | |
| "reward": 1.0822917133569718, | |
| "reward_std": 0.1591762812808156, | |
| "rewards/accuracy_reward": 0.09479166874662041, | |
| "rewards/format_reward": 0.9875000178813934, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 210.63177680969238, | |
| "epoch": 0.026503230081166142, | |
| "grad_norm": 0.4808696508407593, | |
| "kl": 0.0356292724609375, | |
| "learning_rate": 5.2980132450331135e-06, | |
| "loss": 0.0014, | |
| "reward": 1.1015625283122064, | |
| "reward_std": 0.22788139712065458, | |
| "rewards/accuracy_reward": 0.1317708377726376, | |
| "rewards/format_reward": 0.9697916865348816, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 262.94844551086425, | |
| "epoch": 0.033129037601457675, | |
| "grad_norm": 0.42699357867240906, | |
| "kl": 0.048345947265625, | |
| "learning_rate": 6.622516556291392e-06, | |
| "loss": 0.0019, | |
| "reward": 1.179687538743019, | |
| "reward_std": 0.28927393443882465, | |
| "rewards/accuracy_reward": 0.21250000605359673, | |
| "rewards/format_reward": 0.9671875178813935, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.033129037601457675, | |
| "eval_completion_length": 312.76158820258246, | |
| "eval_kl": 0.046305338541666664, | |
| "eval_loss": 0.001854513306170702, | |
| "eval_reward": 1.339120414521959, | |
| "eval_reward_std": 0.2885145727131102, | |
| "eval_rewards/accuracy_reward": 0.37615741623772514, | |
| "eval_rewards/format_reward": 0.962962978416019, | |
| "eval_runtime": 54.3599, | |
| "eval_samples_per_second": 1.821, | |
| "eval_steps_per_second": 0.166, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 367.77709159851076, | |
| "epoch": 0.03975484512174921, | |
| "grad_norm": 0.30974721908569336, | |
| "kl": 0.0423248291015625, | |
| "learning_rate": 7.94701986754967e-06, | |
| "loss": 0.0017, | |
| "reward": 1.28098963201046, | |
| "reward_std": 0.3665796037763357, | |
| "rewards/accuracy_reward": 0.33697917461395266, | |
| "rewards/format_reward": 0.9440104380249977, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 389.0981872558594, | |
| "epoch": 0.04638065264204075, | |
| "grad_norm": 0.46462583541870117, | |
| "kl": 0.054632568359375, | |
| "learning_rate": 9.271523178807948e-06, | |
| "loss": 0.0022, | |
| "reward": 1.3294271290302277, | |
| "reward_std": 0.3420734729617834, | |
| "rewards/accuracy_reward": 0.36953126192092894, | |
| "rewards/format_reward": 0.9598958507180214, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 371.92214546203616, | |
| "epoch": 0.053006460162332285, | |
| "grad_norm": 0.3880268633365631, | |
| "kl": 0.068634033203125, | |
| "learning_rate": 1.0596026490066227e-05, | |
| "loss": 0.0027, | |
| "reward": 1.3507812947034836, | |
| "reward_std": 0.3280396033078432, | |
| "rewards/accuracy_reward": 0.3901041779667139, | |
| "rewards/format_reward": 0.9606771051883698, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 326.34297904968264, | |
| "epoch": 0.05963226768262382, | |
| "grad_norm": 0.3763599693775177, | |
| "kl": 1.29837646484375, | |
| "learning_rate": 1.1920529801324505e-05, | |
| "loss": 0.0519, | |
| "reward": 1.323177123069763, | |
| "reward_std": 0.32737944051623347, | |
| "rewards/accuracy_reward": 0.36510417610406876, | |
| "rewards/format_reward": 0.9580729365348816, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 358.67058486938475, | |
| "epoch": 0.06625807520291535, | |
| "grad_norm": 0.9632219672203064, | |
| "kl": 6.1518310546875, | |
| "learning_rate": 1.3245033112582784e-05, | |
| "loss": 0.2462, | |
| "reward": 1.2625000327825546, | |
| "reward_std": 0.4080970410257578, | |
| "rewards/accuracy_reward": 0.369270845875144, | |
| "rewards/format_reward": 0.8932291895151139, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06625807520291535, | |
| "eval_completion_length": 325.4479259914822, | |
| "eval_kl": 0.2577582465277778, | |
| "eval_loss": 0.01030731201171875, | |
| "eval_reward": 1.400462998284234, | |
| "eval_reward_std": 0.3469897309939067, | |
| "eval_rewards/accuracy_reward": 0.49768519401550293, | |
| "eval_rewards/format_reward": 0.902777804268731, | |
| "eval_runtime": 51.973, | |
| "eval_samples_per_second": 1.905, | |
| "eval_steps_per_second": 0.173, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 340.5109470367432, | |
| "epoch": 0.0728838827232069, | |
| "grad_norm": 0.47404468059539795, | |
| "kl": 0.2045654296875, | |
| "learning_rate": 1.456953642384106e-05, | |
| "loss": 0.0082, | |
| "reward": 1.336718785762787, | |
| "reward_std": 0.4285093888640404, | |
| "rewards/accuracy_reward": 0.4596354283392429, | |
| "rewards/format_reward": 0.8770833566784859, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 255.94688339233397, | |
| "epoch": 0.07950969024349842, | |
| "grad_norm": 0.4434707462787628, | |
| "kl": 0.19765625, | |
| "learning_rate": 1.589403973509934e-05, | |
| "loss": 0.0079, | |
| "reward": 1.2223958760499953, | |
| "reward_std": 0.3857662923634052, | |
| "rewards/accuracy_reward": 0.3195312611758709, | |
| "rewards/format_reward": 0.9028646096587181, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 277.33933143615724, | |
| "epoch": 0.08613549776378997, | |
| "grad_norm": 0.3981403410434723, | |
| "kl": 1.035498046875, | |
| "learning_rate": 1.7218543046357617e-05, | |
| "loss": 0.0415, | |
| "reward": 1.249479204416275, | |
| "reward_std": 0.38627928495407104, | |
| "rewards/accuracy_reward": 0.3450520932674408, | |
| "rewards/format_reward": 0.9044271036982536, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 257.51510963439944, | |
| "epoch": 0.0927613052840815, | |
| "grad_norm": 0.6040926575660706, | |
| "kl": 0.2771484375, | |
| "learning_rate": 1.8543046357615895e-05, | |
| "loss": 0.0111, | |
| "reward": 1.186458373069763, | |
| "reward_std": 0.3947778932750225, | |
| "rewards/accuracy_reward": 0.28750000707805157, | |
| "rewards/format_reward": 0.8989583507180214, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 270.6711009979248, | |
| "epoch": 0.09938711280437303, | |
| "grad_norm": 0.3564830422401428, | |
| "kl": 0.252197265625, | |
| "learning_rate": 1.9867549668874173e-05, | |
| "loss": 0.0101, | |
| "reward": 1.2130208671092988, | |
| "reward_std": 0.4041451971977949, | |
| "rewards/accuracy_reward": 0.33229167833924295, | |
| "rewards/format_reward": 0.8807291880249977, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09938711280437303, | |
| "eval_completion_length": 208.04398727416992, | |
| "eval_kl": 0.3232421875, | |
| "eval_loss": 0.013497698120772839, | |
| "eval_reward": 1.3125000264909532, | |
| "eval_reward_std": 0.30849772029452854, | |
| "eval_rewards/accuracy_reward": 0.3773148192299737, | |
| "eval_rewards/format_reward": 0.9351852072609795, | |
| "eval_runtime": 58.9942, | |
| "eval_samples_per_second": 1.678, | |
| "eval_steps_per_second": 0.153, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 186.75338973999024, | |
| "epoch": 0.10601292032466457, | |
| "grad_norm": 0.7736382484436035, | |
| "kl": 0.6962646484375, | |
| "learning_rate": 1.999783259765003e-05, | |
| "loss": 0.0279, | |
| "reward": 1.183854204416275, | |
| "reward_std": 0.3419460911303759, | |
| "rewards/accuracy_reward": 0.2510416740551591, | |
| "rewards/format_reward": 0.9328125193715096, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 169.17083854675292, | |
| "epoch": 0.1126387278449561, | |
| "grad_norm": 0.3634221851825714, | |
| "kl": 0.459130859375, | |
| "learning_rate": 1.99903415488154e-05, | |
| "loss": 0.0183, | |
| "reward": 1.1283854573965073, | |
| "reward_std": 0.3242658941075206, | |
| "rewards/accuracy_reward": 0.20260417312383652, | |
| "rewards/format_reward": 0.9257812693715095, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 275.55860290527346, | |
| "epoch": 0.11926453536524764, | |
| "grad_norm": 0.38542240858078003, | |
| "kl": 0.3050048828125, | |
| "learning_rate": 1.997750410337147e-05, | |
| "loss": 0.0122, | |
| "reward": 1.182031288743019, | |
| "reward_std": 0.32699903920292855, | |
| "rewards/accuracy_reward": 0.24817709140479566, | |
| "rewards/format_reward": 0.9338541835546493, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 373.55287437438966, | |
| "epoch": 0.12589034288553919, | |
| "grad_norm": 1.1466913223266602, | |
| "kl": 0.5844970703125, | |
| "learning_rate": 1.995932713136112e-05, | |
| "loss": 0.0234, | |
| "reward": 1.1554687857627868, | |
| "reward_std": 0.3822615996003151, | |
| "rewards/accuracy_reward": 0.2549479236826301, | |
| "rewards/format_reward": 0.9005208566784859, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 239.1380271911621, | |
| "epoch": 0.1325161504058307, | |
| "grad_norm": 2.0522491931915283, | |
| "kl": 0.338720703125, | |
| "learning_rate": 1.993582036030978e-05, | |
| "loss": 0.0135, | |
| "reward": 1.128385452926159, | |
| "reward_std": 0.33877944238483904, | |
| "rewards/accuracy_reward": 0.2083333382382989, | |
| "rewards/format_reward": 0.9200521066784859, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1325161504058307, | |
| "eval_completion_length": 243.033571879069, | |
| "eval_kl": 0.3569878472222222, | |
| "eval_loss": 0.013369406573474407, | |
| "eval_reward": 1.207175976700253, | |
| "eval_reward_std": 0.37689801057179767, | |
| "eval_rewards/accuracy_reward": 0.32175926284657586, | |
| "eval_rewards/format_reward": 0.8854166865348816, | |
| "eval_runtime": 50.2709, | |
| "eval_samples_per_second": 1.969, | |
| "eval_steps_per_second": 0.179, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 236.56641330718995, | |
| "epoch": 0.13914195792612225, | |
| "grad_norm": 0.3385501503944397, | |
| "kl": 0.2685546875, | |
| "learning_rate": 1.9906996370019692e-05, | |
| "loss": 0.0107, | |
| "reward": 1.1578125387430191, | |
| "reward_std": 0.3522339530289173, | |
| "rewards/accuracy_reward": 0.24557292349636556, | |
| "rewards/format_reward": 0.912239608168602, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 201.9429744720459, | |
| "epoch": 0.1457677654464138, | |
| "grad_norm": 0.7260228395462036, | |
| "kl": 0.4362060546875, | |
| "learning_rate": 1.9872870585837757e-05, | |
| "loss": 0.0174, | |
| "reward": 1.1442708656191827, | |
| "reward_std": 0.3241072274744511, | |
| "rewards/accuracy_reward": 0.22395833912305535, | |
| "rewards/format_reward": 0.9203125178813935, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 171.1460983276367, | |
| "epoch": 0.1523935729667053, | |
| "grad_norm": 0.9234119057655334, | |
| "kl": 0.371435546875, | |
| "learning_rate": 1.983346127040053e-05, | |
| "loss": 0.0149, | |
| "reward": 1.1585937857627868, | |
| "reward_std": 0.3430036876350641, | |
| "rewards/accuracy_reward": 0.2291666740551591, | |
| "rewards/format_reward": 0.9294271036982537, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 184.18906688690186, | |
| "epoch": 0.15901938048699685, | |
| "grad_norm": 0.41350895166397095, | |
| "kl": 0.4388671875, | |
| "learning_rate": 1.9788789513860875e-05, | |
| "loss": 0.0176, | |
| "reward": 1.1466146260499954, | |
| "reward_std": 0.3448056776076555, | |
| "rewards/accuracy_reward": 0.22135417200624943, | |
| "rewards/format_reward": 0.9252604350447655, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 170.5622449874878, | |
| "epoch": 0.1656451880072884, | |
| "grad_norm": 0.4960261881351471, | |
| "kl": 0.430224609375, | |
| "learning_rate": 1.9738879222601425e-05, | |
| "loss": 0.0172, | |
| "reward": 1.1361979573965073, | |
| "reward_std": 0.3519858349114656, | |
| "rewards/accuracy_reward": 0.21927083879709244, | |
| "rewards/format_reward": 0.9169271022081376, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1656451880072884, | |
| "eval_completion_length": 177.629635281033, | |
| "eval_kl": 0.4537760416666667, | |
| "eval_loss": 0.018489297479391098, | |
| "eval_reward": 1.1678241226408217, | |
| "eval_reward_std": 0.40494963857862687, | |
| "eval_rewards/accuracy_reward": 0.2627314892080095, | |
| "eval_rewards/format_reward": 0.9050926036304898, | |
| "eval_runtime": 49.3339, | |
| "eval_samples_per_second": 2.007, | |
| "eval_steps_per_second": 0.182, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 165.07448387145996, | |
| "epoch": 0.17227099552757993, | |
| "grad_norm": 0.4660806655883789, | |
| "kl": 0.3287353515625, | |
| "learning_rate": 1.968375710644093e-05, | |
| "loss": 0.0132, | |
| "reward": 1.1317708671092988, | |
| "reward_std": 0.33136086612939836, | |
| "rewards/accuracy_reward": 0.20364583972841502, | |
| "rewards/format_reward": 0.9281250208616256, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 153.50963916778565, | |
| "epoch": 0.17889680304787145, | |
| "grad_norm": 0.46810558438301086, | |
| "kl": 0.4310546875, | |
| "learning_rate": 1.9623452664340305e-05, | |
| "loss": 0.0173, | |
| "reward": 1.1289062976837159, | |
| "reward_std": 0.3033852633088827, | |
| "rewards/accuracy_reward": 0.19479167063254862, | |
| "rewards/format_reward": 0.9341146022081375, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 127.1726598739624, | |
| "epoch": 0.185522610568163, | |
| "grad_norm": 0.36899781227111816, | |
| "kl": 0.368310546875, | |
| "learning_rate": 1.9557998168616087e-05, | |
| "loss": 0.0147, | |
| "reward": 1.1588542029261588, | |
| "reward_std": 0.2841993160545826, | |
| "rewards/accuracy_reward": 0.20598958879709245, | |
| "rewards/format_reward": 0.9528646022081375, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 126.30937976837158, | |
| "epoch": 0.19214841808845454, | |
| "grad_norm": 0.38908788561820984, | |
| "kl": 0.504345703125, | |
| "learning_rate": 1.9487428647669688e-05, | |
| "loss": 0.0202, | |
| "reward": 1.1286458730697633, | |
| "reward_std": 0.2689166348427534, | |
| "rewards/accuracy_reward": 0.18385417251847685, | |
| "rewards/format_reward": 0.9447916865348815, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 164.8523473739624, | |
| "epoch": 0.19877422560874605, | |
| "grad_norm": 0.580731213092804, | |
| "kl": 0.39619140625, | |
| "learning_rate": 1.9411781867241718e-05, | |
| "loss": 0.0159, | |
| "reward": 1.1351562827825545, | |
| "reward_std": 0.2917447902262211, | |
| "rewards/accuracy_reward": 0.19661458898335696, | |
| "rewards/format_reward": 0.9385416880249977, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.19877422560874605, | |
| "eval_completion_length": 169.12963443332248, | |
| "eval_kl": 0.7450086805555556, | |
| "eval_loss": 0.031219787895679474, | |
| "eval_reward": 1.1620370944341023, | |
| "eval_reward_std": 0.3717506031195323, | |
| "eval_rewards/accuracy_reward": 0.22916667411724725, | |
| "eval_rewards/format_reward": 0.9328703946537442, | |
| "eval_runtime": 48.3046, | |
| "eval_samples_per_second": 2.049, | |
| "eval_steps_per_second": 0.186, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 137.1604206085205, | |
| "epoch": 0.2054000331290376, | |
| "grad_norm": 0.38375967741012573, | |
| "kl": 0.378857421875, | |
| "learning_rate": 1.9331098310201392e-05, | |
| "loss": 0.0152, | |
| "reward": 1.1098958775401115, | |
| "reward_std": 0.265699202939868, | |
| "rewards/accuracy_reward": 0.16588542177341878, | |
| "rewards/format_reward": 0.9440104380249977, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 188.78802585601807, | |
| "epoch": 0.21202584064932914, | |
| "grad_norm": 0.33487841486930847, | |
| "kl": 0.496142578125, | |
| "learning_rate": 1.9245421154881873e-05, | |
| "loss": 0.0199, | |
| "reward": 1.1182292073965072, | |
| "reward_std": 0.31800296930596234, | |
| "rewards/accuracy_reward": 0.18828125610016286, | |
| "rewards/format_reward": 0.9299479365348816, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 186.27344207763673, | |
| "epoch": 0.21865164816962068, | |
| "grad_norm": 1.0283613204956055, | |
| "kl": 0.3497314453125, | |
| "learning_rate": 1.9154796251973092e-05, | |
| "loss": 0.014, | |
| "reward": 1.1596354573965073, | |
| "reward_std": 0.3103053130209446, | |
| "rewards/accuracy_reward": 0.217968756519258, | |
| "rewards/format_reward": 0.9416666910052299, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 204.39453773498536, | |
| "epoch": 0.2252774556899122, | |
| "grad_norm": 0.43050748109817505, | |
| "kl": 0.498681640625, | |
| "learning_rate": 1.905927209998447e-05, | |
| "loss": 0.0199, | |
| "reward": 1.1023437835276126, | |
| "reward_std": 0.30628957897424697, | |
| "rewards/accuracy_reward": 0.212239589355886, | |
| "rewards/format_reward": 0.8901041835546494, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 211.14583930969238, | |
| "epoch": 0.23190326321020374, | |
| "grad_norm": 0.9810725450515747, | |
| "kl": 0.328759765625, | |
| "learning_rate": 1.8958899819290592e-05, | |
| "loss": 0.0132, | |
| "reward": 1.0622396171092987, | |
| "reward_std": 0.2948887083679438, | |
| "rewards/accuracy_reward": 0.15156250395812093, | |
| "rewards/format_reward": 0.9106771036982536, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.23190326321020374, | |
| "eval_completion_length": 187.81829155815973, | |
| "eval_kl": 0.3449435763888889, | |
| "eval_loss": 0.014129959046840668, | |
| "eval_reward": 1.2037037346098158, | |
| "eval_reward_std": 0.3060726622740428, | |
| "eval_rewards/accuracy_reward": 0.27893519235981834, | |
| "eval_rewards/format_reward": 0.9247685339715745, | |
| "eval_runtime": 46.4056, | |
| "eval_samples_per_second": 2.133, | |
| "eval_steps_per_second": 0.194, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 170.20156650543214, | |
| "epoch": 0.23852907073049529, | |
| "grad_norm": 0.31330692768096924, | |
| "kl": 0.476318359375, | |
| "learning_rate": 1.8853733124773837e-05, | |
| "loss": 0.019, | |
| "reward": 1.0606771275401115, | |
| "reward_std": 0.2639134880155325, | |
| "rewards/accuracy_reward": 0.12161458658520133, | |
| "rewards/format_reward": 0.9390625208616257, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 125.54062900543212, | |
| "epoch": 0.24515487825078683, | |
| "grad_norm": 0.34919577836990356, | |
| "kl": 0.339111328125, | |
| "learning_rate": 1.8743828297078485e-05, | |
| "loss": 0.0136, | |
| "reward": 1.1145833671092986, | |
| "reward_std": 0.24877706002444028, | |
| "rewards/accuracy_reward": 0.15442708847112954, | |
| "rewards/format_reward": 0.9601562678813934, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 124.51458683013917, | |
| "epoch": 0.25178068577107837, | |
| "grad_norm": 0.6795300841331482, | |
| "kl": 0.314599609375, | |
| "learning_rate": 1.8629244152491773e-05, | |
| "loss": 0.0126, | |
| "reward": 1.1388021245598794, | |
| "reward_std": 0.25418675877153873, | |
| "rewards/accuracy_reward": 0.17187500512227416, | |
| "rewards/format_reward": 0.9669271022081375, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 185.66432704925538, | |
| "epoch": 0.2584064932913699, | |
| "grad_norm": 0.29387614130973816, | |
| "kl": 0.362744140625, | |
| "learning_rate": 1.8510042011467978e-05, | |
| "loss": 0.0145, | |
| "reward": 1.0606771200895309, | |
| "reward_std": 0.30348861529491844, | |
| "rewards/accuracy_reward": 0.1322916704695672, | |
| "rewards/format_reward": 0.9283854335546493, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 110.68646202087402, | |
| "epoch": 0.2650323008116614, | |
| "grad_norm": 0.3515387177467346, | |
| "kl": 0.326708984375, | |
| "learning_rate": 1.838628566581236e-05, | |
| "loss": 0.0131, | |
| "reward": 1.1135417073965073, | |
| "reward_std": 0.20253405962139368, | |
| "rewards/accuracy_reward": 0.13723958749324083, | |
| "rewards/format_reward": 0.9763021036982537, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2650323008116614, | |
| "eval_completion_length": 116.7835676405165, | |
| "eval_kl": 0.3328993055555556, | |
| "eval_loss": 0.013369088061153889, | |
| "eval_reward": 1.2650463183720906, | |
| "eval_reward_std": 0.2229729178878996, | |
| "eval_rewards/accuracy_reward": 0.2812500033113692, | |
| "eval_rewards/format_reward": 0.9837963117493523, | |
| "eval_runtime": 40.8337, | |
| "eval_samples_per_second": 2.424, | |
| "eval_steps_per_second": 0.22, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 166.33490180969238, | |
| "epoch": 0.271658108331953, | |
| "grad_norm": 0.3290473520755768, | |
| "kl": 0.31640625, | |
| "learning_rate": 1.8258041344542567e-05, | |
| "loss": 0.0126, | |
| "reward": 1.115625037252903, | |
| "reward_std": 0.24871433693915607, | |
| "rewards/accuracy_reward": 0.15520833851769567, | |
| "rewards/format_reward": 0.9604166850447655, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 162.28073406219482, | |
| "epoch": 0.2782839158522445, | |
| "grad_norm": 1.5391435623168945, | |
| "kl": 0.419970703125, | |
| "learning_rate": 1.8125377678445755e-05, | |
| "loss": 0.0168, | |
| "reward": 1.1242187842726707, | |
| "reward_std": 0.2836728408932686, | |
| "rewards/accuracy_reward": 0.17578125637955963, | |
| "rewards/format_reward": 0.9484375208616257, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 84.54088821411133, | |
| "epoch": 0.284909723372536, | |
| "grad_norm": 0.4486382305622101, | |
| "kl": 0.465869140625, | |
| "learning_rate": 1.7988365663350352e-05, | |
| "loss": 0.0186, | |
| "reward": 1.1148437857627869, | |
| "reward_std": 0.25627183392643926, | |
| "rewards/accuracy_reward": 0.16406250447034837, | |
| "rewards/format_reward": 0.9507812678813934, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 111.79870109558105, | |
| "epoch": 0.2915355308928276, | |
| "grad_norm": 0.47133392095565796, | |
| "kl": 0.50244140625, | |
| "learning_rate": 1.7847078622132202e-05, | |
| "loss": 0.0201, | |
| "reward": 1.1361979499459267, | |
| "reward_std": 0.25920494105666875, | |
| "rewards/accuracy_reward": 0.19583333916962148, | |
| "rewards/format_reward": 0.9403646036982536, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 95.4588568687439, | |
| "epoch": 0.2981613384131191, | |
| "grad_norm": 0.4222451150417328, | |
| "kl": 0.48828125, | |
| "learning_rate": 1.770159216547532e-05, | |
| "loss": 0.0195, | |
| "reward": 1.1565104603767395, | |
| "reward_std": 0.2345227889716625, | |
| "rewards/accuracy_reward": 0.1872395884245634, | |
| "rewards/format_reward": 0.9692708507180214, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2981613384131191, | |
| "eval_completion_length": 139.7500059339735, | |
| "eval_kl": 0.4025607638888889, | |
| "eval_loss": 0.016312483698129654, | |
| "eval_reward": 1.1759259833229914, | |
| "eval_reward_std": 0.3269110951158736, | |
| "eval_rewards/accuracy_reward": 0.23148148589664036, | |
| "eval_rewards/format_reward": 0.9444444643126594, | |
| "eval_runtime": 45.7275, | |
| "eval_samples_per_second": 2.165, | |
| "eval_steps_per_second": 0.197, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 169.73099479675292, | |
| "epoch": 0.3047871459334106, | |
| "grad_norm": 0.41585874557495117, | |
| "kl": 0.4705078125, | |
| "learning_rate": 1.7551984151408363e-05, | |
| "loss": 0.0188, | |
| "reward": 1.0976562932133676, | |
| "reward_std": 0.3228786814957857, | |
| "rewards/accuracy_reward": 0.17473958879709245, | |
| "rewards/format_reward": 0.9229166880249977, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 94.51797103881836, | |
| "epoch": 0.3114129534537022, | |
| "grad_norm": 0.40987807512283325, | |
| "kl": 0.56259765625, | |
| "learning_rate": 1.739833464363838e-05, | |
| "loss": 0.0225, | |
| "reward": 1.1330729693174362, | |
| "reward_std": 0.22887265272438526, | |
| "rewards/accuracy_reward": 0.16380208819173275, | |
| "rewards/format_reward": 0.9692708566784859, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 124.16875343322754, | |
| "epoch": 0.3180387609739937, | |
| "grad_norm": 0.4068983495235443, | |
| "kl": 0.44365234375, | |
| "learning_rate": 1.7240725868704218e-05, | |
| "loss": 0.0177, | |
| "reward": 1.0838542029261589, | |
| "reward_std": 0.25959088616073134, | |
| "rewards/accuracy_reward": 0.13828125388827175, | |
| "rewards/format_reward": 0.9455729335546493, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 167.1687551498413, | |
| "epoch": 0.32466456849428527, | |
| "grad_norm": 0.34697845578193665, | |
| "kl": 0.405126953125, | |
| "learning_rate": 1.7079242171972417e-05, | |
| "loss": 0.0162, | |
| "reward": 1.1450521230697632, | |
| "reward_std": 0.280022681877017, | |
| "rewards/accuracy_reward": 0.19843750591389836, | |
| "rewards/format_reward": 0.946614608168602, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 148.58568172454835, | |
| "epoch": 0.3312903760145768, | |
| "grad_norm": 0.3142317235469818, | |
| "kl": 0.427001953125, | |
| "learning_rate": 1.6913969972499272e-05, | |
| "loss": 0.0171, | |
| "reward": 1.1440104603767396, | |
| "reward_std": 0.30612033531069754, | |
| "rewards/accuracy_reward": 0.20625000540167093, | |
| "rewards/format_reward": 0.9377604395151138, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3312903760145768, | |
| "eval_completion_length": 123.2951431274414, | |
| "eval_kl": 0.4292534722222222, | |
| "eval_loss": 0.017204057425260544, | |
| "eval_reward": 1.2835648589664035, | |
| "eval_reward_std": 0.30847717821598053, | |
| "eval_rewards/accuracy_reward": 0.32060185737080044, | |
| "eval_rewards/format_reward": 0.9629629850387573, | |
| "eval_runtime": 44.5786, | |
| "eval_samples_per_second": 2.221, | |
| "eval_steps_per_second": 0.202, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 114.76849279403686, | |
| "epoch": 0.3379161835348683, | |
| "grad_norm": 0.4374355673789978, | |
| "kl": 0.5587890625, | |
| "learning_rate": 1.674499771678309e-05, | |
| "loss": 0.0224, | |
| "reward": 1.1583333671092988, | |
| "reward_std": 0.2752871666103601, | |
| "rewards/accuracy_reward": 0.20286458814516664, | |
| "rewards/format_reward": 0.9554687708616256, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 118.95989866256714, | |
| "epoch": 0.34454199105515987, | |
| "grad_norm": 0.3690480887889862, | |
| "kl": 0.47080078125, | |
| "learning_rate": 1.6572415831431466e-05, | |
| "loss": 0.0188, | |
| "reward": 1.1442708760499953, | |
| "reward_std": 0.266701377555728, | |
| "rewards/accuracy_reward": 0.188541672937572, | |
| "rewards/format_reward": 0.9557291850447655, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 154.7580778121948, | |
| "epoch": 0.3511677985754514, | |
| "grad_norm": 0.33155742287635803, | |
| "kl": 0.51572265625, | |
| "learning_rate": 1.6396316674768914e-05, | |
| "loss": 0.0206, | |
| "reward": 1.108593787252903, | |
| "reward_std": 0.3096882740035653, | |
| "rewards/accuracy_reward": 0.18203125561121852, | |
| "rewards/format_reward": 0.9265625208616257, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 76.65052275657654, | |
| "epoch": 0.3577936060957429, | |
| "grad_norm": 0.534902036190033, | |
| "kl": 0.69091796875, | |
| "learning_rate": 1.621679448741067e-05, | |
| "loss": 0.0276, | |
| "reward": 1.1270833760499954, | |
| "reward_std": 0.25888577867299317, | |
| "rewards/accuracy_reward": 0.17552083907648922, | |
| "rewards/format_reward": 0.9515625208616256, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 140.45651397705078, | |
| "epoch": 0.36441941361603447, | |
| "grad_norm": 0.7749128937721252, | |
| "kl": 0.670166015625, | |
| "learning_rate": 1.603394534182925e-05, | |
| "loss": 0.0268, | |
| "reward": 1.0437500402331352, | |
| "reward_std": 0.2872411595657468, | |
| "rewards/accuracy_reward": 0.12812500363215804, | |
| "rewards/format_reward": 0.9156250163912774, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.36441941361603447, | |
| "eval_completion_length": 162.37384711371527, | |
| "eval_kl": 0.5069444444444444, | |
| "eval_loss": 0.020352717489004135, | |
| "eval_reward": 1.1203704012764826, | |
| "eval_reward_std": 0.34640828768412274, | |
| "eval_rewards/accuracy_reward": 0.20717593530813852, | |
| "eval_rewards/format_reward": 0.9131944643126594, | |
| "eval_runtime": 48.3205, | |
| "eval_samples_per_second": 2.049, | |
| "eval_steps_per_second": 0.186, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 134.0356803894043, | |
| "epoch": 0.371045221136326, | |
| "grad_norm": 0.3628806173801422, | |
| "kl": 0.53544921875, | |
| "learning_rate": 1.5847867090940602e-05, | |
| "loss": 0.0214, | |
| "reward": 1.1044271185994148, | |
| "reward_std": 0.2886835677549243, | |
| "rewards/accuracy_reward": 0.16770833884365857, | |
| "rewards/format_reward": 0.9367187678813934, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 130.9809928894043, | |
| "epoch": 0.3776710286566175, | |
| "grad_norm": 0.4081813395023346, | |
| "kl": 0.500341796875, | |
| "learning_rate": 1.5658659315737505e-05, | |
| "loss": 0.02, | |
| "reward": 1.13567713201046, | |
| "reward_std": 0.25305410884320734, | |
| "rewards/accuracy_reward": 0.18567708935588598, | |
| "rewards/format_reward": 0.9500000193715096, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 114.95755519866944, | |
| "epoch": 0.3842968361769091, | |
| "grad_norm": 0.37676241993904114, | |
| "kl": 0.564794921875, | |
| "learning_rate": 1.5466423271998144e-05, | |
| "loss": 0.0226, | |
| "reward": 1.135416714847088, | |
| "reward_std": 0.2524235276505351, | |
| "rewards/accuracy_reward": 0.18697917149402202, | |
| "rewards/format_reward": 0.9484375223517418, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 115.24349269866943, | |
| "epoch": 0.3909226436972006, | |
| "grad_norm": 0.4308485686779022, | |
| "kl": 0.5263671875, | |
| "learning_rate": 1.5271261836098403e-05, | |
| "loss": 0.0211, | |
| "reward": 1.1546875417232514, | |
| "reward_std": 0.255348096229136, | |
| "rewards/accuracy_reward": 0.19895834159106016, | |
| "rewards/format_reward": 0.9557291865348816, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 128.86224336624144, | |
| "epoch": 0.3975484512174921, | |
| "grad_norm": 0.42378589510917664, | |
| "kl": 0.762548828125, | |
| "learning_rate": 1.5073279449956916e-05, | |
| "loss": 0.0305, | |
| "reward": 1.1497396290302277, | |
| "reward_std": 0.27525530084967614, | |
| "rewards/accuracy_reward": 0.2031250052154064, | |
| "rewards/format_reward": 0.9466146022081375, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3975484512174921, | |
| "eval_completion_length": 103.54282718234592, | |
| "eval_kl": 0.5060763888888888, | |
| "eval_loss": 0.020621497184038162, | |
| "eval_reward": 1.269675956832038, | |
| "eval_reward_std": 0.27875811523861355, | |
| "eval_rewards/accuracy_reward": 0.29398149251937866, | |
| "eval_rewards/format_reward": 0.9756944643126594, | |
| "eval_runtime": 40.4616, | |
| "eval_samples_per_second": 2.447, | |
| "eval_steps_per_second": 0.222, | |
| "step": 600 | |
| }, | |
| { | |
| "completion_length": 119.92604503631591, | |
| "epoch": 0.4041742587377837, | |
| "grad_norm": 0.346579909324646, | |
| "kl": 0.48173828125, | |
| "learning_rate": 1.4872582065142285e-05, | |
| "loss": 0.0193, | |
| "reward": 1.184895870089531, | |
| "reward_std": 0.25066950926557185, | |
| "rewards/accuracy_reward": 0.22057292337995021, | |
| "rewards/format_reward": 0.9643229380249977, | |
| "step": 610 | |
| }, | |
| { | |
| "completion_length": 166.857816696167, | |
| "epoch": 0.4108000662580752, | |
| "grad_norm": 0.3131767809391022, | |
| "kl": 0.425244140625, | |
| "learning_rate": 1.4669277086172406e-05, | |
| "loss": 0.017, | |
| "reward": 1.1731771290302277, | |
| "reward_std": 0.2866227850317955, | |
| "rewards/accuracy_reward": 0.22500000558793545, | |
| "rewards/format_reward": 0.9481771036982536, | |
| "step": 620 | |
| }, | |
| { | |
| "completion_length": 177.0984432220459, | |
| "epoch": 0.41742587377836676, | |
| "grad_norm": 0.3359943926334381, | |
| "kl": 0.46962890625, | |
| "learning_rate": 1.4463473313036241e-05, | |
| "loss": 0.0188, | |
| "reward": 1.171875038743019, | |
| "reward_std": 0.27172743044793607, | |
| "rewards/accuracy_reward": 0.22317709103226663, | |
| "rewards/format_reward": 0.9486979335546494, | |
| "step": 630 | |
| }, | |
| { | |
| "completion_length": 160.54766025543213, | |
| "epoch": 0.4240516812986583, | |
| "grad_norm": 0.3603960871696472, | |
| "kl": 0.39130859375, | |
| "learning_rate": 1.4255280882968787e-05, | |
| "loss": 0.0157, | |
| "reward": 1.1869792103767396, | |
| "reward_std": 0.2906502477824688, | |
| "rewards/accuracy_reward": 0.23411459047347308, | |
| "rewards/format_reward": 0.9528646036982537, | |
| "step": 640 | |
| }, | |
| { | |
| "completion_length": 164.8755266189575, | |
| "epoch": 0.4306774888189498, | |
| "grad_norm": 0.336618572473526, | |
| "kl": 0.445849609375, | |
| "learning_rate": 1.4044811211510419e-05, | |
| "loss": 0.0178, | |
| "reward": 1.1713542014360427, | |
| "reward_std": 0.30316176153719426, | |
| "rewards/accuracy_reward": 0.23437500689178706, | |
| "rewards/format_reward": 0.9369791880249977, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4306774888189498, | |
| "eval_completion_length": 113.57755109998915, | |
| "eval_kl": 0.4815538194444444, | |
| "eval_loss": 0.01964624412357807, | |
| "eval_reward": 1.2708333730697632, | |
| "eval_reward_std": 0.3395750116970804, | |
| "eval_rewards/accuracy_reward": 0.3090277901954121, | |
| "eval_rewards/format_reward": 0.9618055688010322, | |
| "eval_runtime": 43.035, | |
| "eval_samples_per_second": 2.3, | |
| "eval_steps_per_second": 0.209, | |
| "step": 650 | |
| }, | |
| { | |
| "completion_length": 125.93802452087402, | |
| "epoch": 0.43730329633924137, | |
| "grad_norm": 0.4743092656135559, | |
| "kl": 0.461376953125, | |
| "learning_rate": 1.3832176932882136e-05, | |
| "loss": 0.0184, | |
| "reward": 1.1882812917232513, | |
| "reward_std": 0.2539562493562698, | |
| "rewards/accuracy_reward": 0.22656250707805156, | |
| "rewards/format_reward": 0.9617187693715096, | |
| "step": 660 | |
| }, | |
| { | |
| "completion_length": 139.05703525543214, | |
| "epoch": 0.4439291038595329, | |
| "grad_norm": 0.4012630581855774, | |
| "kl": 0.507470703125, | |
| "learning_rate": 1.3617491839708614e-05, | |
| "loss": 0.0203, | |
| "reward": 1.171093788743019, | |
| "reward_std": 0.2858675643801689, | |
| "rewards/accuracy_reward": 0.22265625689178706, | |
| "rewards/format_reward": 0.9484375208616257, | |
| "step": 670 | |
| }, | |
| { | |
| "completion_length": 117.47630653381347, | |
| "epoch": 0.4505549113798244, | |
| "grad_norm": 0.3696196675300598, | |
| "kl": 0.542919921875, | |
| "learning_rate": 1.3400870822121348e-05, | |
| "loss": 0.0217, | |
| "reward": 1.1838542059063912, | |
| "reward_std": 0.2748897645622492, | |
| "rewards/accuracy_reward": 0.2341145918238908, | |
| "rewards/format_reward": 0.9497396051883698, | |
| "step": 680 | |
| }, | |
| { | |
| "completion_length": 145.46432704925536, | |
| "epoch": 0.45718071890011597, | |
| "grad_norm": 5.13590145111084, | |
| "kl": 0.692041015625, | |
| "learning_rate": 1.3182429806274442e-05, | |
| "loss": 0.0277, | |
| "reward": 1.1864583760499954, | |
| "reward_std": 0.270385118573904, | |
| "rewards/accuracy_reward": 0.23776042237877845, | |
| "rewards/format_reward": 0.9486979350447655, | |
| "step": 690 | |
| }, | |
| { | |
| "completion_length": 135.6513063430786, | |
| "epoch": 0.4638065264204075, | |
| "grad_norm": 0.39522725343704224, | |
| "kl": 0.557568359375, | |
| "learning_rate": 1.2962285692305964e-05, | |
| "loss": 0.0223, | |
| "reward": 1.1851562783122063, | |
| "reward_std": 0.2556427549570799, | |
| "rewards/accuracy_reward": 0.23046875661239027, | |
| "rewards/format_reward": 0.9546875208616257, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4638065264204075, | |
| "eval_completion_length": 148.33218044704861, | |
| "eval_kl": 0.4103732638888889, | |
| "eval_loss": 0.016540652140975, | |
| "eval_reward": 1.2708333730697632, | |
| "eval_reward_std": 0.33350396156311035, | |
| "eval_rewards/accuracy_reward": 0.3240740845600764, | |
| "eval_rewards/format_reward": 0.9467592835426331, | |
| "eval_runtime": 46.6863, | |
| "eval_samples_per_second": 2.121, | |
| "eval_steps_per_second": 0.193, | |
| "step": 700 | |
| }, | |
| { | |
| "completion_length": 148.985941696167, | |
| "epoch": 0.470432333940699, | |
| "grad_norm": 0.41000673174858093, | |
| "kl": 0.51171875, | |
| "learning_rate": 1.2740556291778096e-05, | |
| "loss": 0.0205, | |
| "reward": 1.1614583790302277, | |
| "reward_std": 0.3124166313558817, | |
| "rewards/accuracy_reward": 0.23463542368263007, | |
| "rewards/format_reward": 0.9268229380249977, | |
| "step": 710 | |
| }, | |
| { | |
| "completion_length": 69.19583520889282, | |
| "epoch": 0.47705814146099057, | |
| "grad_norm": 0.3913906216621399, | |
| "kl": 0.63359375, | |
| "learning_rate": 1.2517360264629463e-05, | |
| "loss": 0.0254, | |
| "reward": 1.1914062932133676, | |
| "reward_std": 0.2280671002343297, | |
| "rewards/accuracy_reward": 0.2223958398681134, | |
| "rewards/format_reward": 0.9690104380249978, | |
| "step": 720 | |
| }, | |
| { | |
| "completion_length": 108.44713830947876, | |
| "epoch": 0.4836839489812821, | |
| "grad_norm": 0.6712866425514221, | |
| "kl": 0.609033203125, | |
| "learning_rate": 1.2292817055673543e-05, | |
| "loss": 0.0244, | |
| "reward": 1.1658854559063911, | |
| "reward_std": 0.25000986782833934, | |
| "rewards/accuracy_reward": 0.21458333956543357, | |
| "rewards/format_reward": 0.9513021036982536, | |
| "step": 730 | |
| }, | |
| { | |
| "completion_length": 144.48672218322753, | |
| "epoch": 0.49030975650157366, | |
| "grad_norm": 0.5536672472953796, | |
| "kl": 0.535302734375, | |
| "learning_rate": 1.2067046830676947e-05, | |
| "loss": 0.0214, | |
| "reward": 1.1919271260499955, | |
| "reward_std": 0.29237424544990065, | |
| "rewards/accuracy_reward": 0.24843750819563865, | |
| "rewards/format_reward": 0.9434896007180213, | |
| "step": 740 | |
| }, | |
| { | |
| "completion_length": 139.08802452087403, | |
| "epoch": 0.4969355640218652, | |
| "grad_norm": 0.48829564452171326, | |
| "kl": 0.497509765625, | |
| "learning_rate": 1.1840170412051957e-05, | |
| "loss": 0.0199, | |
| "reward": 1.1994792014360427, | |
| "reward_std": 0.24491893574595452, | |
| "rewards/accuracy_reward": 0.2432291721459478, | |
| "rewards/format_reward": 0.9562500178813934, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4969355640218652, | |
| "eval_completion_length": 172.40972900390625, | |
| "eval_kl": 0.3891059027777778, | |
| "eval_loss": 0.015797864645719528, | |
| "eval_reward": 1.290509303410848, | |
| "eval_reward_std": 0.2870280941327413, | |
| "eval_rewards/accuracy_reward": 0.3414351973268721, | |
| "eval_rewards/format_reward": 0.9490740829043918, | |
| "eval_runtime": 46.928, | |
| "eval_samples_per_second": 2.11, | |
| "eval_steps_per_second": 0.192, | |
| "step": 750 | |
| }, | |
| { | |
| "completion_length": 154.06302490234376, | |
| "epoch": 0.5035613715421567, | |
| "grad_norm": 0.4157249331474304, | |
| "kl": 0.80810546875, | |
| "learning_rate": 1.1612309214197599e-05, | |
| "loss": 0.0323, | |
| "reward": 1.2119792103767395, | |
| "reward_std": 0.2926496058702469, | |
| "rewards/accuracy_reward": 0.265885423310101, | |
| "rewards/format_reward": 0.9460937693715096, | |
| "step": 760 | |
| }, | |
| { | |
| "completion_length": 109.2606798171997, | |
| "epoch": 0.5101871790624483, | |
| "grad_norm": 0.35986068844795227, | |
| "kl": 0.56611328125, | |
| "learning_rate": 1.1383585178523955e-05, | |
| "loss": 0.0227, | |
| "reward": 1.204166702926159, | |
| "reward_std": 0.2575360298156738, | |
| "rewards/accuracy_reward": 0.2502604253590107, | |
| "rewards/format_reward": 0.9539062708616257, | |
| "step": 770 | |
| }, | |
| { | |
| "completion_length": 144.10937929153442, | |
| "epoch": 0.5168129865827398, | |
| "grad_norm": 0.30369916558265686, | |
| "kl": 0.592578125, | |
| "learning_rate": 1.1154120708194398e-05, | |
| "loss": 0.0237, | |
| "reward": 1.1294271171092987, | |
| "reward_std": 0.30647876001894475, | |
| "rewards/accuracy_reward": 0.20937500689178706, | |
| "rewards/format_reward": 0.9200521036982536, | |
| "step": 780 | |
| }, | |
| { | |
| "completion_length": 92.92942972183228, | |
| "epoch": 0.5234387941030313, | |
| "grad_norm": 0.40557360649108887, | |
| "kl": 0.62392578125, | |
| "learning_rate": 1.0924038602620757e-05, | |
| "loss": 0.025, | |
| "reward": 1.1776042118668557, | |
| "reward_std": 0.2612913876771927, | |
| "rewards/accuracy_reward": 0.2195312575204298, | |
| "rewards/format_reward": 0.9580729380249977, | |
| "step": 790 | |
| }, | |
| { | |
| "completion_length": 139.11432666778563, | |
| "epoch": 0.5300646016233228, | |
| "grad_norm": 0.4020339846611023, | |
| "kl": 0.4591796875, | |
| "learning_rate": 1.0693461991746389e-05, | |
| "loss": 0.0184, | |
| "reward": 1.1885417088866235, | |
| "reward_std": 0.29046612568199637, | |
| "rewards/accuracy_reward": 0.24661459056660534, | |
| "rewards/format_reward": 0.9419271036982536, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5300646016233228, | |
| "eval_completion_length": 95.66782718234592, | |
| "eval_kl": 0.5199652777777778, | |
| "eval_loss": 0.021310841664671898, | |
| "eval_reward": 1.271990762816535, | |
| "eval_reward_std": 0.2451818229423629, | |
| "eval_rewards/accuracy_reward": 0.305555565489663, | |
| "eval_rewards/format_reward": 0.9664352072609795, | |
| "eval_runtime": 41.3833, | |
| "eval_samples_per_second": 2.392, | |
| "eval_steps_per_second": 0.217, | |
| "step": 800 | |
| }, | |
| { | |
| "completion_length": 88.05521087646484, | |
| "epoch": 0.5366904091436144, | |
| "grad_norm": 0.3212931156158447, | |
| "kl": 0.6345703125, | |
| "learning_rate": 1.046251427015241e-05, | |
| "loss": 0.0254, | |
| "reward": 1.2151042073965073, | |
| "reward_std": 0.21537913139909506, | |
| "rewards/accuracy_reward": 0.24713542337995023, | |
| "rewards/format_reward": 0.9679687708616257, | |
| "step": 810 | |
| }, | |
| { | |
| "completion_length": 114.12812786102295, | |
| "epoch": 0.543316216663906, | |
| "grad_norm": 0.36236339807510376, | |
| "kl": 0.535205078125, | |
| "learning_rate": 1.023131903102226e-05, | |
| "loss": 0.0214, | |
| "reward": 1.1872396260499953, | |
| "reward_std": 0.25381856635212896, | |
| "rewards/accuracy_reward": 0.22812500651925802, | |
| "rewards/format_reward": 0.9591146022081375, | |
| "step": 820 | |
| }, | |
| { | |
| "completion_length": 120.23437843322753, | |
| "epoch": 0.5499420241841975, | |
| "grad_norm": 0.33346128463745117, | |
| "kl": 0.491455078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0197, | |
| "reward": 1.1893229603767395, | |
| "reward_std": 0.26861554831266404, | |
| "rewards/accuracy_reward": 0.2398437585681677, | |
| "rewards/format_reward": 0.9494791865348816, | |
| "step": 830 | |
| }, | |
| { | |
| "completion_length": 104.39948310852051, | |
| "epoch": 0.556567831704489, | |
| "grad_norm": 0.287349671125412, | |
| "kl": 0.52109375, | |
| "learning_rate": 9.768680968977743e-06, | |
| "loss": 0.0208, | |
| "reward": 1.2109375417232513, | |
| "reward_std": 0.23620927650481463, | |
| "rewards/accuracy_reward": 0.24895834140479564, | |
| "rewards/format_reward": 0.9619791895151139, | |
| "step": 840 | |
| }, | |
| { | |
| "completion_length": 141.53802490234375, | |
| "epoch": 0.5631936392247805, | |
| "grad_norm": 0.40245750546455383, | |
| "kl": 0.515869140625, | |
| "learning_rate": 9.537485729847594e-06, | |
| "loss": 0.0206, | |
| "reward": 1.1757812902331353, | |
| "reward_std": 0.2983800694346428, | |
| "rewards/accuracy_reward": 0.23984375763684512, | |
| "rewards/format_reward": 0.9359375193715096, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5631936392247805, | |
| "eval_completion_length": 127.03241221110027, | |
| "eval_kl": 0.5015190972222222, | |
| "eval_loss": 0.020167144015431404, | |
| "eval_reward": 1.275462998284234, | |
| "eval_reward_std": 0.2603969905111525, | |
| "eval_rewards/accuracy_reward": 0.3298611177338494, | |
| "eval_rewards/format_reward": 0.9456018673049079, | |
| "eval_runtime": 45.7247, | |
| "eval_samples_per_second": 2.165, | |
| "eval_steps_per_second": 0.197, | |
| "step": 850 | |
| }, | |
| { | |
| "completion_length": 102.3765655517578, | |
| "epoch": 0.569819446745072, | |
| "grad_norm": 0.3569597601890564, | |
| "kl": 0.54814453125, | |
| "learning_rate": 9.306538008253611e-06, | |
| "loss": 0.0219, | |
| "reward": 1.191406287252903, | |
| "reward_std": 0.2607411756180227, | |
| "rewards/accuracy_reward": 0.23203125474974512, | |
| "rewards/format_reward": 0.9593750193715096, | |
| "step": 860 | |
| }, | |
| { | |
| "completion_length": 92.75234651565552, | |
| "epoch": 0.5764452542653636, | |
| "grad_norm": 1.2177760601043701, | |
| "kl": 0.564990234375, | |
| "learning_rate": 9.075961397379247e-06, | |
| "loss": 0.0226, | |
| "reward": 1.2388021230697632, | |
| "reward_std": 0.224729376193136, | |
| "rewards/accuracy_reward": 0.2682291749864817, | |
| "rewards/format_reward": 0.9705729380249977, | |
| "step": 870 | |
| }, | |
| { | |
| "completion_length": 129.5666706085205, | |
| "epoch": 0.5830710617856552, | |
| "grad_norm": 0.3381529450416565, | |
| "kl": 0.604443359375, | |
| "learning_rate": 8.845879291805605e-06, | |
| "loss": 0.0242, | |
| "reward": 1.163541705906391, | |
| "reward_std": 0.27646171739324926, | |
| "rewards/accuracy_reward": 0.22343750707805157, | |
| "rewards/format_reward": 0.9401041865348816, | |
| "step": 880 | |
| }, | |
| { | |
| "completion_length": 111.10937881469727, | |
| "epoch": 0.5896968693059467, | |
| "grad_norm": 0.3694465160369873, | |
| "kl": 0.60166015625, | |
| "learning_rate": 8.616414821476048e-06, | |
| "loss": 0.0241, | |
| "reward": 1.1927083715796472, | |
| "reward_std": 0.2659361926838756, | |
| "rewards/accuracy_reward": 0.24088542442768812, | |
| "rewards/format_reward": 0.9518229395151139, | |
| "step": 890 | |
| }, | |
| { | |
| "completion_length": 120.31693096160889, | |
| "epoch": 0.5963226768262382, | |
| "grad_norm": 0.2243836671113968, | |
| "kl": 0.60830078125, | |
| "learning_rate": 8.387690785802403e-06, | |
| "loss": 0.0243, | |
| "reward": 1.1526041999459267, | |
| "reward_std": 0.25904723536223173, | |
| "rewards/accuracy_reward": 0.2093750056810677, | |
| "rewards/format_reward": 0.9432291865348816, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5963226768262382, | |
| "eval_completion_length": 118.34143914116754, | |
| "eval_kl": 0.5698784722222222, | |
| "eval_loss": 0.023158123716711998, | |
| "eval_reward": 1.29166669315762, | |
| "eval_reward_std": 0.26393843856122756, | |
| "eval_rewards/accuracy_reward": 0.34143519401550293, | |
| "eval_rewards/format_reward": 0.9502315057648553, | |
| "eval_runtime": 45.2036, | |
| "eval_samples_per_second": 2.19, | |
| "eval_steps_per_second": 0.199, | |
| "step": 900 | |
| }, | |
| { | |
| "completion_length": 120.57187824249267, | |
| "epoch": 0.6029484843465297, | |
| "grad_norm": 0.4404103755950928, | |
| "kl": 0.54873046875, | |
| "learning_rate": 8.159829587948048e-06, | |
| "loss": 0.0219, | |
| "reward": 1.199218788743019, | |
| "reward_std": 0.28078800477087495, | |
| "rewards/accuracy_reward": 0.2500000085681677, | |
| "rewards/format_reward": 0.9492187678813935, | |
| "step": 910 | |
| }, | |
| { | |
| "completion_length": 118.68646268844604, | |
| "epoch": 0.6095742918668212, | |
| "grad_norm": 0.33018258213996887, | |
| "kl": 0.52841796875, | |
| "learning_rate": 7.932953169323057e-06, | |
| "loss": 0.0211, | |
| "reward": 1.2174479514360428, | |
| "reward_std": 0.2616381015628576, | |
| "rewards/accuracy_reward": 0.26302084103226664, | |
| "rewards/format_reward": 0.9544271022081375, | |
| "step": 920 | |
| }, | |
| { | |
| "completion_length": 109.28151321411133, | |
| "epoch": 0.6162000993871128, | |
| "grad_norm": 0.2895837128162384, | |
| "kl": 0.546044921875, | |
| "learning_rate": 7.70718294432646e-06, | |
| "loss": 0.0218, | |
| "reward": 1.2164062917232514, | |
| "reward_std": 0.2531652105972171, | |
| "rewards/accuracy_reward": 0.25807292461395265, | |
| "rewards/format_reward": 0.958333358168602, | |
| "step": 930 | |
| }, | |
| { | |
| "completion_length": 119.46328525543213, | |
| "epoch": 0.6228259069074044, | |
| "grad_norm": 0.41079920530319214, | |
| "kl": 0.64697265625, | |
| "learning_rate": 7.482639735370536e-06, | |
| "loss": 0.0259, | |
| "reward": 1.1802083820104599, | |
| "reward_std": 0.2645995236933231, | |
| "rewards/accuracy_reward": 0.23177084028720857, | |
| "rewards/format_reward": 0.9484375163912773, | |
| "step": 940 | |
| }, | |
| { | |
| "completion_length": 121.86719093322753, | |
| "epoch": 0.6294517144276959, | |
| "grad_norm": 0.3652746379375458, | |
| "kl": 0.551953125, | |
| "learning_rate": 7.2594437082219074e-06, | |
| "loss": 0.0221, | |
| "reward": 1.2138021171092988, | |
| "reward_std": 0.2821790289133787, | |
| "rewards/accuracy_reward": 0.26484375884756445, | |
| "rewards/format_reward": 0.9489583566784858, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6294517144276959, | |
| "eval_completion_length": 100.6828727722168, | |
| "eval_kl": 0.53515625, | |
| "eval_loss": 0.021696042269468307, | |
| "eval_reward": 1.31250003973643, | |
| "eval_reward_std": 0.2856230421198739, | |
| "eval_rewards/accuracy_reward": 0.346064825852712, | |
| "eval_rewards/format_reward": 0.9664352138837179, | |
| "eval_runtime": 45.1594, | |
| "eval_samples_per_second": 2.192, | |
| "eval_steps_per_second": 0.199, | |
| "step": 950 | |
| }, | |
| { | |
| "completion_length": 118.0130241394043, | |
| "epoch": 0.6360775219479874, | |
| "grad_norm": 0.34392473101615906, | |
| "kl": 0.523291015625, | |
| "learning_rate": 7.037714307694038e-06, | |
| "loss": 0.0209, | |
| "reward": 1.214843788743019, | |
| "reward_std": 0.26948558650910853, | |
| "rewards/accuracy_reward": 0.2541666762903333, | |
| "rewards/format_reward": 0.9606771022081375, | |
| "step": 960 | |
| }, | |
| { | |
| "completion_length": 129.96614933013916, | |
| "epoch": 0.6427033294682789, | |
| "grad_norm": 0.352062463760376, | |
| "kl": 0.509033203125, | |
| "learning_rate": 6.8175701937255645e-06, | |
| "loss": 0.0204, | |
| "reward": 1.208854216337204, | |
| "reward_std": 0.2845765814185143, | |
| "rewards/accuracy_reward": 0.25598959121853115, | |
| "rewards/format_reward": 0.9528646022081375, | |
| "step": 970 | |
| }, | |
| { | |
| "completion_length": 164.04010906219483, | |
| "epoch": 0.6493291369885705, | |
| "grad_norm": 0.3061555325984955, | |
| "kl": 0.554931640625, | |
| "learning_rate": 6.5991291778786556e-06, | |
| "loss": 0.0222, | |
| "reward": 1.2091146275401115, | |
| "reward_std": 0.34185091145336627, | |
| "rewards/accuracy_reward": 0.29166667480021713, | |
| "rewards/format_reward": 0.9174479395151138, | |
| "step": 980 | |
| }, | |
| { | |
| "completion_length": 109.25807552337646, | |
| "epoch": 0.655954944508862, | |
| "grad_norm": 0.42008864879608154, | |
| "kl": 0.569482421875, | |
| "learning_rate": 6.38250816029139e-06, | |
| "loss": 0.0228, | |
| "reward": 1.2244792103767395, | |
| "reward_std": 0.2647375027649105, | |
| "rewards/accuracy_reward": 0.2721354251727462, | |
| "rewards/format_reward": 0.9523437708616257, | |
| "step": 990 | |
| }, | |
| { | |
| "completion_length": 90.79297113418579, | |
| "epoch": 0.6625807520291536, | |
| "grad_norm": 0.37791749835014343, | |
| "kl": 0.554296875, | |
| "learning_rate": 6.167823067117868e-06, | |
| "loss": 0.0222, | |
| "reward": 1.2403646260499954, | |
| "reward_std": 0.2317359633743763, | |
| "rewards/accuracy_reward": 0.27578125735744835, | |
| "rewards/format_reward": 0.9645833507180214, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6625807520291536, | |
| "eval_completion_length": 90.40393786960178, | |
| "eval_kl": 0.5368923611111112, | |
| "eval_loss": 0.021688800305128098, | |
| "eval_reward": 1.3587963183720906, | |
| "eval_reward_std": 0.21392729216151768, | |
| "eval_rewards/accuracy_reward": 0.38657407959302265, | |
| "eval_rewards/format_reward": 0.9722222288449606, | |
| "eval_runtime": 38.2921, | |
| "eval_samples_per_second": 2.585, | |
| "eval_steps_per_second": 0.235, | |
| "step": 1000 | |
| }, | |
| { | |
| "completion_length": 118.42370166778565, | |
| "epoch": 0.6692065595494451, | |
| "grad_norm": 0.32570692896842957, | |
| "kl": 0.533837890625, | |
| "learning_rate": 5.955188788489583e-06, | |
| "loss": 0.0214, | |
| "reward": 1.2507812827825546, | |
| "reward_std": 0.27148876488208773, | |
| "rewards/accuracy_reward": 0.30078125968575475, | |
| "rewards/format_reward": 0.9500000193715096, | |
| "step": 1010 | |
| }, | |
| { | |
| "completion_length": 134.26146202087403, | |
| "epoch": 0.6758323670697366, | |
| "grad_norm": 0.42975538969039917, | |
| "kl": 0.4955078125, | |
| "learning_rate": 5.744719117031217e-06, | |
| "loss": 0.0198, | |
| "reward": 1.2127604454755783, | |
| "reward_std": 0.2835965741425753, | |
| "rewards/accuracy_reward": 0.26354167312383653, | |
| "rewards/format_reward": 0.9492187723517418, | |
| "step": 1020 | |
| }, | |
| { | |
| "completion_length": 134.5783903121948, | |
| "epoch": 0.6824581745900281, | |
| "grad_norm": 0.36342352628707886, | |
| "kl": 0.459912109375, | |
| "learning_rate": 5.536526686963762e-06, | |
| "loss": 0.0184, | |
| "reward": 1.2119792059063912, | |
| "reward_std": 0.283594464790076, | |
| "rewards/accuracy_reward": 0.2687500087544322, | |
| "rewards/format_reward": 0.9432291865348816, | |
| "step": 1030 | |
| }, | |
| { | |
| "completion_length": 108.19531602859497, | |
| "epoch": 0.6890839821103197, | |
| "grad_norm": 0.332270085811615, | |
| "kl": 0.5037109375, | |
| "learning_rate": 5.330722913827594e-06, | |
| "loss": 0.0202, | |
| "reward": 1.2489583656191825, | |
| "reward_std": 0.26699374951422217, | |
| "rewards/accuracy_reward": 0.29557292610406877, | |
| "rewards/format_reward": 0.9533854365348816, | |
| "step": 1040 | |
| }, | |
| { | |
| "completion_length": 120.81276388168335, | |
| "epoch": 0.6957097896306113, | |
| "grad_norm": 0.32219088077545166, | |
| "kl": 0.5189453125, | |
| "learning_rate": 5.127417934857718e-06, | |
| "loss": 0.0208, | |
| "reward": 1.211718785762787, | |
| "reward_std": 0.2897366590797901, | |
| "rewards/accuracy_reward": 0.2705729234963655, | |
| "rewards/format_reward": 0.9411458536982537, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6957097896306113, | |
| "eval_completion_length": 106.52430894639757, | |
| "eval_kl": 0.5453559027777778, | |
| "eval_loss": 0.02215980552136898, | |
| "eval_reward": 1.3611111508475409, | |
| "eval_reward_std": 0.23599610891607073, | |
| "eval_rewards/accuracy_reward": 0.4062500099341075, | |
| "eval_rewards/format_reward": 0.9548611243565878, | |
| "eval_runtime": 44.9455, | |
| "eval_samples_per_second": 2.203, | |
| "eval_steps_per_second": 0.2, | |
| "step": 1050 | |
| }, | |
| { | |
| "completion_length": 107.07890901565551, | |
| "epoch": 0.7023355971509028, | |
| "grad_norm": 0.2685917019844055, | |
| "kl": 0.576708984375, | |
| "learning_rate": 4.926720550043089e-06, | |
| "loss": 0.0231, | |
| "reward": 1.229166704416275, | |
| "reward_std": 0.2693328620865941, | |
| "rewards/accuracy_reward": 0.2820312589406967, | |
| "rewards/format_reward": 0.9471354424953461, | |
| "step": 1060 | |
| }, | |
| { | |
| "completion_length": 102.21718997955323, | |
| "epoch": 0.7089614046711943, | |
| "grad_norm": 0.42000600695610046, | |
| "kl": 0.53173828125, | |
| "learning_rate": 4.728738163901597e-06, | |
| "loss": 0.0213, | |
| "reward": 1.2791667014360428, | |
| "reward_std": 0.2512880745343864, | |
| "rewards/accuracy_reward": 0.32005209028720855, | |
| "rewards/format_reward": 0.9591146051883698, | |
| "step": 1070 | |
| }, | |
| { | |
| "completion_length": 106.98437767028808, | |
| "epoch": 0.7155872121914858, | |
| "grad_norm": 0.350292444229126, | |
| "kl": 0.598828125, | |
| "learning_rate": 4.533576728001858e-06, | |
| "loss": 0.0239, | |
| "reward": 1.210156288743019, | |
| "reward_std": 0.2518887486308813, | |
| "rewards/accuracy_reward": 0.257031256519258, | |
| "rewards/format_reward": 0.9531250268220901, | |
| "step": 1080 | |
| }, | |
| { | |
| "completion_length": 113.01172313690185, | |
| "epoch": 0.7222130197117774, | |
| "grad_norm": 0.41227367520332336, | |
| "kl": 0.553515625, | |
| "learning_rate": 4.341340684262498e-06, | |
| "loss": 0.0221, | |
| "reward": 1.2361979573965072, | |
| "reward_std": 0.24922715383581817, | |
| "rewards/accuracy_reward": 0.28906251015141604, | |
| "rewards/format_reward": 0.9471354365348816, | |
| "step": 1090 | |
| }, | |
| { | |
| "completion_length": 120.65026445388794, | |
| "epoch": 0.7288388272320689, | |
| "grad_norm": 0.35520121455192566, | |
| "kl": 0.53955078125, | |
| "learning_rate": 4.152132909059402e-06, | |
| "loss": 0.0216, | |
| "reward": 1.2221354484558105, | |
| "reward_std": 0.26356869330629706, | |
| "rewards/accuracy_reward": 0.27500000689178705, | |
| "rewards/format_reward": 0.9471354350447655, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7288388272320689, | |
| "eval_completion_length": 118.43750423855252, | |
| "eval_kl": 0.4963107638888889, | |
| "eval_loss": 0.01998673938214779, | |
| "eval_reward": 1.339120414521959, | |
| "eval_reward_std": 0.21001804454459083, | |
| "eval_rewards/accuracy_reward": 0.3865740829043918, | |
| "eval_rewards/format_reward": 0.9525463117493523, | |
| "eval_runtime": 45.4912, | |
| "eval_samples_per_second": 2.176, | |
| "eval_steps_per_second": 0.198, | |
| "step": 1100 | |
| }, | |
| { | |
| "completion_length": 137.40937881469728, | |
| "epoch": 0.7354646347523605, | |
| "grad_norm": 0.48482638597488403, | |
| "kl": 0.568701171875, | |
| "learning_rate": 3.966054658170754e-06, | |
| "loss": 0.0227, | |
| "reward": 1.2109375342726707, | |
| "reward_std": 0.29270930401980877, | |
| "rewards/accuracy_reward": 0.2791666746139526, | |
| "rewards/format_reward": 0.931770858168602, | |
| "step": 1110 | |
| }, | |
| { | |
| "completion_length": 126.11432609558105, | |
| "epoch": 0.742090442272652, | |
| "grad_norm": 0.27705782651901245, | |
| "kl": 0.532177734375, | |
| "learning_rate": 3.7832055125893318e-06, | |
| "loss": 0.0213, | |
| "reward": 1.207291704416275, | |
| "reward_std": 0.246893934533, | |
| "rewards/accuracy_reward": 0.26119792368263006, | |
| "rewards/format_reward": 0.9460937708616257, | |
| "step": 1120 | |
| }, | |
| { | |
| "completion_length": 131.4484426498413, | |
| "epoch": 0.7487162497929435, | |
| "grad_norm": 0.530246376991272, | |
| "kl": 0.55673828125, | |
| "learning_rate": 3.6036833252310887e-06, | |
| "loss": 0.0223, | |
| "reward": 1.2307292073965073, | |
| "reward_std": 0.29129046984016893, | |
| "rewards/accuracy_reward": 0.2937500076368451, | |
| "rewards/format_reward": 0.9369791865348815, | |
| "step": 1130 | |
| }, | |
| { | |
| "completion_length": 120.24609689712524, | |
| "epoch": 0.755342057313235, | |
| "grad_norm": 0.24524401128292084, | |
| "kl": 0.5646484375, | |
| "learning_rate": 3.427584168568535e-06, | |
| "loss": 0.0226, | |
| "reward": 1.2190104603767395, | |
| "reward_std": 0.2685457500629127, | |
| "rewards/accuracy_reward": 0.27187500726431607, | |
| "rewards/format_reward": 0.9471354380249977, | |
| "step": 1140 | |
| }, | |
| { | |
| "completion_length": 102.51015863418579, | |
| "epoch": 0.7619678648335266, | |
| "grad_norm": 0.3422842025756836, | |
| "kl": 0.618212890625, | |
| "learning_rate": 3.2550022832169125e-06, | |
| "loss": 0.0247, | |
| "reward": 1.238281285762787, | |
| "reward_std": 0.23695164285600184, | |
| "rewards/accuracy_reward": 0.2778645927086473, | |
| "rewards/format_reward": 0.9604166835546494, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7619678648335266, | |
| "eval_completion_length": 107.59259626600478, | |
| "eval_kl": 0.6030815972222222, | |
| "eval_loss": 0.023637007921934128, | |
| "eval_reward": 1.3310185670852661, | |
| "eval_reward_std": 0.23936733272340563, | |
| "eval_rewards/accuracy_reward": 0.37731482254134285, | |
| "eval_rewards/format_reward": 0.9537037213643392, | |
| "eval_runtime": 44.103, | |
| "eval_samples_per_second": 2.245, | |
| "eval_steps_per_second": 0.204, | |
| "step": 1150 | |
| }, | |
| { | |
| "completion_length": 113.57943124771118, | |
| "epoch": 0.7685936723538181, | |
| "grad_norm": 0.5340375900268555, | |
| "kl": 0.68115234375, | |
| "learning_rate": 3.086030027500728e-06, | |
| "loss": 0.0272, | |
| "reward": 1.2208333790302277, | |
| "reward_std": 0.27482070587575436, | |
| "rewards/accuracy_reward": 0.2721354249864817, | |
| "rewards/format_reward": 0.94869794100523, | |
| "step": 1160 | |
| }, | |
| { | |
| "completion_length": 121.99713821411133, | |
| "epoch": 0.7752194798741097, | |
| "grad_norm": 0.42114803194999695, | |
| "kl": 0.538671875, | |
| "learning_rate": 2.920757828027586e-06, | |
| "loss": 0.0216, | |
| "reward": 1.2221354573965073, | |
| "reward_std": 0.29092769548296926, | |
| "rewards/accuracy_reward": 0.2809895912185311, | |
| "rewards/format_reward": 0.9411458536982537, | |
| "step": 1170 | |
| }, | |
| { | |
| "completion_length": 109.28125295639038, | |
| "epoch": 0.7818452873944012, | |
| "grad_norm": 0.39234262704849243, | |
| "kl": 0.55908203125, | |
| "learning_rate": 2.759274131295787e-06, | |
| "loss": 0.0224, | |
| "reward": 1.261979202926159, | |
| "reward_std": 0.2803305257111788, | |
| "rewards/accuracy_reward": 0.3132812598254532, | |
| "rewards/format_reward": 0.9486979350447655, | |
| "step": 1180 | |
| }, | |
| { | |
| "completion_length": 104.11823234558105, | |
| "epoch": 0.7884710949146927, | |
| "grad_norm": 0.49963539838790894, | |
| "kl": 0.56357421875, | |
| "learning_rate": 2.60166535636162e-06, | |
| "loss": 0.0225, | |
| "reward": 1.253125038743019, | |
| "reward_std": 0.25115325963124635, | |
| "rewards/accuracy_reward": 0.2994791757315397, | |
| "rewards/format_reward": 0.9536458536982536, | |
| "step": 1190 | |
| }, | |
| { | |
| "completion_length": 106.06250343322753, | |
| "epoch": 0.7950969024349842, | |
| "grad_norm": 0.42556238174438477, | |
| "kl": 0.54150390625, | |
| "learning_rate": 2.448015848591638e-06, | |
| "loss": 0.0217, | |
| "reward": 1.253906285762787, | |
| "reward_std": 0.2610521188005805, | |
| "rewards/accuracy_reward": 0.2997395915910602, | |
| "rewards/format_reward": 0.9541666850447654, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7950969024349842, | |
| "eval_completion_length": 104.84143914116754, | |
| "eval_kl": 0.5355902777777778, | |
| "eval_loss": 0.02168433926999569, | |
| "eval_reward": 1.376157455974155, | |
| "eval_reward_std": 0.18614381965663698, | |
| "eval_rewards/accuracy_reward": 0.4178240829043918, | |
| "eval_rewards/format_reward": 0.9583333532015482, | |
| "eval_runtime": 41.5934, | |
| "eval_samples_per_second": 2.38, | |
| "eval_steps_per_second": 0.216, | |
| "step": 1200 | |
| }, | |
| { | |
| "completion_length": 113.52969074249268, | |
| "epoch": 0.8017227099552758, | |
| "grad_norm": 0.3254588842391968, | |
| "kl": 0.56806640625, | |
| "learning_rate": 2.298407834524682e-06, | |
| "loss": 0.0227, | |
| "reward": 1.269270870089531, | |
| "reward_std": 0.253192731551826, | |
| "rewards/accuracy_reward": 0.31822917647659776, | |
| "rewards/format_reward": 0.9510416895151138, | |
| "step": 1210 | |
| }, | |
| { | |
| "completion_length": 112.07031574249268, | |
| "epoch": 0.8083485174755674, | |
| "grad_norm": 0.2393941879272461, | |
| "kl": 0.50380859375, | |
| "learning_rate": 2.1529213778677993e-06, | |
| "loss": 0.0202, | |
| "reward": 1.2791667133569717, | |
| "reward_std": 0.24380711056292056, | |
| "rewards/accuracy_reward": 0.32239584140479566, | |
| "rewards/format_reward": 0.9567708551883698, | |
| "step": 1220 | |
| }, | |
| { | |
| "completion_length": 103.30625295639038, | |
| "epoch": 0.8149743249958589, | |
| "grad_norm": 0.5480020046234131, | |
| "kl": 0.520361328125, | |
| "learning_rate": 2.0116343366496493e-06, | |
| "loss": 0.0208, | |
| "reward": 1.289062535762787, | |
| "reward_std": 0.2560120256617665, | |
| "rewards/accuracy_reward": 0.33281251094304026, | |
| "rewards/format_reward": 0.9562500178813934, | |
| "step": 1230 | |
| }, | |
| { | |
| "completion_length": 111.6716178894043, | |
| "epoch": 0.8216001325161504, | |
| "grad_norm": 0.4021623134613037, | |
| "kl": 0.5818359375, | |
| "learning_rate": 1.8746223215542482e-06, | |
| "loss": 0.0233, | |
| "reward": 1.2690104588866233, | |
| "reward_std": 0.2605089288204908, | |
| "rewards/accuracy_reward": 0.31901042386889455, | |
| "rewards/format_reward": 0.9500000193715096, | |
| "step": 1240 | |
| }, | |
| { | |
| "completion_length": 110.94271154403687, | |
| "epoch": 0.8282259400364419, | |
| "grad_norm": 1.2661796808242798, | |
| "kl": 0.56083984375, | |
| "learning_rate": 1.7419586554574364e-06, | |
| "loss": 0.0224, | |
| "reward": 1.2833333700895309, | |
| "reward_std": 0.27457276433706285, | |
| "rewards/accuracy_reward": 0.32838542591780423, | |
| "rewards/format_reward": 0.95494794100523, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8282259400364419, | |
| "eval_completion_length": 103.17708587646484, | |
| "eval_kl": 0.4845920138888889, | |
| "eval_loss": 0.019484883174300194, | |
| "eval_reward": 1.3888889286253188, | |
| "eval_reward_std": 0.19302751620610556, | |
| "eval_rewards/accuracy_reward": 0.4236111210452186, | |
| "eval_rewards/format_reward": 0.9652777910232544, | |
| "eval_runtime": 44.455, | |
| "eval_samples_per_second": 2.227, | |
| "eval_steps_per_second": 0.202, | |
| "step": 1250 | |
| }, | |
| { | |
| "completion_length": 113.02031555175782, | |
| "epoch": 0.8348517475567335, | |
| "grad_norm": 0.4074391722679138, | |
| "kl": 0.49287109375, | |
| "learning_rate": 1.6137143341876439e-06, | |
| "loss": 0.0197, | |
| "reward": 1.2604166984558105, | |
| "reward_std": 0.25771117191761733, | |
| "rewards/accuracy_reward": 0.3046875074505806, | |
| "rewards/format_reward": 0.9557291865348816, | |
| "step": 1260 | |
| }, | |
| { | |
| "completion_length": 128.30312900543214, | |
| "epoch": 0.841477555077025, | |
| "grad_norm": 0.3753833472728729, | |
| "kl": 0.550244140625, | |
| "learning_rate": 1.4899579885320237e-06, | |
| "loss": 0.022, | |
| "reward": 1.2690104633569717, | |
| "reward_std": 0.29735342264175413, | |
| "rewards/accuracy_reward": 0.3283854264765978, | |
| "rewards/format_reward": 0.9406250223517418, | |
| "step": 1270 | |
| }, | |
| { | |
| "completion_length": 131.21823406219482, | |
| "epoch": 0.8481033625973166, | |
| "grad_norm": 0.3492971658706665, | |
| "kl": 0.51591796875, | |
| "learning_rate": 1.370755847508226e-06, | |
| "loss": 0.0206, | |
| "reward": 1.251562537252903, | |
| "reward_std": 0.28482332453131676, | |
| "rewards/accuracy_reward": 0.31432292619720104, | |
| "rewards/format_reward": 0.9372396051883698, | |
| "step": 1280 | |
| }, | |
| { | |
| "completion_length": 122.89974346160889, | |
| "epoch": 0.8547291701176081, | |
| "grad_norm": 0.4898810088634491, | |
| "kl": 0.5181640625, | |
| "learning_rate": 1.256171702921516e-06, | |
| "loss": 0.0207, | |
| "reward": 1.2828125476837158, | |
| "reward_std": 0.271611943654716, | |
| "rewards/accuracy_reward": 0.34062500968575476, | |
| "rewards/format_reward": 0.9421875193715096, | |
| "step": 1290 | |
| }, | |
| { | |
| "completion_length": 113.66250410079957, | |
| "epoch": 0.8613549776378996, | |
| "grad_norm": 0.4513114392757416, | |
| "kl": 0.523193359375, | |
| "learning_rate": 1.1462668752261652e-06, | |
| "loss": 0.0209, | |
| "reward": 1.2484375417232514, | |
| "reward_std": 0.2623301435261965, | |
| "rewards/accuracy_reward": 0.2979166738688946, | |
| "rewards/format_reward": 0.9505208566784858, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8613549776378996, | |
| "eval_completion_length": 99.94213189019098, | |
| "eval_kl": 0.4971788194444444, | |
| "eval_loss": 0.02001408487558365, | |
| "eval_reward": 1.378472261958652, | |
| "eval_reward_std": 0.20051221052805582, | |
| "eval_rewards/accuracy_reward": 0.41203704807493424, | |
| "eval_rewards/format_reward": 0.9664352072609795, | |
| "eval_runtime": 44.0864, | |
| "eval_samples_per_second": 2.246, | |
| "eval_steps_per_second": 0.204, | |
| "step": 1300 | |
| }, | |
| { | |
| "completion_length": 104.8481806755066, | |
| "epoch": 0.8679807851581911, | |
| "grad_norm": 0.4572502672672272, | |
| "kl": 0.547412109375, | |
| "learning_rate": 1.04110018070941e-06, | |
| "loss": 0.0219, | |
| "reward": 1.236718800663948, | |
| "reward_std": 0.2456181443296373, | |
| "rewards/accuracy_reward": 0.2802083441987634, | |
| "rewards/format_reward": 0.9565104335546494, | |
| "step": 1310 | |
| }, | |
| { | |
| "completion_length": 106.76484680175781, | |
| "epoch": 0.8746065926784827, | |
| "grad_norm": 0.4408782422542572, | |
| "kl": 0.542333984375, | |
| "learning_rate": 9.407279000155311e-07, | |
| "loss": 0.0217, | |
| "reward": 1.2541667014360427, | |
| "reward_std": 0.24311191439628602, | |
| "rewards/accuracy_reward": 0.3036458421498537, | |
| "rewards/format_reward": 0.9505208551883697, | |
| "step": 1320 | |
| }, | |
| { | |
| "completion_length": 107.94062728881836, | |
| "epoch": 0.8812324001987742, | |
| "grad_norm": 0.3998057246208191, | |
| "kl": 0.515771484375, | |
| "learning_rate": 8.452037480269082e-07, | |
| "loss": 0.0206, | |
| "reward": 1.2924479633569717, | |
| "reward_std": 0.2722974482923746, | |
| "rewards/accuracy_reward": 0.3382812574505806, | |
| "rewards/format_reward": 0.9541666865348816, | |
| "step": 1330 | |
| }, | |
| { | |
| "completion_length": 95.11302337646484, | |
| "epoch": 0.8878582077190658, | |
| "grad_norm": 0.6737642288208008, | |
| "kl": 0.559130859375, | |
| "learning_rate": 7.545788451181313e-07, | |
| "loss": 0.0224, | |
| "reward": 1.2661458790302276, | |
| "reward_std": 0.23880463000386953, | |
| "rewards/accuracy_reward": 0.3049479253590107, | |
| "rewards/format_reward": 0.9611979395151138, | |
| "step": 1340 | |
| }, | |
| { | |
| "completion_length": 101.32344017028808, | |
| "epoch": 0.8944840152393573, | |
| "grad_norm": 0.4838802218437195, | |
| "kl": 0.556005859375, | |
| "learning_rate": 6.689016897986123e-07, | |
| "loss": 0.0222, | |
| "reward": 1.2570312947034836, | |
| "reward_std": 0.2493739674333483, | |
| "rewards/accuracy_reward": 0.30286459140479566, | |
| "rewards/format_reward": 0.9541666895151139, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8944840152393573, | |
| "eval_completion_length": 93.02315097384982, | |
| "eval_kl": 0.5193142361111112, | |
| "eval_loss": 0.021245010197162628, | |
| "eval_reward": 1.3842592901653714, | |
| "eval_reward_std": 0.20025065706835854, | |
| "eval_rewards/accuracy_reward": 0.41550926367441815, | |
| "eval_rewards/format_reward": 0.968750019868215, | |
| "eval_runtime": 40.9157, | |
| "eval_samples_per_second": 2.42, | |
| "eval_steps_per_second": 0.22, | |
| "step": 1350 | |
| }, | |
| { | |
| "completion_length": 101.27943019866943, | |
| "epoch": 0.9011098227596488, | |
| "grad_norm": 0.5689716339111328, | |
| "kl": 0.5677734375, | |
| "learning_rate": 5.88218132758287e-07, | |
| "loss": 0.0227, | |
| "reward": 1.2895833730697632, | |
| "reward_std": 0.25514377616345885, | |
| "rewards/accuracy_reward": 0.3346354255452752, | |
| "rewards/format_reward": 0.9549479365348816, | |
| "step": 1360 | |
| }, | |
| { | |
| "completion_length": 106.12890930175782, | |
| "epoch": 0.9077356302799404, | |
| "grad_norm": 0.39908653497695923, | |
| "kl": 0.56484375, | |
| "learning_rate": 5.125713523303133e-07, | |
| "loss": 0.0226, | |
| "reward": 1.2458333760499953, | |
| "reward_std": 0.24504322968423367, | |
| "rewards/accuracy_reward": 0.29401042591780424, | |
| "rewards/format_reward": 0.9518229365348816, | |
| "step": 1370 | |
| }, | |
| { | |
| "completion_length": 115.99088821411132, | |
| "epoch": 0.9143614378002319, | |
| "grad_norm": 1.2312554121017456, | |
| "kl": 0.5458984375, | |
| "learning_rate": 4.420018313839147e-07, | |
| "loss": 0.0218, | |
| "reward": 1.2518229544162751, | |
| "reward_std": 0.27393123134970665, | |
| "rewards/accuracy_reward": 0.30729167610406877, | |
| "rewards/format_reward": 0.9445312738418579, | |
| "step": 1380 | |
| }, | |
| { | |
| "completion_length": 103.67317962646484, | |
| "epoch": 0.9209872453205235, | |
| "grad_norm": 0.2879469394683838, | |
| "kl": 0.53359375, | |
| "learning_rate": 3.7654733565969826e-07, | |
| "loss": 0.0213, | |
| "reward": 1.284375038743019, | |
| "reward_std": 0.26313075572252276, | |
| "rewards/accuracy_reward": 0.33307292610406875, | |
| "rewards/format_reward": 0.9513021066784859, | |
| "step": 1390 | |
| }, | |
| { | |
| "completion_length": 108.21693058013916, | |
| "epoch": 0.927613052840815, | |
| "grad_norm": 0.45352187752723694, | |
| "kl": 0.548974609375, | |
| "learning_rate": 3.1624289355907334e-07, | |
| "loss": 0.022, | |
| "reward": 1.2638021171092988, | |
| "reward_std": 0.25347734354436396, | |
| "rewards/accuracy_reward": 0.31223959382623434, | |
| "rewards/format_reward": 0.9515625208616256, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.927613052840815, | |
| "eval_completion_length": 89.58449384901259, | |
| "eval_kl": 0.5271267361111112, | |
| "eval_loss": 0.021207302808761597, | |
| "eval_reward": 1.3958333730697632, | |
| "eval_reward_std": 0.21908769508202872, | |
| "eval_rewards/accuracy_reward": 0.42939815587467617, | |
| "eval_rewards/format_reward": 0.9664352006382413, | |
| "eval_runtime": 40.9484, | |
| "eval_samples_per_second": 2.418, | |
| "eval_steps_per_second": 0.22, | |
| "step": 1400 | |
| }, | |
| { | |
| "completion_length": 103.04479503631592, | |
| "epoch": 0.9342388603611065, | |
| "grad_norm": 0.3517305552959442, | |
| "kl": 0.640283203125, | |
| "learning_rate": 2.6112077739857465e-07, | |
| "loss": 0.0256, | |
| "reward": 1.2666667103767395, | |
| "reward_std": 0.24853656738996505, | |
| "rewards/accuracy_reward": 0.3114583415910602, | |
| "rewards/format_reward": 0.9552083522081375, | |
| "step": 1410 | |
| }, | |
| { | |
| "completion_length": 101.159898853302, | |
| "epoch": 0.940864667881398, | |
| "grad_norm": 0.5444666743278503, | |
| "kl": 0.58466796875, | |
| "learning_rate": 2.1121048613912843e-07, | |
| "loss": 0.0234, | |
| "reward": 1.269270870089531, | |
| "reward_std": 0.2637180283665657, | |
| "rewards/accuracy_reward": 0.31562500819563866, | |
| "rewards/format_reward": 0.9536458507180214, | |
| "step": 1420 | |
| }, | |
| { | |
| "completion_length": 107.42942943572999, | |
| "epoch": 0.9474904754016896, | |
| "grad_norm": 0.3939830958843231, | |
| "kl": 0.549267578125, | |
| "learning_rate": 1.665387295994747e-07, | |
| "loss": 0.022, | |
| "reward": 1.2708333641290666, | |
| "reward_std": 0.2713921457529068, | |
| "rewards/accuracy_reward": 0.3205729253590107, | |
| "rewards/format_reward": 0.9502604350447654, | |
| "step": 1430 | |
| }, | |
| { | |
| "completion_length": 110.02474231719971, | |
| "epoch": 0.9541162829219811, | |
| "grad_norm": 0.45775726437568665, | |
| "kl": 0.557666015625, | |
| "learning_rate": 1.271294141622459e-07, | |
| "loss": 0.0223, | |
| "reward": 1.2721354573965074, | |
| "reward_std": 0.2720890769734979, | |
| "rewards/accuracy_reward": 0.3260416738688946, | |
| "rewards/format_reward": 0.9460937708616257, | |
| "step": 1440 | |
| }, | |
| { | |
| "completion_length": 107.73151302337646, | |
| "epoch": 0.9607420904422727, | |
| "grad_norm": 0.5883992910385132, | |
| "kl": 0.538671875, | |
| "learning_rate": 9.300362998030832e-08, | |
| "loss": 0.0215, | |
| "reward": 1.290364620089531, | |
| "reward_std": 0.27429741993546486, | |
| "rewards/accuracy_reward": 0.3398437587544322, | |
| "rewards/format_reward": 0.9505208551883697, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9607420904422727, | |
| "eval_completion_length": 102.1273176405165, | |
| "eval_kl": 0.5319010416666666, | |
| "eval_loss": 0.021446138620376587, | |
| "eval_reward": 1.3912037346098158, | |
| "eval_reward_std": 0.23611565927664438, | |
| "eval_rewards/accuracy_reward": 0.4351851973268721, | |
| "eval_rewards/format_reward": 0.9560185339715745, | |
| "eval_runtime": 44.1752, | |
| "eval_samples_per_second": 2.241, | |
| "eval_steps_per_second": 0.204, | |
| "step": 1450 | |
| }, | |
| { | |
| "completion_length": 107.99531545639039, | |
| "epoch": 0.9673678979625642, | |
| "grad_norm": 0.46655991673469543, | |
| "kl": 0.56240234375, | |
| "learning_rate": 6.417963969022389e-08, | |
| "loss": 0.0225, | |
| "reward": 1.2661458730697632, | |
| "reward_std": 0.2660699520260096, | |
| "rewards/accuracy_reward": 0.3182291751727462, | |
| "rewards/format_reward": 0.9479166835546493, | |
| "step": 1460 | |
| }, | |
| { | |
| "completion_length": 113.53489923477173, | |
| "epoch": 0.9739937054828557, | |
| "grad_norm": 0.38386690616607666, | |
| "kl": 0.53984375, | |
| "learning_rate": 4.067286863888131e-08, | |
| "loss": 0.0216, | |
| "reward": 1.2630208790302277, | |
| "reward_std": 0.2672860164195299, | |
| "rewards/accuracy_reward": 0.3145833432674408, | |
| "rewards/format_reward": 0.9484375193715096, | |
| "step": 1470 | |
| }, | |
| { | |
| "completion_length": 101.46328430175781, | |
| "epoch": 0.9806195130031473, | |
| "grad_norm": 0.6313057541847229, | |
| "kl": 0.540576171875, | |
| "learning_rate": 2.2495896628529355e-08, | |
| "loss": 0.0216, | |
| "reward": 1.2971354603767395, | |
| "reward_std": 0.24062121249735355, | |
| "rewards/accuracy_reward": 0.34166667591780425, | |
| "rewards/format_reward": 0.9554687708616256, | |
| "step": 1480 | |
| }, | |
| { | |
| "completion_length": 106.94739875793456, | |
| "epoch": 0.9872453205234388, | |
| "grad_norm": 0.3773626685142517, | |
| "kl": 0.53203125, | |
| "learning_rate": 9.658451184600959e-09, | |
| "loss": 0.0213, | |
| "reward": 1.2572917103767396, | |
| "reward_std": 0.25889183739200233, | |
| "rewards/accuracy_reward": 0.3059895936399698, | |
| "rewards/format_reward": 0.9513021022081375, | |
| "step": 1490 | |
| }, | |
| { | |
| "completion_length": 110.40052452087403, | |
| "epoch": 0.9938711280437303, | |
| "grad_norm": 0.40007439255714417, | |
| "kl": 0.56728515625, | |
| "learning_rate": 2.167402349972925e-09, | |
| "loss": 0.0227, | |
| "reward": 1.252343788743019, | |
| "reward_std": 0.2644004987552762, | |
| "rewards/accuracy_reward": 0.30546875689178704, | |
| "rewards/format_reward": 0.946875025331974, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9938711280437303, | |
| "eval_completion_length": 100.44560368855794, | |
| "eval_kl": 0.5397135416666666, | |
| "eval_loss": 0.0218037161976099, | |
| "eval_reward": 1.371527804268731, | |
| "eval_reward_std": 0.23327534563011593, | |
| "eval_rewards/accuracy_reward": 0.41782407959302265, | |
| "eval_rewards/format_reward": 0.9537037147416009, | |
| "eval_runtime": 43.5394, | |
| "eval_samples_per_second": 2.274, | |
| "eval_steps_per_second": 0.207, | |
| "step": 1500 | |
| }, | |
| { | |
| "completion_length": 108.79543187883165, | |
| "epoch": 0.9998343548119927, | |
| "kl": 0.5859917534722222, | |
| "reward": 1.2821180986033545, | |
| "reward_std": 0.28485159451762837, | |
| "rewards/accuracy_reward": 0.3333333449231254, | |
| "rewards/format_reward": 0.9487847404347526, | |
| "step": 1509, | |
| "total_flos": 0.0, | |
| "train_loss": 0.021261748100807297, | |
| "train_runtime": 42718.5249, | |
| "train_samples_per_second": 1.696, | |
| "train_steps_per_second": 0.035 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1509, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |