| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.37735849056603776, |
| "eval_steps": 500, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 332.5, |
| "completions/max_terminated_length": 332.5, |
| "completions/mean_length": 279.3291778564453, |
| "completions/mean_terminated_length": 279.3291778564453, |
| "completions/min_length": 220.8, |
| "completions/min_terminated_length": 220.8, |
| "entropy": 0.24179386061926683, |
| "epoch": 0.006289308176100629, |
| "frac_reward_zero_std": 0.10000000223517418, |
| "grad_norm": 0.3676387071609497, |
| "items/detected_max": 32.0, |
| "items/detected_mean": 26.7125, |
| "items/detected_min": 21.0, |
| "kl": 0.00015798515381296359, |
| "learning_rate": 1e-06, |
| "loss": 0.021, |
| "num_tokens": 279138.0, |
| "reward": 0.03066593613475561, |
| "reward_std": 0.024809306021779776, |
| "reward_total": 0.19474640637636184, |
| "rewards/pos_1/mean": 0.19474640265107154, |
| "rewards/pos_1/std": 0.07326587401330471, |
| "rewards/pos_10/mean": 0.011071646504569798, |
| "rewards/pos_10/std": 0.02179219771642238, |
| "rewards/pos_11/mean": 0.007418489165138454, |
| "rewards/pos_11/std": 0.01720283292233944, |
| "rewards/pos_12/mean": 0.006210686359554529, |
| "rewards/pos_12/std": 0.0179185833549127, |
| "rewards/pos_13/mean": 0.0039982261369004846, |
| "rewards/pos_13/std": 0.009272939292714, |
| "rewards/pos_14/mean": 0.0012971448712050914, |
| "rewards/pos_14/std": 0.007776029966771602, |
| "rewards/pos_15/mean": -0.0006621864042244852, |
| "rewards/pos_15/std": 0.005226330552250147, |
| "rewards/pos_16/mean": 0.0002548443153500557, |
| "rewards/pos_16/std": 0.0015290660317987203, |
| "rewards/pos_17/mean": -0.00012490232475101994, |
| "rewards/pos_17/std": 0.0002498046495020399, |
| "rewards/pos_18/mean": -4.625929407134921e-19, |
| "rewards/pos_18/std": 9.251858814269843e-19, |
| "rewards/pos_19/mean": -4.625929407134921e-19, |
| "rewards/pos_19/std": 9.251858814269843e-19, |
| "rewards/pos_2/mean": 0.11575367655605077, |
| "rewards/pos_2/std": 0.07039788216352463, |
| "rewards/pos_20/mean": -4.625929407134921e-19, |
| "rewards/pos_20/std": 9.251858814269843e-19, |
| "rewards/pos_3/mean": 0.08213621266186237, |
| "rewards/pos_3/std": 0.05401332974433899, |
| "rewards/pos_4/mean": 0.059673574194312094, |
| "rewards/pos_4/std": 0.05429626442492008, |
| "rewards/pos_5/mean": 0.04313204605132341, |
| "rewards/pos_5/std": 0.04542485494166613, |
| "rewards/pos_6/mean": 0.03173917992971838, |
| "rewards/pos_6/std": 0.0341308044269681, |
| "rewards/pos_7/mean": 0.025256002554669976, |
| "rewards/pos_7/std": 0.03226408157497644, |
| "rewards/pos_8/mean": 0.016526030853856354, |
| "rewards/pos_8/std": 0.027043108874931932, |
| "rewards/pos_9/mean": 0.014891598536632956, |
| "rewards/pos_9/std": 0.024382113991305233, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 337.5, |
| "completions/max_terminated_length": 337.5, |
| "completions/mean_length": 279.1000091552734, |
| "completions/mean_terminated_length": 279.1000091552734, |
| "completions/min_length": 221.7, |
| "completions/min_terminated_length": 221.7, |
| "entropy": 0.24853383572772145, |
| "epoch": 0.012578616352201259, |
| "frac_reward_zero_std": 0.07500000223517418, |
| "grad_norm": 0.3557882308959961, |
| "items/detected_max": 33.0, |
| "items/detected_mean": 26.795833333333334, |
| "items/detected_min": 20.0, |
| "kl": 0.00038001918375509073, |
| "learning_rate": 1e-06, |
| "loss": 0.0214, |
| "num_tokens": 561062.0, |
| "reward": 0.027369741536676883, |
| "reward_std": 0.028840648476034403, |
| "reward_total": 0.1867876473814249, |
| "rewards/pos_1/mean": 0.1867876447737217, |
| "rewards/pos_1/std": 0.09987052232027054, |
| "rewards/pos_10/mean": 0.005311872425954789, |
| "rewards/pos_10/std": 0.0194315308239311, |
| "rewards/pos_11/mean": 0.002390401461161673, |
| "rewards/pos_11/std": 0.012136956956237555, |
| "rewards/pos_12/mean": 0.0009011006564833224, |
| "rewards/pos_12/std": 0.013748793490231037, |
| "rewards/pos_13/mean": 0.0011169454897753893, |
| "rewards/pos_13/std": 0.010598769155330956, |
| "rewards/pos_14/mean": 0.000874718651175499, |
| "rewards/pos_14/std": 0.006548650283366442, |
| "rewards/pos_15/mean": 0.0003694009268656373, |
| "rewards/pos_15/std": 0.004986041854135692, |
| "rewards/pos_16/mean": 0.00024476498365402224, |
| "rewards/pos_16/std": 0.001508907275274396, |
| "rewards/pos_17/mean": 0.00012490232475101948, |
| "rewards/pos_17/std": 0.0017486325465142726, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": -0.0002410189714282751, |
| "rewards/pos_19/std": 0.0002783047268167138, |
| "rewards/pos_2/mean": 0.10891713332384825, |
| "rewards/pos_2/std": 0.08113598320633172, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.07684338670223952, |
| "rewards/pos_3/std": 0.0730495484545827, |
| "rewards/pos_4/mean": 0.05329529843293131, |
| "rewards/pos_4/std": 0.06378404032438993, |
| "rewards/pos_5/mean": 0.038301121164113286, |
| "rewards/pos_5/std": 0.05285612549632788, |
| "rewards/pos_6/mean": 0.026770615531131626, |
| "rewards/pos_6/std": 0.047124340385198596, |
| "rewards/pos_7/mean": 0.01945003040600568, |
| "rewards/pos_7/std": 0.034271925175562504, |
| "rewards/pos_8/mean": 0.015267963660880924, |
| "rewards/pos_8/std": 0.028611483983695507, |
| "rewards/pos_9/mean": 0.010668529532267712, |
| "rewards/pos_9/std": 0.02512239357456565, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 327.5, |
| "completions/max_terminated_length": 327.5, |
| "completions/mean_length": 275.84167938232423, |
| "completions/mean_terminated_length": 275.84167938232423, |
| "completions/min_length": 216.4, |
| "completions/min_terminated_length": 216.4, |
| "entropy": 0.2397403221887847, |
| "epoch": 0.018867924528301886, |
| "frac_reward_zero_std": 0.15833333730697632, |
| "grad_norm": 0.3486528694629669, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 26.129166666666666, |
| "items/detected_min": 19.0, |
| "kl": 0.0006169086136045128, |
| "learning_rate": 1e-06, |
| "loss": 0.0205, |
| "num_tokens": 846746.0, |
| "reward": 0.023525450099259614, |
| "reward_std": 0.02375112334266305, |
| "reward_total": 0.17002619691193105, |
| "rewards/pos_1/mean": 0.1700261954218149, |
| "rewards/pos_1/std": 0.06453945254907012, |
| "rewards/pos_10/mean": 0.007429246860556305, |
| "rewards/pos_10/std": 0.01979129179380834, |
| "rewards/pos_11/mean": 0.003910829390224535, |
| "rewards/pos_11/std": 0.013063314463943243, |
| "rewards/pos_12/mean": 0.0029709233087487517, |
| "rewards/pos_12/std": 0.012117477436549961, |
| "rewards/pos_13/mean": 0.001196519553195685, |
| "rewards/pos_13/std": 0.007730130874551833, |
| "rewards/pos_14/mean": 0.001231371099129319, |
| "rewards/pos_14/std": 0.006333466898649931, |
| "rewards/pos_15/mean": 0.0003137066261842847, |
| "rewards/pos_15/std": 0.00301144989207387, |
| "rewards/pos_16/mean": 0.00045505964662879707, |
| "rewards/pos_16/std": 0.0028057243209332226, |
| "rewards/pos_17/mean": 0.0008398459758609533, |
| "rewards/pos_17/std": 0.0021793012507259845, |
| "rewards/pos_18/mean": 0.0008421394741162658, |
| "rewards/pos_18/std": 0.002036854811012745, |
| "rewards/pos_19/mean": 0.0004743130411952734, |
| "rewards/pos_19/std": 0.0009486260823905468, |
| "rewards/pos_2/mean": 0.08502510460093618, |
| "rewards/pos_2/std": 0.058614648133516314, |
| "rewards/pos_20/mean": 0.0004743130411952734, |
| "rewards/pos_20/std": 0.0009486260823905468, |
| "rewards/pos_3/mean": 0.05779995219781995, |
| "rewards/pos_3/std": 0.04915156504139304, |
| "rewards/pos_4/mean": 0.046246381290256974, |
| "rewards/pos_4/std": 0.054492526780813934, |
| "rewards/pos_5/mean": 0.031023955554701388, |
| "rewards/pos_5/std": 0.04292720593512058, |
| "rewards/pos_6/mean": 0.023734602285549043, |
| "rewards/pos_6/std": 0.036466607637703416, |
| "rewards/pos_7/mean": 0.016808894340647383, |
| "rewards/pos_7/std": 0.03472186983563006, |
| "rewards/pos_8/mean": 0.012099898888845929, |
| "rewards/pos_8/std": 0.03379071457311511, |
| "rewards/pos_9/mean": 0.007605718693230301, |
| "rewards/pos_9/std": 0.029351587407290936, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 339.4, |
| "completions/max_terminated_length": 339.4, |
| "completions/mean_length": 278.9750091552734, |
| "completions/mean_terminated_length": 278.9750091552734, |
| "completions/min_length": 217.7, |
| "completions/min_terminated_length": 217.7, |
| "entropy": 0.2447954393302401, |
| "epoch": 0.025157232704402517, |
| "frac_reward_zero_std": 0.11666666939854622, |
| "grad_norm": 0.22751903533935547, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 26.841666666666665, |
| "items/detected_min": 18.0, |
| "kl": 0.0006590462109670626, |
| "learning_rate": 1e-06, |
| "loss": 0.0212, |
| "num_tokens": 1127066.0, |
| "reward": 0.01795863511506468, |
| "reward_std": 0.01903191963210702, |
| "reward_total": 0.19013034030795098, |
| "rewards/pos_1/mean": 0.19013033509254457, |
| "rewards/pos_1/std": 0.07505300045013427, |
| "rewards/pos_10/mean": 0.0008360685140360146, |
| "rewards/pos_10/std": 0.011655322345905005, |
| "rewards/pos_11/mean": -0.0004029428935609758, |
| "rewards/pos_11/std": 0.011080938763916493, |
| "rewards/pos_12/mean": -0.00019872152479365467, |
| "rewards/pos_12/std": 0.006120560830458999, |
| "rewards/pos_13/mean": -0.0004367430577985942, |
| "rewards/pos_13/std": 0.0051455656997859475, |
| "rewards/pos_14/mean": -0.0005456584040075541, |
| "rewards/pos_14/std": 0.0031746502034366133, |
| "rewards/pos_15/mean": -0.0002604166860692203, |
| "rewards/pos_15/std": 0.002604166674427688, |
| "rewards/pos_16/mean": -0.00038226647302508354, |
| "rewards/pos_16/std": 0.0007645330158993601, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.07478453110670671, |
| "rewards/pos_2/std": 0.05773529279977083, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.044285639701411125, |
| "rewards/pos_3/std": 0.04824868915602565, |
| "rewards/pos_4/mean": 0.024659152084495872, |
| "rewards/pos_4/std": 0.04134202003479004, |
| "rewards/pos_5/mean": 0.014307024469599128, |
| "rewards/pos_5/std": 0.03745766724459827, |
| "rewards/pos_6/mean": 0.007001709949690849, |
| "rewards/pos_6/std": 0.02652715602889657, |
| "rewards/pos_7/mean": 0.0030801351997070013, |
| "rewards/pos_7/std": 0.022172968951053917, |
| "rewards/pos_8/mean": 0.001393768424168229, |
| "rewards/pos_8/std": 0.017827122542075812, |
| "rewards/pos_9/mean": 0.0010436996119096876, |
| "rewards/pos_9/std": 0.01348349207546562, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 336.2, |
| "completions/max_terminated_length": 336.2, |
| "completions/mean_length": 277.8895919799805, |
| "completions/mean_terminated_length": 277.8895919799805, |
| "completions/min_length": 220.9, |
| "completions/min_terminated_length": 220.9, |
| "entropy": 0.24365584701299667, |
| "epoch": 0.031446540880503145, |
| "frac_reward_zero_std": 0.1666666693985462, |
| "grad_norm": 0.2256021499633789, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 26.358333333333334, |
| "items/detected_min": 19.0, |
| "kl": 0.000693480301076003, |
| "learning_rate": 1e-06, |
| "loss": 0.0207, |
| "num_tokens": 1402153.0, |
| "reward": 0.018253592820838094, |
| "reward_std": 0.024873400665819646, |
| "reward_total": 0.12365640453062951, |
| "rewards/pos_1/mean": 0.12365640229545534, |
| "rewards/pos_1/std": 0.08260059077292681, |
| "rewards/pos_10/mean": 0.003422046871855855, |
| "rewards/pos_10/std": 0.015368315065279603, |
| "rewards/pos_11/mean": 0.0008227722952142358, |
| "rewards/pos_11/std": 0.012859936244785786, |
| "rewards/pos_12/mean": 0.0011632090085186065, |
| "rewards/pos_12/std": 0.008362313848920166, |
| "rewards/pos_13/mean": -0.0005874803406186402, |
| "rewards/pos_13/std": 0.00946071040816605, |
| "rewards/pos_14/mean": 0.00012089891824871302, |
| "rewards/pos_14/std": 0.003749142773449421, |
| "rewards/pos_15/mean": -0.00013020834885537624, |
| "rewards/pos_15/std": 0.00234375, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": -0.00024980464950203897, |
| "rewards/pos_17/std": 0.0004996092990040779, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.08051420627161861, |
| "rewards/pos_2/std": 0.06426868531852961, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.05728820329532027, |
| "rewards/pos_3/std": 0.061889216490089896, |
| "rewards/pos_4/mean": 0.0330103222746402, |
| "rewards/pos_4/std": 0.054461258091032506, |
| "rewards/pos_5/mean": 0.021673589176498354, |
| "rewards/pos_5/std": 0.05066446699202061, |
| "rewards/pos_6/mean": 0.017316767123702448, |
| "rewards/pos_6/std": 0.04409534465521574, |
| "rewards/pos_7/mean": 0.013347537699155509, |
| "rewards/pos_7/std": 0.037946426495909694, |
| "rewards/pos_8/mean": 0.009339081990765408, |
| "rewards/pos_8/std": 0.027106760535389185, |
| "rewards/pos_9/mean": 0.004739647073438391, |
| "rewards/pos_9/std": 0.021040741237811744, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 332.2, |
| "completions/max_terminated_length": 332.2, |
| "completions/mean_length": 275.77292785644534, |
| "completions/mean_terminated_length": 275.77292785644534, |
| "completions/min_length": 210.5, |
| "completions/min_terminated_length": 210.5, |
| "entropy": 0.24245100983728965, |
| "epoch": 0.03773584905660377, |
| "frac_reward_zero_std": 0.1833333380520344, |
| "grad_norm": 0.34496742486953735, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 26.470833333333335, |
| "items/detected_min": 19.0, |
| "kl": 0.0010004034176139007, |
| "learning_rate": 1e-06, |
| "loss": 0.0211, |
| "num_tokens": 1695536.0, |
| "reward": 0.027998355263844133, |
| "reward_std": 0.025541644264012574, |
| "reward_total": 0.23674236759543418, |
| "rewards/pos_1/mean": 0.23674236685037614, |
| "rewards/pos_1/std": 0.0875159040093422, |
| "rewards/pos_10/mean": 0.008996098558418453, |
| "rewards/pos_10/std": 0.018989878077991306, |
| "rewards/pos_11/mean": 0.005316582974046469, |
| "rewards/pos_11/std": 0.015308488998562098, |
| "rewards/pos_12/mean": 0.004086111392825842, |
| "rewards/pos_12/std": 0.014277325919829309, |
| "rewards/pos_13/mean": 0.0035586868296377363, |
| "rewards/pos_13/std": 0.010105705843307078, |
| "rewards/pos_14/mean": 0.0022290571592748164, |
| "rewards/pos_14/std": 0.00829645860940218, |
| "rewards/pos_15/mean": 0.0011718750232830645, |
| "rewards/pos_15/std": 0.0039062500465661286, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.10075544621795415, |
| "rewards/pos_2/std": 0.07253548875451088, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.0666294975206256, |
| "rewards/pos_3/std": 0.05721334554255009, |
| "rewards/pos_4/mean": 0.04308908779639751, |
| "rewards/pos_4/std": 0.05217773541808128, |
| "rewards/pos_5/mean": 0.031524106953293086, |
| "rewards/pos_5/std": 0.04607770144939423, |
| "rewards/pos_6/mean": 0.019696346682030706, |
| "rewards/pos_6/std": 0.03438961319625378, |
| "rewards/pos_7/mean": 0.013458259997423738, |
| "rewards/pos_7/std": 0.03490288313478231, |
| "rewards/pos_8/mean": 0.013818001028266736, |
| "rewards/pos_8/std": 0.02888077814131975, |
| "rewards/pos_9/mean": 0.009397907659877091, |
| "rewards/pos_9/std": 0.02525058896280825, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 330.6, |
| "completions/max_terminated_length": 330.6, |
| "completions/mean_length": 271.752099609375, |
| "completions/mean_terminated_length": 271.752099609375, |
| "completions/min_length": 213.8, |
| "completions/min_terminated_length": 213.8, |
| "entropy": 0.23684501089155674, |
| "epoch": 0.0440251572327044, |
| "frac_reward_zero_std": 0.1416666701436043, |
| "grad_norm": 0.30509424209594727, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 26.395833333333332, |
| "items/detected_min": 20.0, |
| "kl": 0.00137258033840529, |
| "learning_rate": 1e-06, |
| "loss": 0.0193, |
| "num_tokens": 1990757.0, |
| "reward": 0.019408026617020367, |
| "reward_std": 0.019861249532550574, |
| "reward_total": 0.15183117985725403, |
| "rewards/pos_1/mean": 0.15183117538690566, |
| "rewards/pos_1/std": 0.06800224892795086, |
| "rewards/pos_10/mean": 0.0017800831352360546, |
| "rewards/pos_10/std": 0.01105083387810737, |
| "rewards/pos_11/mean": -5.587483756244183e-05, |
| "rewards/pos_11/std": 0.011234215367585421, |
| "rewards/pos_12/mean": 0.0008566255099140107, |
| "rewards/pos_12/std": 0.006780216773040593, |
| "rewards/pos_13/mean": 0.0007554006297141314, |
| "rewards/pos_13/std": 0.006829833285883069, |
| "rewards/pos_14/mean": -0.0002901801839470863, |
| "rewards/pos_14/std": 0.006283610546961427, |
| "rewards/pos_15/mean": -0.0006621863809414208, |
| "rewards/pos_15/std": 0.0026221639243885876, |
| "rewards/pos_16/mean": 0.00038226647302508354, |
| "rewards/pos_16/std": 0.0012742216931656003, |
| "rewards/pos_17/mean": -0.0004996092990040779, |
| "rewards/pos_17/std": 0.0009992185980081559, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.10222204849123954, |
| "rewards/pos_2/std": 0.06664478406310081, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.060004901699721815, |
| "rewards/pos_3/std": 0.06095823477953673, |
| "rewards/pos_4/mean": 0.0354589288122952, |
| "rewards/pos_4/std": 0.05521870870143175, |
| "rewards/pos_5/mean": 0.01933925971388817, |
| "rewards/pos_5/std": 0.03796455860137939, |
| "rewards/pos_6/mean": 0.008141869585961104, |
| "rewards/pos_6/std": 0.021356439776718618, |
| "rewards/pos_7/mean": 0.0011259438935667277, |
| "rewards/pos_7/std": 0.017314390907995404, |
| "rewards/pos_8/mean": 0.003906934190308675, |
| "rewards/pos_8/std": 0.012045483710244298, |
| "rewards/pos_9/mean": 0.003862919790359834, |
| "rewards/pos_9/std": 0.010645803553052247, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 334.6, |
| "completions/max_terminated_length": 334.6, |
| "completions/mean_length": 270.6020904541016, |
| "completions/mean_terminated_length": 270.6020904541016, |
| "completions/min_length": 208.7, |
| "completions/min_terminated_length": 208.7, |
| "entropy": 0.2382718403513233, |
| "epoch": 0.050314465408805034, |
| "frac_reward_zero_std": 0.1333333358168602, |
| "grad_norm": 0.23278222978115082, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 26.154166666666665, |
| "items/detected_min": 15.0, |
| "kl": 0.0016101757981838696, |
| "learning_rate": 1e-06, |
| "loss": 0.0192, |
| "num_tokens": 2274070.0, |
| "reward": 0.018261409434489905, |
| "reward_std": 0.019813417922705412, |
| "reward_total": 0.13820529966615142, |
| "rewards/pos_1/mean": 0.138205295545049, |
| "rewards/pos_1/std": 0.06373181324452162, |
| "rewards/pos_10/mean": 0.001570706581696868, |
| "rewards/pos_10/std": 0.01301275547593832, |
| "rewards/pos_11/mean": 0.0004506188444793224, |
| "rewards/pos_11/std": 0.008641806431114674, |
| "rewards/pos_12/mean": -0.0003394705709069967, |
| "rewards/pos_12/std": 0.00797676993533969, |
| "rewards/pos_13/mean": 0.00011044344864785671, |
| "rewards/pos_13/std": 0.0031491465866565703, |
| "rewards/pos_14/mean": -0.0005456584272906184, |
| "rewards/pos_14/std": 0.0024994839914143085, |
| "rewards/pos_15/mean": -0.00026041667442768814, |
| "rewards/pos_15/std": 0.0026041667442768812, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.07644948430825024, |
| "rewards/pos_2/std": 0.05021180454641581, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.04756893061567098, |
| "rewards/pos_3/std": 0.047028692159801724, |
| "rewards/pos_4/mean": 0.03352718676906079, |
| "rewards/pos_4/std": 0.04555289912968874, |
| "rewards/pos_5/mean": 0.02097760115284473, |
| "rewards/pos_5/std": 0.039121273113414644, |
| "rewards/pos_6/mean": 0.018290499058639397, |
| "rewards/pos_6/std": 0.03251297641545534, |
| "rewards/pos_7/mean": 0.016266733058728278, |
| "rewards/pos_7/std": 0.03300811811350286, |
| "rewards/pos_8/mean": 0.00988356041489169, |
| "rewards/pos_8/std": 0.028713068063370883, |
| "rewards/pos_9/mean": 0.0033275068912189455, |
| "rewards/pos_9/std": 0.017993874754756688, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 329.7, |
| "completions/max_terminated_length": 329.7, |
| "completions/mean_length": 267.65834045410156, |
| "completions/mean_terminated_length": 267.65834045410156, |
| "completions/min_length": 209.7, |
| "completions/min_terminated_length": 209.7, |
| "entropy": 0.2301488552863399, |
| "epoch": 0.05660377358490566, |
| "frac_reward_zero_std": 0.1916666731238365, |
| "grad_norm": 0.18648235499858856, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 25.570833333333333, |
| "items/detected_min": 19.0, |
| "kl": 0.0020055538250744577, |
| "learning_rate": 1e-06, |
| "loss": 0.0183, |
| "num_tokens": 2554026.0, |
| "reward": 0.023970679030753672, |
| "reward_std": 0.021331347804516555, |
| "reward_total": 0.19938781536184252, |
| "rewards/pos_1/mean": 0.19938780865631997, |
| "rewards/pos_1/std": 0.0798557098954916, |
| "rewards/pos_10/mean": 0.0008782313205301761, |
| "rewards/pos_10/std": 0.011960454075597227, |
| "rewards/pos_11/mean": 0.00036563378816936167, |
| "rewards/pos_11/std": 0.01036488190293312, |
| "rewards/pos_12/mean": 0.0002656884375028312, |
| "rewards/pos_12/std": 0.005923388386145234, |
| "rewards/pos_13/mean": -0.0001367966295219958, |
| "rewards/pos_13/std": 0.002462339331395924, |
| "rewards/pos_14/mean": -0.0006665573455393314, |
| "rewards/pos_14/std": 0.0013331146910786629, |
| "rewards/pos_15/mean": -0.0006510416860692203, |
| "rewards/pos_15/std": 0.0013020833721384406, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": -0.00024980464950203897, |
| "rewards/pos_17/std": 0.0004996092990040779, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.10163866644725203, |
| "rewards/pos_2/std": 0.06948479041457176, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.06862257414031774, |
| "rewards/pos_3/std": 0.06218107980675995, |
| "rewards/pos_4/mean": 0.044460686412639915, |
| "rewards/pos_4/std": 0.050121272914111614, |
| "rewards/pos_5/mean": 0.02706362712197006, |
| "rewards/pos_5/std": 0.04326969822868705, |
| "rewards/pos_6/mean": 0.017091114865615965, |
| "rewards/pos_6/std": 0.031835716869682076, |
| "rewards/pos_7/mean": 0.010764542536344379, |
| "rewards/pos_7/std": 0.019727899390272797, |
| "rewards/pos_8/mean": 0.007698593634025505, |
| "rewards/pos_8/std": 0.02043189574033022, |
| "rewards/pos_9/mean": 0.003255945723503828, |
| "rewards/pos_9/std": 0.01512229349464178, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 333.0, |
| "completions/max_terminated_length": 333.0, |
| "completions/mean_length": 272.6104248046875, |
| "completions/mean_terminated_length": 272.6104248046875, |
| "completions/min_length": 210.0, |
| "completions/min_terminated_length": 210.0, |
| "entropy": 0.24247641147424778, |
| "epoch": 0.06289308176100629, |
| "frac_reward_zero_std": 0.2333333395421505, |
| "grad_norm": 0.4801751375198364, |
| "items/detected_max": 34.0, |
| "items/detected_mean": 26.158333333333335, |
| "items/detected_min": 18.0, |
| "kl": 0.0021246019886651387, |
| "learning_rate": 1e-06, |
| "loss": 0.0187, |
| "num_tokens": 2834739.0, |
| "reward": 0.02137735360302031, |
| "reward_std": 0.022431656159460543, |
| "reward_total": 0.15205592066049575, |
| "rewards/pos_1/mean": 0.1520559150725603, |
| "rewards/pos_1/std": 0.08682643361389637, |
| "rewards/pos_10/mean": 0.002806135616265237, |
| "rewards/pos_10/std": 0.014712602784857153, |
| "rewards/pos_11/mean": 0.0010838294634595512, |
| "rewards/pos_11/std": 0.010319896694272756, |
| "rewards/pos_12/mean": 9.85862105153501e-05, |
| "rewards/pos_12/std": 0.00413814561907202, |
| "rewards/pos_13/mean": -0.0005735396873205901, |
| "rewards/pos_13/std": 0.003230412770062685, |
| "rewards/pos_14/mean": -0.0004123469581827521, |
| "rewards/pos_14/std": 0.002757887914776802, |
| "rewards/pos_15/mean": 0.00013020833721384407, |
| "rewards/pos_15/std": 0.001822916720993817, |
| "rewards/pos_16/mean": 0.0, |
| "rewards/pos_16/std": 0.0, |
| "rewards/pos_17/mean": -0.00024980464950203897, |
| "rewards/pos_17/std": 0.0004996092990040779, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.09235873296856881, |
| "rewards/pos_2/std": 0.0637791832908988, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.06637982809916139, |
| "rewards/pos_3/std": 0.05287711825221777, |
| "rewards/pos_4/mean": 0.03900537546724081, |
| "rewards/pos_4/std": 0.054096858203411105, |
| "rewards/pos_5/mean": 0.027999201137572528, |
| "rewards/pos_5/std": 0.04368511065840721, |
| "rewards/pos_6/mean": 0.021048975386656822, |
| "rewards/pos_6/std": 0.03628326542675495, |
| "rewards/pos_7/mean": 0.012417805101722478, |
| "rewards/pos_7/std": 0.03044475233182311, |
| "rewards/pos_8/mean": 0.009451015887316316, |
| "rewards/pos_8/std": 0.022707051225006582, |
| "rewards/pos_9/mean": 0.004067648795899004, |
| "rewards/pos_9/std": 0.02021083978470415, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 330.6, |
| "completions/max_terminated_length": 330.6, |
| "completions/mean_length": 274.5708465576172, |
| "completions/mean_terminated_length": 274.5708465576172, |
| "completions/min_length": 214.3, |
| "completions/min_terminated_length": 214.3, |
| "entropy": 0.23819080513591567, |
| "epoch": 0.06918238993710692, |
| "frac_reward_zero_std": 0.0666666679084301, |
| "grad_norm": 0.2140370011329651, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 26.3875, |
| "items/detected_min": 19.0, |
| "kl": 0.0024520867267710856, |
| "learning_rate": 1e-06, |
| "loss": 0.0186, |
| "num_tokens": 3123245.0, |
| "reward": 0.028679715376347302, |
| "reward_std": 0.025797955319285394, |
| "reward_total": 0.1972983568906784, |
| "rewards/pos_1/mean": 0.1972983494400978, |
| "rewards/pos_1/std": 0.08312824480235577, |
| "rewards/pos_10/mean": 0.006511424528434872, |
| "rewards/pos_10/std": 0.012539930804632605, |
| "rewards/pos_11/mean": 0.004311096295714379, |
| "rewards/pos_11/std": 0.009560632146894931, |
| "rewards/pos_12/mean": 0.003057955822441727, |
| "rewards/pos_12/std": 0.009056452894583344, |
| "rewards/pos_13/mean": 0.0026791852549649777, |
| "rewards/pos_13/std": 0.0089504691073671, |
| "rewards/pos_14/mean": 0.003384079411625862, |
| "rewards/pos_14/std": 0.00754068112000823, |
| "rewards/pos_15/mean": 0.001009297906421125, |
| "rewards/pos_15/std": 0.004131007054820657, |
| "rewards/pos_16/mean": 0.0010092979297041899, |
| "rewards/pos_16/std": 0.0020185960456728944, |
| "rewards/pos_17/mean": -0.00012490234803408338, |
| "rewards/pos_17/std": 0.0021070301532745373, |
| "rewards/pos_18/mean": 4.625929407134921e-19, |
| "rewards/pos_18/std": 9.251858814269843e-19, |
| "rewards/pos_19/mean": 4.625929407134921e-19, |
| "rewards/pos_19/std": 9.251858814269843e-19, |
| "rewards/pos_2/mean": 0.11366754285991192, |
| "rewards/pos_2/std": 0.07409449964761734, |
| "rewards/pos_20/mean": 4.625929407134921e-19, |
| "rewards/pos_20/std": 9.251858814269843e-19, |
| "rewards/pos_3/mean": 0.078475221991539, |
| "rewards/pos_3/std": 0.06386412046849728, |
| "rewards/pos_4/mean": 0.05367650641128421, |
| "rewards/pos_4/std": 0.05735928248614073, |
| "rewards/pos_5/mean": 0.0359677754342556, |
| "rewards/pos_5/std": 0.051013645529747007, |
| "rewards/pos_6/mean": 0.030178581224754454, |
| "rewards/pos_6/std": 0.042489323485642674, |
| "rewards/pos_7/mean": 0.017106734635308384, |
| "rewards/pos_7/std": 0.031865698378533126, |
| "rewards/pos_8/mean": 0.013932319730520249, |
| "rewards/pos_8/std": 0.03123240638524294, |
| "rewards/pos_9/mean": 0.011453809647355229, |
| "rewards/pos_9/std": 0.025007078982889654, |
| "step": 110 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 334.4, |
| "completions/max_terminated_length": 334.4, |
| "completions/mean_length": 277.0229248046875, |
| "completions/mean_terminated_length": 277.0229248046875, |
| "completions/min_length": 216.8, |
| "completions/min_terminated_length": 216.8, |
| "entropy": 0.2525980423515042, |
| "epoch": 0.07547169811320754, |
| "frac_reward_zero_std": 0.15833333805203437, |
| "grad_norm": 0.4217924177646637, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 26.629166666666666, |
| "items/detected_min": 14.0, |
| "kl": 0.0021200735282036476, |
| "learning_rate": 1e-06, |
| "loss": 0.0208, |
| "num_tokens": 3414244.0, |
| "reward": 0.021100316557567566, |
| "reward_std": 0.02556178504601121, |
| "reward_total": 0.15421188264153898, |
| "rewards/pos_1/mean": 0.1542118826182559, |
| "rewards/pos_1/std": 0.08668690882623195, |
| "rewards/pos_10/mean": 0.001462233636993915, |
| "rewards/pos_10/std": 0.014794380008243024, |
| "rewards/pos_11/mean": 0.0009601796511560679, |
| "rewards/pos_11/std": 0.009470279794186354, |
| "rewards/pos_12/mean": 0.00016555656911805273, |
| "rewards/pos_12/std": 0.008588731940835714, |
| "rewards/pos_13/mean": 0.0011903240345418453, |
| "rewards/pos_13/std": 0.006918352888897061, |
| "rewards/pos_14/mean": 0.0006779891438782216, |
| "rewards/pos_14/std": 0.005625644978135824, |
| "rewards/pos_15/mean": 0.0012298537883907556, |
| "rewards/pos_15/std": 0.004600600665435195, |
| "rewards/pos_16/mean": 0.0018836816772818566, |
| "rewards/pos_16/std": 0.003439636155962944, |
| "rewards/pos_17/mean": -1.7571356147527693e-05, |
| "rewards/pos_17/std": 0.001963294483721256, |
| "rewards/pos_18/mean": 0.0004820379428565502, |
| "rewards/pos_18/std": 0.0009640758857131004, |
| "rewards/pos_19/mean": 0.0004820379428565502, |
| "rewards/pos_19/std": 0.0009640758857131004, |
| "rewards/pos_2/mean": 0.09640580550767482, |
| "rewards/pos_2/std": 0.0769618958234787, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.05643308900762349, |
| "rewards/pos_3/std": 0.06533879688940943, |
| "rewards/pos_4/mean": 0.03552527402061969, |
| "rewards/pos_4/std": 0.0584185004234314, |
| "rewards/pos_5/mean": 0.02315435045547929, |
| "rewards/pos_5/std": 0.04358684956096113, |
| "rewards/pos_6/mean": 0.020684696536045523, |
| "rewards/pos_6/std": 0.03866914417594671, |
| "rewards/pos_7/mean": 0.012705407536122948, |
| "rewards/pos_7/std": 0.03549118069931865, |
| "rewards/pos_8/mean": 0.009155850886600092, |
| "rewards/pos_8/std": 0.02957823732867837, |
| "rewards/pos_9/mean": 0.005213634658139199, |
| "rewards/pos_9/std": 0.019175097346305847, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 345.9, |
| "completions/max_terminated_length": 345.9, |
| "completions/mean_length": 271.9437591552734, |
| "completions/mean_terminated_length": 271.9437591552734, |
| "completions/min_length": 216.0, |
| "completions/min_terminated_length": 216.0, |
| "entropy": 0.22371312171841662, |
| "epoch": 0.08176100628930817, |
| "frac_reward_zero_std": 0.21666667237877846, |
| "grad_norm": 0.2381807267665863, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 25.783333333333335, |
| "items/detected_min": 19.0, |
| "kl": 0.0023203926954010966, |
| "learning_rate": 1e-06, |
| "loss": 0.0193, |
| "num_tokens": 3687189.0, |
| "reward": 0.017484810762107374, |
| "reward_std": 0.019230466429144145, |
| "reward_total": 0.12171695046126843, |
| "rewards/pos_1/mean": 0.1217169489711523, |
| "rewards/pos_1/std": 0.06389217637479305, |
| "rewards/pos_10/mean": 0.0034612691204529257, |
| "rewards/pos_10/std": 0.012485883384943008, |
| "rewards/pos_11/mean": 0.0005503416177816689, |
| "rewards/pos_11/std": 0.010597484931349755, |
| "rewards/pos_12/mean": 0.00045946375466883184, |
| "rewards/pos_12/std": 0.006546087586320937, |
| "rewards/pos_13/mean": 0.000506892695557326, |
| "rewards/pos_13/std": 0.006360282865352928, |
| "rewards/pos_14/mean": -0.00014572404325008392, |
| "rewards/pos_14/std": 0.003616921044886112, |
| "rewards/pos_15/mean": 0.0, |
| "rewards/pos_15/std": 0.002083333395421505, |
| "rewards/pos_16/mean": -0.0007645329460501671, |
| "rewards/pos_16/std": 0.0015290660317987203, |
| "rewards/pos_17/mean": -0.00024980464950203897, |
| "rewards/pos_17/std": 0.0004996092990040779, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.07369502820074558, |
| "rewards/pos_2/std": 0.0545389074832201, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.048825977463275196, |
| "rewards/pos_3/std": 0.04302537702023983, |
| "rewards/pos_4/mean": 0.03623074255883694, |
| "rewards/pos_4/std": 0.04152446836233139, |
| "rewards/pos_5/mean": 0.02578731467947364, |
| "rewards/pos_5/std": 0.039170566760003564, |
| "rewards/pos_6/mean": 0.01823263019323349, |
| "rewards/pos_6/std": 0.03099028319120407, |
| "rewards/pos_7/mean": 0.010217601037584245, |
| "rewards/pos_7/std": 0.02759239007718861, |
| "rewards/pos_8/mean": 0.006804429623298347, |
| "rewards/pos_8/std": 0.02312811641022563, |
| "rewards/pos_9/mean": 0.004490223817992955, |
| "rewards/pos_9/std": 0.016783139877952637, |
| "step": 130 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 326.6, |
| "completions/max_terminated_length": 326.6, |
| "completions/mean_length": 269.00001220703126, |
| "completions/mean_terminated_length": 269.00001220703126, |
| "completions/min_length": 212.8, |
| "completions/min_terminated_length": 212.8, |
| "entropy": 0.23122907076030969, |
| "epoch": 0.0880503144654088, |
| "frac_reward_zero_std": 0.1833333380520344, |
| "grad_norm": 0.15572181344032288, |
| "items/detected_max": 32.0, |
| "items/detected_mean": 25.520833333333332, |
| "items/detected_min": 19.0, |
| "kl": 0.0024950778378600566, |
| "learning_rate": 1e-06, |
| "loss": 0.0175, |
| "num_tokens": 3974029.0, |
| "reward": 0.014080746471881867, |
| "reward_std": 0.016236986499279737, |
| "reward_total": 0.1274184116628021, |
| "rewards/pos_1/mean": 0.12741840942762792, |
| "rewards/pos_1/std": 0.05708431825041771, |
| "rewards/pos_10/mean": 0.00020685511990450322, |
| "rewards/pos_10/std": 0.007606808561831713, |
| "rewards/pos_11/mean": -0.002086038514971733, |
| "rewards/pos_11/std": 0.007140537723898888, |
| "rewards/pos_12/mean": 0.00026568842586129906, |
| "rewards/pos_12/std": 0.003909353911876678, |
| "rewards/pos_13/mean": -0.0001367966178804636, |
| "rewards/pos_13/std": 0.002462339401245117, |
| "rewards/pos_14/mean": -0.0011998032219707965, |
| "rewards/pos_14/std": 0.002399606443941593, |
| "rewards/pos_15/mean": -0.00039062501164153216, |
| "rewards/pos_15/std": 0.0007812500232830643, |
| "rewards/pos_16/mean": -0.0005096886307001114, |
| "rewards/pos_16/std": 0.0010193773545324803, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.05554983280599117, |
| "rewards/pos_2/std": 0.04518025889992714, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.04318028772249818, |
| "rewards/pos_3/std": 0.044956205785274504, |
| "rewards/pos_4/mean": 0.030316960613708942, |
| "rewards/pos_4/std": 0.04388560689985752, |
| "rewards/pos_5/mean": 0.01796492321882397, |
| "rewards/pos_5/std": 0.03195359939709306, |
| "rewards/pos_6/mean": 0.008910082012880594, |
| "rewards/pos_6/std": 0.03093706537038088, |
| "rewards/pos_7/mean": 0.0019248059805249795, |
| "rewards/pos_7/std": 0.015874549094587565, |
| "rewards/pos_8/mean": 0.0006415006675524638, |
| "rewards/pos_8/std": 0.01841237680055201, |
| "rewards/pos_9/mean": -0.00019396404968574643, |
| "rewards/pos_9/std": 0.010641431924887002, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 328.8, |
| "completions/max_terminated_length": 328.8, |
| "completions/mean_length": 269.02501068115237, |
| "completions/mean_terminated_length": 269.02501068115237, |
| "completions/min_length": 206.8, |
| "completions/min_terminated_length": 206.8, |
| "entropy": 0.23320260525991518, |
| "epoch": 0.09433962264150944, |
| "frac_reward_zero_std": 0.1666666716337204, |
| "grad_norm": 0.2597532272338867, |
| "items/detected_max": 32.0, |
| "items/detected_mean": 25.716666666666665, |
| "items/detected_min": 17.0, |
| "kl": 0.0030229106078574356, |
| "learning_rate": 1e-06, |
| "loss": 0.0171, |
| "num_tokens": 4257405.0, |
| "reward": 0.017440968845039605, |
| "reward_std": 0.014302454702556134, |
| "reward_total": 0.1550670871976763, |
| "rewards/pos_1/mean": 0.15506708160974086, |
| "rewards/pos_1/std": 0.04401266416534781, |
| "rewards/pos_10/mean": 0.0006411751499399542, |
| "rewards/pos_10/std": 0.00836768765002489, |
| "rewards/pos_11/mean": 0.0011508289724588394, |
| "rewards/pos_11/std": 0.007357292901724577, |
| "rewards/pos_12/mean": -0.0009468055446632207, |
| "rewards/pos_12/std": 0.0066732271574437615, |
| "rewards/pos_13/mean": -0.0007546751294285059, |
| "rewards/pos_13/std": 0.004247732646763325, |
| "rewards/pos_14/mean": -4.281066358089447e-05, |
| "rewards/pos_14/std": 0.0029963889624923467, |
| "rewards/pos_15/mean": 9.981024777516723e-05, |
| "rewards/pos_15/std": 0.001762120542116463, |
| "rewards/pos_16/mean": 0.00010816878639161587, |
| "rewards/pos_16/std": 0.001529983733780682, |
| "rewards/pos_17/mean": 0.00036553293466567993, |
| "rewards/pos_17/std": 0.0012306751683354377, |
| "rewards/pos_18/mean": 0.0002452176297083497, |
| "rewards/pos_18/std": 0.0014713057782500983, |
| "rewards/pos_19/mean": -0.0001205094857141394, |
| "rewards/pos_19/std": 0.00024101897142827724, |
| "rewards/pos_2/mean": 0.07839932842180133, |
| "rewards/pos_2/std": 0.03501760168001056, |
| "rewards/pos_20/mean": -1.8503717628539686e-18, |
| "rewards/pos_20/std": 2.136625223889579e-18, |
| "rewards/pos_3/mean": 0.04932655282318592, |
| "rewards/pos_3/std": 0.03529196488671005, |
| "rewards/pos_4/mean": 0.035052822227589785, |
| "rewards/pos_4/std": 0.03147873505949974, |
| "rewards/pos_5/mean": 0.014226565160788596, |
| "rewards/pos_5/std": 0.03353233290836215, |
| "rewards/pos_6/mean": 0.009943538601510227, |
| "rewards/pos_6/std": 0.02568354532122612, |
| "rewards/pos_7/mean": 0.0037480150582268833, |
| "rewards/pos_7/std": 0.01887619134504348, |
| "rewards/pos_8/mean": 0.00222595312516205, |
| "rewards/pos_8/std": 0.015316987968981266, |
| "rewards/pos_9/mean": 8.356973994523287e-05, |
| "rewards/pos_9/std": 0.010961612733080984, |
| "step": 150 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 331.5, |
| "completions/max_terminated_length": 331.5, |
| "completions/mean_length": 273.31251220703126, |
| "completions/mean_terminated_length": 273.31251220703126, |
| "completions/min_length": 217.2, |
| "completions/min_terminated_length": 217.2, |
| "entropy": 0.23213966513673465, |
| "epoch": 0.10062893081761007, |
| "frac_reward_zero_std": 0.14166666939854622, |
| "grad_norm": 0.3334786593914032, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 25.591666666666665, |
| "items/detected_min": 19.0, |
| "kl": 0.0035994505208994574, |
| "learning_rate": 1e-06, |
| "loss": 0.0189, |
| "num_tokens": 4548299.0, |
| "reward": 0.015756109496578574, |
| "reward_std": 0.017546102358028292, |
| "reward_total": 0.14928646758198738, |
| "rewards/pos_1/mean": 0.14928646497428416, |
| "rewards/pos_1/std": 0.0711632726714015, |
| "rewards/pos_10/mean": 0.002952051186002791, |
| "rewards/pos_10/std": 0.011590327834710479, |
| "rewards/pos_11/mean": 4.112366586923599e-05, |
| "rewards/pos_11/std": 0.009777330886572599, |
| "rewards/pos_12/mean": 0.0006721259327605366, |
| "rewards/pos_12/std": 0.005886415066197514, |
| "rewards/pos_13/mean": -0.0004103898769244552, |
| "rewards/pos_13/std": 0.004967003781348467, |
| "rewards/pos_14/mean": -0.00039993440732359885, |
| "rewards/pos_14/std": 0.0007998688146471977, |
| "rewards/pos_15/mean": -0.0006510416860692203, |
| "rewards/pos_15/std": 0.0013020833721384406, |
| "rewards/pos_16/mean": -0.00012742215767502785, |
| "rewards/pos_16/std": 0.0002548443386331201, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.06409315429627896, |
| "rewards/pos_2/std": 0.05032593030482531, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.04519490986131132, |
| "rewards/pos_3/std": 0.041902261599898336, |
| "rewards/pos_4/mean": 0.022920475325372537, |
| "rewards/pos_4/std": 0.036204461753368375, |
| "rewards/pos_5/mean": 0.013419941076426767, |
| "rewards/pos_5/std": 0.03253512904047966, |
| "rewards/pos_6/mean": 0.007387386483605951, |
| "rewards/pos_6/std": 0.025701201800256968, |
| "rewards/pos_7/mean": 0.005470841613714583, |
| "rewards/pos_7/std": 0.023291338980197907, |
| "rewards/pos_8/mean": 0.0032636421092320234, |
| "rewards/pos_8/std": 0.019500295352190732, |
| "rewards/pos_9/mean": 0.002256354584824294, |
| "rewards/pos_9/std": 0.01522524117026478, |
| "step": 160 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 336.9, |
| "completions/max_terminated_length": 336.9, |
| "completions/mean_length": 270.0791778564453, |
| "completions/mean_terminated_length": 270.0791778564453, |
| "completions/min_length": 207.9, |
| "completions/min_terminated_length": 207.9, |
| "entropy": 0.22610665323833626, |
| "epoch": 0.1069182389937107, |
| "frac_reward_zero_std": 0.15833333656191825, |
| "grad_norm": 0.2289900928735733, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 25.416666666666668, |
| "items/detected_min": 18.0, |
| "kl": 0.003391667290755625, |
| "learning_rate": 1e-06, |
| "loss": 0.0179, |
| "num_tokens": 4831193.0, |
| "reward": 0.0186764198821038, |
| "reward_std": 0.01692580496892333, |
| "reward_total": 0.1566243339329958, |
| "rewards/pos_1/mean": 0.15662432909011842, |
| "rewards/pos_1/std": 0.05228233584202826, |
| "rewards/pos_10/mean": 0.0012737916316837073, |
| "rewards/pos_10/std": 0.009292289335280656, |
| "rewards/pos_11/mean": 0.0017623581690713764, |
| "rewards/pos_11/std": 0.006201159209012985, |
| "rewards/pos_12/mean": -0.0007756583509035408, |
| "rewards/pos_12/std": 0.00754371783696115, |
| "rewards/pos_13/mean": 0.000780485977884382, |
| "rewards/pos_13/std": 0.0026553449453786014, |
| "rewards/pos_14/mean": 0.0010540792252868413, |
| "rewards/pos_14/std": 0.0021081584505736827, |
| "rewards/pos_15/mean": -0.0006510416627861559, |
| "rewards/pos_15/std": 0.0031652866629883645, |
| "rewards/pos_16/mean": -0.00012742215767502877, |
| "rewards/pos_16/std": 0.0002548443386331219, |
| "rewards/pos_17/mean": -0.0003747069742530584, |
| "rewards/pos_17/std": 0.0007494139485061168, |
| "rewards/pos_18/mean": -0.0002452176297083506, |
| "rewards/pos_18/std": 0.0004904352594167012, |
| "rewards/pos_19/mean": -0.00012050948571413849, |
| "rewards/pos_19/std": 0.00024101897142827697, |
| "rewards/pos_2/mean": 0.09396323375403881, |
| "rewards/pos_2/std": 0.04918128461576998, |
| "rewards/pos_20/mean": -9.251858814269843e-19, |
| "rewards/pos_20/std": 1.8503717628539686e-18, |
| "rewards/pos_3/mean": 0.05363719742745161, |
| "rewards/pos_3/std": 0.038127432996407154, |
| "rewards/pos_4/mean": 0.028266194695606827, |
| "rewards/pos_4/std": 0.043761784210801125, |
| "rewards/pos_5/mean": 0.01885695867240429, |
| "rewards/pos_5/std": 0.04044422851875425, |
| "rewards/pos_6/mean": 0.010655944173777243, |
| "rewards/pos_6/std": 0.02998672341927886, |
| "rewards/pos_7/mean": 0.006501190612713497, |
| "rewards/pos_7/std": 0.022654754761606454, |
| "rewards/pos_8/mean": 0.002201350941322744, |
| "rewards/pos_8/std": 0.01561037302017212, |
| "rewards/pos_9/mean": 0.0002458267903421074, |
| "rewards/pos_9/std": 0.013765505608171224, |
| "step": 170 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 337.3, |
| "completions/max_terminated_length": 337.3, |
| "completions/mean_length": 268.4208419799805, |
| "completions/mean_terminated_length": 268.4208419799805, |
| "completions/min_length": 208.5, |
| "completions/min_terminated_length": 208.5, |
| "entropy": 0.22181829875335096, |
| "epoch": 0.11320754716981132, |
| "frac_reward_zero_std": 0.16666667088866233, |
| "grad_norm": 0.2745143473148346, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 25.6, |
| "items/detected_min": 19.0, |
| "kl": 0.00353732905665917, |
| "learning_rate": 1e-06, |
| "loss": 0.0165, |
| "num_tokens": 5095139.0, |
| "reward": 0.01957973469980061, |
| "reward_std": 0.019774758955463767, |
| "reward_total": 0.17836526185274124, |
| "rewards/pos_1/mean": 0.17836525700986386, |
| "rewards/pos_1/std": 0.0697622362524271, |
| "rewards/pos_10/mean": 0.0003152468823827803, |
| "rewards/pos_10/std": 0.008289119158871473, |
| "rewards/pos_11/mean": -0.0011112218722701072, |
| "rewards/pos_11/std": 0.005890296772122383, |
| "rewards/pos_12/mean": -0.0004802196519449353, |
| "rewards/pos_12/std": 0.004994568601250649, |
| "rewards/pos_13/mean": 0.0005208333488553763, |
| "rewards/pos_13/std": 0.003230412770062685, |
| "rewards/pos_14/mean": 0.00012089894153177738, |
| "rewards/pos_14/std": 0.0018415355123579502, |
| "rewards/pos_15/mean": -0.0009114583372138441, |
| "rewards/pos_15/std": 0.0033854166977107525, |
| "rewards/pos_16/mean": 0.0, |
| "rewards/pos_16/std": 0.0, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.08470222726464272, |
| "rewards/pos_2/std": 0.05917537584900856, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.0567516652867198, |
| "rewards/pos_3/std": 0.057415686547756195, |
| "rewards/pos_4/mean": 0.033095260383561254, |
| "rewards/pos_4/std": 0.050696741044521335, |
| "rewards/pos_5/mean": 0.020724335825070737, |
| "rewards/pos_5/std": 0.040101373288780454, |
| "rewards/pos_6/mean": 0.016150066489353777, |
| "rewards/pos_6/std": 0.0350839720107615, |
| "rewards/pos_7/mean": 0.005806612188462168, |
| "rewards/pos_7/std": 0.0280417391564697, |
| "rewards/pos_8/mean": -0.0006850265664979816, |
| "rewards/pos_8/std": 0.015849439846351742, |
| "rewards/pos_9/mean": -0.0016472050338052213, |
| "rewards/pos_9/std": 0.011492027598433197, |
| "step": 180 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 320.5, |
| "completions/max_terminated_length": 320.5, |
| "completions/mean_length": 266.6770935058594, |
| "completions/mean_terminated_length": 266.6770935058594, |
| "completions/min_length": 213.0, |
| "completions/min_terminated_length": 213.0, |
| "entropy": 0.2238691563097139, |
| "epoch": 0.11949685534591195, |
| "frac_reward_zero_std": 0.1333333358168602, |
| "grad_norm": 0.27348336577415466, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 25.604166666666668, |
| "items/detected_min": 20.0, |
| "kl": 0.00398559235521437, |
| "learning_rate": 1e-06, |
| "loss": 0.0177, |
| "num_tokens": 5367136.0, |
| "reward": 0.019349526677979158, |
| "reward_std": 0.020760491117835046, |
| "reward_total": 0.14788751676678658, |
| "rewards/pos_1/mean": 0.1478875134140253, |
| "rewards/pos_1/std": 0.07359497845172883, |
| "rewards/pos_10/mean": -0.00036135842092335226, |
| "rewards/pos_10/std": 0.009093766659498214, |
| "rewards/pos_11/mean": 0.00014302353374660016, |
| "rewards/pos_11/std": 0.006097358372062445, |
| "rewards/pos_12/mean": 0.0004698589909821749, |
| "rewards/pos_12/std": 0.005443687411025166, |
| "rewards/pos_13/mean": -0.00047190774930641057, |
| "rewards/pos_13/std": 0.004843967990018428, |
| "rewards/pos_14/mean": 0.0004996092990040779, |
| "rewards/pos_14/std": 0.003132202010601759, |
| "rewards/pos_15/mean": -0.0005420573987066745, |
| "rewards/pos_15/std": 0.003082551993429661, |
| "rewards/pos_16/mean": 0.00024476498365402224, |
| "rewards/pos_16/std": 0.001508907275274396, |
| "rewards/pos_17/mean": 0.00024980464950203897, |
| "rewards/pos_17/std": 0.0014988278970122337, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.09178031878545881, |
| "rewards/pos_2/std": 0.05649234391748905, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.0645428148796782, |
| "rewards/pos_3/std": 0.04721329659223557, |
| "rewards/pos_4/mean": 0.034123249468393624, |
| "rewards/pos_4/std": 0.050931732170283794, |
| "rewards/pos_5/mean": 0.020877907634712756, |
| "rewards/pos_5/std": 0.04322007759474218, |
| "rewards/pos_6/mean": 0.014087293948978185, |
| "rewards/pos_6/std": 0.03900466645136476, |
| "rewards/pos_7/mean": 0.008370040811132639, |
| "rewards/pos_7/std": 0.03408770642708987, |
| "rewards/pos_8/mean": 0.004736725857947021, |
| "rewards/pos_8/std": 0.02168282950296998, |
| "rewards/pos_9/mean": 0.00047341859608422964, |
| "rewards/pos_9/std": 0.014039887744002045, |
| "step": 190 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 326.2, |
| "completions/max_terminated_length": 326.2, |
| "completions/mean_length": 265.7000045776367, |
| "completions/mean_terminated_length": 265.7000045776367, |
| "completions/min_length": 209.1, |
| "completions/min_terminated_length": 209.1, |
| "entropy": 0.22170269154012204, |
| "epoch": 0.12578616352201258, |
| "frac_reward_zero_std": 0.2000000074505806, |
| "grad_norm": 0.24164925515651703, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 25.566666666666666, |
| "items/detected_min": 18.0, |
| "kl": 0.004068990563973784, |
| "learning_rate": 1e-06, |
| "loss": 0.0167, |
| "num_tokens": 5619256.0, |
| "reward": 0.017621663835598156, |
| "reward_std": 0.018194446712732314, |
| "reward_total": 0.13456704604905098, |
| "rewards/pos_1/mean": 0.1345670434413478, |
| "rewards/pos_1/std": 0.046814880799502134, |
| "rewards/pos_10/mean": 0.001287172012962401, |
| "rewards/pos_10/std": 0.010988512635231018, |
| "rewards/pos_11/mean": 0.0032654690090566873, |
| "rewards/pos_11/std": 0.007643049582839012, |
| "rewards/pos_12/mean": 0.0005549672991037369, |
| "rewards/pos_12/std": 0.0068269922398030754, |
| "rewards/pos_13/mean": 0.0007194308214820922, |
| "rewards/pos_13/std": 0.004568580654449761, |
| "rewards/pos_14/mean": 0.000740341772325337, |
| "rewards/pos_14/std": 0.004380421992391348, |
| "rewards/pos_15/mean": 0.000758960610255599, |
| "rewards/pos_15/std": 0.004496230185031891, |
| "rewards/pos_16/mean": 0.0006371107883751392, |
| "rewards/pos_16/std": 0.002659227070398629, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.07656875005923211, |
| "rewards/pos_2/std": 0.05108541529625654, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.04204599694348872, |
| "rewards/pos_3/std": 0.04441921235993505, |
| "rewards/pos_4/mean": 0.03253965424373746, |
| "rewards/pos_4/std": 0.043719809129834176, |
| "rewards/pos_5/mean": 0.02265897230245173, |
| "rewards/pos_5/std": 0.039311152510344984, |
| "rewards/pos_6/mean": 0.016424931492656468, |
| "rewards/pos_6/std": 0.03444936061277985, |
| "rewards/pos_7/mean": 0.011618296918459236, |
| "rewards/pos_7/std": 0.026631731120869518, |
| "rewards/pos_8/mean": 0.005892342084553093, |
| "rewards/pos_8/std": 0.01966498149558902, |
| "rewards/pos_9/mean": 0.002278735424624756, |
| "rewards/pos_9/std": 0.015979555086232722, |
| "step": 200 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 328.1, |
| "completions/max_terminated_length": 328.1, |
| "completions/mean_length": 265.3000061035156, |
| "completions/mean_terminated_length": 265.3000061035156, |
| "completions/min_length": 211.1, |
| "completions/min_terminated_length": 211.1, |
| "entropy": 0.22155435737222434, |
| "epoch": 0.1320754716981132, |
| "frac_reward_zero_std": 0.20000000149011612, |
| "grad_norm": 0.2742804288864136, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 25.2, |
| "items/detected_min": 17.0, |
| "kl": 0.004075667206052458, |
| "learning_rate": 1e-06, |
| "loss": 0.0175, |
| "num_tokens": 5876148.0, |
| "reward": 0.0260989835485816, |
| "reward_std": 0.02093745619058609, |
| "reward_total": 0.2333697572350502, |
| "rewards/pos_1/mean": 0.23336975127458573, |
| "rewards/pos_1/std": 0.08852581698447466, |
| "rewards/pos_10/mean": 0.0008219308452680707, |
| "rewards/pos_10/std": 0.00651337243616581, |
| "rewards/pos_11/mean": 0.00041771340183913706, |
| "rewards/pos_11/std": 0.004837532434612513, |
| "rewards/pos_12/mean": -0.0018297375878319143, |
| "rewards/pos_12/std": 0.005276431818492711, |
| "rewards/pos_13/mean": -0.0008207797771319747, |
| "rewards/pos_13/std": 0.001410291320644319, |
| "rewards/pos_14/mean": -0.0007998688146471977, |
| "rewards/pos_14/std": 0.0015997376292943954, |
| "rewards/pos_15/mean": -0.0006510416860692203, |
| "rewards/pos_15/std": 0.001081953290849924, |
| "rewards/pos_16/mean": -0.0007645329460501671, |
| "rewards/pos_16/std": 0.0013136462308466434, |
| "rewards/pos_17/mean": -0.00024980464950203897, |
| "rewards/pos_17/std": 0.0004996092990040779, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.11015560580417513, |
| "rewards/pos_2/std": 0.060006641410291196, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.06867622348072473, |
| "rewards/pos_3/std": 0.06746885739266872, |
| "rewards/pos_4/mean": 0.04123723595403135, |
| "rewards/pos_4/std": 0.04667948558926582, |
| "rewards/pos_5/mean": 0.030098050978267565, |
| "rewards/pos_5/std": 0.041850338224321604, |
| "rewards/pos_6/mean": 0.023275516787543894, |
| "rewards/pos_6/std": 0.0351828612620011, |
| "rewards/pos_7/mean": 0.011897218006197363, |
| "rewards/pos_7/std": 0.025269487011246382, |
| "rewards/pos_8/mean": 0.0065091788419522345, |
| "rewards/pos_8/std": 0.018076164787635206, |
| "rewards/pos_9/mean": 0.0007596121751703322, |
| "rewards/pos_9/std": 0.012911638477817178, |
| "step": 210 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 322.8, |
| "completions/max_terminated_length": 322.8, |
| "completions/mean_length": 260.1895919799805, |
| "completions/mean_terminated_length": 260.1895919799805, |
| "completions/min_length": 204.2, |
| "completions/min_terminated_length": 204.2, |
| "entropy": 0.2140471934651335, |
| "epoch": 0.13836477987421383, |
| "frac_reward_zero_std": 0.20000000447034835, |
| "grad_norm": 0.2579440176486969, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.854166666666668, |
| "items/detected_min": 18.0, |
| "kl": 0.005129828681189489, |
| "learning_rate": 1e-06, |
| "loss": 0.0144, |
| "num_tokens": 6144383.0, |
| "reward": 0.019254540232941507, |
| "reward_std": 0.01791708227247, |
| "reward_total": 0.17324519529938698, |
| "rewards/pos_1/mean": 0.1732451893389225, |
| "rewards/pos_1/std": 0.06843348871916533, |
| "rewards/pos_10/mean": 0.0009092353051528334, |
| "rewards/pos_10/std": 0.009819345409050583, |
| "rewards/pos_11/mean": 0.0003755505429580808, |
| "rewards/pos_11/std": 0.006670900527387858, |
| "rewards/pos_12/mean": 9.85862105153501e-05, |
| "rewards/pos_12/std": 0.0018861609743908048, |
| "rewards/pos_13/mean": -2.6353169232606888e-05, |
| "rewards/pos_13/std": 0.002136039733886719, |
| "rewards/pos_14/mean": 0.00012089894153177738, |
| "rewards/pos_14/std": 0.0018415355123579502, |
| "rewards/pos_15/mean": 0.00026041667442768814, |
| "rewards/pos_15/std": 0.0015625000465661286, |
| "rewards/pos_16/mean": -0.00012742215767502785, |
| "rewards/pos_16/std": 0.0002548443386331201, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.08983773421496152, |
| "rewards/pos_2/std": 0.061134057305753234, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.051768475910648704, |
| "rewards/pos_3/std": 0.048210570914670826, |
| "rewards/pos_4/mean": 0.03269893038086593, |
| "rewards/pos_4/std": 0.04562271069735289, |
| "rewards/pos_5/mean": 0.013085525191854686, |
| "rewards/pos_5/std": 0.03348905979655683, |
| "rewards/pos_6/mean": 0.009051869111135601, |
| "rewards/pos_6/std": 0.023240961600095034, |
| "rewards/pos_7/mean": 0.007865509623661638, |
| "rewards/pos_7/std": 0.02395617887377739, |
| "rewards/pos_8/mean": 0.00524594159796834, |
| "rewards/pos_8/std": 0.01783889103680849, |
| "rewards/pos_9/mean": 0.0008032934507355094, |
| "rewards/pos_9/std": 0.01199915090110153, |
| "step": 220 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 316.4, |
| "completions/max_terminated_length": 316.4, |
| "completions/mean_length": 256.67084045410155, |
| "completions/mean_terminated_length": 256.67084045410155, |
| "completions/min_length": 199.4, |
| "completions/min_terminated_length": 199.4, |
| "entropy": 0.21115152711669605, |
| "epoch": 0.14465408805031446, |
| "frac_reward_zero_std": 0.2250000074505806, |
| "grad_norm": 0.21807080507278442, |
| "items/detected_max": 33.0, |
| "items/detected_mean": 24.458333333333332, |
| "items/detected_min": 18.0, |
| "kl": 0.00473162657605523, |
| "learning_rate": 1e-06, |
| "loss": 0.0154, |
| "num_tokens": 6408505.0, |
| "reward": 0.013173915771767497, |
| "reward_std": 0.017613324616104365, |
| "reward_total": 0.11636085771024227, |
| "rewards/pos_1/mean": 0.116360854357481, |
| "rewards/pos_1/std": 0.06958029698580503, |
| "rewards/pos_10/mean": -0.0015289061702787876, |
| "rewards/pos_10/std": 0.006670845299959182, |
| "rewards/pos_11/mean": -0.0004223058931529522, |
| "rewards/pos_11/std": 0.004721084609627724, |
| "rewards/pos_12/mean": 0.00032677670242264867, |
| "rewards/pos_12/std": 0.0034685343271121383, |
| "rewards/pos_13/mean": -6.385107990354299e-05, |
| "rewards/pos_13/std": 0.004249790078029036, |
| "rewards/pos_14/mean": 0.00063058752566576, |
| "rewards/pos_14/std": 0.0028609128668904303, |
| "rewards/pos_15/mean": 0.0003794802469201386, |
| "rewards/pos_15/std": 0.0033631272381171584, |
| "rewards/pos_16/mean": 0.0005096886307001114, |
| "rewards/pos_16/std": 0.0010193773545324803, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.05666366685181856, |
| "rewards/pos_2/std": 0.05859119575470686, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.03987346403300762, |
| "rewards/pos_3/std": 0.05046229064464569, |
| "rewards/pos_4/mean": 0.02973797037266195, |
| "rewards/pos_4/std": 0.04398397598415613, |
| "rewards/pos_5/mean": 0.011401956726331264, |
| "rewards/pos_5/std": 0.033489659521728755, |
| "rewards/pos_6/mean": 0.007691465364769101, |
| "rewards/pos_6/std": 0.02227671444416046, |
| "rewards/pos_7/mean": 0.001962298701982945, |
| "rewards/pos_7/std": 0.021468895603902638, |
| "rewards/pos_8/mean": 0.0008060719235800206, |
| "rewards/pos_8/std": 0.016375967115163804, |
| "rewards/pos_9/mean": -0.0008509156294167041, |
| "rewards/pos_9/std": 0.009683813899755478, |
| "step": 230 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 344.9, |
| "completions/max_terminated_length": 344.9, |
| "completions/mean_length": 262.8958404541016, |
| "completions/mean_terminated_length": 262.8958404541016, |
| "completions/min_length": 202.1, |
| "completions/min_terminated_length": 202.1, |
| "entropy": 0.22451901606594524, |
| "epoch": 0.1509433962264151, |
| "frac_reward_zero_std": 0.20833333879709243, |
| "grad_norm": 0.3170786201953888, |
| "items/detected_max": 41.0, |
| "items/detected_mean": 25.341666666666665, |
| "items/detected_min": 19.0, |
| "kl": 0.005417148351746922, |
| "learning_rate": 1e-06, |
| "loss": 0.016, |
| "num_tokens": 6710187.0, |
| "reward": 0.026339445263147354, |
| "reward_std": 0.019665919244289398, |
| "reward_total": 0.1830954909324646, |
| "rewards/pos_1/mean": 0.1830954894423485, |
| "rewards/pos_1/std": 0.05954671716317535, |
| "rewards/pos_10/mean": 0.002383789012674242, |
| "rewards/pos_10/std": 0.0101193432463333, |
| "rewards/pos_11/mean": -0.0002576601458713412, |
| "rewards/pos_11/std": 0.007039384357631207, |
| "rewards/pos_12/mean": 0.0005050237057730555, |
| "rewards/pos_12/std": 0.00495102065615356, |
| "rewards/pos_13/mean": 0.0006576299783773683, |
| "rewards/pos_13/std": 0.0029568195110186934, |
| "rewards/pos_14/mean": 0.00012089891824870931, |
| "rewards/pos_14/std": 0.0016913962550461291, |
| "rewards/pos_15/mean": 0.00013020832557230816, |
| "rewards/pos_15/std": 0.0018229166511446238, |
| "rewards/pos_16/mean": -0.00038226647302508354, |
| "rewards/pos_16/std": 0.0007645330158993601, |
| "rewards/pos_17/mean": -0.0003747069742530621, |
| "rewards/pos_17/std": 0.0007494139485061168, |
| "rewards/pos_18/mean": -0.00036782644456252825, |
| "rewards/pos_18/std": 0.0007356528891250491, |
| "rewards/pos_19/mean": -0.0002410189714282751, |
| "rewards/pos_19/std": 0.0004820379428565502, |
| "rewards/pos_2/mean": 0.11684439983218908, |
| "rewards/pos_2/std": 0.05454435236752033, |
| "rewards/pos_20/mean": -3.700743525707937e-18, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.07016311129555106, |
| "rewards/pos_3/std": 0.05461613442748785, |
| "rewards/pos_4/mean": 0.05105745694600046, |
| "rewards/pos_4/std": 0.049194585625082256, |
| "rewards/pos_5/mean": 0.04083440322428942, |
| "rewards/pos_5/std": 0.04271569908596575, |
| "rewards/pos_6/mean": 0.02642920259386301, |
| "rewards/pos_6/std": 0.033980270475149156, |
| "rewards/pos_7/mean": 0.01962945028208196, |
| "rewards/pos_7/std": 0.028517392510548235, |
| "rewards/pos_8/mean": 0.010012811232202998, |
| "rewards/pos_8/std": 0.022616432514041662, |
| "rewards/pos_9/mean": 0.006548473122529685, |
| "rewards/pos_9/std": 0.016274257795885205, |
| "step": 240 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 335.2, |
| "completions/max_terminated_length": 335.2, |
| "completions/mean_length": 265.06250915527346, |
| "completions/mean_terminated_length": 265.06250915527346, |
| "completions/min_length": 206.5, |
| "completions/min_terminated_length": 206.5, |
| "entropy": 0.23201529448851943, |
| "epoch": 0.15723270440251572, |
| "frac_reward_zero_std": 0.15833333879709244, |
| "grad_norm": 0.18924497067928314, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 25.55, |
| "items/detected_min": 16.0, |
| "kl": 0.005678520092139176, |
| "learning_rate": 1e-06, |
| "loss": 0.0164, |
| "num_tokens": 6991369.0, |
| "reward": 0.025188619922846555, |
| "reward_std": 0.02295760540291667, |
| "reward_total": 0.189043590426445, |
| "rewards/pos_1/mean": 0.18904358446598052, |
| "rewards/pos_1/std": 0.07828422243474051, |
| "rewards/pos_10/mean": 0.005751247936859727, |
| "rewards/pos_10/std": 0.014085922576487064, |
| "rewards/pos_11/mean": 0.003690930129960179, |
| "rewards/pos_11/std": 0.01120929727330804, |
| "rewards/pos_12/mean": 0.0018294563284143805, |
| "rewards/pos_12/std": 0.006877762172371149, |
| "rewards/pos_13/mean": 0.0014269712381064891, |
| "rewards/pos_13/std": 0.005435216333717107, |
| "rewards/pos_14/mean": 0.0004972761031240225, |
| "rewards/pos_14/std": 0.005260519497096538, |
| "rewards/pos_15/mean": 0.0010305219795554868, |
| "rewards/pos_15/std": 0.0020610440522432346, |
| "rewards/pos_16/mean": 0.00012742215767502693, |
| "rewards/pos_16/std": 0.0017839103471487777, |
| "rewards/pos_17/mean": -0.0001249023247510204, |
| "rewards/pos_17/std": 0.0002498046495020408, |
| "rewards/pos_18/mean": -0.0001226088148541758, |
| "rewards/pos_18/std": 0.0002452176297083516, |
| "rewards/pos_19/mean": -0.00024101897142827602, |
| "rewards/pos_19/std": 0.0002783047268167157, |
| "rewards/pos_2/mean": 0.10324889775365591, |
| "rewards/pos_2/std": 0.06747091393917799, |
| "rewards/pos_20/mean": -9.251858814269843e-19, |
| "rewards/pos_20/std": 1.8503717628539686e-18, |
| "rewards/pos_3/mean": 0.06996003771200776, |
| "rewards/pos_3/std": 0.05566513775847852, |
| "rewards/pos_4/mean": 0.0460869993083179, |
| "rewards/pos_4/std": 0.05539451166987419, |
| "rewards/pos_5/mean": 0.02949588217306882, |
| "rewards/pos_5/std": 0.04718504603952169, |
| "rewards/pos_6/mean": 0.020764206012245268, |
| "rewards/pos_6/std": 0.03319612480700016, |
| "rewards/pos_7/mean": 0.012898960616439581, |
| "rewards/pos_7/std": 0.031006166944280267, |
| "rewards/pos_8/mean": 0.011339673865586519, |
| "rewards/pos_8/std": 0.025132232066243886, |
| "rewards/pos_9/mean": 0.007068848155904561, |
| "rewards/pos_9/std": 0.01833072875160724, |
| "step": 250 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 325.5, |
| "completions/max_terminated_length": 325.5, |
| "completions/mean_length": 264.6687637329102, |
| "completions/mean_terminated_length": 264.6687637329102, |
| "completions/min_length": 212.8, |
| "completions/min_terminated_length": 212.8, |
| "entropy": 0.21104721585288644, |
| "epoch": 0.16352201257861634, |
| "frac_reward_zero_std": 0.12500000298023223, |
| "grad_norm": 0.1994742453098297, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 25.033333333333335, |
| "items/detected_min": 19.0, |
| "kl": 0.005256364249119846, |
| "learning_rate": 1e-06, |
| "loss": 0.0156, |
| "num_tokens": 7230110.0, |
| "reward": 0.01918751737102866, |
| "reward_std": 0.018093076907098292, |
| "reward_total": 0.17405339246615767, |
| "rewards/pos_1/mean": 0.17405338347889482, |
| "rewards/pos_1/std": 0.06521443482488394, |
| "rewards/pos_10/mean": -0.0009455020190216601, |
| "rewards/pos_10/std": 0.005048538488335907, |
| "rewards/pos_11/mean": -0.0004358483478426933, |
| "rewards/pos_11/std": 0.004538286104798317, |
| "rewards/pos_12/mean": -0.0009852433227933942, |
| "rewards/pos_12/std": 0.0019704866455867885, |
| "rewards/pos_13/mean": -0.0004103898885659873, |
| "rewards/pos_13/std": 0.0008207797771319747, |
| "rewards/pos_14/mean": -0.0009331803303211927, |
| "rewards/pos_14/std": 0.0016409843228757381, |
| "rewards/pos_15/mean": -0.00026041667442768814, |
| "rewards/pos_15/std": 0.0005208333488553763, |
| "rewards/pos_16/mean": -0.0005096886307001114, |
| "rewards/pos_16/std": 0.0010193773545324803, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.09146424243226647, |
| "rewards/pos_2/std": 0.05889338552951813, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.049712089775130155, |
| "rewards/pos_3/std": 0.0506577804684639, |
| "rewards/pos_4/mean": 0.03075085999444127, |
| "rewards/pos_4/std": 0.04471208192408085, |
| "rewards/pos_5/mean": 0.017505517741665245, |
| "rewards/pos_5/std": 0.03927737530320883, |
| "rewards/pos_6/mean": 0.011149797862162813, |
| "rewards/pos_6/std": 0.03163430448621511, |
| "rewards/pos_7/mean": 0.007539704500231892, |
| "rewards/pos_7/std": 0.02429402128327638, |
| "rewards/pos_8/mean": 0.005642501241527498, |
| "rewards/pos_8/std": 0.019538127072155475, |
| "rewards/pos_9/mean": 0.0005351163446903229, |
| "rewards/pos_9/std": 0.011835501319728792, |
| "step": 260 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 321.6, |
| "completions/max_terminated_length": 321.6, |
| "completions/mean_length": 266.3895889282227, |
| "completions/mean_terminated_length": 266.3895889282227, |
| "completions/min_length": 212.2, |
| "completions/min_terminated_length": 212.2, |
| "entropy": 0.22741040562589962, |
| "epoch": 0.16981132075471697, |
| "frac_reward_zero_std": 0.19166667237877846, |
| "grad_norm": 0.11161798983812332, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 25.679166666666667, |
| "items/detected_min": 19.0, |
| "kl": 0.006663059488103803, |
| "learning_rate": 1e-06, |
| "loss": 0.0178, |
| "num_tokens": 7504485.0, |
| "reward": 0.02319504979532212, |
| "reward_std": 0.021584799420088528, |
| "reward_total": 0.2078800647519529, |
| "rewards/pos_1/mean": 0.20788006028160452, |
| "rewards/pos_1/std": 0.08386271893978119, |
| "rewards/pos_10/mean": 0.0002914302225690335, |
| "rewards/pos_10/std": 0.008567684050649404, |
| "rewards/pos_11/mean": 0.0008116275537759065, |
| "rewards/pos_11/std": 0.007888751104474068, |
| "rewards/pos_12/mean": 0.0014335623593069613, |
| "rewards/pos_12/std": 0.006815162324346602, |
| "rewards/pos_13/mean": 0.0010429344838485122, |
| "rewards/pos_13/std": 0.005427724355831743, |
| "rewards/pos_14/mean": 0.0003763771615922451, |
| "rewards/pos_14/std": 0.002538005914539099, |
| "rewards/pos_15/mean": 0.0003794802934862673, |
| "rewards/pos_15/std": 0.0012797940289601684, |
| "rewards/pos_16/mean": 0.00038226647302508354, |
| "rewards/pos_16/std": 0.0012742216931656003, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.08499452217947692, |
| "rewards/pos_2/std": 0.06867573540657759, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.04896511500701308, |
| "rewards/pos_3/std": 0.05700467079877854, |
| "rewards/pos_4/mean": 0.03908236059360206, |
| "rewards/pos_4/std": 0.04660152085125446, |
| "rewards/pos_5/mean": 0.02731589375374218, |
| "rewards/pos_5/std": 0.03737078960984945, |
| "rewards/pos_6/mean": 0.02225884739673347, |
| "rewards/pos_6/std": 0.03675803560763598, |
| "rewards/pos_7/mean": 0.01433403476839885, |
| "rewards/pos_7/std": 0.0255989148048684, |
| "rewards/pos_8/mean": 0.008673226041719318, |
| "rewards/pos_8/std": 0.024009060859680176, |
| "rewards/pos_9/mean": 0.005679246410727501, |
| "rewards/pos_9/std": 0.018023168295621873, |
| "step": 270 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 321.7, |
| "completions/max_terminated_length": 321.7, |
| "completions/mean_length": 260.93334045410154, |
| "completions/mean_terminated_length": 260.93334045410154, |
| "completions/min_length": 206.3, |
| "completions/min_terminated_length": 206.3, |
| "entropy": 0.22694507626195748, |
| "epoch": 0.1761006289308176, |
| "frac_reward_zero_std": 0.06666666865348816, |
| "grad_norm": 0.231883704662323, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 24.904166666666665, |
| "items/detected_min": 19.0, |
| "kl": 0.00740518799527005, |
| "learning_rate": 1e-06, |
| "loss": 0.0152, |
| "num_tokens": 7805425.0, |
| "reward": 0.02050045975483954, |
| "reward_std": 0.020080117974430323, |
| "reward_total": 0.1723884368315339, |
| "rewards/pos_1/mean": 0.17238843599334358, |
| "rewards/pos_1/std": 0.08533855974674225, |
| "rewards/pos_10/mean": 0.0010272695566527546, |
| "rewards/pos_10/std": 0.011435505677945912, |
| "rewards/pos_11/mean": 0.00032194284722208977, |
| "rewards/pos_11/std": 0.0058740658685565, |
| "rewards/pos_12/mean": 0.0010664916946552694, |
| "rewards/pos_12/std": 0.004227267880924046, |
| "rewards/pos_13/mean": 0.0005193052464164793, |
| "rewards/pos_13/std": 0.0032273565186187623, |
| "rewards/pos_14/mean": 0.0005332458764314652, |
| "rewards/pos_14/std": 0.0031994753517210484, |
| "rewards/pos_15/mean": -0.0005208333488553763, |
| "rewards/pos_15/std": 0.0010416666977107526, |
| "rewards/pos_16/mean": -0.00012742215767502785, |
| "rewards/pos_16/std": 0.0002548443386331201, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": -0.0002452176297083497, |
| "rewards/pos_18/std": 0.0004904352594166994, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.09616097155958414, |
| "rewards/pos_2/std": 0.05851127915084362, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.05270993052981794, |
| "rewards/pos_3/std": 0.05299665480852127, |
| "rewards/pos_4/mean": 0.02972645848058164, |
| "rewards/pos_4/std": 0.04188747890293598, |
| "rewards/pos_5/mean": 0.023145902354735882, |
| "rewards/pos_5/std": 0.0399086520075798, |
| "rewards/pos_6/mean": 0.014170866855420173, |
| "rewards/pos_6/std": 0.02930837543681264, |
| "rewards/pos_7/mean": 0.008681701379828155, |
| "rewards/pos_7/std": 0.022060082969255747, |
| "rewards/pos_8/mean": 0.006511726812459528, |
| "rewards/pos_8/std": 0.02124700667336583, |
| "rewards/pos_9/mean": 0.004063288937322796, |
| "rewards/pos_9/std": 0.02034384049475193, |
| "step": 280 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 318.9, |
| "completions/max_terminated_length": 318.9, |
| "completions/mean_length": 259.00834503173826, |
| "completions/mean_terminated_length": 259.00834503173826, |
| "completions/min_length": 205.2, |
| "completions/min_terminated_length": 205.2, |
| "entropy": 0.21961077268545826, |
| "epoch": 0.18238993710691823, |
| "frac_reward_zero_std": 0.2250000037252903, |
| "grad_norm": 0.3105533719062805, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.458333333333332, |
| "items/detected_min": 18.0, |
| "kl": 0.005909793743679378, |
| "learning_rate": 1e-06, |
| "loss": 0.0153, |
| "num_tokens": 8051641.0, |
| "reward": 0.01701394822448492, |
| "reward_std": 0.01878683748655021, |
| "reward_total": 0.12582458220422268, |
| "rewards/pos_1/mean": 0.12582458183169365, |
| "rewards/pos_1/std": 0.06831286940723658, |
| "rewards/pos_10/mean": 0.0021640648017637433, |
| "rewards/pos_10/std": 0.008456035633571446, |
| "rewards/pos_11/mean": -0.00034264513524249194, |
| "rewards/pos_11/std": 0.004963404871523381, |
| "rewards/pos_12/mean": -0.00015655870083719492, |
| "rewards/pos_12/std": 0.004190851980820298, |
| "rewards/pos_13/mean": -0.0004103898885659873, |
| "rewards/pos_13/std": 0.002778257662430406, |
| "rewards/pos_14/mean": -0.0006665573455393314, |
| "rewards/pos_14/std": 0.0013331146910786629, |
| "rewards/pos_15/mean": -0.00026041667442768814, |
| "rewards/pos_15/std": 0.0005208333488553763, |
| "rewards/pos_16/mean": -0.00038226647302508354, |
| "rewards/pos_16/std": 0.0007645330158993601, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.080518504884094, |
| "rewards/pos_2/std": 0.06397041976451874, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.05210294816643, |
| "rewards/pos_3/std": 0.05435645366087556, |
| "rewards/pos_4/mean": 0.03605008292943239, |
| "rewards/pos_4/std": 0.04513352606445551, |
| "rewards/pos_5/mean": 0.023162062186747788, |
| "rewards/pos_5/std": 0.04228055775165558, |
| "rewards/pos_6/mean": 0.011037050012964756, |
| "rewards/pos_6/std": 0.026784436777234077, |
| "rewards/pos_7/mean": 0.0049487472046166655, |
| "rewards/pos_7/std": 0.022359331441111862, |
| "rewards/pos_8/mean": 0.003727375087328255, |
| "rewards/pos_8/std": 0.013905423972755671, |
| "rewards/pos_9/mean": 0.0032054831506684424, |
| "rewards/pos_9/std": 0.015140429511666298, |
| "step": 290 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 329.0, |
| "completions/max_terminated_length": 329.0, |
| "completions/mean_length": 262.5062622070312, |
| "completions/mean_terminated_length": 262.5062622070312, |
| "completions/min_length": 210.3, |
| "completions/min_terminated_length": 210.3, |
| "entropy": 0.2162858149347206, |
| "epoch": 0.18867924528301888, |
| "frac_reward_zero_std": 0.17500000223517417, |
| "grad_norm": 0.2965948283672333, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.929166666666667, |
| "items/detected_min": 18.0, |
| "kl": 0.006186072509444784, |
| "learning_rate": 1e-06, |
| "loss": 0.0166, |
| "num_tokens": 8319180.0, |
| "reward": 0.021316327748354524, |
| "reward_std": 0.0197633249219507, |
| "reward_total": 0.17199704954400657, |
| "rewards/pos_1/mean": 0.17199704800732435, |
| "rewards/pos_1/std": 0.07101438688114285, |
| "rewards/pos_10/mean": 0.0023221148061566056, |
| "rewards/pos_10/std": 0.006007013586349785, |
| "rewards/pos_11/mean": 0.0007187323644757271, |
| "rewards/pos_11/std": 0.0063957332633435724, |
| "rewards/pos_12/mean": 0.0003055526991374791, |
| "rewards/pos_12/std": 0.0025730435503646732, |
| "rewards/pos_13/mean": 0.0004621113766916096, |
| "rewards/pos_13/std": 0.0022599261021241544, |
| "rewards/pos_14/mean": 0.0003427405841648579, |
| "rewards/pos_14/std": 0.0020978176034986973, |
| "rewards/pos_15/mean": 0.0007488812552765012, |
| "rewards/pos_15/std": 0.0016863864380866288, |
| "rewards/pos_16/mean": 0.0003721871413290501, |
| "rewards/pos_16/std": 0.0020621417788788676, |
| "rewards/pos_17/mean": 0.0002498046262189746, |
| "rewards/pos_17/std": 0.0014988278970122337, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.0910015320405364, |
| "rewards/pos_2/std": 0.07098599858582019, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.05390574834309518, |
| "rewards/pos_3/std": 0.05022534327581525, |
| "rewards/pos_4/mean": 0.035313252126798034, |
| "rewards/pos_4/std": 0.045174385979771615, |
| "rewards/pos_5/mean": 0.02623851306270808, |
| "rewards/pos_5/std": 0.03764389446005225, |
| "rewards/pos_6/mean": 0.017836013311170972, |
| "rewards/pos_6/std": 0.03342308038845658, |
| "rewards/pos_7/mean": 0.015512678562663496, |
| "rewards/pos_7/std": 0.02850998127833009, |
| "rewards/pos_8/mean": 0.006407566787675023, |
| "rewards/pos_8/std": 0.021089862566441298, |
| "rewards/pos_9/mean": 0.002592064579948783, |
| "rewards/pos_9/std": 0.012618646910414099, |
| "step": 300 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 315.3, |
| "completions/max_terminated_length": 315.3, |
| "completions/mean_length": 258.1125122070313, |
| "completions/mean_terminated_length": 258.1125122070313, |
| "completions/min_length": 201.0, |
| "completions/min_terminated_length": 201.0, |
| "entropy": 0.22230708294858534, |
| "epoch": 0.1949685534591195, |
| "frac_reward_zero_std": 0.16666667014360428, |
| "grad_norm": 0.21009129285812378, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.758333333333333, |
| "items/detected_min": 18.0, |
| "kl": 0.00682944758979526, |
| "learning_rate": 1e-06, |
| "loss": 0.0156, |
| "num_tokens": 8583422.0, |
| "reward": 0.020729794539511202, |
| "reward_std": 0.019597215484827755, |
| "reward_total": 0.13572603426873683, |
| "rewards/pos_1/mean": 0.1357260297983885, |
| "rewards/pos_1/std": 0.06282865833491087, |
| "rewards/pos_10/mean": 0.002956245210953057, |
| "rewards/pos_10/std": 0.012046903045848012, |
| "rewards/pos_11/mean": 0.0010622974252328277, |
| "rewards/pos_11/std": 0.007764216605573892, |
| "rewards/pos_12/mean": 0.0009396833018399775, |
| "rewards/pos_12/std": 0.003768626763485372, |
| "rewards/pos_13/mean": 0.0003964492352679372, |
| "rewards/pos_13/std": 0.003297562850639224, |
| "rewards/pos_14/mean": 0.0, |
| "rewards/pos_14/std": 0.0021329835057258607, |
| "rewards/pos_15/mean": 0.0, |
| "rewards/pos_15/std": 0.0, |
| "rewards/pos_16/mean": -0.00038226647302508354, |
| "rewards/pos_16/std": 0.0007645330158993601, |
| "rewards/pos_17/mean": -0.00024980464950203897, |
| "rewards/pos_17/std": 0.0004996092990040779, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.09809606708586216, |
| "rewards/pos_2/std": 0.05868534594774246, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.06796926595270633, |
| "rewards/pos_3/std": 0.05614562537521124, |
| "rewards/pos_4/mean": 0.04019765388220549, |
| "rewards/pos_4/std": 0.050240844232030214, |
| "rewards/pos_5/mean": 0.02712703449651599, |
| "rewards/pos_5/std": 0.038400023430585864, |
| "rewards/pos_6/mean": 0.016242858988698573, |
| "rewards/pos_6/std": 0.031802404299378396, |
| "rewards/pos_7/mean": 0.011441315151751041, |
| "rewards/pos_7/std": 0.028564795944839717, |
| "rewards/pos_8/mean": 0.009426339023048059, |
| "rewards/pos_8/std": 0.020744784269481897, |
| "rewards/pos_9/mean": 0.0036466998630203308, |
| "rewards/pos_9/std": 0.01425737168174237, |
| "step": 310 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 327.3, |
| "completions/max_terminated_length": 327.3, |
| "completions/mean_length": 265.0479278564453, |
| "completions/mean_terminated_length": 265.0479278564453, |
| "completions/min_length": 207.5, |
| "completions/min_terminated_length": 207.5, |
| "entropy": 0.23266545981168746, |
| "epoch": 0.20125786163522014, |
| "frac_reward_zero_std": 0.21666666865348816, |
| "grad_norm": 0.3723245859146118, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 25.533333333333335, |
| "items/detected_min": 18.0, |
| "kl": 0.015437694112673246, |
| "learning_rate": 1e-06, |
| "loss": 0.018, |
| "num_tokens": 8874261.0, |
| "reward": 0.01982590677216649, |
| "reward_std": 0.022089423984289168, |
| "reward_total": 0.15025124587118627, |
| "rewards/pos_1/mean": 0.15025124410167337, |
| "rewards/pos_1/std": 0.07577884886413813, |
| "rewards/pos_10/mean": 0.001985669624991715, |
| "rewards/pos_10/std": 0.011629964830353856, |
| "rewards/pos_11/mean": 0.0015761803835630417, |
| "rewards/pos_11/std": 0.007801409997045994, |
| "rewards/pos_12/mean": 0.00045018800301477313, |
| "rewards/pos_12/std": 0.004841349297203123, |
| "rewards/pos_13/mean": 0.0010289938771165907, |
| "rewards/pos_13/std": 0.004246733780018985, |
| "rewards/pos_14/mean": 0.0003763771615922451, |
| "rewards/pos_14/std": 0.005101214628666639, |
| "rewards/pos_15/mean": -0.0002715613809414208, |
| "rewards/pos_15/std": 0.0023617473198100924, |
| "rewards/pos_16/mean": -0.00012742215767502785, |
| "rewards/pos_16/std": 0.0022935990244150163, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.07865543810185045, |
| "rewards/pos_2/std": 0.061682610772550106, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.05971371060004458, |
| "rewards/pos_3/std": 0.057435553334653375, |
| "rewards/pos_4/mean": 0.03755526882596314, |
| "rewards/pos_4/std": 0.05022326456382871, |
| "rewards/pos_5/mean": 0.024758547963574528, |
| "rewards/pos_5/std": 0.05079233031719923, |
| "rewards/pos_6/mean": 0.021620591334067286, |
| "rewards/pos_6/std": 0.04191543236374855, |
| "rewards/pos_7/mean": 0.012061798262099426, |
| "rewards/pos_7/std": 0.028514722548425197, |
| "rewards/pos_8/mean": 0.005660012143198401, |
| "rewards/pos_8/std": 0.020607514400035144, |
| "rewards/pos_9/mean": 0.0015910826507024467, |
| "rewards/pos_9/std": 0.0158261253265664, |
| "step": 320 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 322.7, |
| "completions/max_terminated_length": 322.7, |
| "completions/mean_length": 260.2916717529297, |
| "completions/mean_terminated_length": 260.2916717529297, |
| "completions/min_length": 203.1, |
| "completions/min_terminated_length": 203.1, |
| "entropy": 0.2289639735283951, |
| "epoch": 0.20754716981132076, |
| "frac_reward_zero_std": 0.1666666693985462, |
| "grad_norm": 0.36900240182876587, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.883333333333333, |
| "items/detected_min": 19.0, |
| "kl": 0.007082089926310194, |
| "learning_rate": 1e-06, |
| "loss": 0.0139, |
| "num_tokens": 9170361.0, |
| "reward": 0.015596002503298224, |
| "reward_std": 0.018469544406980277, |
| "reward_total": 0.1345834530889988, |
| "rewards/pos_1/mean": 0.13458345010876654, |
| "rewards/pos_1/std": 0.05662547051906586, |
| "rewards/pos_10/mean": 0.0029571169754490255, |
| "rewards/pos_10/std": 0.011131436098366976, |
| "rewards/pos_11/mean": 0.0010631690733134747, |
| "rewards/pos_11/std": 0.007871837262064219, |
| "rewards/pos_12/mean": -8.095824159681797e-05, |
| "rewards/pos_12/std": 0.0019351853523403406, |
| "rewards/pos_13/mean": -0.00047553846379742024, |
| "rewards/pos_13/std": 0.002647960395552218, |
| "rewards/pos_14/mean": 8.210353553295135e-05, |
| "rewards/pos_14/std": 0.0016157859936356544, |
| "rewards/pos_15/mean": -0.0001690037315711379, |
| "rewards/pos_15/std": 0.002266159188002348, |
| "rewards/pos_16/mean": -0.0002824950031936169, |
| "rewards/pos_16/std": 0.0024931418942287565, |
| "rewards/pos_17/mean": 0.00023223331663757563, |
| "rewards/pos_17/std": 0.0014636851847171783, |
| "rewards/pos_18/mean": 0.0002368203247897327, |
| "rewards/pos_18/std": 0.0009640759089961648, |
| "rewards/pos_19/mean": 0.0004820379428565502, |
| "rewards/pos_19/std": 0.0009640758857131004, |
| "rewards/pos_2/mean": 0.06383709013462066, |
| "rewards/pos_2/std": 0.05119516551494598, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.04193787656258792, |
| "rewards/pos_3/std": 0.05090108048170805, |
| "rewards/pos_4/mean": 0.02564535040874034, |
| "rewards/pos_4/std": 0.04551187176257372, |
| "rewards/pos_5/mean": 0.01711053295293823, |
| "rewards/pos_5/std": 0.037988297548145054, |
| "rewards/pos_6/mean": 0.010198239896756908, |
| "rewards/pos_6/std": 0.03463809639215469, |
| "rewards/pos_7/mean": 0.006971109099686146, |
| "rewards/pos_7/std": 0.02396038032602519, |
| "rewards/pos_8/mean": 0.004388767573982477, |
| "rewards/pos_8/std": 0.01984125506132841, |
| "rewards/pos_9/mean": 0.003202139458153397, |
| "rewards/pos_9/std": 0.015375914028845727, |
| "step": 330 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 316.0, |
| "completions/max_terminated_length": 316.0, |
| "completions/mean_length": 257.50000915527346, |
| "completions/mean_terminated_length": 257.50000915527346, |
| "completions/min_length": 207.4, |
| "completions/min_terminated_length": 207.4, |
| "entropy": 0.22065137525399525, |
| "epoch": 0.2138364779874214, |
| "frac_reward_zero_std": 0.1916666716337204, |
| "grad_norm": 0.28421109914779663, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.35, |
| "items/detected_min": 19.0, |
| "kl": 0.007017532674944959, |
| "learning_rate": 1e-06, |
| "loss": 0.0149, |
| "num_tokens": 9446513.0, |
| "reward": 0.019425443094223737, |
| "reward_std": 0.018430884974077345, |
| "reward_total": 0.1281277360394597, |
| "rewards/pos_1/mean": 0.12812773007899522, |
| "rewards/pos_1/std": 0.05213469823356718, |
| "rewards/pos_10/mean": 0.002732968854252249, |
| "rewards/pos_10/std": 0.011932172696106136, |
| "rewards/pos_11/mean": 0.002629860525485128, |
| "rewards/pos_11/std": 0.009663155302405357, |
| "rewards/pos_12/mean": 0.0012087689246982336, |
| "rewards/pos_12/std": 0.005961433122865856, |
| "rewards/pos_13/mean": 0.0003840367076918483, |
| "rewards/pos_13/std": 0.002956819487735629, |
| "rewards/pos_14/mean": -1.241255085915327e-05, |
| "rewards/pos_14/std": 0.0021081584505736827, |
| "rewards/pos_15/mean": 0.00026041667442768814, |
| "rewards/pos_15/std": 0.0015625000465661286, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.09696470415219664, |
| "rewards/pos_2/std": 0.05546027736272663, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.060085851489566265, |
| "rewards/pos_3/std": 0.04964439510367811, |
| "rewards/pos_4/mean": 0.03614059896208346, |
| "rewards/pos_4/std": 0.048409922327846286, |
| "rewards/pos_5/mean": 0.022982617770321668, |
| "rewards/pos_5/std": 0.039155343035236004, |
| "rewards/pos_6/mean": 0.014983089175075293, |
| "rewards/pos_6/std": 0.030907634552568196, |
| "rewards/pos_7/mean": 0.01095429282868281, |
| "rewards/pos_7/std": 0.021905113221146168, |
| "rewards/pos_8/mean": 0.007376816321630031, |
| "rewards/pos_8/std": 0.020483076642267405, |
| "rewards/pos_9/mean": 0.004069264605641365, |
| "rewards/pos_9/std": 0.015573488874360919, |
| "step": 340 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 331.2, |
| "completions/max_terminated_length": 331.2, |
| "completions/mean_length": 259.6979278564453, |
| "completions/mean_terminated_length": 259.6979278564453, |
| "completions/min_length": 198.2, |
| "completions/min_terminated_length": 198.2, |
| "entropy": 0.22916316849490007, |
| "epoch": 0.22012578616352202, |
| "frac_reward_zero_std": 0.20000000447034835, |
| "grad_norm": 0.3008638918399811, |
| "items/detected_max": 34.0, |
| "items/detected_mean": 25.133333333333333, |
| "items/detected_min": 19.0, |
| "kl": 0.007831003968021832, |
| "learning_rate": 1e-06, |
| "loss": 0.0162, |
| "num_tokens": 9722756.0, |
| "reward": 0.02412949312129058, |
| "reward_std": 0.01957730036228895, |
| "reward_total": 0.19526247452013196, |
| "rewards/pos_1/mean": 0.1952624715398997, |
| "rewards/pos_1/std": 0.07063707131892442, |
| "rewards/pos_10/mean": 0.004335769626777619, |
| "rewards/pos_10/std": 0.007979208161123097, |
| "rewards/pos_11/mean": 0.002140712761320174, |
| "rewards/pos_11/std": 0.00789201883599162, |
| "rewards/pos_12/mean": 0.002158847451210022, |
| "rewards/pos_12/std": 0.004358783643692732, |
| "rewards/pos_13/mean": -6.151784909889103e-05, |
| "rewards/pos_13/std": 0.004023188166320324, |
| "rewards/pos_14/mean": 0.0004996092990040779, |
| "rewards/pos_14/std": 0.002981483284384012, |
| "rewards/pos_15/mean": -0.0001514323754236102, |
| "rewards/pos_15/std": 0.002301301993429661, |
| "rewards/pos_16/mean": 0.00011734282597899436, |
| "rewards/pos_16/std": 0.0012540629599243403, |
| "rewards/pos_17/mean": 0.0003747069742530584, |
| "rewards/pos_17/std": 0.0012490232475101948, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.09324139822274446, |
| "rewards/pos_2/std": 0.06644056802615524, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.0615955856628716, |
| "rewards/pos_3/std": 0.051313380664214495, |
| "rewards/pos_4/mean": 0.04688858595909551, |
| "rewards/pos_4/std": 0.04371719118207693, |
| "rewards/pos_5/mean": 0.03025575850624591, |
| "rewards/pos_5/std": 0.04001493654213846, |
| "rewards/pos_6/mean": 0.02127471216954291, |
| "rewards/pos_6/std": 0.030604945309460164, |
| "rewards/pos_7/mean": 0.01315246180165559, |
| "rewards/pos_7/std": 0.023588302149437367, |
| "rewards/pos_8/mean": 0.006093456991948187, |
| "rewards/pos_8/std": 0.019216742552816867, |
| "rewards/pos_9/mean": 0.005533974547870457, |
| "rewards/pos_9/std": 0.0137285475153476, |
| "step": 350 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 317.6, |
| "completions/max_terminated_length": 317.6, |
| "completions/mean_length": 260.34375762939453, |
| "completions/mean_terminated_length": 260.34375762939453, |
| "completions/min_length": 213.4, |
| "completions/min_terminated_length": 213.4, |
| "entropy": 0.21896101338788868, |
| "epoch": 0.22641509433962265, |
| "frac_reward_zero_std": 0.11666666939854622, |
| "grad_norm": 0.27910512685775757, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.866666666666667, |
| "items/detected_min": 17.0, |
| "kl": 0.006924989688074371, |
| "learning_rate": 1e-06, |
| "loss": 0.0149, |
| "num_tokens": 9980249.0, |
| "reward": 0.028933631628751753, |
| "reward_std": 0.019675091747194527, |
| "reward_total": 0.22213271036744117, |
| "rewards/pos_1/mean": 0.22213271036744117, |
| "rewards/pos_1/std": 0.07323975777253508, |
| "rewards/pos_10/mean": 0.0012367580784484744, |
| "rewards/pos_10/std": 0.007548078242689371, |
| "rewards/pos_11/mean": 0.0018706073053181172, |
| "rewards/pos_11/std": 0.004126033373177051, |
| "rewards/pos_12/mean": 0.00045404935954138634, |
| "rewards/pos_12/std": 0.004291756404563784, |
| "rewards/pos_13/mean": 0.0011696518515236676, |
| "rewards/pos_13/std": 0.003098501614294946, |
| "rewards/pos_14/mean": 0.0006329207681119442, |
| "rewards/pos_14/std": 0.002865579165518284, |
| "rewards/pos_15/mean": 0.00010898428736254573, |
| "rewards/pos_15/std": 0.0017804686212912202, |
| "rewards/pos_16/mean": 0.00011734282597899436, |
| "rewards/pos_16/std": 0.001763751613907516, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0017872774740681052, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 1.5108222277709536e-18, |
| "rewards/pos_2/mean": 0.1439829993993044, |
| "rewards/pos_2/std": 0.06127358330413699, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 1.5108222277709536e-18, |
| "rewards/pos_3/mean": 0.08225387446582318, |
| "rewards/pos_3/std": 0.05610301522538066, |
| "rewards/pos_4/mean": 0.04696628153324127, |
| "rewards/pos_4/std": 0.04185368986800313, |
| "rewards/pos_5/mean": 0.030603414541110395, |
| "rewards/pos_5/std": 0.04589109029620886, |
| "rewards/pos_6/mean": 0.022831282811239362, |
| "rewards/pos_6/std": 0.02951878057792783, |
| "rewards/pos_7/mean": 0.014235854789149016, |
| "rewards/pos_7/std": 0.024564327974803745, |
| "rewards/pos_8/mean": 0.007167542690876871, |
| "rewards/pos_8/std": 0.019931788416579367, |
| "rewards/pos_9/mean": 0.0030309490975923836, |
| "rewards/pos_9/std": 0.013619128172285855, |
| "step": 360 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 315.6, |
| "completions/max_terminated_length": 315.6, |
| "completions/mean_length": 257.2437576293945, |
| "completions/mean_terminated_length": 257.2437576293945, |
| "completions/min_length": 209.5, |
| "completions/min_terminated_length": 209.5, |
| "entropy": 0.225166599607716, |
| "epoch": 0.23270440251572327, |
| "frac_reward_zero_std": 0.1666666693985462, |
| "grad_norm": 0.1677815020084381, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 24.379166666666666, |
| "items/detected_min": 19.0, |
| "kl": 0.007016470714976701, |
| "learning_rate": 1e-06, |
| "loss": 0.014, |
| "num_tokens": 10236114.0, |
| "reward": 0.017631425987929106, |
| "reward_std": 0.022127764578908683, |
| "reward_total": 0.14379139244556427, |
| "rewards/pos_1/mean": 0.14379138946533204, |
| "rewards/pos_1/std": 0.0882169634103775, |
| "rewards/pos_10/mean": 0.0022585043567232787, |
| "rewards/pos_10/std": 0.009334756038151681, |
| "rewards/pos_11/mean": 0.001532090362161398, |
| "rewards/pos_11/std": 0.005388705246150494, |
| "rewards/pos_12/mean": 0.0005332458647899329, |
| "rewards/pos_12/std": 0.0033184764208272098, |
| "rewards/pos_13/mean": 0.0002596526173874736, |
| "rewards/pos_13/std": 0.0016136782709509135, |
| "rewards/pos_14/mean": 0.0001333114691078663, |
| "rewards/pos_14/std": 0.001866360567510128, |
| "rewards/pos_15/mean": -0.0005208333488553763, |
| "rewards/pos_15/std": 0.0010416666977107526, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.08060946250334382, |
| "rewards/pos_2/std": 0.06794189997017383, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.054246106650680304, |
| "rewards/pos_3/std": 0.06542806439101696, |
| "rewards/pos_4/mean": 0.03207998964935541, |
| "rewards/pos_4/std": 0.05759974382817745, |
| "rewards/pos_5/mean": 0.017739857686683534, |
| "rewards/pos_5/std": 0.049878083541989325, |
| "rewards/pos_6/mean": 0.010747757053468376, |
| "rewards/pos_6/std": 0.03253991464152932, |
| "rewards/pos_7/mean": 0.0061981274979189035, |
| "rewards/pos_7/std": 0.023129416280426085, |
| "rewards/pos_8/mean": 0.0031042592250742017, |
| "rewards/pos_8/std": 0.019796630647033454, |
| "rewards/pos_9/mean": 0.00017042522376868873, |
| "rewards/pos_9/std": 0.0149512107251212, |
| "step": 370 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 325.4, |
| "completions/max_terminated_length": 325.4, |
| "completions/mean_length": 256.8833480834961, |
| "completions/mean_terminated_length": 256.8833480834961, |
| "completions/min_length": 202.0, |
| "completions/min_terminated_length": 202.0, |
| "entropy": 0.22051387261599303, |
| "epoch": 0.2389937106918239, |
| "frac_reward_zero_std": 0.19166667088866235, |
| "grad_norm": 0.40871769189834595, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.458333333333332, |
| "items/detected_min": 17.0, |
| "kl": 0.007386551384115592, |
| "learning_rate": 1e-06, |
| "loss": 0.0145, |
| "num_tokens": 10495002.0, |
| "reward": 0.023296015989035367, |
| "reward_std": 0.021536412183195353, |
| "reward_total": 0.18946771025657655, |
| "rewards/pos_1/mean": 0.18946770280599595, |
| "rewards/pos_1/std": 0.0835272241383791, |
| "rewards/pos_10/mean": 0.002924169832840562, |
| "rewards/pos_10/std": 0.007994202524423599, |
| "rewards/pos_11/mean": 0.0019019185565412045, |
| "rewards/pos_11/std": 0.006062317825853825, |
| "rewards/pos_12/mean": 0.0017702369252219797, |
| "rewards/pos_12/std": 0.004168829647824168, |
| "rewards/pos_13/mean": 0.0006561018875800073, |
| "rewards/pos_13/std": 0.001643702038563788, |
| "rewards/pos_14/mean": 0.0007998688146471977, |
| "rewards/pos_14/std": 0.0017647244036197661, |
| "rewards/pos_15/mean": -0.00039062501164153216, |
| "rewards/pos_15/std": 0.0005611199419945478, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": -0.0003747069742530584, |
| "rewards/pos_17/std": 0.0007494139485061168, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.09561244631186128, |
| "rewards/pos_2/std": 0.06435443423688411, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.06603119405917823, |
| "rewards/pos_3/std": 0.06552036330103875, |
| "rewards/pos_4/mean": 0.04784354259027168, |
| "rewards/pos_4/std": 0.05719647705554962, |
| "rewards/pos_5/mean": 0.024211431900039316, |
| "rewards/pos_5/std": 0.042845807410776614, |
| "rewards/pos_6/mean": 0.017558457027189434, |
| "rewards/pos_6/std": 0.035367506369948384, |
| "rewards/pos_7/mean": 0.010088088770862669, |
| "rewards/pos_7/std": 0.028223303635604678, |
| "rewards/pos_8/mean": 0.005037966254167259, |
| "rewards/pos_8/std": 0.01750833559781313, |
| "rewards/pos_9/mean": 0.0030373329878784715, |
| "rewards/pos_9/std": 0.01273077039513737, |
| "step": 380 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 326.7, |
| "completions/max_terminated_length": 326.7, |
| "completions/mean_length": 260.4479248046875, |
| "completions/mean_terminated_length": 260.4479248046875, |
| "completions/min_length": 209.6, |
| "completions/min_terminated_length": 209.6, |
| "entropy": 0.23070885874330999, |
| "epoch": 0.24528301886792453, |
| "frac_reward_zero_std": 0.14166666939854622, |
| "grad_norm": 0.1970636546611786, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.816666666666666, |
| "items/detected_min": 18.0, |
| "kl": 0.008075704663254631, |
| "learning_rate": 1e-06, |
| "loss": 0.015, |
| "num_tokens": 10768141.0, |
| "reward": 0.0202477783896029, |
| "reward_std": 0.020860963594168426, |
| "reward_total": 0.15246383249759674, |
| "rewards/pos_1/mean": 0.15246383231133223, |
| "rewards/pos_1/std": 0.07292928658425808, |
| "rewards/pos_10/mean": -0.0004727510269731283, |
| "rewards/pos_10/std": 0.007833246374502778, |
| "rewards/pos_11/mean": -0.0005811311304569245, |
| "rewards/pos_11/std": 0.00324117187410593, |
| "rewards/pos_12/mean": -0.0005629961844533682, |
| "rewards/pos_12/std": 0.0011259923689067365, |
| "rewards/pos_13/mean": -0.0009575764182955026, |
| "rewards/pos_13/std": 0.0019151528365910053, |
| "rewards/pos_14/mean": -0.0007998688146471972, |
| "rewards/pos_14/std": 0.0015997376292943965, |
| "rewards/pos_15/mean": -0.00026041667442768766, |
| "rewards/pos_15/std": 0.0005208333488553772, |
| "rewards/pos_16/mean": -0.00038226647302508305, |
| "rewards/pos_16/std": 0.0007645330158993611, |
| "rewards/pos_17/mean": -0.0004996092990040775, |
| "rewards/pos_17/std": 0.0009992185980081567, |
| "rewards/pos_18/mean": -0.00012260881485417439, |
| "rewards/pos_18/std": 0.0002452176297083506, |
| "rewards/pos_19/mean": 4.625929407134921e-19, |
| "rewards/pos_19/std": 9.251858814269843e-19, |
| "rewards/pos_2/mean": 0.10268718618899583, |
| "rewards/pos_2/std": 0.07012189291417599, |
| "rewards/pos_20/mean": 4.625929407134921e-19, |
| "rewards/pos_20/std": 9.251858814269843e-19, |
| "rewards/pos_3/mean": 0.06723444974049926, |
| "rewards/pos_3/std": 0.06676209792494774, |
| "rewards/pos_4/mean": 0.03524471241980791, |
| "rewards/pos_4/std": 0.055582412891089915, |
| "rewards/pos_5/mean": 0.022470815673295876, |
| "rewards/pos_5/std": 0.04258656706660986, |
| "rewards/pos_6/mean": 0.01445532594419395, |
| "rewards/pos_6/std": 0.031042478699237108, |
| "rewards/pos_7/mean": 0.007936406962107867, |
| "rewards/pos_7/std": 0.026095511973835528, |
| "rewards/pos_8/mean": 0.004532542885863222, |
| "rewards/pos_8/std": 0.01912364289164543, |
| "rewards/pos_9/mean": 0.002569500356912613, |
| "rewards/pos_9/std": 0.014730246970430017, |
| "step": 390 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 328.5, |
| "completions/max_terminated_length": 328.5, |
| "completions/mean_length": 256.43959350585936, |
| "completions/mean_terminated_length": 256.43959350585936, |
| "completions/min_length": 206.3, |
| "completions/min_terminated_length": 206.3, |
| "entropy": 0.21603431180119514, |
| "epoch": 0.25157232704402516, |
| "frac_reward_zero_std": 0.15833333730697632, |
| "grad_norm": 0.30788999795913696, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.45, |
| "items/detected_min": 19.0, |
| "kl": 0.008252161745622289, |
| "learning_rate": 1e-06, |
| "loss": 0.0147, |
| "num_tokens": 11031828.0, |
| "reward": 0.016329724296701896, |
| "reward_std": 0.016512975608929992, |
| "reward_total": 0.14265724457800388, |
| "rewards/pos_1/mean": 0.14265724280849099, |
| "rewards/pos_1/std": 0.0651577839627862, |
| "rewards/pos_10/mean": 0.00025893463753163816, |
| "rewards/pos_10/std": 0.005557593540288508, |
| "rewards/pos_11/mean": 0.0004358483478426933, |
| "rewards/pos_11/std": 0.002794892713427544, |
| "rewards/pos_12/mean": -0.0007037452305667102, |
| "rewards/pos_12/std": 0.0014074904611334205, |
| "rewards/pos_13/mean": -0.0010943730361759663, |
| "rewards/pos_13/std": 0.0021887460723519327, |
| "rewards/pos_14/mean": -0.000933180283755064, |
| "rewards/pos_14/std": 0.0016409843228757381, |
| "rewards/pos_15/mean": -0.00013020833721384407, |
| "rewards/pos_15/std": 0.00026041667442768814, |
| "rewards/pos_16/mean": -0.00012742215767502785, |
| "rewards/pos_16/std": 0.0002548443386331201, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.08800087512936443, |
| "rewards/pos_2/std": 0.05617033746093512, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.03881473926594481, |
| "rewards/pos_3/std": 0.04208221547305584, |
| "rewards/pos_4/mean": 0.023159040114842357, |
| "rewards/pos_4/std": 0.03938644733279943, |
| "rewards/pos_5/mean": 0.012225279369158671, |
| "rewards/pos_5/std": 0.03276767339557409, |
| "rewards/pos_6/mean": 0.009199051139876246, |
| "rewards/pos_6/std": 0.025476095825433732, |
| "rewards/pos_7/mean": 0.007665468950290233, |
| "rewards/pos_7/std": 0.023648051917552947, |
| "rewards/pos_8/mean": 0.004900209908373654, |
| "rewards/pos_8/std": 0.017847701627761125, |
| "rewards/pos_9/mean": 0.002391626697499305, |
| "rewards/pos_9/std": 0.013368407892994583, |
| "step": 400 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 316.3, |
| "completions/max_terminated_length": 316.3, |
| "completions/mean_length": 256.9125091552734, |
| "completions/mean_terminated_length": 256.9125091552734, |
| "completions/min_length": 198.5, |
| "completions/min_terminated_length": 198.5, |
| "entropy": 0.22275403228898843, |
| "epoch": 0.2578616352201258, |
| "frac_reward_zero_std": 0.18333333507180213, |
| "grad_norm": 0.2904108762741089, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.691666666666666, |
| "items/detected_min": 18.0, |
| "kl": 0.007832498365799741, |
| "learning_rate": 1e-06, |
| "loss": 0.0133, |
| "num_tokens": 11296478.0, |
| "reward": 0.028976143896579744, |
| "reward_std": 0.022660434059798718, |
| "reward_total": 0.2048182100057602, |
| "rewards/pos_1/mean": 0.20481819957494735, |
| "rewards/pos_1/std": 0.07572797238826752, |
| "rewards/pos_10/mean": 0.0031875847955234347, |
| "rewards/pos_10/std": 0.009897435712628067, |
| "rewards/pos_11/mean": 0.0017400289420038463, |
| "rewards/pos_11/std": 0.008549216762185096, |
| "rewards/pos_12/mean": 0.0019125141203403472, |
| "rewards/pos_12/std": 0.006288737198337913, |
| "rewards/pos_13/mean": 0.0005208333488553763, |
| "rewards/pos_13/std": 0.0029991445364430545, |
| "rewards/pos_14/mean": 0.00012089891824871302, |
| "rewards/pos_14/std": 0.0018415355123579502, |
| "rewards/pos_15/mean": -1.1641532182693482e-11, |
| "rewards/pos_15/std": 0.001936122798360884, |
| "rewards/pos_16/mean": -0.0006371107883751392, |
| "rewards/pos_16/std": 0.0012742216931656003, |
| "rewards/pos_17/mean": -0.00024980464950203897, |
| "rewards/pos_17/std": 0.0004996092990040779, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.11721294932067394, |
| "rewards/pos_2/std": 0.06804916113615037, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.08078673807904124, |
| "rewards/pos_3/std": 0.055386250652372834, |
| "rewards/pos_4/mean": 0.06849569948390126, |
| "rewards/pos_4/std": 0.05045593734830618, |
| "rewards/pos_5/mean": 0.0406662215013057, |
| "rewards/pos_5/std": 0.04149941392242908, |
| "rewards/pos_6/mean": 0.02355194711126387, |
| "rewards/pos_6/std": 0.040676692873239516, |
| "rewards/pos_7/mean": 0.019361452711746097, |
| "rewards/pos_7/std": 0.03839451922103763, |
| "rewards/pos_8/mean": 0.011890080338343979, |
| "rewards/pos_8/std": 0.030131167080253363, |
| "rewards/pos_9/mean": 0.006267227104399353, |
| "rewards/pos_9/std": 0.01935631341766566, |
| "step": 410 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 314.6, |
| "completions/max_terminated_length": 314.6, |
| "completions/mean_length": 254.96250762939454, |
| "completions/mean_terminated_length": 254.96250762939454, |
| "completions/min_length": 202.4, |
| "completions/min_terminated_length": 202.4, |
| "entropy": 0.21695980379978816, |
| "epoch": 0.2641509433962264, |
| "frac_reward_zero_std": 0.2083333395421505, |
| "grad_norm": 0.30839505791664124, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.95, |
| "items/detected_min": 20.0, |
| "kl": 0.01072122643769641, |
| "learning_rate": 1e-06, |
| "loss": 0.0122, |
| "num_tokens": 11561600.0, |
| "reward": 0.01675855405628681, |
| "reward_std": 0.01783207766711712, |
| "reward_total": 0.14728330001235007, |
| "rewards/pos_1/mean": 0.14728329554200173, |
| "rewards/pos_1/std": 0.06452072151005268, |
| "rewards/pos_10/mean": 0.0008343550143763423, |
| "rewards/pos_10/std": 0.007690893951803446, |
| "rewards/pos_11/mean": -0.0001931680366396904, |
| "rewards/pos_11/std": 0.0025193195790052412, |
| "rewards/pos_12/mean": -0.00031124838860705493, |
| "rewards/pos_12/std": 0.0027554802829399703, |
| "rewards/pos_13/mean": -0.00028753390070050954, |
| "rewards/pos_13/std": 0.002476783096790314, |
| "rewards/pos_14/mean": 0.0001333114691078663, |
| "rewards/pos_14/std": 0.001866360567510128, |
| "rewards/pos_15/mean": -0.00026041667442768814, |
| "rewards/pos_15/std": 0.0005208333488553763, |
| "rewards/pos_16/mean": 0.0, |
| "rewards/pos_16/std": 0.0, |
| "rewards/pos_17/mean": -0.0003747069742530584, |
| "rewards/pos_17/std": 0.0007494139485061168, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.07640054933726788, |
| "rewards/pos_2/std": 0.06290208585560322, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.052256912970915434, |
| "rewards/pos_3/std": 0.05409074127674103, |
| "rewards/pos_4/mean": 0.027486630342900754, |
| "rewards/pos_4/std": 0.043330289982259275, |
| "rewards/pos_5/mean": 0.018951812526211143, |
| "rewards/pos_5/std": 0.03892630822956562, |
| "rewards/pos_6/mean": 0.00856637657561805, |
| "rewards/pos_6/std": 0.027429566346108915, |
| "rewards/pos_7/mean": 0.0033461297823426618, |
| "rewards/pos_7/std": 0.022302147187292575, |
| "rewards/pos_8/mean": 0.0017868415918201209, |
| "rewards/pos_8/std": 0.01229224568232894, |
| "rewards/pos_9/mean": -0.0003254689043387771, |
| "rewards/pos_9/std": 0.012023128243163228, |
| "step": 420 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 322.3, |
| "completions/max_terminated_length": 322.3, |
| "completions/mean_length": 257.87501068115233, |
| "completions/mean_terminated_length": 257.87501068115233, |
| "completions/min_length": 202.9, |
| "completions/min_terminated_length": 202.9, |
| "entropy": 0.217508593822519, |
| "epoch": 0.27044025157232704, |
| "frac_reward_zero_std": 0.2083333373069763, |
| "grad_norm": 0.2622852623462677, |
| "items/detected_max": 35.0, |
| "items/detected_mean": 24.483333333333334, |
| "items/detected_min": 18.0, |
| "kl": 0.009693127268595466, |
| "learning_rate": 1e-06, |
| "loss": 0.0139, |
| "num_tokens": 11848076.0, |
| "reward": 0.023260173643939196, |
| "reward_std": 0.01729170950129628, |
| "reward_total": 0.2099197213537991, |
| "rewards/pos_1/mean": 0.20991971981711685, |
| "rewards/pos_1/std": 0.08556453036144376, |
| "rewards/pos_10/mean": -0.0009033275535330185, |
| "rewards/pos_10/std": 0.001806655107066037, |
| "rewards/pos_11/mean": -0.0007264139130711555, |
| "rewards/pos_11/std": 0.001452827826142311, |
| "rewards/pos_12/mean": -0.000985243311151862, |
| "rewards/pos_12/std": 0.001970486622303724, |
| "rewards/pos_13/mean": -0.0001367966295219958, |
| "rewards/pos_13/std": 0.0002735932590439916, |
| "rewards/pos_14/mean": -0.00039993440732359885, |
| "rewards/pos_14/std": 0.0007998688146471977, |
| "rewards/pos_15/mean": -0.0006510416860692203, |
| "rewards/pos_15/std": 0.0013020833721384406, |
| "rewards/pos_16/mean": -0.00012742215767502785, |
| "rewards/pos_16/std": 0.0002548443386331201, |
| "rewards/pos_17/mean": -0.0001249023247510204, |
| "rewards/pos_17/std": 0.0002498046495020408, |
| "rewards/pos_18/mean": -9.251858814269843e-19, |
| "rewards/pos_18/std": 1.8503717628539686e-18, |
| "rewards/pos_19/mean": -9.251858814269843e-19, |
| "rewards/pos_19/std": 1.8503717628539686e-18, |
| "rewards/pos_2/mean": 0.10043946434743703, |
| "rewards/pos_2/std": 0.05938351722434163, |
| "rewards/pos_20/mean": -9.251858814269843e-19, |
| "rewards/pos_20/std": 1.8503717628539686e-18, |
| "rewards/pos_3/mean": 0.05927633896353655, |
| "rewards/pos_3/std": 0.04616741510108113, |
| "rewards/pos_4/mean": 0.042810957436449824, |
| "rewards/pos_4/std": 0.0379714198410511, |
| "rewards/pos_5/mean": 0.030907541140913965, |
| "rewards/pos_5/std": 0.033647235855460166, |
| "rewards/pos_6/mean": 0.01658214651979506, |
| "rewards/pos_6/std": 0.031235989369452, |
| "rewards/pos_7/mean": 0.008262457395903765, |
| "rewards/pos_7/std": 0.01830614060163498, |
| "rewards/pos_8/mean": 0.0016870602848939598, |
| "rewards/pos_8/std": 0.01442420994862914, |
| "rewards/pos_9/mean": -0.0006271459045819938, |
| "rewards/pos_9/std": 0.01102356247138232, |
| "step": 430 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 319.4, |
| "completions/max_terminated_length": 319.4, |
| "completions/mean_length": 257.2979248046875, |
| "completions/mean_terminated_length": 257.2979248046875, |
| "completions/min_length": 202.6, |
| "completions/min_terminated_length": 202.6, |
| "entropy": 0.213536003852884, |
| "epoch": 0.27672955974842767, |
| "frac_reward_zero_std": 0.18333333879709243, |
| "grad_norm": 0.5048796534538269, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.325, |
| "items/detected_min": 17.0, |
| "kl": 0.010342476896767039, |
| "learning_rate": 1e-06, |
| "loss": 0.015, |
| "num_tokens": 12102227.0, |
| "reward": 0.025915137585252525, |
| "reward_std": 0.02463643942028284, |
| "reward_total": 0.17452566474676132, |
| "rewards/pos_1/mean": 0.1745256520807743, |
| "rewards/pos_1/std": 0.08961139153689146, |
| "rewards/pos_10/mean": 0.005741290561854839, |
| "rewards/pos_10/std": 0.01339336596429348, |
| "rewards/pos_11/mean": 0.002170154917985201, |
| "rewards/pos_11/std": 0.008989358879625797, |
| "rewards/pos_12/mean": 0.0031916681327857077, |
| "rewards/pos_12/std": 0.0069463324500247834, |
| "rewards/pos_13/mean": 0.0002596526290290058, |
| "rewards/pos_13/std": 0.0038024243200197815, |
| "rewards/pos_14/mean": 0.00039993440732359885, |
| "rewards/pos_14/std": 0.0013331146910786629, |
| "rewards/pos_15/mean": -0.00039062501164153173, |
| "rewards/pos_15/std": 0.0007812500232830652, |
| "rewards/pos_16/mean": 4.625929407134921e-19, |
| "rewards/pos_16/std": 9.251858814269843e-19, |
| "rewards/pos_17/mean": -0.0002498046495020385, |
| "rewards/pos_17/std": 0.0004996092990040789, |
| "rewards/pos_18/mean": -0.00012260881485417439, |
| "rewards/pos_18/std": 0.0002452176297083506, |
| "rewards/pos_19/mean": -0.0001205094857141371, |
| "rewards/pos_19/std": 0.00024101897142827602, |
| "rewards/pos_2/mean": 0.121816236525774, |
| "rewards/pos_2/std": 0.07342976573854684, |
| "rewards/pos_20/mean": -0.00011857826029881789, |
| "rewards/pos_20/std": 0.00023715652059763762, |
| "rewards/pos_3/mean": 0.07539536720141768, |
| "rewards/pos_3/std": 0.07011822983622551, |
| "rewards/pos_4/mean": 0.04657440833980218, |
| "rewards/pos_4/std": 0.06269463896751404, |
| "rewards/pos_5/mean": 0.03148499419912696, |
| "rewards/pos_5/std": 0.04455104125663638, |
| "rewards/pos_6/mean": 0.021406761987600477, |
| "rewards/pos_6/std": 0.041934129782021046, |
| "rewards/pos_7/mean": 0.016635867313016205, |
| "rewards/pos_7/std": 0.031167579744942487, |
| "rewards/pos_8/mean": 0.011998111993307248, |
| "rewards/pos_8/std": 0.025716074835509063, |
| "rewards/pos_9/mean": 0.0077047318452969195, |
| "rewards/pos_9/std": 0.01703706863336265, |
| "step": 440 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 333.4, |
| "completions/max_terminated_length": 333.4, |
| "completions/mean_length": 258.3291778564453, |
| "completions/mean_terminated_length": 258.3291778564453, |
| "completions/min_length": 205.5, |
| "completions/min_terminated_length": 205.5, |
| "entropy": 0.21201196995874247, |
| "epoch": 0.2830188679245283, |
| "frac_reward_zero_std": 0.24166667386889457, |
| "grad_norm": 0.28489571809768677, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.383333333333333, |
| "items/detected_min": 19.0, |
| "kl": 0.013525528619356918, |
| "learning_rate": 1e-06, |
| "loss": 0.0147, |
| "num_tokens": 12361545.0, |
| "reward": 0.02282193535938859, |
| "reward_std": 0.021278287377208473, |
| "reward_total": 0.15365880057215692, |
| "rewards/pos_1/mean": 0.15365879833698273, |
| "rewards/pos_1/std": 0.06613776870071889, |
| "rewards/pos_10/mean": 0.0027370264171622696, |
| "rewards/pos_10/std": 0.008689730637706818, |
| "rewards/pos_11/mean": 0.0024675481021404267, |
| "rewards/pos_11/std": 0.007933381665498018, |
| "rewards/pos_12/mean": 0.001032855163794011, |
| "rewards/pos_12/std": 0.0041585674742236735, |
| "rewards/pos_13/mean": 0.00034887201618403243, |
| "rewards/pos_13/std": 0.00343367662280798, |
| "rewards/pos_14/mean": 0.0008995437063276773, |
| "rewards/pos_14/std": 0.0023323331959545623, |
| "rewards/pos_15/mean": -2.1224049851298334e-05, |
| "rewards/pos_15/std": 0.001894681667909026, |
| "rewards/pos_16/mean": 0.00011734282597899482, |
| "rewards/pos_16/std": 0.0017637516837567101, |
| "rewards/pos_17/mean": 0.00012490232475101994, |
| "rewards/pos_17/std": 0.0017486325465142737, |
| "rewards/pos_18/mean": -0.00012260881485417439, |
| "rewards/pos_18/std": 0.0002452176297083506, |
| "rewards/pos_19/mean": 4.625929407134921e-19, |
| "rewards/pos_19/std": 9.251858814269843e-19, |
| "rewards/pos_2/mean": 0.1019910465925932, |
| "rewards/pos_2/std": 0.057884824462234974, |
| "rewards/pos_20/mean": -0.0002371565205976362, |
| "rewards/pos_20/std": 0.0004743130411952744, |
| "rewards/pos_3/mean": 0.07082258388400078, |
| "rewards/pos_3/std": 0.061851803213357925, |
| "rewards/pos_4/mean": 0.05265027731657028, |
| "rewards/pos_4/std": 0.05185747928917408, |
| "rewards/pos_5/mean": 0.029668276105076073, |
| "rewards/pos_5/std": 0.049659001640975475, |
| "rewards/pos_6/mean": 0.020517667289823294, |
| "rewards/pos_6/std": 0.03811166062951088, |
| "rewards/pos_7/mean": 0.008061961038038135, |
| "rewards/pos_7/std": 0.027415334060788155, |
| "rewards/pos_8/mean": 0.008093093032948672, |
| "rewards/pos_8/std": 0.024487523501738905, |
| "rewards/pos_9/mean": 0.0036278903018683193, |
| "rewards/pos_9/std": 0.015486051235347987, |
| "step": 450 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 336.9, |
| "completions/max_terminated_length": 336.9, |
| "completions/mean_length": 259.5166748046875, |
| "completions/mean_terminated_length": 259.5166748046875, |
| "completions/min_length": 206.9, |
| "completions/min_terminated_length": 206.9, |
| "entropy": 0.21642931795989473, |
| "epoch": 0.2893081761006289, |
| "frac_reward_zero_std": 0.2083333373069763, |
| "grad_norm": 0.2088499367237091, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.8875, |
| "items/detected_min": 19.0, |
| "kl": 0.01066165748294831, |
| "learning_rate": 1e-06, |
| "loss": 0.0149, |
| "num_tokens": 12617377.0, |
| "reward": 0.015607737097889185, |
| "reward_std": 0.023038433864712715, |
| "reward_total": 0.10255183950066567, |
| "rewards/pos_1/mean": 0.10255183652043343, |
| "rewards/pos_1/std": 0.07729223407804967, |
| "rewards/pos_10/mean": 0.0008214917848818004, |
| "rewards/pos_10/std": 0.006460730661638081, |
| "rewards/pos_11/mean": 0.0004120025085285306, |
| "rewards/pos_11/std": 0.004891922930255532, |
| "rewards/pos_12/mean": 0.0007297705044038594, |
| "rewards/pos_12/std": 0.00483751802239567, |
| "rewards/pos_13/mean": 0.00046408207854256036, |
| "rewards/pos_13/std": 0.0025697237113490702, |
| "rewards/pos_14/mean": 0.0004780227085575461, |
| "rewards/pos_14/std": 0.0025557830464094876, |
| "rewards/pos_15/mean": 0.0007508519338443875, |
| "rewards/pos_15/std": 0.0025433705653995276, |
| "rewards/pos_16/mean": 0.0001081687631085515, |
| "rewards/pos_16/std": 0.0016022538300603628, |
| "rewards/pos_17/mean": 0.00011572828516364098, |
| "rewards/pos_17/std": 0.0017302843742072582, |
| "rewards/pos_18/mean": 0.00036782644456252457, |
| "rewards/pos_18/std": 0.0012260881485417485, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.07489825356751681, |
| "rewards/pos_2/std": 0.07403903417289256, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.04760490842163563, |
| "rewards/pos_3/std": 0.0639339717105031, |
| "rewards/pos_4/mean": 0.031833214685320854, |
| "rewards/pos_4/std": 0.054455436579883096, |
| "rewards/pos_5/mean": 0.02260264167562127, |
| "rewards/pos_5/std": 0.04626600733026862, |
| "rewards/pos_6/mean": 0.01071704727364704, |
| "rewards/pos_6/std": 0.04206645265221596, |
| "rewards/pos_7/mean": 0.009195378096774221, |
| "rewards/pos_7/std": 0.03155197752639651, |
| "rewards/pos_8/mean": 0.00475915347924456, |
| "rewards/pos_8/std": 0.02537555042654276, |
| "rewards/pos_9/mean": 0.0037443477544002236, |
| "rewards/pos_9/std": 0.01737030434887856, |
| "step": 460 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 318.4, |
| "completions/max_terminated_length": 318.4, |
| "completions/mean_length": 253.72084045410156, |
| "completions/mean_terminated_length": 253.72084045410156, |
| "completions/min_length": 202.4, |
| "completions/min_terminated_length": 202.4, |
| "entropy": 0.20708943608527383, |
| "epoch": 0.29559748427672955, |
| "frac_reward_zero_std": 0.15833333656191825, |
| "grad_norm": 0.2726491093635559, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.233333333333334, |
| "items/detected_min": 18.0, |
| "kl": 0.010009914560456915, |
| "learning_rate": 1e-06, |
| "loss": 0.0141, |
| "num_tokens": 12867423.0, |
| "reward": 0.023438004404306413, |
| "reward_std": 0.022939678560942412, |
| "reward_total": 0.20833126753568648, |
| "rewards/pos_1/mean": 0.20833126306533814, |
| "rewards/pos_1/std": 0.08295661797747016, |
| "rewards/pos_10/mean": 0.003840826149098575, |
| "rewards/pos_10/std": 0.01358814868144691, |
| "rewards/pos_11/mean": 0.004028283571824431, |
| "rewards/pos_11/std": 0.011297739017754794, |
| "rewards/pos_12/mean": 0.0031792555819265544, |
| "rewards/pos_12/std": 0.008610495855100453, |
| "rewards/pos_13/mean": 0.0009312232374213636, |
| "rewards/pos_13/std": 0.0024096329929307104, |
| "rewards/pos_14/mean": 0.00038752187974750997, |
| "rewards/pos_14/std": 0.001308289635926485, |
| "rewards/pos_15/mean": 0.00039062501164153216, |
| "rewards/pos_15/std": 0.0013020833721384406, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.09070989573374391, |
| "rewards/pos_2/std": 0.07176251346245408, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.05962197887711227, |
| "rewards/pos_3/std": 0.0604378919582814, |
| "rewards/pos_4/mean": 0.040848955418914555, |
| "rewards/pos_4/std": 0.05593613050878048, |
| "rewards/pos_5/mean": 0.025873664231039584, |
| "rewards/pos_5/std": 0.04659653939306736, |
| "rewards/pos_6/mean": 0.013208038732409477, |
| "rewards/pos_6/std": 0.03292058128863573, |
| "rewards/pos_7/mean": 0.009250723465811461, |
| "rewards/pos_7/std": 0.02974128625355661, |
| "rewards/pos_8/mean": 0.004960190316584583, |
| "rewards/pos_8/std": 0.023272993229329585, |
| "rewards/pos_9/mean": 0.0034524708811659367, |
| "rewards/pos_9/std": 0.016142918425612152, |
| "step": 470 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 314.6, |
| "completions/max_terminated_length": 314.6, |
| "completions/mean_length": 256.15625762939453, |
| "completions/mean_terminated_length": 256.15625762939453, |
| "completions/min_length": 202.4, |
| "completions/min_terminated_length": 202.4, |
| "entropy": 0.22198334643617273, |
| "epoch": 0.3018867924528302, |
| "frac_reward_zero_std": 0.17500000298023224, |
| "grad_norm": 0.2422315627336502, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.533333333333335, |
| "items/detected_min": 19.0, |
| "kl": 0.012731472958694211, |
| "learning_rate": 1e-06, |
| "loss": 0.0159, |
| "num_tokens": 13147890.0, |
| "reward": 0.019851746130734683, |
| "reward_std": 0.021506846975535156, |
| "reward_total": 0.13185822628438473, |
| "rewards/pos_1/mean": 0.13185822516679763, |
| "rewards/pos_1/std": 0.06748964563012123, |
| "rewards/pos_10/mean": 0.004497004637960345, |
| "rewards/pos_10/std": 0.010416024201549589, |
| "rewards/pos_11/mean": 0.0031894596293568613, |
| "rewards/pos_11/std": 0.008589645940810442, |
| "rewards/pos_12/mean": 0.002049866074230522, |
| "rewards/pos_12/std": 0.0058181500295177106, |
| "rewards/pos_13/mean": 0.0006700425175949932, |
| "rewards/pos_13/std": 0.004630456329323351, |
| "rewards/pos_14/mean": -0.0001333114691078663, |
| "rewards/pos_14/std": 0.002069247793406248, |
| "rewards/pos_15/mean": -0.0005208333488553763, |
| "rewards/pos_15/std": 0.0010416666977107526, |
| "rewards/pos_16/mean": -0.0006371107883751392, |
| "rewards/pos_16/std": 0.0012742216931656003, |
| "rewards/pos_17/mean": -0.0003747069742530584, |
| "rewards/pos_17/std": 0.0007494139485061168, |
| "rewards/pos_18/mean": -0.00036782644456252457, |
| "rewards/pos_18/std": 0.0007356528891250491, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.0955985370092094, |
| "rewards/pos_2/std": 0.06465087793767452, |
| "rewards/pos_20/mean": -0.00011857826029881835, |
| "rewards/pos_20/std": 0.0002371565205976367, |
| "rewards/pos_3/mean": 0.05621484536677599, |
| "rewards/pos_3/std": 0.056038703257218006, |
| "rewards/pos_4/mean": 0.03342725089751184, |
| "rewards/pos_4/std": 0.04756846679374575, |
| "rewards/pos_5/mean": 0.022668213793076576, |
| "rewards/pos_5/std": 0.043966184742748736, |
| "rewards/pos_6/mean": 0.01648205746896565, |
| "rewards/pos_6/std": 0.03921035025268793, |
| "rewards/pos_7/mean": 0.015421654691454022, |
| "rewards/pos_7/std": 0.032973642507568, |
| "rewards/pos_8/mean": 0.01127681282814592, |
| "rewards/pos_8/std": 0.026968108676373958, |
| "rewards/pos_9/mean": 0.00583330043591559, |
| "rewards/pos_9/std": 0.01570929940789938, |
| "step": 480 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 328.6, |
| "completions/max_terminated_length": 328.6, |
| "completions/mean_length": 259.42708892822264, |
| "completions/mean_terminated_length": 259.42708892822264, |
| "completions/min_length": 210.7, |
| "completions/min_terminated_length": 210.7, |
| "entropy": 0.2256676376486818, |
| "epoch": 0.3081761006289308, |
| "frac_reward_zero_std": 0.1750000037252903, |
| "grad_norm": 0.29480305314064026, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 25.2875, |
| "items/detected_min": 19.0, |
| "kl": 0.011438539253625398, |
| "learning_rate": 1e-06, |
| "loss": 0.0163, |
| "num_tokens": 13409963.0, |
| "reward": 0.021010580559959635, |
| "reward_std": 0.020790635049343108, |
| "reward_total": 0.16183093758299946, |
| "rewards/pos_1/mean": 0.1618309345562011, |
| "rewards/pos_1/std": 0.05938454018905759, |
| "rewards/pos_10/mean": 0.0036000264110043643, |
| "rewards/pos_10/std": 0.01220149858854711, |
| "rewards/pos_11/mean": 0.0015766112133860587, |
| "rewards/pos_11/std": 0.009965774789452552, |
| "rewards/pos_12/mean": 0.0016310159582644701, |
| "rewards/pos_12/std": 0.008706640219315886, |
| "rewards/pos_13/mean": -2.635318087413861e-05, |
| "rewards/pos_13/std": 0.0038622493389993914, |
| "rewards/pos_14/mean": -1.2412527576088906e-05, |
| "rewards/pos_14/std": 0.0021081584505736827, |
| "rewards/pos_15/mean": -1.164153172010054e-11, |
| "rewards/pos_15/std": 0.0020833333255723127, |
| "rewards/pos_16/mean": -0.0007645329460501671, |
| "rewards/pos_16/std": 0.0015290660317987203, |
| "rewards/pos_17/mean": -0.0004996092990040775, |
| "rewards/pos_17/std": 0.0007880588760599503, |
| "rewards/pos_18/mean": -0.0004904352594166994, |
| "rewards/pos_18/std": 0.0009808705188333988, |
| "rewards/pos_19/mean": 4.625929407134921e-19, |
| "rewards/pos_19/std": 9.251858814269843e-19, |
| "rewards/pos_2/mean": 0.0853801101911813, |
| "rewards/pos_2/std": 0.055204817373305556, |
| "rewards/pos_20/mean": 4.625929407134921e-19, |
| "rewards/pos_20/std": 9.251858814269843e-19, |
| "rewards/pos_3/mean": 0.06138521353714168, |
| "rewards/pos_3/std": 0.06057459069415927, |
| "rewards/pos_4/mean": 0.04186704335734248, |
| "rewards/pos_4/std": 0.05362486112862826, |
| "rewards/pos_5/mean": 0.02881924912217073, |
| "rewards/pos_5/std": 0.04141271626576781, |
| "rewards/pos_6/mean": 0.018322084681130946, |
| "rewards/pos_6/std": 0.03851036261767149, |
| "rewards/pos_7/mean": 0.010271314741112291, |
| "rewards/pos_7/std": 0.030041778273880482, |
| "rewards/pos_8/mean": 0.004253976827021689, |
| "rewards/pos_8/std": 0.02209998187609017, |
| "rewards/pos_9/mean": 0.003067348466720432, |
| "rewards/pos_9/std": 0.012733368459157646, |
| "step": 490 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 319.2, |
| "completions/max_terminated_length": 319.2, |
| "completions/mean_length": 257.12708892822263, |
| "completions/mean_terminated_length": 257.12708892822263, |
| "completions/min_length": 205.3, |
| "completions/min_terminated_length": 205.3, |
| "entropy": 0.2168938455792765, |
| "epoch": 0.31446540880503143, |
| "frac_reward_zero_std": 0.1916666693985462, |
| "grad_norm": 0.3637041747570038, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.4, |
| "items/detected_min": 19.0, |
| "kl": 0.011324638859756911, |
| "learning_rate": 1e-06, |
| "loss": 0.0145, |
| "num_tokens": 13669780.0, |
| "reward": 0.028080825228244066, |
| "reward_std": 0.026594148110598327, |
| "reward_total": 0.19896745309233665, |
| "rewards/pos_1/mean": 0.19896744936704636, |
| "rewards/pos_1/std": 0.09368667528033256, |
| "rewards/pos_10/mean": 0.0030387546285055578, |
| "rewards/pos_10/std": 0.011216811486519873, |
| "rewards/pos_11/mean": 0.0017206659773364662, |
| "rewards/pos_11/std": 0.00750925000756979, |
| "rewards/pos_12/mean": 0.001734267210122198, |
| "rewards/pos_12/std": 0.005157523066736757, |
| "rewards/pos_13/mean": 0.0003465388319455087, |
| "rewards/pos_13/std": 0.0034290103474631907, |
| "rewards/pos_14/mean": 0.0004972761031240225, |
| "rewards/pos_14/std": 0.003127535805106163, |
| "rewards/pos_15/mean": 0.0007701053051277995, |
| "rewards/pos_15/std": 0.002581877401098609, |
| "rewards/pos_16/mean": 0.00012742215767502785, |
| "rewards/pos_16/std": 0.001783910347148776, |
| "rewards/pos_17/mean": -0.0004996092990040779, |
| "rewards/pos_17/std": 0.0009992185980081559, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.11547301951795816, |
| "rewards/pos_2/std": 0.08222815282642841, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.08356678988784552, |
| "rewards/pos_3/std": 0.07231327965855598, |
| "rewards/pos_4/mean": 0.05913681124026576, |
| "rewards/pos_4/std": 0.06760933250188828, |
| "rewards/pos_5/mean": 0.03961076997220516, |
| "rewards/pos_5/std": 0.0505744569003582, |
| "rewards/pos_6/mean": 0.025688346242532134, |
| "rewards/pos_6/std": 0.04620512621477246, |
| "rewards/pos_7/mean": 0.01850563199259341, |
| "rewards/pos_7/std": 0.03958346555009484, |
| "rewards/pos_8/mean": 0.008969539403915405, |
| "rewards/pos_8/std": 0.026932531129568817, |
| "rewards/pos_9/mean": 0.004205800127238035, |
| "rewards/pos_9/std": 0.016458538291044534, |
| "step": 500 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 317.6, |
| "completions/max_terminated_length": 317.6, |
| "completions/mean_length": 257.35000762939455, |
| "completions/mean_terminated_length": 257.35000762939455, |
| "completions/min_length": 206.9, |
| "completions/min_terminated_length": 206.9, |
| "entropy": 0.2289095859353741, |
| "epoch": 0.32075471698113206, |
| "frac_reward_zero_std": 0.20000000298023224, |
| "grad_norm": 0.3724060654640198, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.945833333333333, |
| "items/detected_min": 19.0, |
| "kl": 0.012526460485726905, |
| "learning_rate": 1e-06, |
| "loss": 0.0161, |
| "num_tokens": 13941636.0, |
| "reward": 0.013600436504930258, |
| "reward_std": 0.013879508594982326, |
| "reward_total": 0.12234384417533875, |
| "rewards/pos_1/mean": 0.12234384194016457, |
| "rewards/pos_1/std": 0.06252944990992546, |
| "rewards/pos_10/mean": 0.0011139262584038078, |
| "rewards/pos_10/std": 0.005974171520210803, |
| "rewards/pos_11/mean": 0.0007149806478992105, |
| "rewards/pos_11/std": 0.004180813068524003, |
| "rewards/pos_12/mean": 0.0007421830203384161, |
| "rewards/pos_12/std": 0.0037812422029674052, |
| "rewards/pos_13/mean": 0.00033969798823818564, |
| "rewards/pos_13/std": 0.0032613215735182166, |
| "rewards/pos_14/mean": 0.0008903696667402983, |
| "rewards/pos_14/std": 0.002313985209912062, |
| "rewards/pos_15/mean": -0.00016060643829405333, |
| "rewards/pos_15/std": 0.0022829538211226468, |
| "rewards/pos_16/mean": -0.00027409770991653225, |
| "rewards/pos_16/std": 0.002509936527349055, |
| "rewards/pos_17/mean": 0.00024063060991466023, |
| "rewards/pos_17/std": 0.0014804798178374773, |
| "rewards/pos_18/mean": 0.0004904352594166992, |
| "rewards/pos_18/std": 0.0009808705188333994, |
| "rewards/pos_19/mean": -0.0001205094857141378, |
| "rewards/pos_19/std": 0.0002410189714282756, |
| "rewards/pos_2/mean": 0.05304830120876432, |
| "rewards/pos_2/std": 0.04234171658754349, |
| "rewards/pos_20/mean": -2.3129647035674607e-19, |
| "rewards/pos_20/std": 4.625929407134921e-19, |
| "rewards/pos_3/mean": 0.03204201725311577, |
| "rewards/pos_3/std": 0.02614194988273084, |
| "rewards/pos_4/mean": 0.024951637379126625, |
| "rewards/pos_4/std": 0.02837535673752427, |
| "rewards/pos_5/mean": 0.01782722012139857, |
| "rewards/pos_5/std": 0.030218360666185617, |
| "rewards/pos_6/mean": 0.008628727751784027, |
| "rewards/pos_6/std": 0.022143259178847075, |
| "rewards/pos_7/mean": 0.00442632056074217, |
| "rewards/pos_7/std": 0.017501901322975754, |
| "rewards/pos_8/mean": 0.0035614772117696703, |
| "rewards/pos_8/std": 0.013929071929305792, |
| "rewards/pos_9/mean": 0.001202161784749478, |
| "rewards/pos_9/std": 0.00740228877402842, |
| "step": 510 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.008333333333333337, |
| "completions/max_length": 345.0, |
| "completions/max_terminated_length": 325.2, |
| "completions/mean_length": 259.12084197998047, |
| "completions/mean_terminated_length": 257.01781158447267, |
| "completions/min_length": 203.6, |
| "completions/min_terminated_length": 203.6, |
| "entropy": 0.21795971781636278, |
| "epoch": 0.3270440251572327, |
| "frac_reward_zero_std": 0.24166667610406875, |
| "grad_norm": 0.44123509526252747, |
| "items/detected_max": 47.0, |
| "items/detected_mean": 24.404166666666665, |
| "items/detected_min": 19.0, |
| "kl": 0.009536893166174802, |
| "learning_rate": 1e-06, |
| "loss": 0.0151, |
| "num_tokens": 14200682.0, |
| "reward": 0.018912672251462936, |
| "reward_std": 0.020285651087760925, |
| "reward_total": 0.16123745497316122, |
| "rewards/pos_1/mean": 0.1612374499440193, |
| "rewards/pos_1/std": 0.07130067069083453, |
| "rewards/pos_10/mean": 0.002000838378444314, |
| "rewards/pos_10/std": 0.009915737924166024, |
| "rewards/pos_11/mean": 0.0014460663311183452, |
| "rewards/pos_11/std": 0.006357286497950554, |
| "rewards/pos_12/mean": 0.002331364236306399, |
| "rewards/pos_12/std": 0.0051678218180313705, |
| "rewards/pos_13/mean": 0.0012172291171737015, |
| "rewards/pos_13/std": 0.0028921225806698204, |
| "rewards/pos_14/mean": 0.0001333114691078663, |
| "rewards/pos_14/std": 0.001866360567510128, |
| "rewards/pos_15/mean": -0.00026041667442768814, |
| "rewards/pos_15/std": 0.0005208333488553763, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": -0.0004996092990040779, |
| "rewards/pos_17/std": 0.0009992185980081559, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.08295135851949453, |
| "rewards/pos_2/std": 0.06343129687011242, |
| "rewards/pos_20/mean": -0.0004743130411952734, |
| "rewards/pos_20/std": 0.0009486260823905468, |
| "rewards/pos_3/mean": 0.05000346293672919, |
| "rewards/pos_3/std": 0.06582604087889195, |
| "rewards/pos_4/mean": 0.0349484842736274, |
| "rewards/pos_4/std": 0.051855390705168244, |
| "rewards/pos_5/mean": 0.01802287108730525, |
| "rewards/pos_5/std": 0.038701156992465256, |
| "rewards/pos_6/mean": 0.012096035139256856, |
| "rewards/pos_6/std": 0.02850966826081276, |
| "rewards/pos_7/mean": 0.006792393838986754, |
| "rewards/pos_7/std": 0.02349440399557352, |
| "rewards/pos_8/mean": 0.005078107689041644, |
| "rewards/pos_8/std": 0.019577187206596135, |
| "rewards/pos_9/mean": 0.0016062510956544429, |
| "rewards/pos_9/std": 0.01359426868148148, |
| "step": 520 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 335.6, |
| "completions/max_terminated_length": 335.6, |
| "completions/mean_length": 257.13334503173826, |
| "completions/mean_terminated_length": 257.13334503173826, |
| "completions/min_length": 201.4, |
| "completions/min_terminated_length": 201.4, |
| "entropy": 0.2276484333910048, |
| "epoch": 0.3333333333333333, |
| "frac_reward_zero_std": 0.16666667088866233, |
| "grad_norm": 0.39736422896385193, |
| "items/detected_max": 31.0, |
| "items/detected_mean": 24.495833333333334, |
| "items/detected_min": 17.0, |
| "kl": 0.011707671616750303, |
| "learning_rate": 1e-06, |
| "loss": 0.0136, |
| "num_tokens": 14494766.0, |
| "reward": 0.021821795543655754, |
| "reward_std": 0.018616222217679022, |
| "reward_total": 0.1836221881210804, |
| "rewards/pos_1/mean": 0.18362218141555786, |
| "rewards/pos_1/std": 0.06593040227890015, |
| "rewards/pos_10/mean": -8.139674318954348e-05, |
| "rewards/pos_10/std": 0.0068946087034419175, |
| "rewards/pos_11/mean": 0.00041771335527300834, |
| "rewards/pos_11/std": 0.005574377067387104, |
| "rewards/pos_12/mean": 0.0, |
| "rewards/pos_12/std": 0.0020140345441177487, |
| "rewards/pos_13/mean": -0.0008207797771319747, |
| "rewards/pos_13/std": 0.001410291320644319, |
| "rewards/pos_14/mean": -0.0002666229382157326, |
| "rewards/pos_14/std": 0.0005332458764314652, |
| "rewards/pos_15/mean": -0.0009114583604969084, |
| "rewards/pos_15/std": 0.001822916720993817, |
| "rewards/pos_16/mean": -0.00012742215767502785, |
| "rewards/pos_16/std": 0.0002548443386331201, |
| "rewards/pos_17/mean": -0.0003747069742530584, |
| "rewards/pos_17/std": 0.0007494139485061168, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.09686637232080102, |
| "rewards/pos_2/std": 0.058121581189334394, |
| "rewards/pos_20/mean": -0.00011857826029881835, |
| "rewards/pos_20/std": 0.0002371565205976367, |
| "rewards/pos_3/mean": 0.06536929956637323, |
| "rewards/pos_3/std": 0.05751038622111082, |
| "rewards/pos_4/mean": 0.03915250832214952, |
| "rewards/pos_4/std": 0.048235630802810195, |
| "rewards/pos_5/mean": 0.02973933550529182, |
| "rewards/pos_5/std": 0.03755170316435397, |
| "rewards/pos_6/mean": 0.012826546526048333, |
| "rewards/pos_6/std": 0.030991469509899616, |
| "rewards/pos_7/mean": 0.008809663902502507, |
| "rewards/pos_7/std": 0.023566680843941867, |
| "rewards/pos_8/mean": 0.00226218614843674, |
| "rewards/pos_8/std": 0.016075075697153808, |
| "rewards/pos_9/mean": 0.00031418022699654103, |
| "rewards/pos_9/std": 0.01436436150688678, |
| "step": 530 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.002083333333333337, |
| "completions/max_length": 348.2, |
| "completions/max_terminated_length": 328.0, |
| "completions/mean_length": 259.75834197998046, |
| "completions/mean_terminated_length": 259.19579620361327, |
| "completions/min_length": 203.2, |
| "completions/min_terminated_length": 203.2, |
| "entropy": 0.23366477421174447, |
| "epoch": 0.33962264150943394, |
| "frac_reward_zero_std": 0.18333333730697632, |
| "grad_norm": 0.2925439774990082, |
| "items/detected_max": 50.0, |
| "items/detected_mean": 24.9375, |
| "items/detected_min": 19.0, |
| "kl": 0.010972671315539629, |
| "learning_rate": 1e-06, |
| "loss": 0.016, |
| "num_tokens": 14773898.0, |
| "reward": 0.02019151346758008, |
| "reward_std": 0.018987514078617096, |
| "reward_total": 0.17083178609609603, |
| "rewards/pos_1/mean": 0.17083178013563155, |
| "rewards/pos_1/std": 0.06076769046485424, |
| "rewards/pos_10/mean": 0.0035903325304389, |
| "rewards/pos_10/std": 0.011628140788525343, |
| "rewards/pos_11/mean": 0.0027449949644505977, |
| "rewards/pos_11/std": 0.009627606812864542, |
| "rewards/pos_12/mean": 0.002477097907103598, |
| "rewards/pos_12/std": 0.0066403827862814065, |
| "rewards/pos_13/mean": 0.0019338638870976866, |
| "rewards/pos_13/std": 0.0057128167012706395, |
| "rewards/pos_14/mean": 0.002217912580817938, |
| "rewards/pos_14/std": 0.004919342882931233, |
| "rewards/pos_15/mean": 0.0012909386656247079, |
| "rewards/pos_15/std": 0.0036235440289601683, |
| "rewards/pos_16/mean": 0.00012742215767502785, |
| "rewards/pos_16/std": 0.0017839103704318405, |
| "rewards/pos_17/mean": -0.00024980464950203897, |
| "rewards/pos_17/std": 0.0004996092990040779, |
| "rewards/pos_18/mean": -0.0002452176297083497, |
| "rewards/pos_18/std": 0.0004904352594166994, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.08002098277211189, |
| "rewards/pos_2/std": 0.0625394044443965, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.050222796481102706, |
| "rewards/pos_3/std": 0.047927550598979, |
| "rewards/pos_4/mean": 0.03301227022893727, |
| "rewards/pos_4/std": 0.045436117053031924, |
| "rewards/pos_5/mean": 0.01862254913430661, |
| "rewards/pos_5/std": 0.036664932314306495, |
| "rewards/pos_6/mean": 0.013114646542817354, |
| "rewards/pos_6/std": 0.024240608420223, |
| "rewards/pos_7/mean": 0.010791311541106552, |
| "rewards/pos_7/std": 0.022284213569946587, |
| "rewards/pos_8/mean": 0.00868327144999057, |
| "rewards/pos_8/std": 0.01720749761443585, |
| "rewards/pos_9/mean": 0.00476360940374434, |
| "rewards/pos_9/std": 0.017515426012687386, |
| "step": 540 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 326.3, |
| "completions/max_terminated_length": 326.3, |
| "completions/mean_length": 257.66875610351565, |
| "completions/mean_terminated_length": 257.66875610351565, |
| "completions/min_length": 205.4, |
| "completions/min_terminated_length": 205.4, |
| "entropy": 0.2219298164670666, |
| "epoch": 0.34591194968553457, |
| "frac_reward_zero_std": 0.16666667014360428, |
| "grad_norm": 0.18160945177078247, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.6875, |
| "items/detected_min": 19.0, |
| "kl": 0.013025570806348696, |
| "learning_rate": 1e-06, |
| "loss": 0.0135, |
| "num_tokens": 15045635.0, |
| "reward": 0.02094852374866605, |
| "reward_std": 0.018986396910622715, |
| "reward_total": 0.1696576789021492, |
| "rewards/pos_1/mean": 0.16965767368674278, |
| "rewards/pos_1/std": 0.06637014597654342, |
| "rewards/pos_10/mean": -0.0004375260672532022, |
| "rewards/pos_10/std": 0.004892604262568057, |
| "rewards/pos_11/mean": 0.0004868887132033706, |
| "rewards/pos_11/std": 0.0027171708177775146, |
| "rewards/pos_12/mean": 0.000505023670848459, |
| "rewards/pos_12/std": 0.003104969975538552, |
| "rewards/pos_13/mean": 0.0003840367076918483, |
| "rewards/pos_13/std": 0.0032727377722039817, |
| "rewards/pos_14/mean": -0.00014572401996701956, |
| "rewards/pos_14/std": 0.0023747813887894154, |
| "rewards/pos_15/mean": 0.00026041667442768814, |
| "rewards/pos_15/std": 0.0015625000465661286, |
| "rewards/pos_16/mean": -0.0005096886307001114, |
| "rewards/pos_16/std": 0.0008039575535804033, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.10143493013456464, |
| "rewards/pos_2/std": 0.06443125363439321, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.06565358490915969, |
| "rewards/pos_3/std": 0.0606054674834013, |
| "rewards/pos_4/mean": 0.04483333106618374, |
| "rewards/pos_4/std": 0.05223893783986568, |
| "rewards/pos_5/mean": 0.017703550728037955, |
| "rewards/pos_5/std": 0.040874285716563465, |
| "rewards/pos_6/mean": 0.013590086996555329, |
| "rewards/pos_6/std": 0.030414795130491258, |
| "rewards/pos_7/mean": 0.006060151034034789, |
| "rewards/pos_7/std": 0.022422701702453196, |
| "rewards/pos_8/mean": 0.0007093379856087268, |
| "rewards/pos_8/std": 0.01532193636521697, |
| "rewards/pos_9/mean": -0.0009702042210847139, |
| "rewards/pos_9/std": 0.00782886240631342, |
| "step": 550 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 318.8, |
| "completions/max_terminated_length": 318.8, |
| "completions/mean_length": 255.1791748046875, |
| "completions/mean_terminated_length": 255.1791748046875, |
| "completions/min_length": 206.8, |
| "completions/min_terminated_length": 206.8, |
| "entropy": 0.2227155176612238, |
| "epoch": 0.3522012578616352, |
| "frac_reward_zero_std": 0.20833333507180213, |
| "grad_norm": 0.4057275652885437, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.0875, |
| "items/detected_min": 18.0, |
| "kl": 0.013039527038442127, |
| "learning_rate": 1e-06, |
| "loss": 0.0158, |
| "num_tokens": 15341969.0, |
| "reward": 0.02573305545374751, |
| "reward_std": 0.025089992582798003, |
| "reward_total": 0.15084369219839572, |
| "rewards/pos_1/mean": 0.1508436843752861, |
| "rewards/pos_1/std": 0.08285234421491623, |
| "rewards/pos_10/mean": 0.0003826912841759622, |
| "rewards/pos_10/std": 0.00601864627096802, |
| "rewards/pos_11/mean": -0.0006290163844823837, |
| "rewards/pos_11/std": 0.0025642702355980875, |
| "rewards/pos_12/mean": -2.975030802190304e-05, |
| "rewards/pos_12/std": 0.0021924841217696666, |
| "rewards/pos_13/mean": -0.00015073727117851377, |
| "rewards/pos_13/std": 0.002434458048082888, |
| "rewards/pos_14/mean": -0.0002666229382157326, |
| "rewards/pos_14/std": 0.0026662293821573257, |
| "rewards/pos_15/mean": -0.00026041667442768814, |
| "rewards/pos_15/std": 0.0005208333488553763, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.11551399044692516, |
| "rewards/pos_2/std": 0.08242853209376336, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.0830869296565652, |
| "rewards/pos_3/std": 0.0705016914755106, |
| "rewards/pos_4/mean": 0.058664625696837905, |
| "rewards/pos_4/std": 0.06315769106149674, |
| "rewards/pos_5/mean": 0.04447245486080646, |
| "rewards/pos_5/std": 0.057740448787808415, |
| "rewards/pos_6/mean": 0.03110660398378968, |
| "rewards/pos_6/std": 0.046270192787051204, |
| "rewards/pos_7/mean": 0.020051701087504627, |
| "rewards/pos_7/std": 0.0408051636070013, |
| "rewards/pos_8/mean": 0.008981027461898824, |
| "rewards/pos_8/std": 0.02832955224439502, |
| "rewards/pos_9/mean": 0.003148760786280036, |
| "rewards/pos_9/std": 0.012807594193145633, |
| "step": 560 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 315.7, |
| "completions/max_terminated_length": 315.7, |
| "completions/mean_length": 255.00833740234376, |
| "completions/mean_terminated_length": 255.00833740234376, |
| "completions/min_length": 200.4, |
| "completions/min_terminated_length": 200.4, |
| "entropy": 0.2198466069375475, |
| "epoch": 0.3584905660377358, |
| "frac_reward_zero_std": 0.18333333879709243, |
| "grad_norm": 0.2640518844127655, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.033333333333335, |
| "items/detected_min": 17.0, |
| "kl": 0.011254959223151673, |
| "learning_rate": 1e-06, |
| "loss": 0.014, |
| "num_tokens": 15608789.0, |
| "reward": 0.023907679412513972, |
| "reward_std": 0.02189104985445738, |
| "reward_total": 0.19489617720246316, |
| "rewards/pos_1/mean": 0.19489617235958576, |
| "rewards/pos_1/std": 0.08845191560685635, |
| "rewards/pos_10/mean": 0.002839635347481817, |
| "rewards/pos_10/std": 0.008555516763590277, |
| "rewards/pos_11/mean": 0.002113221539184451, |
| "rewards/pos_11/std": 0.006886483822017908, |
| "rewards/pos_12/mean": 0.0005332458764314652, |
| "rewards/pos_12/std": 0.003080526296980679, |
| "rewards/pos_13/mean": -0.00015073727117851377, |
| "rewards/pos_13/std": 0.002434458048082888, |
| "rewards/pos_14/mean": 0.00039993440732359885, |
| "rewards/pos_14/std": 0.0013331146910786629, |
| "rewards/pos_15/mean": -0.00013020833721384407, |
| "rewards/pos_15/std": 0.00026041667442768814, |
| "rewards/pos_16/mean": -0.0002548443153500557, |
| "rewards/pos_16/std": 0.0005096886772662402, |
| "rewards/pos_17/mean": 0.0, |
| "rewards/pos_17/std": 0.0, |
| "rewards/pos_18/mean": 0.0, |
| "rewards/pos_18/std": 0.0, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.09868481699377299, |
| "rewards/pos_2/std": 0.0687787925824523, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.06857036959845572, |
| "rewards/pos_3/std": 0.06220374419353902, |
| "rewards/pos_4/mean": 0.04410428605042398, |
| "rewards/pos_4/std": 0.05569797847419977, |
| "rewards/pos_5/mean": 0.024646718322765084, |
| "rewards/pos_5/std": 0.04680635128170252, |
| "rewards/pos_6/mean": 0.01727755949832499, |
| "rewards/pos_6/std": 0.03226131461560726, |
| "rewards/pos_7/mean": 0.010465893859509379, |
| "rewards/pos_7/std": 0.02676454761531204, |
| "rewards/pos_8/mean": 0.008822847646661103, |
| "rewards/pos_8/std": 0.018864544108510017, |
| "rewards/pos_9/mean": 0.0054551504668779675, |
| "rewards/pos_9/std": 0.014690567925572395, |
| "step": 570 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 318.2, |
| "completions/max_terminated_length": 318.2, |
| "completions/mean_length": 252.0562545776367, |
| "completions/mean_terminated_length": 252.0562545776367, |
| "completions/min_length": 200.3, |
| "completions/min_terminated_length": 200.3, |
| "entropy": 0.22020154868563016, |
| "epoch": 0.36477987421383645, |
| "frac_reward_zero_std": 0.20833333805203438, |
| "grad_norm": 0.19639542698860168, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 23.941666666666666, |
| "items/detected_min": 18.0, |
| "kl": 0.012878992815482585, |
| "learning_rate": 1e-06, |
| "loss": 0.0142, |
| "num_tokens": 15882596.0, |
| "reward": 0.01625635549426079, |
| "reward_std": 0.020171427354216575, |
| "reward_total": 0.13932952359318734, |
| "rewards/pos_1/mean": 0.13932951912283897, |
| "rewards/pos_1/std": 0.06653505899012088, |
| "rewards/pos_10/mean": 0.0011549281771294773, |
| "rewards/pos_10/std": 0.007822985318489373, |
| "rewards/pos_11/mean": 0.0019340602215379477, |
| "rewards/pos_11/std": 0.006773776095360518, |
| "rewards/pos_12/mean": 0.0022382270311936737, |
| "rewards/pos_12/std": 0.005602446431294083, |
| "rewards/pos_13/mean": 0.0021132876281626523, |
| "rewards/pos_13/std": 0.006415321328677237, |
| "rewards/pos_14/mean": 0.0011661666445434093, |
| "rewards/pos_14/std": 0.003782981541007757, |
| "rewards/pos_15/mean": 0.00010898428736254573, |
| "rewards/pos_15/std": 0.0017804686212912202, |
| "rewards/pos_16/mean": 0.00024476498365402224, |
| "rewards/pos_16/std": 0.0013653157511726022, |
| "rewards/pos_17/mean": 0.0003747069742530584, |
| "rewards/pos_17/std": 0.0012490232475101948, |
| "rewards/pos_18/mean": -0.00012260881485417485, |
| "rewards/pos_18/std": 0.0002452176297083497, |
| "rewards/pos_19/mean": -0.00036152845714241264, |
| "rewards/pos_19/std": 0.0005193236982449889, |
| "rewards/pos_2/mean": 0.07585011683404445, |
| "rewards/pos_2/std": 0.06930038519203663, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.04002528414130211, |
| "rewards/pos_3/std": 0.057030481100082395, |
| "rewards/pos_4/mean": 0.026532804872840644, |
| "rewards/pos_4/std": 0.04809789825230837, |
| "rewards/pos_5/mean": 0.015283433720469475, |
| "rewards/pos_5/std": 0.04078430999070406, |
| "rewards/pos_6/mean": 0.008291334100067615, |
| "rewards/pos_6/std": 0.03033105507493019, |
| "rewards/pos_7/mean": 0.006710097620089073, |
| "rewards/pos_7/std": 0.024835748365148902, |
| "rewards/pos_8/mean": 0.003944838512688875, |
| "rewards/pos_8/std": 0.019218799006193877, |
| "rewards/pos_9/mean": 0.00030867705354467037, |
| "rewards/pos_9/std": 0.011737936455756427, |
| "step": 580 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 315.5, |
| "completions/max_terminated_length": 315.5, |
| "completions/mean_length": 256.16459197998046, |
| "completions/mean_terminated_length": 256.16459197998046, |
| "completions/min_length": 208.2, |
| "completions/min_terminated_length": 208.2, |
| "entropy": 0.20928714523712794, |
| "epoch": 0.3710691823899371, |
| "frac_reward_zero_std": 0.1916666716337204, |
| "grad_norm": 0.19928312301635742, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.333333333333332, |
| "items/detected_min": 20.0, |
| "kl": 0.010333529350949297, |
| "learning_rate": 1e-06, |
| "loss": 0.0129, |
| "num_tokens": 16140451.0, |
| "reward": 0.018881635484285654, |
| "reward_std": 0.01549559785053134, |
| "reward_total": 0.15202718488872052, |
| "rewards/pos_1/mean": 0.1520271772518754, |
| "rewards/pos_1/std": 0.045391360158100726, |
| "rewards/pos_10/mean": -0.00017164183082059027, |
| "rewards/pos_10/std": 0.004313099361024797, |
| "rewards/pos_11/mean": -0.0007264139130711555, |
| "rewards/pos_11/std": 0.0036130989901721478, |
| "rewards/pos_12/mean": -0.0009852433227933942, |
| "rewards/pos_12/std": 0.0017325364518910647, |
| "rewards/pos_13/mean": -0.0008207797771319747, |
| "rewards/pos_13/std": 0.001410291320644319, |
| "rewards/pos_14/mean": -0.0007998688146471977, |
| "rewards/pos_14/std": 0.0013743613380938768, |
| "rewards/pos_15/mean": -0.0006510416860692203, |
| "rewards/pos_15/std": 0.0013020833721384406, |
| "rewards/pos_16/mean": -0.00038226647302508354, |
| "rewards/pos_16/std": 0.0007645330158993601, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": -0.0002452176297083497, |
| "rewards/pos_18/std": 0.0004904352594166994, |
| "rewards/pos_19/mean": 0.0, |
| "rewards/pos_19/std": 0.0, |
| "rewards/pos_2/mean": 0.08341998513787985, |
| "rewards/pos_2/std": 0.052314735017716885, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.0601381441578269, |
| "rewards/pos_3/std": 0.04900498539209366, |
| "rewards/pos_4/mean": 0.03645331035368145, |
| "rewards/pos_4/std": 0.03896988406777382, |
| "rewards/pos_5/mean": 0.024979627598077057, |
| "rewards/pos_5/std": 0.03800388928502798, |
| "rewards/pos_6/mean": 0.01494928018655628, |
| "rewards/pos_6/std": 0.026825455389916895, |
| "rewards/pos_7/mean": 0.007466998533345759, |
| "rewards/pos_7/std": 0.022271700343117116, |
| "rewards/pos_8/mean": 0.0020789593312656508, |
| "rewards/pos_8/std": 0.01322312243282795, |
| "rewards/pos_9/mean": 0.0010265630786307155, |
| "rewards/pos_9/std": 0.008656562282703817, |
| "step": 590 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 316.6, |
| "completions/max_terminated_length": 316.6, |
| "completions/mean_length": 256.9187576293945, |
| "completions/mean_terminated_length": 256.9187576293945, |
| "completions/min_length": 200.8, |
| "completions/min_terminated_length": 200.8, |
| "entropy": 0.2289535324089229, |
| "epoch": 0.37735849056603776, |
| "frac_reward_zero_std": 0.2166666716337204, |
| "grad_norm": 0.11284702271223068, |
| "items/detected_max": 30.0, |
| "items/detected_mean": 24.408333333333335, |
| "items/detected_min": 20.0, |
| "kl": 0.014122579808463343, |
| "learning_rate": 1e-06, |
| "loss": 0.014, |
| "num_tokens": 16420668.0, |
| "reward": 0.020397144777234644, |
| "reward_std": 0.01687405025586486, |
| "reward_total": 0.18865224489709362, |
| "rewards/pos_1/mean": 0.18865224173059686, |
| "rewards/pos_1/std": 0.06502700969576836, |
| "rewards/pos_10/mean": -0.00011006370186805725, |
| "rewards/pos_10/std": 0.007006493420340121, |
| "rewards/pos_11/mean": 0.0002226763404905796, |
| "rewards/pos_11/std": 0.0039321394637227055, |
| "rewards/pos_12/mean": 0.00024987878277897834, |
| "rewards/pos_12/std": 0.0034018342383205892, |
| "rewards/pos_13/mean": -0.0002735932474024594, |
| "rewards/pos_13/std": 0.004693410592153668, |
| "rewards/pos_14/mean": -0.0006665573455393314, |
| "rewards/pos_14/std": 0.0013331146910786629, |
| "rewards/pos_15/mean": -0.0005208333488553763, |
| "rewards/pos_15/std": 0.0010416666977107526, |
| "rewards/pos_16/mean": 0.0, |
| "rewards/pos_16/std": 0.0, |
| "rewards/pos_17/mean": -0.00012490232475101948, |
| "rewards/pos_17/std": 0.00024980464950203897, |
| "rewards/pos_18/mean": -0.0002452176297083497, |
| "rewards/pos_18/std": 0.0004904352594166994, |
| "rewards/pos_19/mean": -0.00012050948571413755, |
| "rewards/pos_19/std": 0.0002410189714282751, |
| "rewards/pos_2/mean": 0.0862714304268593, |
| "rewards/pos_2/std": 0.05415545515716076, |
| "rewards/pos_20/mean": 0.0, |
| "rewards/pos_20/std": 0.0, |
| "rewards/pos_3/mean": 0.05922614808077924, |
| "rewards/pos_3/std": 0.0429378055036068, |
| "rewards/pos_4/mean": 0.04153857089113444, |
| "rewards/pos_4/std": 0.03988835141062737, |
| "rewards/pos_5/mean": 0.018643930507823824, |
| "rewards/pos_5/std": 0.038961370615288614, |
| "rewards/pos_6/mean": 0.01132265494670719, |
| "rewards/pos_6/std": 0.030610042996704577, |
| "rewards/pos_7/mean": 0.003978243132587522, |
| "rewards/pos_7/std": 0.021603101422078906, |
| "rewards/pos_8/mean": 0.0007107637298759073, |
| "rewards/pos_8/std": 0.01438979608938098, |
| "rewards/pos_9/mean": -0.0008119920123135671, |
| "rewards/pos_9/std": 0.007518141204491258, |
| "step": 600 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3180, |
| "num_input_tokens_seen": 16420668, |
| "num_train_epochs": 2, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|