| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.07421150278293136, | |
| "eval_steps": 500, | |
| "global_step": 80, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 238.55, | |
| "epoch": 0.0009276437847866419, | |
| "grad_norm": 0.6362782716751099, | |
| "kl": 0.0, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0, | |
| "reward": 2.5, | |
| "reward_std": 1.3948675394058228, | |
| "rewards/execution_reward_func": 0.3, | |
| "rewards/import_check_reward_func": 0.65, | |
| "rewards/match_reward_func": 0.75, | |
| "rewards/syntax_reward_func": 0.8, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 89.2, | |
| "epoch": 0.0018552875695732839, | |
| "grad_norm": 1.435771107673645, | |
| "kl": 0.00012032143022224772, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0, | |
| "reward": 3.8, | |
| "reward_std": 1.0218252658843994, | |
| "rewards/execution_reward_func": 0.75, | |
| "rewards/import_check_reward_func": 0.8, | |
| "rewards/match_reward_func": 1.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 301.15, | |
| "epoch": 0.0027829313543599257, | |
| "grad_norm": 0.5544025301933289, | |
| "kl": 0.00015556021098745988, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0, | |
| "reward": 3.65, | |
| "reward_std": 2.213274621963501, | |
| "rewards/execution_reward_func": 0.15, | |
| "rewards/import_check_reward_func": 0.75, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.75, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 111.95, | |
| "epoch": 0.0037105751391465678, | |
| "grad_norm": 0.7285812497138977, | |
| "kl": 0.001399944696458988, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0001, | |
| "reward": 4.6, | |
| "reward_std": 2.024621105194092, | |
| "rewards/execution_reward_func": 0.7, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 147.7, | |
| "epoch": 0.00463821892393321, | |
| "grad_norm": 0.6224641799926758, | |
| "kl": 0.002306692115962505, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0001, | |
| "reward": 5.15, | |
| "reward_std": 1.4712525367736817, | |
| "rewards/execution_reward_func": 0.6, | |
| "rewards/import_check_reward_func": 0.9, | |
| "rewards/match_reward_func": 2.75, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 226.6, | |
| "epoch": 0.0055658627087198514, | |
| "grad_norm": 0.7274765372276306, | |
| "kl": 0.0017352548136841505, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0001, | |
| "reward": 3.0, | |
| "reward_std": 2.074456262588501, | |
| "rewards/execution_reward_func": 0.15, | |
| "rewards/import_check_reward_func": 0.8, | |
| "rewards/match_reward_func": 1.25, | |
| "rewards/syntax_reward_func": 0.8, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 269.95, | |
| "epoch": 0.006493506493506494, | |
| "grad_norm": 0.5622701048851013, | |
| "kl": 0.0028683518437901514, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0001, | |
| "reward": 3.3, | |
| "reward_std": 1.8375282287597656, | |
| "rewards/execution_reward_func": 0.3, | |
| "rewards/import_check_reward_func": 0.75, | |
| "rewards/match_reward_func": 1.5, | |
| "rewards/syntax_reward_func": 0.75, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 139.95, | |
| "epoch": 0.0074211502782931356, | |
| "grad_norm": 0.3798786997795105, | |
| "kl": 0.004316802701214329, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0002, | |
| "reward": 3.4, | |
| "reward_std": 0.2, | |
| "rewards/execution_reward_func": 0.4, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 266.7, | |
| "epoch": 0.008348794063079777, | |
| "grad_norm": 0.5406395196914673, | |
| "kl": 0.003787403281603474, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0002, | |
| "reward": 4.6, | |
| "reward_std": 1.6634664058685302, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 0.9, | |
| "rewards/match_reward_func": 2.25, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 285.6, | |
| "epoch": 0.00927643784786642, | |
| "grad_norm": 0.3494756519794464, | |
| "kl": 0.0056897306552855294, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0002, | |
| "reward": 4.0, | |
| "reward_std": 0.7464101552963257, | |
| "rewards/execution_reward_func": 0.3, | |
| "rewards/import_check_reward_func": 0.85, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.85, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 151.2, | |
| "epoch": 0.01020408163265306, | |
| "grad_norm": 0.20952889323234558, | |
| "kl": 0.0030223518115235493, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0001, | |
| "reward": 3.95, | |
| "reward_std": 0.6773502349853515, | |
| "rewards/execution_reward_func": 0.45, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 211.3, | |
| "epoch": 0.011131725417439703, | |
| "grad_norm": 0.11596839129924774, | |
| "kl": 0.0040574187733000144, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0002, | |
| "reward": 5.1, | |
| "reward_std": 0.2, | |
| "rewards/execution_reward_func": 0.2, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 276.4, | |
| "epoch": 0.012059369202226345, | |
| "grad_norm": 0.7007413506507874, | |
| "kl": 0.0023531450337031857, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0001, | |
| "reward": 3.45, | |
| "reward_std": 1.240312433242798, | |
| "rewards/execution_reward_func": 0.35, | |
| "rewards/import_check_reward_func": 0.8, | |
| "rewards/match_reward_func": 1.5, | |
| "rewards/syntax_reward_func": 0.8, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 268.1, | |
| "epoch": 0.012987012987012988, | |
| "grad_norm": 0.7164338231086731, | |
| "kl": 0.012285257197800093, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 3.8, | |
| "reward_std": 2.259043979644775, | |
| "rewards/execution_reward_func": 0.25, | |
| "rewards/import_check_reward_func": 0.9, | |
| "rewards/match_reward_func": 1.75, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 103.4, | |
| "epoch": 0.013914656771799629, | |
| "grad_norm": 0.778170108795166, | |
| "kl": 0.016984900797251613, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 3.05, | |
| "reward_std": 0.7, | |
| "rewards/execution_reward_func": 0.3, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 0.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 385.35, | |
| "epoch": 0.014842300556586271, | |
| "grad_norm": 0.35350680351257324, | |
| "kl": 0.0004109930072445422, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0, | |
| "reward": 2.9, | |
| "reward_std": 1.207443642616272, | |
| "rewards/execution_reward_func": 0.2, | |
| "rewards/import_check_reward_func": 0.6, | |
| "rewards/match_reward_func": 1.5, | |
| "rewards/syntax_reward_func": 0.6, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 226.9, | |
| "epoch": 0.015769944341372914, | |
| "grad_norm": 0.2457304149866104, | |
| "kl": 0.004650659079197794, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0002, | |
| "reward": 4.6, | |
| "reward_std": 0.8225053548812866, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 0.9, | |
| "rewards/match_reward_func": 2.25, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 143.4, | |
| "epoch": 0.016697588126159554, | |
| "grad_norm": 0.37925291061401367, | |
| "kl": 0.025710263662040232, | |
| "learning_rate": 2e-05, | |
| "loss": 0.001, | |
| "reward": 5.475, | |
| "reward_std": 0.8694802522659302, | |
| "rewards/execution_reward_func": 0.4, | |
| "rewards/import_check_reward_func": 0.925, | |
| "rewards/match_reward_func": 3.25, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 195.9, | |
| "epoch": 0.017625231910946195, | |
| "grad_norm": 1.0235642194747925, | |
| "kl": 0.004265061870682985, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0002, | |
| "reward": 4.825, | |
| "reward_std": 1.95, | |
| "rewards/execution_reward_func": 0.65, | |
| "rewards/import_check_reward_func": 0.975, | |
| "rewards/match_reward_func": 2.25, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 210.2, | |
| "epoch": 0.01855287569573284, | |
| "grad_norm": 0.8921510577201843, | |
| "kl": 0.01736406094278209, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 3.75, | |
| "reward_std": 2.0600518703460695, | |
| "rewards/execution_reward_func": 0.5, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 60.7, | |
| "epoch": 0.01948051948051948, | |
| "grad_norm": 0.7613882422447205, | |
| "kl": 0.01977001652121544, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0008, | |
| "reward": 6.3, | |
| "reward_std": 1.3154700517654419, | |
| "rewards/execution_reward_func": 0.8, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 124.7, | |
| "epoch": 0.02040816326530612, | |
| "grad_norm": 0.8782172799110413, | |
| "kl": 0.007784434873610735, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 5.95, | |
| "reward_std": 1.7160588264465333, | |
| "rewards/execution_reward_func": 0.7, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 48.45, | |
| "epoch": 0.021335807050092765, | |
| "grad_norm": 0.7235850095748901, | |
| "kl": 0.015565779805183411, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 5.95, | |
| "reward_std": 1.2176626682281495, | |
| "rewards/execution_reward_func": 0.95, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 199.2, | |
| "epoch": 0.022263450834879406, | |
| "grad_norm": 0.36401212215423584, | |
| "kl": 0.008195015601813793, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 5.65, | |
| "reward_std": 0.9087337732315064, | |
| "rewards/execution_reward_func": 0.5, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 3.25, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 284.6, | |
| "epoch": 0.023191094619666047, | |
| "grad_norm": 0.28475916385650635, | |
| "kl": 0.01078999440651387, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0004, | |
| "reward": 4.5, | |
| "reward_std": 0.6, | |
| "rewards/execution_reward_func": 0.1, | |
| "rewards/import_check_reward_func": 0.7, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 0.7, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 191.1, | |
| "epoch": 0.02411873840445269, | |
| "grad_norm": 0.16215822100639343, | |
| "kl": 0.006734096398577094, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 4.45, | |
| "reward_std": 0.5980713129043579, | |
| "rewards/execution_reward_func": 0.75, | |
| "rewards/import_check_reward_func": 0.85, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.85, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 120.6, | |
| "epoch": 0.02504638218923933, | |
| "grad_norm": 0.6018548011779785, | |
| "kl": 0.012047081184573472, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 5.15, | |
| "reward_std": 0.6773502349853515, | |
| "rewards/execution_reward_func": 0.65, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 236.95, | |
| "epoch": 0.025974025974025976, | |
| "grad_norm": 0.9233574271202087, | |
| "kl": 0.007194043893832713, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 4.4, | |
| "reward_std": 1.5498929023742676, | |
| "rewards/execution_reward_func": 0.45, | |
| "rewards/import_check_reward_func": 0.85, | |
| "rewards/match_reward_func": 2.25, | |
| "rewards/syntax_reward_func": 0.85, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 232.5, | |
| "epoch": 0.026901669758812616, | |
| "grad_norm": 0.7554066777229309, | |
| "kl": 0.005206135101616383, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0002, | |
| "reward": 3.8, | |
| "reward_std": 2.025991106033325, | |
| "rewards/execution_reward_func": 0.25, | |
| "rewards/import_check_reward_func": 0.8, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.75, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 168.4, | |
| "epoch": 0.027829313543599257, | |
| "grad_norm": 0.8817532658576965, | |
| "kl": 0.007167509943246842, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 2.625, | |
| "reward_std": 1.0498347997665405, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 0.925, | |
| "rewards/match_reward_func": 0.25, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 150.0, | |
| "epoch": 0.0287569573283859, | |
| "grad_norm": 0.7171877026557922, | |
| "kl": 0.01311813194770366, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 4.45, | |
| "reward_std": 1.8154700517654419, | |
| "rewards/execution_reward_func": 0.25, | |
| "rewards/import_check_reward_func": 0.85, | |
| "rewards/match_reward_func": 2.5, | |
| "rewards/syntax_reward_func": 0.85, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 126.65, | |
| "epoch": 0.029684601113172542, | |
| "grad_norm": 0.8615996837615967, | |
| "kl": 0.012906858464702964, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 5.65, | |
| "reward_std": 1.1, | |
| "rewards/execution_reward_func": 0.65, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 127.2, | |
| "epoch": 0.030612244897959183, | |
| "grad_norm": 0.28843703866004944, | |
| "kl": 0.01665779766626656, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 5.85, | |
| "reward_std": 0.7, | |
| "rewards/execution_reward_func": 0.6, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 122.25, | |
| "epoch": 0.03153988868274583, | |
| "grad_norm": 0.7831349968910217, | |
| "kl": 0.008541356842033564, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 5.45, | |
| "reward_std": 0.3, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 304.45, | |
| "epoch": 0.032467532467532464, | |
| "grad_norm": 0.33276575803756714, | |
| "kl": 0.002659273101016879, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0001, | |
| "reward": 3.15, | |
| "reward_std": 0.7, | |
| "rewards/execution_reward_func": 0.45, | |
| "rewards/import_check_reward_func": 0.85, | |
| "rewards/match_reward_func": 1.0, | |
| "rewards/syntax_reward_func": 0.85, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 104.0, | |
| "epoch": 0.03339517625231911, | |
| "grad_norm": 0.7017046809196472, | |
| "kl": 0.01652910131961107, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 5.05, | |
| "reward_std": 1.970170545578003, | |
| "rewards/execution_reward_func": 0.3, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 218.9, | |
| "epoch": 0.03432282003710575, | |
| "grad_norm": 0.4842416048049927, | |
| "kl": 0.010277540143579244, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0004, | |
| "reward": 5.7, | |
| "reward_std": 1.3773502349853515, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 3.25, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 162.4, | |
| "epoch": 0.03525046382189239, | |
| "grad_norm": 1.1983845233917236, | |
| "kl": 0.007826995360665024, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 5.7, | |
| "reward_std": 0.3154700517654419, | |
| "rewards/execution_reward_func": 0.7, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 113.35, | |
| "epoch": 0.036178107606679034, | |
| "grad_norm": 0.41759467124938965, | |
| "kl": 0.01697214711457491, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 6.25, | |
| "reward_std": 0.6773502349853515, | |
| "rewards/execution_reward_func": 0.75, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 149.85, | |
| "epoch": 0.03710575139146568, | |
| "grad_norm": 0.810123860836029, | |
| "kl": 0.009923209110274911, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0004, | |
| "reward": 3.15, | |
| "reward_std": 1.098312759399414, | |
| "rewards/execution_reward_func": 0.5, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 0.75, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 124.25, | |
| "epoch": 0.038033395176252316, | |
| "grad_norm": 0.0389072448015213, | |
| "kl": 0.019343174435198306, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0008, | |
| "reward": 5.4, | |
| "reward_std": 0.0, | |
| "rewards/execution_reward_func": 0.4, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 183.95, | |
| "epoch": 0.03896103896103896, | |
| "grad_norm": 0.3545382022857666, | |
| "kl": 0.008350804960355163, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 3.8, | |
| "reward_std": 0.7516611576080322, | |
| "rewards/execution_reward_func": 0.65, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 1.25, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 189.65, | |
| "epoch": 0.039888682745825604, | |
| "grad_norm": 0.24783830344676971, | |
| "kl": 0.0119239964755252, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 4.475, | |
| "reward_std": 0.25, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 0.975, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 87.35, | |
| "epoch": 0.04081632653061224, | |
| "grad_norm": 0.22360822558403015, | |
| "kl": 0.023859670106321573, | |
| "learning_rate": 2e-05, | |
| "loss": 0.001, | |
| "reward": 5.25, | |
| "reward_std": 0.574456262588501, | |
| "rewards/execution_reward_func": 0.5, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 71.55, | |
| "epoch": 0.041743970315398886, | |
| "grad_norm": 1.2800755500793457, | |
| "kl": 0.025399934221059083, | |
| "learning_rate": 2e-05, | |
| "loss": 0.001, | |
| "reward": 6.75, | |
| "reward_std": 1.0416025638580322, | |
| "rewards/execution_reward_func": 0.75, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 4.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 128.95, | |
| "epoch": 0.04267161410018553, | |
| "grad_norm": 0.18902267515659332, | |
| "kl": 0.020552122965455055, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0008, | |
| "reward": 4.45, | |
| "reward_std": 0.1, | |
| "rewards/execution_reward_func": 0.45, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 94.0, | |
| "epoch": 0.04359925788497217, | |
| "grad_norm": 0.6993905305862427, | |
| "kl": 0.013059131056070327, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 4.5, | |
| "reward_std": 0.6, | |
| "rewards/execution_reward_func": 0.75, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 186.9, | |
| "epoch": 0.04452690166975881, | |
| "grad_norm": 0.8022165894508362, | |
| "kl": 0.007941013853996991, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0003, | |
| "reward": 4.65, | |
| "reward_std": 1.648912525177002, | |
| "rewards/execution_reward_func": 0.4, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 69.1, | |
| "epoch": 0.045454545454545456, | |
| "grad_norm": 0.5193853974342346, | |
| "kl": 0.0152623875066638, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 5.75, | |
| "reward_std": 0.5, | |
| "rewards/execution_reward_func": 1.0, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 108.0, | |
| "epoch": 0.04638218923933209, | |
| "grad_norm": 0.5317401885986328, | |
| "kl": 0.01654429854825139, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 5.3, | |
| "reward_std": 0.5773502349853515, | |
| "rewards/execution_reward_func": 0.8, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 99.05, | |
| "epoch": 0.04730983302411874, | |
| "grad_norm": 0.25006845593452454, | |
| "kl": 0.015682655945420267, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 5.65, | |
| "reward_std": 0.1, | |
| "rewards/execution_reward_func": 0.65, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 139.95, | |
| "epoch": 0.04823747680890538, | |
| "grad_norm": 0.2602873146533966, | |
| "kl": 0.01236535501666367, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 5.15, | |
| "reward_std": 0.6154700517654419, | |
| "rewards/execution_reward_func": 0.9, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 200.35, | |
| "epoch": 0.04916512059369202, | |
| "grad_norm": 0.8118894696235657, | |
| "kl": 0.01205776953138411, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 4.45, | |
| "reward_std": 1.7, | |
| "rewards/execution_reward_func": 0.7, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 136.0, | |
| "epoch": 0.05009276437847866, | |
| "grad_norm": 0.30172234773635864, | |
| "kl": 0.012935555540025234, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 3.9, | |
| "reward_std": 0.5773502349853515, | |
| "rewards/execution_reward_func": 0.4, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 139.45, | |
| "epoch": 0.05102040816326531, | |
| "grad_norm": 0.24473513662815094, | |
| "kl": 0.010810322500765324, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0004, | |
| "reward": 4.55, | |
| "reward_std": 0.5, | |
| "rewards/execution_reward_func": 0.8, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 206.95, | |
| "epoch": 0.05194805194805195, | |
| "grad_norm": 0.5211365222930908, | |
| "kl": 0.00964691461995244, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0004, | |
| "reward": 4.1, | |
| "reward_std": 1.0, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 0.9, | |
| "rewards/match_reward_func": 1.75, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 154.15, | |
| "epoch": 0.05287569573283859, | |
| "grad_norm": 0.4427022933959961, | |
| "kl": 0.015764740016311406, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 2.75, | |
| "reward_std": 0.9928203105926514, | |
| "rewards/execution_reward_func": 0.35, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 0.5, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 180.4, | |
| "epoch": 0.05380333951762523, | |
| "grad_norm": 0.33605360984802246, | |
| "kl": 0.012487533967942, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 4.175, | |
| "reward_std": 0.85, | |
| "rewards/execution_reward_func": 0.5, | |
| "rewards/import_check_reward_func": 0.975, | |
| "rewards/match_reward_func": 1.75, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 115.35, | |
| "epoch": 0.05473098330241188, | |
| "grad_norm": 0.195700541138649, | |
| "kl": 0.01763147208839655, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 5.45, | |
| "reward_std": 0.1, | |
| "rewards/execution_reward_func": 0.45, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 118.45, | |
| "epoch": 0.055658627087198514, | |
| "grad_norm": 0.20909559726715088, | |
| "kl": 0.0156675161793828, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 4.3, | |
| "reward_std": 0.6, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 106.85, | |
| "epoch": 0.05658627087198516, | |
| "grad_norm": 0.6488444209098816, | |
| "kl": 0.018127623945474625, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 5.75, | |
| "reward_std": 1.2773502349853516, | |
| "rewards/execution_reward_func": 0.5, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 158.65, | |
| "epoch": 0.0575139146567718, | |
| "grad_norm": 0.6527036428451538, | |
| "kl": 0.02021341770887375, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0008, | |
| "reward": 5.45, | |
| "reward_std": 1.615470051765442, | |
| "rewards/execution_reward_func": 0.7, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 48.15, | |
| "epoch": 0.05844155844155844, | |
| "grad_norm": 1.3493740558624268, | |
| "kl": 0.1433052785694599, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0057, | |
| "reward": 7.55, | |
| "reward_std": 0.5, | |
| "rewards/execution_reward_func": 0.8, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 4.75, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 185.65, | |
| "epoch": 0.059369202226345084, | |
| "grad_norm": 0.1428665667772293, | |
| "kl": 0.01815747832879424, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 4.325, | |
| "reward_std": 0.18929693698883057, | |
| "rewards/execution_reward_func": 0.6, | |
| "rewards/import_check_reward_func": 0.875, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.85, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 162.3, | |
| "epoch": 0.06029684601113173, | |
| "grad_norm": 0.6011455059051514, | |
| "kl": 0.012370403949171304, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 4.0, | |
| "reward_std": 1.02315514087677, | |
| "rewards/execution_reward_func": 0.6, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 1.5, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 111.0, | |
| "epoch": 0.061224489795918366, | |
| "grad_norm": 0.1835913360118866, | |
| "kl": 0.014051128178834915, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 6.5, | |
| "reward_std": 0.6, | |
| "rewards/execution_reward_func": 0.25, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 4.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 195.7, | |
| "epoch": 0.06215213358070501, | |
| "grad_norm": 1.6659204959869385, | |
| "kl": 0.014673770777881146, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 3.85, | |
| "reward_std": 0.6516611576080322, | |
| "rewards/execution_reward_func": 0.1, | |
| "rewards/import_check_reward_func": 0.85, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 42.05, | |
| "epoch": 0.06307977736549165, | |
| "grad_norm": 0.46125537157058716, | |
| "kl": 0.01653697257861495, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 5.85, | |
| "reward_std": 0.1, | |
| "rewards/execution_reward_func": 0.85, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 53.2, | |
| "epoch": 0.0640074211502783, | |
| "grad_norm": 0.6094168424606323, | |
| "kl": 0.018516975454986095, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0007, | |
| "reward": 6.2, | |
| "reward_std": 1.2, | |
| "rewards/execution_reward_func": 0.7, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 193.55, | |
| "epoch": 0.06493506493506493, | |
| "grad_norm": 0.586470365524292, | |
| "kl": 0.014461748860776424, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 3.825, | |
| "reward_std": 0.9016611576080322, | |
| "rewards/execution_reward_func": 0.25, | |
| "rewards/import_check_reward_func": 0.925, | |
| "rewards/match_reward_func": 1.75, | |
| "rewards/syntax_reward_func": 0.9, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 102.05, | |
| "epoch": 0.06586270871985157, | |
| "grad_norm": 0.42713698744773865, | |
| "kl": 0.015026956889778375, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 5.55, | |
| "reward_std": 1.1, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 179.7, | |
| "epoch": 0.06679035250463822, | |
| "grad_norm": 0.2813414931297302, | |
| "kl": 0.0110438191331923, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0004, | |
| "reward": 5.3, | |
| "reward_std": 0.7403124332427978, | |
| "rewards/execution_reward_func": 0.8, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 2.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 188.3, | |
| "epoch": 0.06771799628942486, | |
| "grad_norm": 0.26418182253837585, | |
| "kl": 0.013095348398201168, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 5.45, | |
| "reward_std": 0.38284270763397216, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 3.0, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 223.4, | |
| "epoch": 0.0686456400742115, | |
| "grad_norm": 0.23218098282814026, | |
| "kl": 0.013283595815300942, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 4.7, | |
| "reward_std": 0.3983127593994141, | |
| "rewards/execution_reward_func": 0.8, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 210.85, | |
| "epoch": 0.06957328385899815, | |
| "grad_norm": 0.32164114713668823, | |
| "kl": 0.014888664055615664, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 3.45, | |
| "reward_std": 0.2154700517654419, | |
| "rewards/execution_reward_func": 0.45, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 43.05, | |
| "epoch": 0.07050092764378478, | |
| "grad_norm": 0.06279900670051575, | |
| "kl": 0.026899104565382005, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0011, | |
| "reward": 7.6, | |
| "reward_std": 0.0, | |
| "rewards/execution_reward_func": 0.6, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 5.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 143.75, | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.38623046875, | |
| "kl": 0.012465272471308709, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0005, | |
| "reward": 3.55, | |
| "reward_std": 0.1, | |
| "rewards/execution_reward_func": 0.55, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.0, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 137.65, | |
| "epoch": 0.07235621521335807, | |
| "grad_norm": 0.3098149001598358, | |
| "kl": 0.015718556847423314, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 4.15, | |
| "reward_std": 0.7928203105926513, | |
| "rewards/execution_reward_func": 0.65, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.5, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 156.3, | |
| "epoch": 0.07328385899814471, | |
| "grad_norm": 0.5492616891860962, | |
| "kl": 0.015015369467437267, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0006, | |
| "reward": 4.35, | |
| "reward_std": 1.3, | |
| "rewards/execution_reward_func": 0.45, | |
| "rewards/import_check_reward_func": 0.95, | |
| "rewards/match_reward_func": 2.0, | |
| "rewards/syntax_reward_func": 0.95, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 209.25, | |
| "epoch": 0.07421150278293136, | |
| "grad_norm": 0.39929476380348206, | |
| "kl": 0.010218921303749084, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0004, | |
| "reward": 3.85, | |
| "reward_std": 0.8154700517654419, | |
| "rewards/execution_reward_func": 0.6, | |
| "rewards/import_check_reward_func": 1.0, | |
| "rewards/match_reward_func": 1.25, | |
| "rewards/syntax_reward_func": 1.0, | |
| "step": 80 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 80, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |