{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07421150278293136, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 238.55, "epoch": 0.0009276437847866419, "grad_norm": 0.6362782716751099, "kl": 0.0, "learning_rate": 2e-05, "loss": 0.0, "reward": 2.5, "reward_std": 1.3948675394058228, "rewards/execution_reward_func": 0.3, "rewards/import_check_reward_func": 0.65, "rewards/match_reward_func": 0.75, "rewards/syntax_reward_func": 0.8, "step": 1 }, { "completion_length": 89.2, "epoch": 0.0018552875695732839, "grad_norm": 1.435771107673645, "kl": 0.00012032143022224772, "learning_rate": 2e-05, "loss": 0.0, "reward": 3.8, "reward_std": 1.0218252658843994, "rewards/execution_reward_func": 0.75, "rewards/import_check_reward_func": 0.8, "rewards/match_reward_func": 1.25, "rewards/syntax_reward_func": 1.0, "step": 2 }, { "completion_length": 301.15, "epoch": 0.0027829313543599257, "grad_norm": 0.5544025301933289, "kl": 0.00015556021098745988, "learning_rate": 2e-05, "loss": 0.0, "reward": 3.65, "reward_std": 2.213274621963501, "rewards/execution_reward_func": 0.15, "rewards/import_check_reward_func": 0.75, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.75, "step": 3 }, { "completion_length": 111.95, "epoch": 0.0037105751391465678, "grad_norm": 0.7285812497138977, "kl": 0.001399944696458988, "learning_rate": 2e-05, "loss": 0.0001, "reward": 4.6, "reward_std": 2.024621105194092, "rewards/execution_reward_func": 0.7, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.95, "step": 4 }, { "completion_length": 147.7, "epoch": 0.00463821892393321, "grad_norm": 0.6224641799926758, "kl": 0.002306692115962505, "learning_rate": 2e-05, "loss": 0.0001, "reward": 5.15, "reward_std": 1.4712525367736817, "rewards/execution_reward_func": 0.6, "rewards/import_check_reward_func": 0.9, "rewards/match_reward_func": 2.75, "rewards/syntax_reward_func": 0.9, "step": 5 }, { "completion_length": 226.6, "epoch": 0.0055658627087198514, "grad_norm": 0.7274765372276306, "kl": 0.0017352548136841505, "learning_rate": 2e-05, "loss": 0.0001, "reward": 3.0, "reward_std": 2.074456262588501, "rewards/execution_reward_func": 0.15, "rewards/import_check_reward_func": 0.8, "rewards/match_reward_func": 1.25, "rewards/syntax_reward_func": 0.8, "step": 6 }, { "completion_length": 269.95, "epoch": 0.006493506493506494, "grad_norm": 0.5622701048851013, "kl": 0.0028683518437901514, "learning_rate": 2e-05, "loss": 0.0001, "reward": 3.3, "reward_std": 1.8375282287597656, "rewards/execution_reward_func": 0.3, "rewards/import_check_reward_func": 0.75, "rewards/match_reward_func": 1.5, "rewards/syntax_reward_func": 0.75, "step": 7 }, { "completion_length": 139.95, "epoch": 0.0074211502782931356, "grad_norm": 0.3798786997795105, "kl": 0.004316802701214329, "learning_rate": 2e-05, "loss": 0.0002, "reward": 3.4, "reward_std": 0.2, "rewards/execution_reward_func": 0.4, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.0, "rewards/syntax_reward_func": 1.0, "step": 8 }, { "completion_length": 266.7, "epoch": 0.008348794063079777, "grad_norm": 0.5406395196914673, "kl": 0.003787403281603474, "learning_rate": 2e-05, "loss": 0.0002, "reward": 4.6, "reward_std": 1.6634664058685302, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 0.9, "rewards/match_reward_func": 2.25, "rewards/syntax_reward_func": 0.9, "step": 9 }, { "completion_length": 285.6, "epoch": 0.00927643784786642, "grad_norm": 0.3494756519794464, "kl": 0.0056897306552855294, "learning_rate": 2e-05, "loss": 0.0002, "reward": 4.0, "reward_std": 0.7464101552963257, "rewards/execution_reward_func": 0.3, "rewards/import_check_reward_func": 0.85, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.85, "step": 10 }, { "completion_length": 151.2, "epoch": 0.01020408163265306, "grad_norm": 0.20952889323234558, "kl": 0.0030223518115235493, "learning_rate": 2e-05, "loss": 0.0001, "reward": 3.95, "reward_std": 0.6773502349853515, "rewards/execution_reward_func": 0.45, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.5, "rewards/syntax_reward_func": 1.0, "step": 11 }, { "completion_length": 211.3, "epoch": 0.011131725417439703, "grad_norm": 0.11596839129924774, "kl": 0.0040574187733000144, "learning_rate": 2e-05, "loss": 0.0002, "reward": 5.1, "reward_std": 0.2, "rewards/execution_reward_func": 0.2, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 0.95, "step": 12 }, { "completion_length": 276.4, "epoch": 0.012059369202226345, "grad_norm": 0.7007413506507874, "kl": 0.0023531450337031857, "learning_rate": 2e-05, "loss": 0.0001, "reward": 3.45, "reward_std": 1.240312433242798, "rewards/execution_reward_func": 0.35, "rewards/import_check_reward_func": 0.8, "rewards/match_reward_func": 1.5, "rewards/syntax_reward_func": 0.8, "step": 13 }, { "completion_length": 268.1, "epoch": 0.012987012987012988, "grad_norm": 0.7164338231086731, "kl": 0.012285257197800093, "learning_rate": 2e-05, "loss": 0.0005, "reward": 3.8, "reward_std": 2.259043979644775, "rewards/execution_reward_func": 0.25, "rewards/import_check_reward_func": 0.9, "rewards/match_reward_func": 1.75, "rewards/syntax_reward_func": 0.9, "step": 14 }, { "completion_length": 103.4, "epoch": 0.013914656771799629, "grad_norm": 0.778170108795166, "kl": 0.016984900797251613, "learning_rate": 2e-05, "loss": 0.0007, "reward": 3.05, "reward_std": 0.7, "rewards/execution_reward_func": 0.3, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 0.75, "rewards/syntax_reward_func": 1.0, "step": 15 }, { "completion_length": 385.35, "epoch": 0.014842300556586271, "grad_norm": 0.35350680351257324, "kl": 0.0004109930072445422, "learning_rate": 2e-05, "loss": 0.0, "reward": 2.9, "reward_std": 1.207443642616272, "rewards/execution_reward_func": 0.2, "rewards/import_check_reward_func": 0.6, "rewards/match_reward_func": 1.5, "rewards/syntax_reward_func": 0.6, "step": 16 }, { "completion_length": 226.9, "epoch": 0.015769944341372914, "grad_norm": 0.2457304149866104, "kl": 0.004650659079197794, "learning_rate": 2e-05, "loss": 0.0002, "reward": 4.6, "reward_std": 0.8225053548812866, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 0.9, "rewards/match_reward_func": 2.25, "rewards/syntax_reward_func": 0.9, "step": 17 }, { "completion_length": 143.4, "epoch": 0.016697588126159554, "grad_norm": 0.37925291061401367, "kl": 0.025710263662040232, "learning_rate": 2e-05, "loss": 0.001, "reward": 5.475, "reward_std": 0.8694802522659302, "rewards/execution_reward_func": 0.4, "rewards/import_check_reward_func": 0.925, "rewards/match_reward_func": 3.25, "rewards/syntax_reward_func": 0.9, "step": 18 }, { "completion_length": 195.9, "epoch": 0.017625231910946195, "grad_norm": 1.0235642194747925, "kl": 0.004265061870682985, "learning_rate": 2e-05, "loss": 0.0002, "reward": 4.825, "reward_std": 1.95, "rewards/execution_reward_func": 0.65, "rewards/import_check_reward_func": 0.975, "rewards/match_reward_func": 2.25, "rewards/syntax_reward_func": 0.95, "step": 19 }, { "completion_length": 210.2, "epoch": 0.01855287569573284, "grad_norm": 0.8921510577201843, "kl": 0.01736406094278209, "learning_rate": 2e-05, "loss": 0.0007, "reward": 3.75, "reward_std": 2.0600518703460695, "rewards/execution_reward_func": 0.5, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.25, "rewards/syntax_reward_func": 1.0, "step": 20 }, { "completion_length": 60.7, "epoch": 0.01948051948051948, "grad_norm": 0.7613882422447205, "kl": 0.01977001652121544, "learning_rate": 2e-05, "loss": 0.0008, "reward": 6.3, "reward_std": 1.3154700517654419, "rewards/execution_reward_func": 0.8, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.5, "rewards/syntax_reward_func": 1.0, "step": 21 }, { "completion_length": 124.7, "epoch": 0.02040816326530612, "grad_norm": 0.8782172799110413, "kl": 0.007784434873610735, "learning_rate": 2e-05, "loss": 0.0003, "reward": 5.95, "reward_std": 1.7160588264465333, "rewards/execution_reward_func": 0.7, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.25, "rewards/syntax_reward_func": 1.0, "step": 22 }, { "completion_length": 48.45, "epoch": 0.021335807050092765, "grad_norm": 0.7235850095748901, "kl": 0.015565779805183411, "learning_rate": 2e-05, "loss": 0.0006, "reward": 5.95, "reward_std": 1.2176626682281495, "rewards/execution_reward_func": 0.95, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 1.0, "step": 23 }, { "completion_length": 199.2, "epoch": 0.022263450834879406, "grad_norm": 0.36401212215423584, "kl": 0.008195015601813793, "learning_rate": 2e-05, "loss": 0.0003, "reward": 5.65, "reward_std": 0.9087337732315064, "rewards/execution_reward_func": 0.5, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 3.25, "rewards/syntax_reward_func": 0.95, "step": 24 }, { "completion_length": 284.6, "epoch": 0.023191094619666047, "grad_norm": 0.28475916385650635, "kl": 0.01078999440651387, "learning_rate": 2e-05, "loss": 0.0004, "reward": 4.5, "reward_std": 0.6, "rewards/execution_reward_func": 0.1, "rewards/import_check_reward_func": 0.7, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 0.7, "step": 25 }, { "completion_length": 191.1, "epoch": 0.02411873840445269, "grad_norm": 0.16215822100639343, "kl": 0.006734096398577094, "learning_rate": 2e-05, "loss": 0.0003, "reward": 4.45, "reward_std": 0.5980713129043579, "rewards/execution_reward_func": 0.75, "rewards/import_check_reward_func": 0.85, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.85, "step": 26 }, { "completion_length": 120.6, "epoch": 0.02504638218923933, "grad_norm": 0.6018548011779785, "kl": 0.012047081184573472, "learning_rate": 2e-05, "loss": 0.0005, "reward": 5.15, "reward_std": 0.6773502349853515, "rewards/execution_reward_func": 0.65, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.5, "rewards/syntax_reward_func": 1.0, "step": 27 }, { "completion_length": 236.95, "epoch": 0.025974025974025976, "grad_norm": 0.9233574271202087, "kl": 0.007194043893832713, "learning_rate": 2e-05, "loss": 0.0003, "reward": 4.4, "reward_std": 1.5498929023742676, "rewards/execution_reward_func": 0.45, "rewards/import_check_reward_func": 0.85, "rewards/match_reward_func": 2.25, "rewards/syntax_reward_func": 0.85, "step": 28 }, { "completion_length": 232.5, "epoch": 0.026901669758812616, "grad_norm": 0.7554066777229309, "kl": 0.005206135101616383, "learning_rate": 2e-05, "loss": 0.0002, "reward": 3.8, "reward_std": 2.025991106033325, "rewards/execution_reward_func": 0.25, "rewards/import_check_reward_func": 0.8, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.75, "step": 29 }, { "completion_length": 168.4, "epoch": 0.027829313543599257, "grad_norm": 0.8817532658576965, "kl": 0.007167509943246842, "learning_rate": 2e-05, "loss": 0.0003, "reward": 2.625, "reward_std": 1.0498347997665405, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 0.925, "rewards/match_reward_func": 0.25, "rewards/syntax_reward_func": 0.9, "step": 30 }, { "completion_length": 150.0, "epoch": 0.0287569573283859, "grad_norm": 0.7171877026557922, "kl": 0.01311813194770366, "learning_rate": 2e-05, "loss": 0.0005, "reward": 4.45, "reward_std": 1.8154700517654419, "rewards/execution_reward_func": 0.25, "rewards/import_check_reward_func": 0.85, "rewards/match_reward_func": 2.5, "rewards/syntax_reward_func": 0.85, "step": 31 }, { "completion_length": 126.65, "epoch": 0.029684601113172542, "grad_norm": 0.8615996837615967, "kl": 0.012906858464702964, "learning_rate": 2e-05, "loss": 0.0005, "reward": 5.65, "reward_std": 1.1, "rewards/execution_reward_func": 0.65, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 1.0, "step": 32 }, { "completion_length": 127.2, "epoch": 0.030612244897959183, "grad_norm": 0.28843703866004944, "kl": 0.01665779766626656, "learning_rate": 2e-05, "loss": 0.0007, "reward": 5.85, "reward_std": 0.7, "rewards/execution_reward_func": 0.6, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.25, "rewards/syntax_reward_func": 1.0, "step": 33 }, { "completion_length": 122.25, "epoch": 0.03153988868274583, "grad_norm": 0.7831349968910217, "kl": 0.008541356842033564, "learning_rate": 2e-05, "loss": 0.0003, "reward": 5.45, "reward_std": 0.3, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 0.95, "step": 34 }, { "completion_length": 304.45, "epoch": 0.032467532467532464, "grad_norm": 0.33276575803756714, "kl": 0.002659273101016879, "learning_rate": 2e-05, "loss": 0.0001, "reward": 3.15, "reward_std": 0.7, "rewards/execution_reward_func": 0.45, "rewards/import_check_reward_func": 0.85, "rewards/match_reward_func": 1.0, "rewards/syntax_reward_func": 0.85, "step": 35 }, { "completion_length": 104.0, "epoch": 0.03339517625231911, "grad_norm": 0.7017046809196472, "kl": 0.01652910131961107, "learning_rate": 2e-05, "loss": 0.0007, "reward": 5.05, "reward_std": 1.970170545578003, "rewards/execution_reward_func": 0.3, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.75, "rewards/syntax_reward_func": 1.0, "step": 36 }, { "completion_length": 218.9, "epoch": 0.03432282003710575, "grad_norm": 0.4842416048049927, "kl": 0.010277540143579244, "learning_rate": 2e-05, "loss": 0.0004, "reward": 5.7, "reward_std": 1.3773502349853515, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 3.25, "rewards/syntax_reward_func": 0.95, "step": 37 }, { "completion_length": 162.4, "epoch": 0.03525046382189239, "grad_norm": 1.1983845233917236, "kl": 0.007826995360665024, "learning_rate": 2e-05, "loss": 0.0003, "reward": 5.7, "reward_std": 0.3154700517654419, "rewards/execution_reward_func": 0.7, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 1.0, "step": 38 }, { "completion_length": 113.35, "epoch": 0.036178107606679034, "grad_norm": 0.41759467124938965, "kl": 0.01697214711457491, "learning_rate": 2e-05, "loss": 0.0007, "reward": 6.25, "reward_std": 0.6773502349853515, "rewards/execution_reward_func": 0.75, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.5, "rewards/syntax_reward_func": 1.0, "step": 39 }, { "completion_length": 149.85, "epoch": 0.03710575139146568, "grad_norm": 0.810123860836029, "kl": 0.009923209110274911, "learning_rate": 2e-05, "loss": 0.0004, "reward": 3.15, "reward_std": 1.098312759399414, "rewards/execution_reward_func": 0.5, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 0.75, "rewards/syntax_reward_func": 0.95, "step": 40 }, { "completion_length": 124.25, "epoch": 0.038033395176252316, "grad_norm": 0.0389072448015213, "kl": 0.019343174435198306, "learning_rate": 2e-05, "loss": 0.0008, "reward": 5.4, "reward_std": 0.0, "rewards/execution_reward_func": 0.4, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 1.0, "step": 41 }, { "completion_length": 183.95, "epoch": 0.03896103896103896, "grad_norm": 0.3545382022857666, "kl": 0.008350804960355163, "learning_rate": 2e-05, "loss": 0.0003, "reward": 3.8, "reward_std": 0.7516611576080322, "rewards/execution_reward_func": 0.65, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 1.25, "rewards/syntax_reward_func": 0.95, "step": 42 }, { "completion_length": 189.65, "epoch": 0.039888682745825604, "grad_norm": 0.24783830344676971, "kl": 0.0119239964755252, "learning_rate": 2e-05, "loss": 0.0005, "reward": 4.475, "reward_std": 0.25, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 0.975, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.95, "step": 43 }, { "completion_length": 87.35, "epoch": 0.04081632653061224, "grad_norm": 0.22360822558403015, "kl": 0.023859670106321573, "learning_rate": 2e-05, "loss": 0.001, "reward": 5.25, "reward_std": 0.574456262588501, "rewards/execution_reward_func": 0.5, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.75, "rewards/syntax_reward_func": 1.0, "step": 44 }, { "completion_length": 71.55, "epoch": 0.041743970315398886, "grad_norm": 1.2800755500793457, "kl": 0.025399934221059083, "learning_rate": 2e-05, "loss": 0.001, "reward": 6.75, "reward_std": 1.0416025638580322, "rewards/execution_reward_func": 0.75, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 4.0, "rewards/syntax_reward_func": 1.0, "step": 45 }, { "completion_length": 128.95, "epoch": 0.04267161410018553, "grad_norm": 0.18902267515659332, "kl": 0.020552122965455055, "learning_rate": 2e-05, "loss": 0.0008, "reward": 4.45, "reward_std": 0.1, "rewards/execution_reward_func": 0.45, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 1.0, "step": 46 }, { "completion_length": 94.0, "epoch": 0.04359925788497217, "grad_norm": 0.6993905305862427, "kl": 0.013059131056070327, "learning_rate": 2e-05, "loss": 0.0005, "reward": 4.5, "reward_std": 0.6, "rewards/execution_reward_func": 0.75, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.75, "rewards/syntax_reward_func": 1.0, "step": 47 }, { "completion_length": 186.9, "epoch": 0.04452690166975881, "grad_norm": 0.8022165894508362, "kl": 0.007941013853996991, "learning_rate": 2e-05, "loss": 0.0003, "reward": 4.65, "reward_std": 1.648912525177002, "rewards/execution_reward_func": 0.4, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.25, "rewards/syntax_reward_func": 1.0, "step": 48 }, { "completion_length": 69.1, "epoch": 0.045454545454545456, "grad_norm": 0.5193853974342346, "kl": 0.0152623875066638, "learning_rate": 2e-05, "loss": 0.0006, "reward": 5.75, "reward_std": 0.5, "rewards/execution_reward_func": 1.0, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.75, "rewards/syntax_reward_func": 1.0, "step": 49 }, { "completion_length": 108.0, "epoch": 0.04638218923933209, "grad_norm": 0.5317401885986328, "kl": 0.01654429854825139, "learning_rate": 2e-05, "loss": 0.0007, "reward": 5.3, "reward_std": 0.5773502349853515, "rewards/execution_reward_func": 0.8, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.5, "rewards/syntax_reward_func": 1.0, "step": 50 }, { "completion_length": 99.05, "epoch": 0.04730983302411874, "grad_norm": 0.25006845593452454, "kl": 0.015682655945420267, "learning_rate": 2e-05, "loss": 0.0006, "reward": 5.65, "reward_std": 0.1, "rewards/execution_reward_func": 0.65, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 1.0, "step": 51 }, { "completion_length": 139.95, "epoch": 0.04823747680890538, "grad_norm": 0.2602873146533966, "kl": 0.01236535501666367, "learning_rate": 2e-05, "loss": 0.0005, "reward": 5.15, "reward_std": 0.6154700517654419, "rewards/execution_reward_func": 0.9, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.25, "rewards/syntax_reward_func": 1.0, "step": 52 }, { "completion_length": 200.35, "epoch": 0.04916512059369202, "grad_norm": 0.8118894696235657, "kl": 0.01205776953138411, "learning_rate": 2e-05, "loss": 0.0005, "reward": 4.45, "reward_std": 1.7, "rewards/execution_reward_func": 0.7, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.75, "rewards/syntax_reward_func": 1.0, "step": 53 }, { "completion_length": 136.0, "epoch": 0.05009276437847866, "grad_norm": 0.30172234773635864, "kl": 0.012935555540025234, "learning_rate": 2e-05, "loss": 0.0005, "reward": 3.9, "reward_std": 0.5773502349853515, "rewards/execution_reward_func": 0.4, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.5, "rewards/syntax_reward_func": 1.0, "step": 54 }, { "completion_length": 139.45, "epoch": 0.05102040816326531, "grad_norm": 0.24473513662815094, "kl": 0.010810322500765324, "learning_rate": 2e-05, "loss": 0.0004, "reward": 4.55, "reward_std": 0.5, "rewards/execution_reward_func": 0.8, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.75, "rewards/syntax_reward_func": 1.0, "step": 55 }, { "completion_length": 206.95, "epoch": 0.05194805194805195, "grad_norm": 0.5211365222930908, "kl": 0.00964691461995244, "learning_rate": 2e-05, "loss": 0.0004, "reward": 4.1, "reward_std": 1.0, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 0.9, "rewards/match_reward_func": 1.75, "rewards/syntax_reward_func": 0.9, "step": 56 }, { "completion_length": 154.15, "epoch": 0.05287569573283859, "grad_norm": 0.4427022933959961, "kl": 0.015764740016311406, "learning_rate": 2e-05, "loss": 0.0006, "reward": 2.75, "reward_std": 0.9928203105926514, "rewards/execution_reward_func": 0.35, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 0.5, "rewards/syntax_reward_func": 0.95, "step": 57 }, { "completion_length": 180.4, "epoch": 0.05380333951762523, "grad_norm": 0.33605360984802246, "kl": 0.012487533967942, "learning_rate": 2e-05, "loss": 0.0005, "reward": 4.175, "reward_std": 0.85, "rewards/execution_reward_func": 0.5, "rewards/import_check_reward_func": 0.975, "rewards/match_reward_func": 1.75, "rewards/syntax_reward_func": 0.95, "step": 58 }, { "completion_length": 115.35, "epoch": 0.05473098330241188, "grad_norm": 0.195700541138649, "kl": 0.01763147208839655, "learning_rate": 2e-05, "loss": 0.0007, "reward": 5.45, "reward_std": 0.1, "rewards/execution_reward_func": 0.45, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 1.0, "step": 59 }, { "completion_length": 118.45, "epoch": 0.055658627087198514, "grad_norm": 0.20909559726715088, "kl": 0.0156675161793828, "learning_rate": 2e-05, "loss": 0.0006, "reward": 4.3, "reward_std": 0.6, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.75, "rewards/syntax_reward_func": 1.0, "step": 60 }, { "completion_length": 106.85, "epoch": 0.05658627087198516, "grad_norm": 0.6488444209098816, "kl": 0.018127623945474625, "learning_rate": 2e-05, "loss": 0.0007, "reward": 5.75, "reward_std": 1.2773502349853516, "rewards/execution_reward_func": 0.5, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.25, "rewards/syntax_reward_func": 1.0, "step": 61 }, { "completion_length": 158.65, "epoch": 0.0575139146567718, "grad_norm": 0.6527036428451538, "kl": 0.02021341770887375, "learning_rate": 2e-05, "loss": 0.0008, "reward": 5.45, "reward_std": 1.615470051765442, "rewards/execution_reward_func": 0.7, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.75, "rewards/syntax_reward_func": 1.0, "step": 62 }, { "completion_length": 48.15, "epoch": 0.05844155844155844, "grad_norm": 1.3493740558624268, "kl": 0.1433052785694599, "learning_rate": 2e-05, "loss": 0.0057, "reward": 7.55, "reward_std": 0.5, "rewards/execution_reward_func": 0.8, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 4.75, "rewards/syntax_reward_func": 1.0, "step": 63 }, { "completion_length": 185.65, "epoch": 0.059369202226345084, "grad_norm": 0.1428665667772293, "kl": 0.01815747832879424, "learning_rate": 2e-05, "loss": 0.0007, "reward": 4.325, "reward_std": 0.18929693698883057, "rewards/execution_reward_func": 0.6, "rewards/import_check_reward_func": 0.875, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.85, "step": 64 }, { "completion_length": 162.3, "epoch": 0.06029684601113173, "grad_norm": 0.6011455059051514, "kl": 0.012370403949171304, "learning_rate": 2e-05, "loss": 0.0005, "reward": 4.0, "reward_std": 1.02315514087677, "rewards/execution_reward_func": 0.6, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 1.5, "rewards/syntax_reward_func": 0.95, "step": 65 }, { "completion_length": 111.0, "epoch": 0.061224489795918366, "grad_norm": 0.1835913360118866, "kl": 0.014051128178834915, "learning_rate": 2e-05, "loss": 0.0006, "reward": 6.5, "reward_std": 0.6, "rewards/execution_reward_func": 0.25, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 4.25, "rewards/syntax_reward_func": 1.0, "step": 66 }, { "completion_length": 195.7, "epoch": 0.06215213358070501, "grad_norm": 1.6659204959869385, "kl": 0.014673770777881146, "learning_rate": 2e-05, "loss": 0.0006, "reward": 3.85, "reward_std": 0.6516611576080322, "rewards/execution_reward_func": 0.1, "rewards/import_check_reward_func": 0.85, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.9, "step": 67 }, { "completion_length": 42.05, "epoch": 0.06307977736549165, "grad_norm": 0.46125537157058716, "kl": 0.01653697257861495, "learning_rate": 2e-05, "loss": 0.0007, "reward": 5.85, "reward_std": 0.1, "rewards/execution_reward_func": 0.85, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 1.0, "step": 68 }, { "completion_length": 53.2, "epoch": 0.0640074211502783, "grad_norm": 0.6094168424606323, "kl": 0.018516975454986095, "learning_rate": 2e-05, "loss": 0.0007, "reward": 6.2, "reward_std": 1.2, "rewards/execution_reward_func": 0.7, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.5, "rewards/syntax_reward_func": 1.0, "step": 69 }, { "completion_length": 193.55, "epoch": 0.06493506493506493, "grad_norm": 0.586470365524292, "kl": 0.014461748860776424, "learning_rate": 2e-05, "loss": 0.0006, "reward": 3.825, "reward_std": 0.9016611576080322, "rewards/execution_reward_func": 0.25, "rewards/import_check_reward_func": 0.925, "rewards/match_reward_func": 1.75, "rewards/syntax_reward_func": 0.9, "step": 70 }, { "completion_length": 102.05, "epoch": 0.06586270871985157, "grad_norm": 0.42713698744773865, "kl": 0.015026956889778375, "learning_rate": 2e-05, "loss": 0.0006, "reward": 5.55, "reward_std": 1.1, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 1.0, "step": 71 }, { "completion_length": 179.7, "epoch": 0.06679035250463822, "grad_norm": 0.2813414931297302, "kl": 0.0110438191331923, "learning_rate": 2e-05, "loss": 0.0004, "reward": 5.3, "reward_std": 0.7403124332427978, "rewards/execution_reward_func": 0.8, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 2.5, "rewards/syntax_reward_func": 1.0, "step": 72 }, { "completion_length": 188.3, "epoch": 0.06771799628942486, "grad_norm": 0.26418182253837585, "kl": 0.013095348398201168, "learning_rate": 2e-05, "loss": 0.0005, "reward": 5.45, "reward_std": 0.38284270763397216, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 3.0, "rewards/syntax_reward_func": 0.95, "step": 73 }, { "completion_length": 223.4, "epoch": 0.0686456400742115, "grad_norm": 0.23218098282814026, "kl": 0.013283595815300942, "learning_rate": 2e-05, "loss": 0.0005, "reward": 4.7, "reward_std": 0.3983127593994141, "rewards/execution_reward_func": 0.8, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.95, "step": 74 }, { "completion_length": 210.85, "epoch": 0.06957328385899815, "grad_norm": 0.32164114713668823, "kl": 0.014888664055615664, "learning_rate": 2e-05, "loss": 0.0006, "reward": 3.45, "reward_std": 0.2154700517654419, "rewards/execution_reward_func": 0.45, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.0, "rewards/syntax_reward_func": 1.0, "step": 75 }, { "completion_length": 43.05, "epoch": 0.07050092764378478, "grad_norm": 0.06279900670051575, "kl": 0.026899104565382005, "learning_rate": 2e-05, "loss": 0.0011, "reward": 7.6, "reward_std": 0.0, "rewards/execution_reward_func": 0.6, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 5.0, "rewards/syntax_reward_func": 1.0, "step": 76 }, { "completion_length": 143.75, "epoch": 0.07142857142857142, "grad_norm": 0.38623046875, "kl": 0.012465272471308709, "learning_rate": 2e-05, "loss": 0.0005, "reward": 3.55, "reward_std": 0.1, "rewards/execution_reward_func": 0.55, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.0, "rewards/syntax_reward_func": 1.0, "step": 77 }, { "completion_length": 137.65, "epoch": 0.07235621521335807, "grad_norm": 0.3098149001598358, "kl": 0.015718556847423314, "learning_rate": 2e-05, "loss": 0.0006, "reward": 4.15, "reward_std": 0.7928203105926513, "rewards/execution_reward_func": 0.65, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.5, "rewards/syntax_reward_func": 1.0, "step": 78 }, { "completion_length": 156.3, "epoch": 0.07328385899814471, "grad_norm": 0.5492616891860962, "kl": 0.015015369467437267, "learning_rate": 2e-05, "loss": 0.0006, "reward": 4.35, "reward_std": 1.3, "rewards/execution_reward_func": 0.45, "rewards/import_check_reward_func": 0.95, "rewards/match_reward_func": 2.0, "rewards/syntax_reward_func": 0.95, "step": 79 }, { "completion_length": 209.25, "epoch": 0.07421150278293136, "grad_norm": 0.39929476380348206, "kl": 0.010218921303749084, "learning_rate": 2e-05, "loss": 0.0004, "reward": 3.85, "reward_std": 0.8154700517654419, "rewards/execution_reward_func": 0.6, "rewards/import_check_reward_func": 1.0, "rewards/match_reward_func": 1.25, "rewards/syntax_reward_func": 1.0, "step": 80 } ], "logging_steps": 1, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }