| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 39, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 3541.9607543945312, |
| "epoch": 0.02564102564102564, |
| "grad_norm": 5.551374912261963, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "reward": 0.3934539742767811, |
| "reward_std": 0.3425147756934166, |
| "rewards/code_reward": 0.3934539742767811, |
| "rewards/format_reward": 0.0, |
| "step": 1 |
| }, |
| { |
| "completion_length": 3543.0106201171875, |
| "epoch": 0.05128205128205128, |
| "grad_norm": 1.4287736415863037, |
| "kl": 0.0, |
| "learning_rate": 5e-06, |
| "loss": 0.0, |
| "reward": 0.46500000543892384, |
| "reward_std": 0.19661623612046242, |
| "rewards/code_reward": 0.46499999053776264, |
| "rewards/format_reward": 0.0, |
| "step": 2 |
| }, |
| { |
| "completion_length": 3270.2929077148438, |
| "epoch": 0.07692307692307693, |
| "grad_norm": 2.6348750591278076, |
| "kl": 0.00787353515625, |
| "learning_rate": 1e-05, |
| "loss": 0.0003, |
| "reward": 0.5235317498445511, |
| "reward_std": 0.30637360364198685, |
| "rewards/code_reward": 0.5235317498445511, |
| "rewards/format_reward": 0.0, |
| "step": 3 |
| }, |
| { |
| "completion_length": 2943.4500122070312, |
| "epoch": 0.10256410256410256, |
| "grad_norm": 0.7752403020858765, |
| "kl": 0.0511474609375, |
| "learning_rate": 9.98378869844137e-06, |
| "loss": 0.002, |
| "reward": 0.5904354751110077, |
| "reward_std": 0.21103981602936983, |
| "rewards/code_reward": 0.5904354676604271, |
| "rewards/format_reward": 0.0, |
| "step": 4 |
| }, |
| { |
| "completion_length": 3438.1785888671875, |
| "epoch": 0.1282051282051282, |
| "grad_norm": 3.3206803798675537, |
| "kl": 0.120361328125, |
| "learning_rate": 9.935271596564688e-06, |
| "loss": 0.0048, |
| "reward": 0.47174738347530365, |
| "reward_std": 0.1500780526548624, |
| "rewards/code_reward": 0.47174738347530365, |
| "rewards/format_reward": 0.0, |
| "step": 5 |
| }, |
| { |
| "completion_length": 3396.6856689453125, |
| "epoch": 0.15384615384615385, |
| "grad_norm": 0.2983720600605011, |
| "kl": 0.2265625, |
| "learning_rate": 9.854798261200746e-06, |
| "loss": 0.0091, |
| "reward": 0.5083690956234932, |
| "reward_std": 0.23467476293444633, |
| "rewards/code_reward": 0.5083690956234932, |
| "rewards/format_reward": 0.0, |
| "step": 6 |
| }, |
| { |
| "completion_length": 3636.0071411132812, |
| "epoch": 0.1794871794871795, |
| "grad_norm": 0.29332682490348816, |
| "kl": 0.26611328125, |
| "learning_rate": 9.74294850457488e-06, |
| "loss": 0.0106, |
| "reward": 0.387291356921196, |
| "reward_std": 0.23526490107178688, |
| "rewards/code_reward": 0.387291356921196, |
| "rewards/format_reward": 0.0, |
| "step": 7 |
| }, |
| { |
| "completion_length": 3375.800048828125, |
| "epoch": 0.20512820512820512, |
| "grad_norm": 0.3416775166988373, |
| "kl": 0.3359375, |
| "learning_rate": 9.600528206746613e-06, |
| "loss": 0.0134, |
| "reward": 0.48297154158353806, |
| "reward_std": 0.25456428155303, |
| "rewards/code_reward": 0.48297156393527985, |
| "rewards/format_reward": 0.0, |
| "step": 8 |
| }, |
| { |
| "completion_length": 3155.9608154296875, |
| "epoch": 0.23076923076923078, |
| "grad_norm": 0.240010604262352, |
| "kl": 0.3779296875, |
| "learning_rate": 9.428563509225348e-06, |
| "loss": 0.0151, |
| "reward": 0.5900027677416801, |
| "reward_std": 0.18181878328323364, |
| "rewards/code_reward": 0.5900027677416801, |
| "rewards/format_reward": 0.0, |
| "step": 9 |
| }, |
| { |
| "completion_length": 3539.346435546875, |
| "epoch": 0.2564102564102564, |
| "grad_norm": 0.23563309013843536, |
| "kl": 0.3955078125, |
| "learning_rate": 9.22829342159729e-06, |
| "loss": 0.0158, |
| "reward": 0.4020918384194374, |
| "reward_std": 0.22648156061768532, |
| "rewards/code_reward": 0.4020918384194374, |
| "rewards/format_reward": 0.0, |
| "step": 10 |
| }, |
| { |
| "completion_length": 3615.935791015625, |
| "epoch": 0.28205128205128205, |
| "grad_norm": 0.19946347177028656, |
| "kl": 0.47216796875, |
| "learning_rate": 9.001160894432979e-06, |
| "loss": 0.0189, |
| "reward": 0.41117217019200325, |
| "reward_std": 0.20069691445678473, |
| "rewards/code_reward": 0.41117217019200325, |
| "rewards/format_reward": 0.0, |
| "step": 11 |
| }, |
| { |
| "completion_length": 3652.20361328125, |
| "epoch": 0.3076923076923077, |
| "grad_norm": 0.21409198641777039, |
| "kl": 0.556640625, |
| "learning_rate": 8.748802422795361e-06, |
| "loss": 0.0222, |
| "reward": 0.41303257271647453, |
| "reward_std": 0.2046195026487112, |
| "rewards/code_reward": 0.41303258016705513, |
| "rewards/format_reward": 0.0, |
| "step": 12 |
| }, |
| { |
| "completion_length": 3682.6357421875, |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.6496288180351257, |
| "kl": 0.580078125, |
| "learning_rate": 8.473036255255368e-06, |
| "loss": 0.0232, |
| "reward": 0.4388655610382557, |
| "reward_std": 0.22819811291992664, |
| "rewards/code_reward": 0.4388655610382557, |
| "rewards/format_reward": 0.0, |
| "step": 13 |
| }, |
| { |
| "completion_length": 3709.0142211914062, |
| "epoch": 0.358974358974359, |
| "grad_norm": 0.23685197532176971, |
| "kl": 0.6474609375, |
| "learning_rate": 8.175849293369292e-06, |
| "loss": 0.0259, |
| "reward": 0.37063881754875183, |
| "reward_std": 0.23854901269078255, |
| "rewards/code_reward": 0.37063881754875183, |
| "rewards/format_reward": 0.0, |
| "step": 14 |
| }, |
| { |
| "completion_length": 3758.8857421875, |
| "epoch": 0.38461538461538464, |
| "grad_norm": 0.22902953624725342, |
| "kl": 0.66015625, |
| "learning_rate": 7.859382776007544e-06, |
| "loss": 0.0264, |
| "reward": 0.31185516342520714, |
| "reward_std": 0.22547182254493237, |
| "rewards/code_reward": 0.31185516342520714, |
| "rewards/format_reward": 0.0, |
| "step": 15 |
| }, |
| { |
| "completion_length": 3474.4249877929688, |
| "epoch": 0.41025641025641024, |
| "grad_norm": 0.2739468514919281, |
| "kl": 0.658203125, |
| "learning_rate": 7.52591685167953e-06, |
| "loss": 0.0263, |
| "reward": 0.46756455302238464, |
| "reward_std": 0.21291000582277775, |
| "rewards/code_reward": 0.46756456792354584, |
| "rewards/format_reward": 0.0, |
| "step": 16 |
| }, |
| { |
| "completion_length": 3608.0107421875, |
| "epoch": 0.4358974358974359, |
| "grad_norm": 0.26821818947792053, |
| "kl": 0.66796875, |
| "learning_rate": 7.1778541500113895e-06, |
| "loss": 0.0267, |
| "reward": 0.33532585576176643, |
| "reward_std": 0.29637256264686584, |
| "rewards/code_reward": 0.33532586693763733, |
| "rewards/format_reward": 0.0, |
| "step": 17 |
| }, |
| { |
| "completion_length": 3718.9964599609375, |
| "epoch": 0.46153846153846156, |
| "grad_norm": 0.24430078268051147, |
| "kl": 0.6259765625, |
| "learning_rate": 6.817702470744477e-06, |
| "loss": 0.025, |
| "reward": 0.22364513762295246, |
| "reward_std": 0.23529299348592758, |
| "rewards/code_reward": 0.22364513762295246, |
| "rewards/format_reward": 0.0, |
| "step": 18 |
| }, |
| { |
| "completion_length": 3312.9071044921875, |
| "epoch": 0.48717948717948717, |
| "grad_norm": 0.2842939794063568, |
| "kl": 0.5859375, |
| "learning_rate": 6.448056714980768e-06, |
| "loss": 0.0234, |
| "reward": 0.46438145264983177, |
| "reward_std": 0.2700807861983776, |
| "rewards/code_reward": 0.46438145637512207, |
| "rewards/format_reward": 0.0, |
| "step": 19 |
| }, |
| { |
| "completion_length": 3342.6749877929688, |
| "epoch": 0.5128205128205128, |
| "grad_norm": 0.2545294761657715, |
| "kl": 0.5576171875, |
| "learning_rate": 6.071580188860955e-06, |
| "loss": 0.0223, |
| "reward": 0.5150031447410583, |
| "reward_std": 0.2922932505607605, |
| "rewards/code_reward": 0.5150031298398972, |
| "rewards/format_reward": 0.0, |
| "step": 20 |
| }, |
| { |
| "completion_length": 2976.3214111328125, |
| "epoch": 0.5384615384615384, |
| "grad_norm": 0.23293468356132507, |
| "kl": 0.6181640625, |
| "learning_rate": 5.690985414382668e-06, |
| "loss": 0.0247, |
| "reward": 0.46903225034475327, |
| "reward_std": 0.23096787184476852, |
| "rewards/code_reward": 0.4690322279930115, |
| "rewards/format_reward": 0.0, |
| "step": 21 |
| }, |
| { |
| "completion_length": 3359.9892578125, |
| "epoch": 0.5641025641025641, |
| "grad_norm": 0.2094847410917282, |
| "kl": 0.5703125, |
| "learning_rate": 5.309014585617335e-06, |
| "loss": 0.0228, |
| "reward": 0.40152605809271336, |
| "reward_std": 0.2346283309161663, |
| "rewards/code_reward": 0.40152604319155216, |
| "rewards/format_reward": 0.0, |
| "step": 22 |
| }, |
| { |
| "completion_length": 2678.5321655273438, |
| "epoch": 0.5897435897435898, |
| "grad_norm": 0.24114222824573517, |
| "kl": 0.5654296875, |
| "learning_rate": 4.928419811139046e-06, |
| "loss": 0.0227, |
| "reward": 0.608155146241188, |
| "reward_std": 0.14721081405878067, |
| "rewards/code_reward": 0.6081551611423492, |
| "rewards/format_reward": 0.0, |
| "step": 23 |
| }, |
| { |
| "completion_length": 2912.8821411132812, |
| "epoch": 0.6153846153846154, |
| "grad_norm": 0.3605176508426666, |
| "kl": 0.5908203125, |
| "learning_rate": 4.551943285019233e-06, |
| "loss": 0.0236, |
| "reward": 0.5329480394721031, |
| "reward_std": 0.1990287434309721, |
| "rewards/code_reward": 0.5329480245709419, |
| "rewards/format_reward": 0.0, |
| "step": 24 |
| }, |
| { |
| "completion_length": 3301.3857421875, |
| "epoch": 0.6410256410256411, |
| "grad_norm": 6.35435152053833, |
| "kl": 0.716796875, |
| "learning_rate": 4.182297529255525e-06, |
| "loss": 0.0287, |
| "reward": 0.36159154400229454, |
| "reward_std": 0.2206190638244152, |
| "rewards/code_reward": 0.36159154027700424, |
| "rewards/format_reward": 0.0, |
| "step": 25 |
| }, |
| { |
| "completion_length": 3339.2214965820312, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.4265838861465454, |
| "kl": 0.7275390625, |
| "learning_rate": 3.822145849988612e-06, |
| "loss": 0.0291, |
| "reward": 0.3386395741254091, |
| "reward_std": 0.18014118261635303, |
| "rewards/code_reward": 0.3386395741254091, |
| "rewards/format_reward": 0.0, |
| "step": 26 |
| }, |
| { |
| "completion_length": 3056.1214599609375, |
| "epoch": 0.6923076923076923, |
| "grad_norm": 0.4635300636291504, |
| "kl": 0.7158203125, |
| "learning_rate": 3.4740831483204696e-06, |
| "loss": 0.0286, |
| "reward": 0.4216299280524254, |
| "reward_std": 0.24011335149407387, |
| "rewards/code_reward": 0.4216299429535866, |
| "rewards/format_reward": 0.0, |
| "step": 27 |
| }, |
| { |
| "completion_length": 2741.1821899414062, |
| "epoch": 0.717948717948718, |
| "grad_norm": 0.26547616720199585, |
| "kl": 0.6884765625, |
| "learning_rate": 3.1406172239924583e-06, |
| "loss": 0.0275, |
| "reward": 0.5913942456245422, |
| "reward_std": 0.19475560076534748, |
| "rewards/code_reward": 0.5913942456245422, |
| "rewards/format_reward": 0.0, |
| "step": 28 |
| }, |
| { |
| "completion_length": 3502.3857421875, |
| "epoch": 0.7435897435897436, |
| "grad_norm": 0.3135121464729309, |
| "kl": 0.8955078125, |
| "learning_rate": 2.8241507066307106e-06, |
| "loss": 0.0358, |
| "reward": 0.33309811167418957, |
| "reward_std": 0.2021036557853222, |
| "rewards/code_reward": 0.3330980967730284, |
| "rewards/format_reward": 0.0, |
| "step": 29 |
| }, |
| { |
| "completion_length": 3053.2321166992188, |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.25711485743522644, |
| "kl": 0.880859375, |
| "learning_rate": 2.526963744744635e-06, |
| "loss": 0.0352, |
| "reward": 0.44350775331258774, |
| "reward_std": 0.19116137735545635, |
| "rewards/code_reward": 0.44350775331258774, |
| "rewards/format_reward": 0.0, |
| "step": 30 |
| }, |
| { |
| "completion_length": 3173.0178833007812, |
| "epoch": 0.7948717948717948, |
| "grad_norm": 0.3827289044857025, |
| "kl": 0.8798828125, |
| "learning_rate": 2.2511975772046403e-06, |
| "loss": 0.0352, |
| "reward": 0.42228348553180695, |
| "reward_std": 0.205778568983078, |
| "rewards/code_reward": 0.42228347808122635, |
| "rewards/format_reward": 0.0, |
| "step": 31 |
| }, |
| { |
| "completion_length": 3241.3535766601562, |
| "epoch": 0.8205128205128205, |
| "grad_norm": 0.4431661367416382, |
| "kl": 0.9033203125, |
| "learning_rate": 1.9988391055670234e-06, |
| "loss": 0.0362, |
| "reward": 0.3359471336007118, |
| "reward_std": 0.20435638166964054, |
| "rewards/code_reward": 0.3359471336007118, |
| "rewards/format_reward": 0.0, |
| "step": 32 |
| }, |
| { |
| "completion_length": 2643.057159423828, |
| "epoch": 0.8461538461538461, |
| "grad_norm": 0.5116384029388428, |
| "kl": 0.8017578125, |
| "learning_rate": 1.771706578402711e-06, |
| "loss": 0.0321, |
| "reward": 0.5609605759382248, |
| "reward_std": 0.18299106322228909, |
| "rewards/code_reward": 0.5609605610370636, |
| "rewards/format_reward": 0.0, |
| "step": 33 |
| }, |
| { |
| "completion_length": 2872.3214721679688, |
| "epoch": 0.8717948717948718, |
| "grad_norm": 0.44135168194770813, |
| "kl": 0.962890625, |
| "learning_rate": 1.5714364907746535e-06, |
| "loss": 0.0385, |
| "reward": 0.41167889907956123, |
| "reward_std": 0.1887526996433735, |
| "rewards/code_reward": 0.41167885810136795, |
| "rewards/format_reward": 0.0, |
| "step": 34 |
| }, |
| { |
| "completion_length": 2978.9928588867188, |
| "epoch": 0.8974358974358975, |
| "grad_norm": 0.6080113053321838, |
| "kl": 1.0380859375, |
| "learning_rate": 1.399471793253389e-06, |
| "loss": 0.0416, |
| "reward": 0.3960940055549145, |
| "reward_std": 0.20829082280397415, |
| "rewards/code_reward": 0.3960940055549145, |
| "rewards/format_reward": 0.0, |
| "step": 35 |
| }, |
| { |
| "completion_length": 3258.403564453125, |
| "epoch": 0.9230769230769231, |
| "grad_norm": 0.41639065742492676, |
| "kl": 1.072265625, |
| "learning_rate": 1.257051495425121e-06, |
| "loss": 0.0429, |
| "reward": 0.1503874734044075, |
| "reward_std": 0.12068512104451656, |
| "rewards/code_reward": 0.15038747526705265, |
| "rewards/format_reward": 0.0, |
| "step": 36 |
| }, |
| { |
| "completion_length": 2731.4713745117188, |
| "epoch": 0.9487179487179487, |
| "grad_norm": 0.33002978563308716, |
| "kl": 1.0263671875, |
| "learning_rate": 1.1452017387992552e-06, |
| "loss": 0.041, |
| "reward": 0.234993327409029, |
| "reward_std": 0.1311760265380144, |
| "rewards/code_reward": 0.234993327409029, |
| "rewards/format_reward": 0.0, |
| "step": 37 |
| }, |
| { |
| "completion_length": 2790.7571411132812, |
| "epoch": 0.9743589743589743, |
| "grad_norm": 0.3518536388874054, |
| "kl": 1.005859375, |
| "learning_rate": 1.0647284034353122e-06, |
| "loss": 0.0402, |
| "reward": 0.28501298278570175, |
| "reward_std": 0.07613634853623807, |
| "rewards/code_reward": 0.28501297906041145, |
| "rewards/format_reward": 0.0, |
| "step": 38 |
| }, |
| { |
| "completion_length": 2785.1597290039062, |
| "epoch": 1.0, |
| "grad_norm": 0.6018698215484619, |
| "kl": 1.119140625, |
| "learning_rate": 1.0162113015586309e-06, |
| "loss": 0.0448, |
| "reward": 0.26603568717837334, |
| "reward_std": 0.09410964651033282, |
| "rewards/code_reward": 0.26603569462895393, |
| "rewards/format_reward": 0.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 39, |
| "total_flos": 0.0, |
| "train_loss": 0.024176058914698324, |
| "train_runtime": 19938.5063, |
| "train_samples_per_second": 0.054, |
| "train_steps_per_second": 0.002 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 39, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 10, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|