{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9893390191897655, "eval_steps": 60, "global_step": 232, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 664.3515930175781, "epoch": 0.017057569296375266, "grad_norm": 0.11565207690000534, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.1249, "reward": 0.8191964775323868, "reward_std": 0.1755836745724082, "rewards/accuracy_reward": 0.8191964775323868, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 661.427487373352, "epoch": 0.08528784648187633, "grad_norm": 0.1410389542579651, "kl": 0.00010322034358978271, "learning_rate": 5e-07, "loss": 0.0815, "reward": 0.7940848618745804, "reward_std": 0.16921476647257805, "rewards/accuracy_reward": 0.7940848618745804, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 648.45962600708, "epoch": 0.17057569296375266, "grad_norm": 0.09059495478868484, "kl": 0.00012706518173217775, "learning_rate": 5e-07, "loss": 0.092, "reward": 0.8165178954601288, "reward_std": 0.1695016896352172, "rewards/accuracy_reward": 0.8165178954601288, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 635.6861877441406, "epoch": 0.255863539445629, "grad_norm": 0.23655401170253754, "kl": 0.0001537799835205078, "learning_rate": 5e-07, "loss": 0.1002, "reward": 0.8232143238186836, "reward_std": 0.17031898349523544, "rewards/accuracy_reward": 0.8232143238186836, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 636.7087341308594, "epoch": 0.3411513859275053, "grad_norm": 0.15169048309326172, "kl": 0.0003520965576171875, "learning_rate": 5e-07, "loss": 0.0965, "reward": 0.8183036118745803, "reward_std": 0.16691437950357796, "rewards/accuracy_reward": 0.8183036118745803, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 660.6172164916992, "epoch": 0.42643923240938164, "grad_norm": 0.10549971461296082, "kl": 0.00020837783813476562, "learning_rate": 5e-07, "loss": 0.0838, "reward": 0.813392898440361, "reward_std": 0.17468413366004826, "rewards/accuracy_reward": 0.813392898440361, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 659.3672187805175, "epoch": 0.511727078891258, "grad_norm": 0.13681049644947052, "kl": 0.00038820505142211914, "learning_rate": 5e-07, "loss": 0.0786, "reward": 0.80357146859169, "reward_std": 0.17490468453615904, "rewards/accuracy_reward": 0.80357146859169, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 660.5917663574219, "epoch": 0.5970149253731343, "grad_norm": 0.09065572917461395, "kl": 0.0004504680633544922, "learning_rate": 5e-07, "loss": 0.0824, "reward": 0.8071428924798966, "reward_std": 0.1621523329988122, "rewards/accuracy_reward": 0.8071428924798966, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 638.1212364196778, "epoch": 0.6823027718550106, "grad_norm": 0.09261901676654816, "kl": 0.0005172014236450196, "learning_rate": 5e-07, "loss": 0.0585, "reward": 0.8138393208384513, "reward_std": 0.1534264313057065, "rewards/accuracy_reward": 0.8138393208384513, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 664.7774856567382, "epoch": 0.767590618336887, "grad_norm": 0.09141222387552261, "kl": 0.00053253173828125, "learning_rate": 5e-07, "loss": 0.0687, "reward": 0.8071428909897804, "reward_std": 0.16072208830155432, "rewards/accuracy_reward": 0.8071428909897804, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 662.5498062133789, "epoch": 0.8528784648187633, "grad_norm": 0.24577292799949646, "kl": 0.0011467933654785156, "learning_rate": 5e-07, "loss": 0.0649, "reward": 0.8042411059141159, "reward_std": 0.16364638023078443, "rewards/accuracy_reward": 0.8042411059141159, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 643.9475730895996, "epoch": 0.9381663113006397, "grad_norm": 0.10820304602384567, "kl": 0.000740814208984375, "learning_rate": 5e-07, "loss": 0.0622, "reward": 0.8261161133646965, "reward_std": 0.15972621561959385, "rewards/accuracy_reward": 0.8261161133646965, "step": 55 }, { "epoch": 1.0341151385927505, "grad_norm": 0.1097937524318695, "learning_rate": 5e-07, "loss": 0.0662, "step": 60 }, { "epoch": 1.0341151385927505, "eval_clip_ratio": 0.0, "eval_completion_length": 638.2177686691284, "eval_kl": 0.0012085437774658203, "eval_loss": 0.027663394808769226, "eval_reward": 0.7151227928698063, "eval_reward_std": 0.2182473847642541, "eval_rewards/accuracy_reward": 0.7151227928698063, "eval_runtime": 835.396, "eval_samples_per_second": 0.599, "eval_steps_per_second": 0.006, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 630.2452568054199, "epoch": 1.1194029850746268, "grad_norm": 0.08139240741729736, "kl": 0.0015056610107421875, "learning_rate": 5e-07, "loss": 0.0579, "reward": 0.8170759312808513, "reward_std": 0.16009651254862547, "rewards/accuracy_reward": 0.8170759312808513, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 620.5440017700196, "epoch": 1.2046908315565032, "grad_norm": 0.10313019156455994, "kl": 0.0016027450561523437, "learning_rate": 5e-07, "loss": 0.0586, "reward": 0.8310268223285675, "reward_std": 0.1424413041677326, "rewards/accuracy_reward": 0.8310268223285675, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 649.887752532959, "epoch": 1.2899786780383795, "grad_norm": 0.09998168796300888, "kl": 0.00717315673828125, "learning_rate": 5e-07, "loss": 0.0564, "reward": 0.8100446775555611, "reward_std": 0.1757219755090773, "rewards/accuracy_reward": 0.8100446775555611, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 614.6263717651367, "epoch": 1.375266524520256, "grad_norm": 0.08961261808872223, "kl": 0.0022918701171875, "learning_rate": 5e-07, "loss": 0.0376, "reward": 0.8328125387430191, "reward_std": 0.13861298179253936, "rewards/accuracy_reward": 0.8328125387430191, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 631.008511352539, "epoch": 1.4605543710021323, "grad_norm": 0.1273442804813385, "kl": 0.002947235107421875, "learning_rate": 5e-07, "loss": 0.0541, "reward": 0.8229911118745804, "reward_std": 0.14886255729943515, "rewards/accuracy_reward": 0.8229911118745804, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 646.872346496582, "epoch": 1.5458422174840085, "grad_norm": 0.15443700551986694, "kl": 0.0033596038818359377, "learning_rate": 5e-07, "loss": 0.0595, "reward": 0.809821467101574, "reward_std": 0.15138995712623, "rewards/accuracy_reward": 0.809821467101574, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 631.2808311462402, "epoch": 1.6311300639658848, "grad_norm": 0.09066915512084961, "kl": 0.004022216796875, "learning_rate": 5e-07, "loss": 0.0418, "reward": 0.8258928924798965, "reward_std": 0.1533732468262315, "rewards/accuracy_reward": 0.8258928924798965, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 627.2167701721191, "epoch": 1.716417910447761, "grad_norm": 0.10236337780952454, "kl": 0.011969375610351562, "learning_rate": 5e-07, "loss": 0.0372, "reward": 0.8267857551574707, "reward_std": 0.13705341126769782, "rewards/accuracy_reward": 0.8267857551574707, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 643.0727920532227, "epoch": 1.8017057569296375, "grad_norm": 0.09229780733585358, "kl": 0.00559539794921875, "learning_rate": 5e-07, "loss": 0.0289, "reward": 0.8116071805357933, "reward_std": 0.147033178107813, "rewards/accuracy_reward": 0.8116071805357933, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 634.0770401000976, "epoch": 1.886993603411514, "grad_norm": 0.1279992163181305, "kl": 0.006862640380859375, "learning_rate": 5e-07, "loss": 0.0364, "reward": 0.8312500357627869, "reward_std": 0.14459644490852952, "rewards/accuracy_reward": 0.8312500357627869, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 643.7125259399414, "epoch": 1.9722814498933903, "grad_norm": 0.12527693808078766, "kl": 0.00738983154296875, "learning_rate": 5e-07, "loss": 0.0394, "reward": 0.8138393253087998, "reward_std": 0.15650860401801764, "rewards/accuracy_reward": 0.8138393253087998, "step": 115 }, { "epoch": 2.068230277185501, "grad_norm": 0.13853299617767334, "learning_rate": 5e-07, "loss": 0.0354, "step": 120 }, { "epoch": 2.068230277185501, "eval_clip_ratio": 0.0, "eval_completion_length": 627.0459775924683, "eval_kl": 0.009876251220703125, "eval_loss": 0.023924430832266808, "eval_reward": 0.7343750353902578, "eval_reward_std": 0.19236661097966135, "eval_rewards/accuracy_reward": 0.7343750353902578, "eval_runtime": 697.2301, "eval_samples_per_second": 0.717, "eval_steps_per_second": 0.007, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 621.6253170013428, "epoch": 2.1535181236673773, "grad_norm": 0.11815498024225235, "kl": 0.01000518798828125, "learning_rate": 5e-07, "loss": 0.0358, "reward": 0.8255580753087998, "reward_std": 0.14198732506483794, "rewards/accuracy_reward": 0.8255580753087998, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 621.314752960205, "epoch": 2.2388059701492535, "grad_norm": 0.113522969186306, "kl": 0.0125152587890625, "learning_rate": 5e-07, "loss": 0.0269, "reward": 0.8386161178350449, "reward_std": 0.14197837365791202, "rewards/accuracy_reward": 0.8386161178350449, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 642.9511466979981, "epoch": 2.3240938166311302, "grad_norm": 0.14222967624664307, "kl": 0.01525726318359375, "learning_rate": 5e-07, "loss": 0.0476, "reward": 0.7872768193483353, "reward_std": 0.14514056108891965, "rewards/accuracy_reward": 0.7872768193483353, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 628.785961151123, "epoch": 2.4093816631130065, "grad_norm": 0.13704024255275726, "kl": 0.01926116943359375, "learning_rate": 5e-07, "loss": 0.0403, "reward": 0.8256696745753288, "reward_std": 0.14226720854640007, "rewards/accuracy_reward": 0.8256696745753288, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 614.7006958007812, "epoch": 2.4946695095948828, "grad_norm": 0.19874536991119385, "kl": 0.0266326904296875, "learning_rate": 5e-07, "loss": 0.0278, "reward": 0.8165178939700126, "reward_std": 0.16517118187621235, "rewards/accuracy_reward": 0.8165178939700126, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 649.61431350708, "epoch": 2.579957356076759, "grad_norm": 0.40368160605430603, "kl": 0.0365936279296875, "learning_rate": 5e-07, "loss": 0.0341, "reward": 0.7767857447266578, "reward_std": 0.1682931227609515, "rewards/accuracy_reward": 0.7767857447266578, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 620.872575378418, "epoch": 2.6652452025586353, "grad_norm": 0.37761253118515015, "kl": 0.049951171875, "learning_rate": 5e-07, "loss": 0.0415, "reward": 0.7785714641213417, "reward_std": 0.19512954521924258, "rewards/accuracy_reward": 0.7785714641213417, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 617.9544944763184, "epoch": 2.750533049040512, "grad_norm": 0.44903331995010376, "kl": 0.0691650390625, "learning_rate": 5e-07, "loss": 0.0422, "reward": 0.7671875387430191, "reward_std": 0.19579849690198897, "rewards/accuracy_reward": 0.7671875387430191, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 616.282169342041, "epoch": 2.835820895522388, "grad_norm": 0.7222861647605896, "kl": 0.10626220703125, "learning_rate": 5e-07, "loss": 0.0487, "reward": 0.7156250298023223, "reward_std": 0.2289330180734396, "rewards/accuracy_reward": 0.7156250298023223, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 649.1158721923828, "epoch": 2.9211087420042645, "grad_norm": 1.717586636543274, "kl": 0.194580078125, "learning_rate": 5e-07, "loss": 0.0679, "reward": 0.614955385774374, "reward_std": 0.2752906741574407, "rewards/accuracy_reward": 0.614955385774374, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 786.9139389038086, "epoch": 3.0170575692963753, "grad_norm": 1.529920220375061, "kl": 0.439892578125, "learning_rate": 5e-07, "loss": 0.1198, "reward": 0.368080372735858, "reward_std": 0.29059169851243494, "rewards/accuracy_reward": 0.368080372735858, "step": 175 }, { "epoch": 3.1023454157782515, "grad_norm": 1.5960689783096313, "learning_rate": 5e-07, "loss": 0.0887, "step": 180 }, { "epoch": 3.1023454157782515, "eval_clip_ratio": 0.0, "eval_completion_length": 791.9263305664062, "eval_kl": 2.4365234375, "eval_loss": 0.06115880608558655, "eval_reward": 0.08565848605940118, "eval_reward_std": 0.13261561130639166, "eval_rewards/accuracy_reward": 0.08565848605940118, "eval_runtime": 821.1595, "eval_samples_per_second": 0.609, "eval_steps_per_second": 0.006, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 818.8542762756348, "epoch": 3.1876332622601278, "grad_norm": 11.959312438964844, "kl": 2.426806640625, "learning_rate": 5e-07, "loss": 0.0522, "reward": 0.10647321877768263, "reward_std": 0.14978813820052891, "rewards/accuracy_reward": 0.10647321877768263, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 679.0966827392579, "epoch": 3.272921108742004, "grad_norm": 19.53175163269043, "kl": 3.7345703125, "learning_rate": 5e-07, "loss": 0.0359, "reward": 0.039732144516892734, "reward_std": 0.07693687449209392, "rewards/accuracy_reward": 0.039732144516892734, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 561.9143058776856, "epoch": 3.3582089552238807, "grad_norm": 8.676216125488281, "kl": 5.15078125, "learning_rate": 5e-07, "loss": 0.0272, "reward": 0.026116072735749184, "reward_std": 0.05382296503521502, "rewards/accuracy_reward": 0.026116072735749184, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 466.9301559448242, "epoch": 3.443496801705757, "grad_norm": 16.412755966186523, "kl": 7.519140625, "learning_rate": 5e-07, "loss": 0.0222, "reward": 0.02700892973225564, "reward_std": 0.058131046639755365, "rewards/accuracy_reward": 0.02700892973225564, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 409.43640365600584, "epoch": 3.5287846481876333, "grad_norm": 10.202103614807129, "kl": 4.88203125, "learning_rate": 5e-07, "loss": 0.0116, "reward": 0.02410714393481612, "reward_std": 0.051837433129549026, "rewards/accuracy_reward": 0.02410714393481612, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 366.6959976196289, "epoch": 3.6140724946695095, "grad_norm": 33.47189712524414, "kl": 6.95859375, "learning_rate": 5e-07, "loss": 0.0144, "reward": 0.026116072852164506, "reward_std": 0.04902788205072284, "rewards/accuracy_reward": 0.026116072852164506, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 356.4863983154297, "epoch": 3.699360341151386, "grad_norm": 7.119285583496094, "kl": 3.1953125, "learning_rate": 5e-07, "loss": 0.0123, "reward": 0.0292410729220137, "reward_std": 0.05716597293503582, "rewards/accuracy_reward": 0.0292410729220137, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 343.4513526916504, "epoch": 3.7846481876332625, "grad_norm": 15.441688537597656, "kl": 4.012109375, "learning_rate": 5e-07, "loss": 0.0177, "reward": 0.028571429941803218, "reward_std": 0.058770314510911706, "rewards/accuracy_reward": 0.028571429941803218, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 310.74108505249023, "epoch": 3.8699360341151388, "grad_norm": 7.061368942260742, "kl": 5.580859375, "learning_rate": 5e-07, "loss": 0.0081, "reward": 0.02857143001165241, "reward_std": 0.06492680269293487, "rewards/accuracy_reward": 0.02857143001165241, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 314.79555015563966, "epoch": 3.955223880597015, "grad_norm": 22.824426651000977, "kl": 8.496484375, "learning_rate": 5e-07, "loss": 0.0108, "reward": 0.033258930104784666, "reward_std": 0.06678469418548047, "rewards/accuracy_reward": 0.033258930104784666, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 311.46373558044434, "epoch": 3.9893390191897655, "kl": 4.9599609375, "reward": 0.042968751688022166, "reward_std": 0.08240398659836501, "rewards/accuracy_reward": 0.042968751688022166, "step": 232, "total_flos": 0.0, "train_loss": 0.0500773029434013, "train_runtime": 52194.0457, "train_samples_per_second": 0.575, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 232, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }