| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 46.553125, | |
| "epoch": 0.008, | |
| "grad_norm": 0.05134107172489166, | |
| "kl": 0.012939453125, | |
| "learning_rate": 9.95e-07, | |
| "loss": 0.0001, | |
| "reward": 2.703125, | |
| "reward_std": 0.11205126643180847, | |
| "rewards/accuracy_reward": 1.7125, | |
| "rewards/format_reward": 0.990625, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 49.34375, | |
| "epoch": 0.016, | |
| "grad_norm": 0.06966069340705872, | |
| "kl": 0.01898193359375, | |
| "learning_rate": 9.9e-07, | |
| "loss": 0.0002, | |
| "reward": 2.775, | |
| "reward_std": 0.05, | |
| "rewards/accuracy_reward": 1.78125, | |
| "rewards/format_reward": 0.99375, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 44.63125, | |
| "epoch": 0.024, | |
| "grad_norm": 5.11226749420166, | |
| "kl": 0.0212158203125, | |
| "learning_rate": 9.849999999999999e-07, | |
| "loss": 0.0002, | |
| "reward": 2.546875, | |
| "reward_std": 0.09568375647068024, | |
| "rewards/accuracy_reward": 1.55625, | |
| "rewards/format_reward": 0.990625, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 42.775, | |
| "epoch": 0.032, | |
| "grad_norm": 0.0820818841457367, | |
| "kl": 0.042626953125, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0004, | |
| "reward": 2.775, | |
| "reward_std": 0.03943375647068024, | |
| "rewards/accuracy_reward": 1.775, | |
| "rewards/format_reward": 1.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 44.275, | |
| "epoch": 0.04, | |
| "grad_norm": 0.06030546873807907, | |
| "kl": 0.03828125, | |
| "learning_rate": 9.75e-07, | |
| "loss": 0.0004, | |
| "reward": 2.74375, | |
| "reward_std": 0.026933756470680238, | |
| "rewards/accuracy_reward": 1.74375, | |
| "rewards/format_reward": 1.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 50.990625, | |
| "epoch": 0.048, | |
| "grad_norm": 0.10005596280097961, | |
| "kl": 0.03060302734375, | |
| "learning_rate": 9.7e-07, | |
| "loss": 0.0003, | |
| "reward": 2.60625, | |
| "reward_std": 0.10193375647068023, | |
| "rewards/accuracy_reward": 1.60625, | |
| "rewards/format_reward": 1.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 54.375, | |
| "epoch": 0.056, | |
| "grad_norm": 4.453707695007324, | |
| "kl": 0.0556640625, | |
| "learning_rate": 9.649999999999999e-07, | |
| "loss": 0.0006, | |
| "reward": 2.590625, | |
| "reward_std": 0.08318375647068024, | |
| "rewards/accuracy_reward": 1.59375, | |
| "rewards/format_reward": 0.996875, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 48.815625, | |
| "epoch": 0.064, | |
| "grad_norm": 2.5629329681396484, | |
| "kl": 0.040283203125, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0004, | |
| "reward": 2.765625, | |
| "reward_std": 0.058183756470680234, | |
| "rewards/accuracy_reward": 1.76875, | |
| "rewards/format_reward": 0.996875, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 47.53125, | |
| "epoch": 0.072, | |
| "grad_norm": 0.08292120695114136, | |
| "kl": 0.0712646484375, | |
| "learning_rate": 9.55e-07, | |
| "loss": 0.0007, | |
| "reward": 2.825, | |
| "reward_std": 0.05, | |
| "rewards/accuracy_reward": 1.825, | |
| "rewards/format_reward": 1.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 46.703125, | |
| "epoch": 0.08, | |
| "grad_norm": 2.7465286254882812, | |
| "kl": 0.05367431640625, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": 0.0005, | |
| "reward": 2.71875, | |
| "reward_std": 0.07693375647068024, | |
| "rewards/accuracy_reward": 1.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 46.225, | |
| "epoch": 0.088, | |
| "grad_norm": 2.1839213371276855, | |
| "kl": 0.0655517578125, | |
| "learning_rate": 9.45e-07, | |
| "loss": 0.0007, | |
| "reward": 2.609375, | |
| "reward_std": 0.03125, | |
| "rewards/accuracy_reward": 1.61875, | |
| "rewards/format_reward": 0.990625, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 44.096875, | |
| "epoch": 0.096, | |
| "grad_norm": 0.07181887328624725, | |
| "kl": 0.06865234375, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0007, | |
| "reward": 2.71875, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 44.590625, | |
| "epoch": 0.104, | |
| "grad_norm": 0.09902142733335495, | |
| "kl": 0.0936767578125, | |
| "learning_rate": 9.35e-07, | |
| "loss": 0.0009, | |
| "reward": 2.56875, | |
| "reward_std": 0.0625, | |
| "rewards/accuracy_reward": 1.575, | |
| "rewards/format_reward": 0.99375, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 43.81875, | |
| "epoch": 0.112, | |
| "grad_norm": 2.340815305709839, | |
| "kl": 0.066015625, | |
| "learning_rate": 9.3e-07, | |
| "loss": 0.0007, | |
| "reward": 2.75, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 49.06875, | |
| "epoch": 0.12, | |
| "grad_norm": 2.58245849609375, | |
| "kl": 0.0600341796875, | |
| "learning_rate": 9.25e-07, | |
| "loss": 0.0006, | |
| "reward": 2.7125, | |
| "reward_std": 0.125, | |
| "rewards/accuracy_reward": 1.7125, | |
| "rewards/format_reward": 1.0, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 52.84375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.06839890778064728, | |
| "kl": 0.0785400390625, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0008, | |
| "reward": 2.7, | |
| "reward_std": 0.03943375647068024, | |
| "rewards/accuracy_reward": 1.7, | |
| "rewards/format_reward": 1.0, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 47.7, | |
| "epoch": 0.136, | |
| "grad_norm": 0.11428700387477875, | |
| "kl": 0.06865234375, | |
| "learning_rate": 9.15e-07, | |
| "loss": 0.0007, | |
| "reward": 2.75, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 43.478125, | |
| "epoch": 0.144, | |
| "grad_norm": 2.188392400741577, | |
| "kl": 0.062451171875, | |
| "learning_rate": 9.1e-07, | |
| "loss": 0.0006, | |
| "reward": 2.615625, | |
| "reward_std": 0.06875, | |
| "rewards/accuracy_reward": 1.61875, | |
| "rewards/format_reward": 0.996875, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 42.540625, | |
| "epoch": 0.152, | |
| "grad_norm": 3.399991512298584, | |
| "kl": 0.076953125, | |
| "learning_rate": 9.05e-07, | |
| "loss": 0.0008, | |
| "reward": 2.64375, | |
| "reward_std": 0.09136751294136047, | |
| "rewards/accuracy_reward": 1.64375, | |
| "rewards/format_reward": 1.0, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 50.21875, | |
| "epoch": 0.16, | |
| "grad_norm": 0.10214658826589584, | |
| "kl": 0.09365234375, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0009, | |
| "reward": 2.784375, | |
| "reward_std": 0.05625, | |
| "rewards/accuracy_reward": 1.7875, | |
| "rewards/format_reward": 0.996875, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 54.35, | |
| "epoch": 0.168, | |
| "grad_norm": 0.08639144152402878, | |
| "kl": 0.1749267578125, | |
| "learning_rate": 8.95e-07, | |
| "loss": 0.0017, | |
| "reward": 2.7875, | |
| "reward_std": 0.014433756470680237, | |
| "rewards/accuracy_reward": 1.7875, | |
| "rewards/format_reward": 1.0, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 56.29375, | |
| "epoch": 0.176, | |
| "grad_norm": 0.06954076141119003, | |
| "kl": 0.119091796875, | |
| "learning_rate": 8.9e-07, | |
| "loss": 0.0012, | |
| "reward": 2.75, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.75625, | |
| "rewards/format_reward": 0.99375, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 48.8875, | |
| "epoch": 0.184, | |
| "grad_norm": 0.06490013003349304, | |
| "kl": 0.12080078125, | |
| "learning_rate": 8.85e-07, | |
| "loss": 0.0012, | |
| "reward": 2.65, | |
| "reward_std": 0.07886751294136048, | |
| "rewards/accuracy_reward": 1.65, | |
| "rewards/format_reward": 1.0, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 41.115625, | |
| "epoch": 0.192, | |
| "grad_norm": 0.12679292261600494, | |
| "kl": 0.12470703125, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0012, | |
| "reward": 2.8125, | |
| "reward_std": 0.07886751294136048, | |
| "rewards/accuracy_reward": 1.81875, | |
| "rewards/format_reward": 0.99375, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 40.4125, | |
| "epoch": 0.2, | |
| "grad_norm": 0.11438746750354767, | |
| "kl": 30.11142578125, | |
| "learning_rate": 8.75e-07, | |
| "loss": 0.3012, | |
| "reward": 2.725, | |
| "reward_std": 0.05, | |
| "rewards/accuracy_reward": 1.73125, | |
| "rewards/format_reward": 0.99375, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 46.790625, | |
| "epoch": 0.208, | |
| "grad_norm": 2.282456159591675, | |
| "kl": 0.10205078125, | |
| "learning_rate": 8.699999999999999e-07, | |
| "loss": 0.001, | |
| "reward": 2.578125, | |
| "reward_std": 0.06875, | |
| "rewards/accuracy_reward": 1.58125, | |
| "rewards/format_reward": 0.996875, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 52.253125, | |
| "epoch": 0.216, | |
| "grad_norm": 1.9098315238952637, | |
| "kl": 0.105810546875, | |
| "learning_rate": 8.65e-07, | |
| "loss": 0.0011, | |
| "reward": 2.8, | |
| "reward_std": 0.07886751294136048, | |
| "rewards/accuracy_reward": 1.8, | |
| "rewards/format_reward": 1.0, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 49.828125, | |
| "epoch": 0.224, | |
| "grad_norm": 0.058336157351732254, | |
| "kl": 0.0714599609375, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0007, | |
| "reward": 2.73125, | |
| "reward_std": 0.0375, | |
| "rewards/accuracy_reward": 1.73125, | |
| "rewards/format_reward": 1.0, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 47.14375, | |
| "epoch": 0.232, | |
| "grad_norm": 0.07711385935544968, | |
| "kl": 0.08037109375, | |
| "learning_rate": 8.55e-07, | |
| "loss": 0.0008, | |
| "reward": 2.875, | |
| "reward_std": 0.03943375647068024, | |
| "rewards/accuracy_reward": 1.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 46.759375, | |
| "epoch": 0.24, | |
| "grad_norm": 0.059466563165187836, | |
| "kl": 0.079248046875, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.70625, | |
| "reward_std": 0.051933756470680235, | |
| "rewards/accuracy_reward": 1.70625, | |
| "rewards/format_reward": 1.0, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 48.540625, | |
| "epoch": 0.248, | |
| "grad_norm": 3.2264294624328613, | |
| "kl": 0.0768310546875, | |
| "learning_rate": 8.45e-07, | |
| "loss": 0.0008, | |
| "reward": 2.7375, | |
| "reward_std": 0.075, | |
| "rewards/accuracy_reward": 1.7375, | |
| "rewards/format_reward": 1.0, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 46.85, | |
| "epoch": 0.256, | |
| "grad_norm": 0.08373435586690903, | |
| "kl": 0.088037109375, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0009, | |
| "reward": 2.728125, | |
| "reward_std": 0.08318375647068024, | |
| "rewards/accuracy_reward": 1.73125, | |
| "rewards/format_reward": 0.996875, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 45.0125, | |
| "epoch": 0.264, | |
| "grad_norm": 0.08248328417539597, | |
| "kl": 0.084375, | |
| "learning_rate": 8.349999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.684375, | |
| "reward_std": 0.04568375647068024, | |
| "rewards/accuracy_reward": 1.6875, | |
| "rewards/format_reward": 0.996875, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 47.1, | |
| "epoch": 0.272, | |
| "grad_norm": 0.08357389271259308, | |
| "kl": 0.07880859375, | |
| "learning_rate": 8.299999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.628125, | |
| "reward_std": 0.03318375647068024, | |
| "rewards/accuracy_reward": 1.63125, | |
| "rewards/format_reward": 0.996875, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 48.95625, | |
| "epoch": 0.28, | |
| "grad_norm": 1.7901896238327026, | |
| "kl": 0.084033203125, | |
| "learning_rate": 8.249999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.609375, | |
| "reward_std": 0.03125, | |
| "rewards/accuracy_reward": 1.6125, | |
| "rewards/format_reward": 0.996875, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 46.15, | |
| "epoch": 0.288, | |
| "grad_norm": 0.07559721171855927, | |
| "kl": 0.14404296875, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0014, | |
| "reward": 2.778125, | |
| "reward_std": 0.04375, | |
| "rewards/accuracy_reward": 1.78125, | |
| "rewards/format_reward": 0.996875, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 43.8375, | |
| "epoch": 0.296, | |
| "grad_norm": 3.8727450370788574, | |
| "kl": 0.109521484375, | |
| "learning_rate": 8.149999999999999e-07, | |
| "loss": 0.0011, | |
| "reward": 2.83125, | |
| "reward_std": 0.0375, | |
| "rewards/accuracy_reward": 1.83125, | |
| "rewards/format_reward": 1.0, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 45.9625, | |
| "epoch": 0.304, | |
| "grad_norm": 0.05233932286500931, | |
| "kl": 0.0930908203125, | |
| "learning_rate": 8.1e-07, | |
| "loss": 0.0009, | |
| "reward": 2.796875, | |
| "reward_std": 0.00625, | |
| "rewards/accuracy_reward": 1.8, | |
| "rewards/format_reward": 0.996875, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 49.55, | |
| "epoch": 0.312, | |
| "grad_norm": 4.457919120788574, | |
| "kl": 0.0723876953125, | |
| "learning_rate": 8.05e-07, | |
| "loss": 0.0007, | |
| "reward": 2.75, | |
| "reward_std": 0.053867512941360475, | |
| "rewards/accuracy_reward": 1.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 50.909375, | |
| "epoch": 0.32, | |
| "grad_norm": 0.050397127866744995, | |
| "kl": 0.08388671875, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0008, | |
| "reward": 2.7625, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.7625, | |
| "rewards/format_reward": 1.0, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 49.165625, | |
| "epoch": 0.328, | |
| "grad_norm": 0.1388678401708603, | |
| "kl": 0.084033203125, | |
| "learning_rate": 7.95e-07, | |
| "loss": 0.0008, | |
| "reward": 2.6875, | |
| "reward_std": 0.014433756470680237, | |
| "rewards/accuracy_reward": 1.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 48.190625, | |
| "epoch": 0.336, | |
| "grad_norm": 2.034395933151245, | |
| "kl": 0.078125, | |
| "learning_rate": 7.9e-07, | |
| "loss": 0.0008, | |
| "reward": 2.76875, | |
| "reward_std": 0.0375, | |
| "rewards/accuracy_reward": 1.76875, | |
| "rewards/format_reward": 1.0, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 49.45, | |
| "epoch": 0.344, | |
| "grad_norm": 2.2621846199035645, | |
| "kl": 0.075048828125, | |
| "learning_rate": 7.85e-07, | |
| "loss": 0.0008, | |
| "reward": 2.634375, | |
| "reward_std": 0.03125, | |
| "rewards/accuracy_reward": 1.6375, | |
| "rewards/format_reward": 0.996875, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 52.03125, | |
| "epoch": 0.352, | |
| "grad_norm": 2.9660024642944336, | |
| "kl": 0.0776123046875, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.7625, | |
| "reward_std": 0.03943375647068024, | |
| "rewards/accuracy_reward": 1.7625, | |
| "rewards/format_reward": 1.0, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 52.496875, | |
| "epoch": 0.36, | |
| "grad_norm": 0.040182050317525864, | |
| "kl": 0.0726806640625, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0007, | |
| "reward": 2.6875, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 51.725, | |
| "epoch": 0.368, | |
| "grad_norm": 0.06841447949409485, | |
| "kl": 0.0802001953125, | |
| "learning_rate": 7.699999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.8, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.8, | |
| "rewards/format_reward": 1.0, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 48.14375, | |
| "epoch": 0.376, | |
| "grad_norm": 0.04733005911111832, | |
| "kl": 0.0659912109375, | |
| "learning_rate": 7.65e-07, | |
| "loss": 0.0007, | |
| "reward": 2.61875, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.61875, | |
| "rewards/format_reward": 1.0, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 46.89375, | |
| "epoch": 0.384, | |
| "grad_norm": 2.7484917640686035, | |
| "kl": 0.0697998046875, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0007, | |
| "reward": 2.74375, | |
| "reward_std": 0.09136751294136047, | |
| "rewards/accuracy_reward": 1.74375, | |
| "rewards/format_reward": 1.0, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 48.48125, | |
| "epoch": 0.392, | |
| "grad_norm": 1.7968782186508179, | |
| "kl": 0.0580078125, | |
| "learning_rate": 7.55e-07, | |
| "loss": 0.0006, | |
| "reward": 2.7125, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.7125, | |
| "rewards/format_reward": 1.0, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 50.034375, | |
| "epoch": 0.4, | |
| "grad_norm": 0.08426347374916077, | |
| "kl": 0.077099609375, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0008, | |
| "reward": 2.68125, | |
| "reward_std": 0.04136751294136047, | |
| "rewards/accuracy_reward": 1.68125, | |
| "rewards/format_reward": 1.0, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 51.378125, | |
| "epoch": 0.408, | |
| "grad_norm": 0.040815118700265884, | |
| "kl": 0.06416015625, | |
| "learning_rate": 7.45e-07, | |
| "loss": 0.0006, | |
| "reward": 2.73125, | |
| "reward_std": 0.026933756470680238, | |
| "rewards/accuracy_reward": 1.73125, | |
| "rewards/format_reward": 1.0, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 49.878125, | |
| "epoch": 0.416, | |
| "grad_norm": 0.06027600169181824, | |
| "kl": 0.0675537109375, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0007, | |
| "reward": 2.671875, | |
| "reward_std": 0.00625, | |
| "rewards/accuracy_reward": 1.675, | |
| "rewards/format_reward": 0.996875, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 47.921875, | |
| "epoch": 0.424, | |
| "grad_norm": 0.06604389101266861, | |
| "kl": 0.07177734375, | |
| "learning_rate": 7.35e-07, | |
| "loss": 0.0007, | |
| "reward": 2.675, | |
| "reward_std": 0.08943375647068023, | |
| "rewards/accuracy_reward": 1.68125, | |
| "rewards/format_reward": 0.99375, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 41.890625, | |
| "epoch": 0.432, | |
| "grad_norm": 2.579275608062744, | |
| "kl": 0.080859375, | |
| "learning_rate": 7.3e-07, | |
| "loss": 0.0008, | |
| "reward": 2.75, | |
| "reward_std": 0.03943375647068024, | |
| "rewards/accuracy_reward": 1.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 44.046875, | |
| "epoch": 0.44, | |
| "grad_norm": 0.04179125651717186, | |
| "kl": 0.076025390625, | |
| "learning_rate": 7.249999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.5375, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.5375, | |
| "rewards/format_reward": 1.0, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 46.725, | |
| "epoch": 0.448, | |
| "grad_norm": 0.04865502566099167, | |
| "kl": 0.075830078125, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0008, | |
| "reward": 2.66875, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.66875, | |
| "rewards/format_reward": 1.0, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 48.0875, | |
| "epoch": 0.456, | |
| "grad_norm": 0.1781499981880188, | |
| "kl": 92.47451171875, | |
| "learning_rate": 7.149999999999999e-07, | |
| "loss": 0.9243, | |
| "reward": 2.8, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.8, | |
| "rewards/format_reward": 1.0, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 49.703125, | |
| "epoch": 0.464, | |
| "grad_norm": 0.05255131423473358, | |
| "kl": 0.0656982421875, | |
| "learning_rate": 7.1e-07, | |
| "loss": 0.0007, | |
| "reward": 2.6625, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.6625, | |
| "rewards/format_reward": 1.0, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 52.06875, | |
| "epoch": 0.472, | |
| "grad_norm": 0.1266418695449829, | |
| "kl": 0.0781005859375, | |
| "learning_rate": 7.049999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.75, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 53.475, | |
| "epoch": 0.48, | |
| "grad_norm": 0.07561592757701874, | |
| "kl": 0.0699951171875, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0007, | |
| "reward": 2.6875, | |
| "reward_std": 0.053867512941360475, | |
| "rewards/accuracy_reward": 1.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 600 | |
| }, | |
| { | |
| "completion_length": 52.0625, | |
| "epoch": 0.488, | |
| "grad_norm": 0.04883831739425659, | |
| "kl": 0.0799560546875, | |
| "learning_rate": 6.949999999999999e-07, | |
| "loss": 0.0008, | |
| "reward": 2.65625, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 610 | |
| }, | |
| { | |
| "completion_length": 49.54375, | |
| "epoch": 0.496, | |
| "grad_norm": 2.3243064880371094, | |
| "kl": 0.0752685546875, | |
| "learning_rate": 6.9e-07, | |
| "loss": 0.0008, | |
| "reward": 2.815625, | |
| "reward_std": 0.058183756470680234, | |
| "rewards/accuracy_reward": 1.81875, | |
| "rewards/format_reward": 0.996875, | |
| "step": 620 | |
| }, | |
| { | |
| "completion_length": 48.690625, | |
| "epoch": 0.504, | |
| "grad_norm": 0.06750122457742691, | |
| "kl": 0.06513671875, | |
| "learning_rate": 6.85e-07, | |
| "loss": 0.0007, | |
| "reward": 2.84375, | |
| "reward_std": 0.0375, | |
| "rewards/accuracy_reward": 1.84375, | |
| "rewards/format_reward": 1.0, | |
| "step": 630 | |
| }, | |
| { | |
| "completion_length": 49.271875, | |
| "epoch": 0.512, | |
| "grad_norm": 0.056099992245435715, | |
| "kl": 0.0666259765625, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0007, | |
| "reward": 2.69375, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.69375, | |
| "rewards/format_reward": 1.0, | |
| "step": 640 | |
| }, | |
| { | |
| "completion_length": 46.4375, | |
| "epoch": 0.52, | |
| "grad_norm": 0.0455087348818779, | |
| "kl": 0.0546630859375, | |
| "learning_rate": 6.75e-07, | |
| "loss": 0.0005, | |
| "reward": 2.75625, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.75625, | |
| "rewards/format_reward": 1.0, | |
| "step": 650 | |
| }, | |
| { | |
| "completion_length": 46.496875, | |
| "epoch": 0.528, | |
| "grad_norm": 0.05418640747666359, | |
| "kl": 0.0645263671875, | |
| "learning_rate": 6.7e-07, | |
| "loss": 0.0006, | |
| "reward": 2.6875, | |
| "reward_std": 0.014433756470680237, | |
| "rewards/accuracy_reward": 1.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 660 | |
| }, | |
| { | |
| "completion_length": 46.328125, | |
| "epoch": 0.536, | |
| "grad_norm": 4.0458455085754395, | |
| "kl": 0.081103515625, | |
| "learning_rate": 6.65e-07, | |
| "loss": 0.0008, | |
| "reward": 2.65625, | |
| "reward_std": 0.08080126941204072, | |
| "rewards/accuracy_reward": 1.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 670 | |
| }, | |
| { | |
| "completion_length": 48.91875, | |
| "epoch": 0.544, | |
| "grad_norm": 0.04970540851354599, | |
| "kl": 0.0717041015625, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0007, | |
| "reward": 2.7625, | |
| "reward_std": 0.04330126941204071, | |
| "rewards/accuracy_reward": 1.7625, | |
| "rewards/format_reward": 1.0, | |
| "step": 680 | |
| }, | |
| { | |
| "completion_length": 49.01875, | |
| "epoch": 0.552, | |
| "grad_norm": 0.1746923178434372, | |
| "kl": 0.073779296875, | |
| "learning_rate": 6.55e-07, | |
| "loss": 0.0007, | |
| "reward": 2.75, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 690 | |
| }, | |
| { | |
| "completion_length": 48.3125, | |
| "epoch": 0.56, | |
| "grad_norm": 0.051023293286561966, | |
| "kl": 0.06783447265625, | |
| "learning_rate": 6.5e-07, | |
| "loss": 0.0007, | |
| "reward": 2.7625, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.7625, | |
| "rewards/format_reward": 1.0, | |
| "step": 700 | |
| }, | |
| { | |
| "completion_length": 49.11875, | |
| "epoch": 0.568, | |
| "grad_norm": 0.07166194915771484, | |
| "kl": 0.0619384765625, | |
| "learning_rate": 6.45e-07, | |
| "loss": 0.0006, | |
| "reward": 2.7375, | |
| "reward_std": 0.014433756470680237, | |
| "rewards/accuracy_reward": 1.7375, | |
| "rewards/format_reward": 1.0, | |
| "step": 710 | |
| }, | |
| { | |
| "completion_length": 51.103125, | |
| "epoch": 0.576, | |
| "grad_norm": 0.08520376682281494, | |
| "kl": 0.0830078125, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0008, | |
| "reward": 2.7375, | |
| "reward_std": 0.014433756470680237, | |
| "rewards/accuracy_reward": 1.7375, | |
| "rewards/format_reward": 1.0, | |
| "step": 720 | |
| }, | |
| { | |
| "completion_length": 49.615625, | |
| "epoch": 0.584, | |
| "grad_norm": 0.10399647802114487, | |
| "kl": 0.0688232421875, | |
| "learning_rate": 6.35e-07, | |
| "loss": 0.0007, | |
| "reward": 2.69375, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.69375, | |
| "rewards/format_reward": 1.0, | |
| "step": 730 | |
| }, | |
| { | |
| "completion_length": 50.596875, | |
| "epoch": 0.592, | |
| "grad_norm": 0.06369677186012268, | |
| "kl": 0.087890625, | |
| "learning_rate": 6.3e-07, | |
| "loss": 0.0009, | |
| "reward": 2.61875, | |
| "reward_std": 0.0375, | |
| "rewards/accuracy_reward": 1.61875, | |
| "rewards/format_reward": 1.0, | |
| "step": 740 | |
| }, | |
| { | |
| "completion_length": 50.56875, | |
| "epoch": 0.6, | |
| "grad_norm": 0.07198835164308548, | |
| "kl": 0.10087890625, | |
| "learning_rate": 6.249999999999999e-07, | |
| "loss": 0.001, | |
| "reward": 2.625, | |
| "reward_std": 0.06443375647068024, | |
| "rewards/accuracy_reward": 1.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 750 | |
| }, | |
| { | |
| "completion_length": 50.953125, | |
| "epoch": 0.608, | |
| "grad_norm": 0.04980659857392311, | |
| "kl": 0.101806640625, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.001, | |
| "reward": 2.721875, | |
| "reward_std": 0.03125, | |
| "rewards/accuracy_reward": 1.725, | |
| "rewards/format_reward": 0.996875, | |
| "step": 760 | |
| }, | |
| { | |
| "completion_length": 46.609375, | |
| "epoch": 0.616, | |
| "grad_norm": 2.673631191253662, | |
| "kl": 0.0730224609375, | |
| "learning_rate": 6.149999999999999e-07, | |
| "loss": 0.0007, | |
| "reward": 2.6875, | |
| "reward_std": 0.03943375647068024, | |
| "rewards/accuracy_reward": 1.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 770 | |
| }, | |
| { | |
| "completion_length": 46.16875, | |
| "epoch": 0.624, | |
| "grad_norm": 0.07191024720668793, | |
| "kl": 0.07197265625, | |
| "learning_rate": 6.1e-07, | |
| "loss": 0.0007, | |
| "reward": 2.6125, | |
| "reward_std": 0.03943375647068024, | |
| "rewards/accuracy_reward": 1.6125, | |
| "rewards/format_reward": 1.0, | |
| "step": 780 | |
| }, | |
| { | |
| "completion_length": 47.346875, | |
| "epoch": 0.632, | |
| "grad_norm": 0.31487828493118286, | |
| "kl": 0.0890625, | |
| "learning_rate": 6.049999999999999e-07, | |
| "loss": 0.0009, | |
| "reward": 2.7, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.7, | |
| "rewards/format_reward": 1.0, | |
| "step": 790 | |
| }, | |
| { | |
| "completion_length": 48.5125, | |
| "epoch": 0.64, | |
| "grad_norm": 0.04281134530901909, | |
| "kl": 0.0651611328125, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0007, | |
| "reward": 2.65, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.65, | |
| "rewards/format_reward": 1.0, | |
| "step": 800 | |
| }, | |
| { | |
| "completion_length": 47.9625, | |
| "epoch": 0.648, | |
| "grad_norm": 1.7782899141311646, | |
| "kl": 0.0711669921875, | |
| "learning_rate": 5.949999999999999e-07, | |
| "loss": 0.0007, | |
| "reward": 2.634375, | |
| "reward_std": 0.04568375647068024, | |
| "rewards/accuracy_reward": 1.6375, | |
| "rewards/format_reward": 0.996875, | |
| "step": 810 | |
| }, | |
| { | |
| "completion_length": 47.7, | |
| "epoch": 0.656, | |
| "grad_norm": 0.9939271211624146, | |
| "kl": 0.07099609375, | |
| "learning_rate": 5.9e-07, | |
| "loss": 0.0007, | |
| "reward": 2.6625, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.6625, | |
| "rewards/format_reward": 1.0, | |
| "step": 820 | |
| }, | |
| { | |
| "completion_length": 45.790625, | |
| "epoch": 0.664, | |
| "grad_norm": 0.05890406668186188, | |
| "kl": 0.0596923828125, | |
| "learning_rate": 5.849999999999999e-07, | |
| "loss": 0.0006, | |
| "reward": 2.696875, | |
| "reward_std": 0.00625, | |
| "rewards/accuracy_reward": 1.7, | |
| "rewards/format_reward": 0.996875, | |
| "step": 830 | |
| }, | |
| { | |
| "completion_length": 46.675, | |
| "epoch": 0.672, | |
| "grad_norm": 0.062360286712646484, | |
| "kl": 0.071240234375, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0007, | |
| "reward": 2.5875, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.5875, | |
| "rewards/format_reward": 1.0, | |
| "step": 840 | |
| }, | |
| { | |
| "completion_length": 47.621875, | |
| "epoch": 0.68, | |
| "grad_norm": 2.2732224464416504, | |
| "kl": 1.6703369140625, | |
| "learning_rate": 5.749999999999999e-07, | |
| "loss": 0.0167, | |
| "reward": 2.64375, | |
| "reward_std": 0.04136751294136047, | |
| "rewards/accuracy_reward": 1.64375, | |
| "rewards/format_reward": 1.0, | |
| "step": 850 | |
| }, | |
| { | |
| "completion_length": 50.35, | |
| "epoch": 0.688, | |
| "grad_norm": 2.1026487350463867, | |
| "kl": 0.09111328125, | |
| "learning_rate": 5.699999999999999e-07, | |
| "loss": 0.0009, | |
| "reward": 2.6625, | |
| "reward_std": 0.053867512941360475, | |
| "rewards/accuracy_reward": 1.6625, | |
| "rewards/format_reward": 1.0, | |
| "step": 860 | |
| }, | |
| { | |
| "completion_length": 52.321875, | |
| "epoch": 0.696, | |
| "grad_norm": 3.0173561573028564, | |
| "kl": 321.521728515625, | |
| "learning_rate": 5.649999999999999e-07, | |
| "loss": 3.2171, | |
| "reward": 2.46875, | |
| "reward_std": 0.04136751294136047, | |
| "rewards/accuracy_reward": 1.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 870 | |
| }, | |
| { | |
| "completion_length": 49.7625, | |
| "epoch": 0.704, | |
| "grad_norm": 0.06468257308006287, | |
| "kl": 0.39306640625, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0039, | |
| "reward": 2.7125, | |
| "reward_std": 0.06443375647068024, | |
| "rewards/accuracy_reward": 1.7125, | |
| "rewards/format_reward": 1.0, | |
| "step": 880 | |
| }, | |
| { | |
| "completion_length": 48.240625, | |
| "epoch": 0.712, | |
| "grad_norm": 0.07906866073608398, | |
| "kl": 0.12939453125, | |
| "learning_rate": 5.55e-07, | |
| "loss": 0.0013, | |
| "reward": 2.68125, | |
| "reward_std": 0.026933756470680238, | |
| "rewards/accuracy_reward": 1.68125, | |
| "rewards/format_reward": 1.0, | |
| "step": 890 | |
| }, | |
| { | |
| "completion_length": 49.03125, | |
| "epoch": 0.72, | |
| "grad_norm": 0.07313551008701324, | |
| "kl": 0.18505859375, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0019, | |
| "reward": 2.69375, | |
| "reward_std": 0.026933756470680238, | |
| "rewards/accuracy_reward": 1.69375, | |
| "rewards/format_reward": 1.0, | |
| "step": 900 | |
| }, | |
| { | |
| "completion_length": 49.60625, | |
| "epoch": 0.728, | |
| "grad_norm": 4.0763630867004395, | |
| "kl": 2.13671875, | |
| "learning_rate": 5.45e-07, | |
| "loss": 0.0213, | |
| "reward": 2.753125, | |
| "reward_std": 0.058183756470680234, | |
| "rewards/accuracy_reward": 1.75625, | |
| "rewards/format_reward": 0.996875, | |
| "step": 910 | |
| }, | |
| { | |
| "completion_length": 49.83125, | |
| "epoch": 0.736, | |
| "grad_norm": 4.245804786682129, | |
| "kl": 0.094384765625, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0009, | |
| "reward": 2.75625, | |
| "reward_std": 0.04136751294136047, | |
| "rewards/accuracy_reward": 1.75625, | |
| "rewards/format_reward": 1.0, | |
| "step": 920 | |
| }, | |
| { | |
| "completion_length": 50.996875, | |
| "epoch": 0.744, | |
| "grad_norm": 39.86748504638672, | |
| "kl": 3.122021484375, | |
| "learning_rate": 5.35e-07, | |
| "loss": 0.0313, | |
| "reward": 2.8125, | |
| "reward_std": 0.03943375647068024, | |
| "rewards/accuracy_reward": 1.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 930 | |
| }, | |
| { | |
| "completion_length": 49.59375, | |
| "epoch": 0.752, | |
| "grad_norm": 0.05779128894209862, | |
| "kl": 0.0998046875, | |
| "learning_rate": 5.3e-07, | |
| "loss": 0.001, | |
| "reward": 2.79375, | |
| "reward_std": 0.051933756470680235, | |
| "rewards/accuracy_reward": 1.79375, | |
| "rewards/format_reward": 1.0, | |
| "step": 940 | |
| }, | |
| { | |
| "completion_length": 47.74375, | |
| "epoch": 0.76, | |
| "grad_norm": 0.056427907198667526, | |
| "kl": 0.445361328125, | |
| "learning_rate": 5.25e-07, | |
| "loss": 0.0045, | |
| "reward": 2.76875, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.76875, | |
| "rewards/format_reward": 1.0, | |
| "step": 950 | |
| }, | |
| { | |
| "completion_length": 48.50625, | |
| "epoch": 0.768, | |
| "grad_norm": 66.90420532226562, | |
| "kl": 2.232958984375, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0223, | |
| "reward": 2.80625, | |
| "reward_std": 0.0375, | |
| "rewards/accuracy_reward": 1.80625, | |
| "rewards/format_reward": 1.0, | |
| "step": 960 | |
| }, | |
| { | |
| "completion_length": 51.01875, | |
| "epoch": 0.776, | |
| "grad_norm": 0.09945037215948105, | |
| "kl": 0.09228515625, | |
| "learning_rate": 5.149999999999999e-07, | |
| "loss": 0.0009, | |
| "reward": 2.775, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.775, | |
| "rewards/format_reward": 1.0, | |
| "step": 970 | |
| }, | |
| { | |
| "completion_length": 50.090625, | |
| "epoch": 0.784, | |
| "grad_norm": 0.07448896020650864, | |
| "kl": 0.15419921875, | |
| "learning_rate": 5.1e-07, | |
| "loss": 0.0015, | |
| "reward": 2.76875, | |
| "reward_std": 0.0375, | |
| "rewards/accuracy_reward": 1.76875, | |
| "rewards/format_reward": 1.0, | |
| "step": 980 | |
| }, | |
| { | |
| "completion_length": 49.23125, | |
| "epoch": 0.792, | |
| "grad_norm": 2.0038902759552, | |
| "kl": 0.098828125, | |
| "learning_rate": 5.049999999999999e-07, | |
| "loss": 0.001, | |
| "reward": 2.64375, | |
| "reward_std": 0.0125, | |
| "rewards/accuracy_reward": 1.64375, | |
| "rewards/format_reward": 1.0, | |
| "step": 990 | |
| }, | |
| { | |
| "completion_length": 49.790625, | |
| "epoch": 0.8, | |
| "grad_norm": 0.03871207684278488, | |
| "kl": 0.31083984375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0031, | |
| "reward": 2.7625, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 1.7625, | |
| "rewards/format_reward": 1.0, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |