{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 46.553125, "epoch": 0.008, "grad_norm": 0.05134107172489166, "kl": 0.012939453125, "learning_rate": 9.95e-07, "loss": 0.0001, "reward": 2.703125, "reward_std": 0.11205126643180847, "rewards/accuracy_reward": 1.7125, "rewards/format_reward": 0.990625, "step": 10 }, { "completion_length": 49.34375, "epoch": 0.016, "grad_norm": 0.06966069340705872, "kl": 0.01898193359375, "learning_rate": 9.9e-07, "loss": 0.0002, "reward": 2.775, "reward_std": 0.05, "rewards/accuracy_reward": 1.78125, "rewards/format_reward": 0.99375, "step": 20 }, { "completion_length": 44.63125, "epoch": 0.024, "grad_norm": 5.11226749420166, "kl": 0.0212158203125, "learning_rate": 9.849999999999999e-07, "loss": 0.0002, "reward": 2.546875, "reward_std": 0.09568375647068024, "rewards/accuracy_reward": 1.55625, "rewards/format_reward": 0.990625, "step": 30 }, { "completion_length": 42.775, "epoch": 0.032, "grad_norm": 0.0820818841457367, "kl": 0.042626953125, "learning_rate": 9.8e-07, "loss": 0.0004, "reward": 2.775, "reward_std": 0.03943375647068024, "rewards/accuracy_reward": 1.775, "rewards/format_reward": 1.0, "step": 40 }, { "completion_length": 44.275, "epoch": 0.04, "grad_norm": 0.06030546873807907, "kl": 0.03828125, "learning_rate": 9.75e-07, "loss": 0.0004, "reward": 2.74375, "reward_std": 0.026933756470680238, "rewards/accuracy_reward": 1.74375, "rewards/format_reward": 1.0, "step": 50 }, { "completion_length": 50.990625, "epoch": 0.048, "grad_norm": 0.10005596280097961, "kl": 0.03060302734375, "learning_rate": 9.7e-07, "loss": 0.0003, "reward": 2.60625, "reward_std": 0.10193375647068023, "rewards/accuracy_reward": 1.60625, "rewards/format_reward": 1.0, "step": 60 }, { "completion_length": 54.375, "epoch": 0.056, "grad_norm": 4.453707695007324, "kl": 0.0556640625, "learning_rate": 9.649999999999999e-07, "loss": 0.0006, "reward": 2.590625, "reward_std": 0.08318375647068024, "rewards/accuracy_reward": 1.59375, "rewards/format_reward": 0.996875, "step": 70 }, { "completion_length": 48.815625, "epoch": 0.064, "grad_norm": 2.5629329681396484, "kl": 0.040283203125, "learning_rate": 9.6e-07, "loss": 0.0004, "reward": 2.765625, "reward_std": 0.058183756470680234, "rewards/accuracy_reward": 1.76875, "rewards/format_reward": 0.996875, "step": 80 }, { "completion_length": 47.53125, "epoch": 0.072, "grad_norm": 0.08292120695114136, "kl": 0.0712646484375, "learning_rate": 9.55e-07, "loss": 0.0007, "reward": 2.825, "reward_std": 0.05, "rewards/accuracy_reward": 1.825, "rewards/format_reward": 1.0, "step": 90 }, { "completion_length": 46.703125, "epoch": 0.08, "grad_norm": 2.7465286254882812, "kl": 0.05367431640625, "learning_rate": 9.499999999999999e-07, "loss": 0.0005, "reward": 2.71875, "reward_std": 0.07693375647068024, "rewards/accuracy_reward": 1.71875, "rewards/format_reward": 1.0, "step": 100 }, { "completion_length": 46.225, "epoch": 0.088, "grad_norm": 2.1839213371276855, "kl": 0.0655517578125, "learning_rate": 9.45e-07, "loss": 0.0007, "reward": 2.609375, "reward_std": 0.03125, "rewards/accuracy_reward": 1.61875, "rewards/format_reward": 0.990625, "step": 110 }, { "completion_length": 44.096875, "epoch": 0.096, "grad_norm": 0.07181887328624725, "kl": 0.06865234375, "learning_rate": 9.399999999999999e-07, "loss": 0.0007, "reward": 2.71875, "reward_std": 0.0125, "rewards/accuracy_reward": 1.71875, "rewards/format_reward": 1.0, "step": 120 }, { "completion_length": 44.590625, "epoch": 0.104, "grad_norm": 0.09902142733335495, "kl": 0.0936767578125, "learning_rate": 9.35e-07, "loss": 0.0009, "reward": 2.56875, "reward_std": 0.0625, "rewards/accuracy_reward": 1.575, "rewards/format_reward": 0.99375, "step": 130 }, { "completion_length": 43.81875, "epoch": 0.112, "grad_norm": 2.340815305709839, "kl": 0.066015625, "learning_rate": 9.3e-07, "loss": 0.0007, "reward": 2.75, "reward_std": 0.025, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 140 }, { "completion_length": 49.06875, "epoch": 0.12, "grad_norm": 2.58245849609375, "kl": 0.0600341796875, "learning_rate": 9.25e-07, "loss": 0.0006, "reward": 2.7125, "reward_std": 0.125, "rewards/accuracy_reward": 1.7125, "rewards/format_reward": 1.0, "step": 150 }, { "completion_length": 52.84375, "epoch": 0.128, "grad_norm": 0.06839890778064728, "kl": 0.0785400390625, "learning_rate": 9.2e-07, "loss": 0.0008, "reward": 2.7, "reward_std": 0.03943375647068024, "rewards/accuracy_reward": 1.7, "rewards/format_reward": 1.0, "step": 160 }, { "completion_length": 47.7, "epoch": 0.136, "grad_norm": 0.11428700387477875, "kl": 0.06865234375, "learning_rate": 9.15e-07, "loss": 0.0007, "reward": 2.75, "reward_std": 0.025, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 170 }, { "completion_length": 43.478125, "epoch": 0.144, "grad_norm": 2.188392400741577, "kl": 0.062451171875, "learning_rate": 9.1e-07, "loss": 0.0006, "reward": 2.615625, "reward_std": 0.06875, "rewards/accuracy_reward": 1.61875, "rewards/format_reward": 0.996875, "step": 180 }, { "completion_length": 42.540625, "epoch": 0.152, "grad_norm": 3.399991512298584, "kl": 0.076953125, "learning_rate": 9.05e-07, "loss": 0.0008, "reward": 2.64375, "reward_std": 0.09136751294136047, "rewards/accuracy_reward": 1.64375, "rewards/format_reward": 1.0, "step": 190 }, { "completion_length": 50.21875, "epoch": 0.16, "grad_norm": 0.10214658826589584, "kl": 0.09365234375, "learning_rate": 9e-07, "loss": 0.0009, "reward": 2.784375, "reward_std": 0.05625, "rewards/accuracy_reward": 1.7875, "rewards/format_reward": 0.996875, "step": 200 }, { "completion_length": 54.35, "epoch": 0.168, "grad_norm": 0.08639144152402878, "kl": 0.1749267578125, "learning_rate": 8.95e-07, "loss": 0.0017, "reward": 2.7875, "reward_std": 0.014433756470680237, "rewards/accuracy_reward": 1.7875, "rewards/format_reward": 1.0, "step": 210 }, { "completion_length": 56.29375, "epoch": 0.176, "grad_norm": 0.06954076141119003, "kl": 0.119091796875, "learning_rate": 8.9e-07, "loss": 0.0012, "reward": 2.75, "reward_std": 0.025, "rewards/accuracy_reward": 1.75625, "rewards/format_reward": 0.99375, "step": 220 }, { "completion_length": 48.8875, "epoch": 0.184, "grad_norm": 0.06490013003349304, "kl": 0.12080078125, "learning_rate": 8.85e-07, "loss": 0.0012, "reward": 2.65, "reward_std": 0.07886751294136048, "rewards/accuracy_reward": 1.65, "rewards/format_reward": 1.0, "step": 230 }, { "completion_length": 41.115625, "epoch": 0.192, "grad_norm": 0.12679292261600494, "kl": 0.12470703125, "learning_rate": 8.799999999999999e-07, "loss": 0.0012, "reward": 2.8125, "reward_std": 0.07886751294136048, "rewards/accuracy_reward": 1.81875, "rewards/format_reward": 0.99375, "step": 240 }, { "completion_length": 40.4125, "epoch": 0.2, "grad_norm": 0.11438746750354767, "kl": 30.11142578125, "learning_rate": 8.75e-07, "loss": 0.3012, "reward": 2.725, "reward_std": 0.05, "rewards/accuracy_reward": 1.73125, "rewards/format_reward": 0.99375, "step": 250 }, { "completion_length": 46.790625, "epoch": 0.208, "grad_norm": 2.282456159591675, "kl": 0.10205078125, "learning_rate": 8.699999999999999e-07, "loss": 0.001, "reward": 2.578125, "reward_std": 0.06875, "rewards/accuracy_reward": 1.58125, "rewards/format_reward": 0.996875, "step": 260 }, { "completion_length": 52.253125, "epoch": 0.216, "grad_norm": 1.9098315238952637, "kl": 0.105810546875, "learning_rate": 8.65e-07, "loss": 0.0011, "reward": 2.8, "reward_std": 0.07886751294136048, "rewards/accuracy_reward": 1.8, "rewards/format_reward": 1.0, "step": 270 }, { "completion_length": 49.828125, "epoch": 0.224, "grad_norm": 0.058336157351732254, "kl": 0.0714599609375, "learning_rate": 8.599999999999999e-07, "loss": 0.0007, "reward": 2.73125, "reward_std": 0.0375, "rewards/accuracy_reward": 1.73125, "rewards/format_reward": 1.0, "step": 280 }, { "completion_length": 47.14375, "epoch": 0.232, "grad_norm": 0.07711385935544968, "kl": 0.08037109375, "learning_rate": 8.55e-07, "loss": 0.0008, "reward": 2.875, "reward_std": 0.03943375647068024, "rewards/accuracy_reward": 1.875, "rewards/format_reward": 1.0, "step": 290 }, { "completion_length": 46.759375, "epoch": 0.24, "grad_norm": 0.059466563165187836, "kl": 0.079248046875, "learning_rate": 8.499999999999999e-07, "loss": 0.0008, "reward": 2.70625, "reward_std": 0.051933756470680235, "rewards/accuracy_reward": 1.70625, "rewards/format_reward": 1.0, "step": 300 }, { "completion_length": 48.540625, "epoch": 0.248, "grad_norm": 3.2264294624328613, "kl": 0.0768310546875, "learning_rate": 8.45e-07, "loss": 0.0008, "reward": 2.7375, "reward_std": 0.075, "rewards/accuracy_reward": 1.7375, "rewards/format_reward": 1.0, "step": 310 }, { "completion_length": 46.85, "epoch": 0.256, "grad_norm": 0.08373435586690903, "kl": 0.088037109375, "learning_rate": 8.399999999999999e-07, "loss": 0.0009, "reward": 2.728125, "reward_std": 0.08318375647068024, "rewards/accuracy_reward": 1.73125, "rewards/format_reward": 0.996875, "step": 320 }, { "completion_length": 45.0125, "epoch": 0.264, "grad_norm": 0.08248328417539597, "kl": 0.084375, "learning_rate": 8.349999999999999e-07, "loss": 0.0008, "reward": 2.684375, "reward_std": 0.04568375647068024, "rewards/accuracy_reward": 1.6875, "rewards/format_reward": 0.996875, "step": 330 }, { "completion_length": 47.1, "epoch": 0.272, "grad_norm": 0.08357389271259308, "kl": 0.07880859375, "learning_rate": 8.299999999999999e-07, "loss": 0.0008, "reward": 2.628125, "reward_std": 0.03318375647068024, "rewards/accuracy_reward": 1.63125, "rewards/format_reward": 0.996875, "step": 340 }, { "completion_length": 48.95625, "epoch": 0.28, "grad_norm": 1.7901896238327026, "kl": 0.084033203125, "learning_rate": 8.249999999999999e-07, "loss": 0.0008, "reward": 2.609375, "reward_std": 0.03125, "rewards/accuracy_reward": 1.6125, "rewards/format_reward": 0.996875, "step": 350 }, { "completion_length": 46.15, "epoch": 0.288, "grad_norm": 0.07559721171855927, "kl": 0.14404296875, "learning_rate": 8.199999999999999e-07, "loss": 0.0014, "reward": 2.778125, "reward_std": 0.04375, "rewards/accuracy_reward": 1.78125, "rewards/format_reward": 0.996875, "step": 360 }, { "completion_length": 43.8375, "epoch": 0.296, "grad_norm": 3.8727450370788574, "kl": 0.109521484375, "learning_rate": 8.149999999999999e-07, "loss": 0.0011, "reward": 2.83125, "reward_std": 0.0375, "rewards/accuracy_reward": 1.83125, "rewards/format_reward": 1.0, "step": 370 }, { "completion_length": 45.9625, "epoch": 0.304, "grad_norm": 0.05233932286500931, "kl": 0.0930908203125, "learning_rate": 8.1e-07, "loss": 0.0009, "reward": 2.796875, "reward_std": 0.00625, "rewards/accuracy_reward": 1.8, "rewards/format_reward": 0.996875, "step": 380 }, { "completion_length": 49.55, "epoch": 0.312, "grad_norm": 4.457919120788574, "kl": 0.0723876953125, "learning_rate": 8.05e-07, "loss": 0.0007, "reward": 2.75, "reward_std": 0.053867512941360475, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 390 }, { "completion_length": 50.909375, "epoch": 0.32, "grad_norm": 0.050397127866744995, "kl": 0.08388671875, "learning_rate": 8e-07, "loss": 0.0008, "reward": 2.7625, "reward_std": 0.025, "rewards/accuracy_reward": 1.7625, "rewards/format_reward": 1.0, "step": 400 }, { "completion_length": 49.165625, "epoch": 0.328, "grad_norm": 0.1388678401708603, "kl": 0.084033203125, "learning_rate": 7.95e-07, "loss": 0.0008, "reward": 2.6875, "reward_std": 0.014433756470680237, "rewards/accuracy_reward": 1.6875, "rewards/format_reward": 1.0, "step": 410 }, { "completion_length": 48.190625, "epoch": 0.336, "grad_norm": 2.034395933151245, "kl": 0.078125, "learning_rate": 7.9e-07, "loss": 0.0008, "reward": 2.76875, "reward_std": 0.0375, "rewards/accuracy_reward": 1.76875, "rewards/format_reward": 1.0, "step": 420 }, { "completion_length": 49.45, "epoch": 0.344, "grad_norm": 2.2621846199035645, "kl": 0.075048828125, "learning_rate": 7.85e-07, "loss": 0.0008, "reward": 2.634375, "reward_std": 0.03125, "rewards/accuracy_reward": 1.6375, "rewards/format_reward": 0.996875, "step": 430 }, { "completion_length": 52.03125, "epoch": 0.352, "grad_norm": 2.9660024642944336, "kl": 0.0776123046875, "learning_rate": 7.799999999999999e-07, "loss": 0.0008, "reward": 2.7625, "reward_std": 0.03943375647068024, "rewards/accuracy_reward": 1.7625, "rewards/format_reward": 1.0, "step": 440 }, { "completion_length": 52.496875, "epoch": 0.36, "grad_norm": 0.040182050317525864, "kl": 0.0726806640625, "learning_rate": 7.75e-07, "loss": 0.0007, "reward": 2.6875, "reward_std": 0.025, "rewards/accuracy_reward": 1.6875, "rewards/format_reward": 1.0, "step": 450 }, { "completion_length": 51.725, "epoch": 0.368, "grad_norm": 0.06841447949409485, "kl": 0.0802001953125, "learning_rate": 7.699999999999999e-07, "loss": 0.0008, "reward": 2.8, "reward_std": 0.0, "rewards/accuracy_reward": 1.8, "rewards/format_reward": 1.0, "step": 460 }, { "completion_length": 48.14375, "epoch": 0.376, "grad_norm": 0.04733005911111832, "kl": 0.0659912109375, "learning_rate": 7.65e-07, "loss": 0.0007, "reward": 2.61875, "reward_std": 0.0125, "rewards/accuracy_reward": 1.61875, "rewards/format_reward": 1.0, "step": 470 }, { "completion_length": 46.89375, "epoch": 0.384, "grad_norm": 2.7484917640686035, "kl": 0.0697998046875, "learning_rate": 7.599999999999999e-07, "loss": 0.0007, "reward": 2.74375, "reward_std": 0.09136751294136047, "rewards/accuracy_reward": 1.74375, "rewards/format_reward": 1.0, "step": 480 }, { "completion_length": 48.48125, "epoch": 0.392, "grad_norm": 1.7968782186508179, "kl": 0.0580078125, "learning_rate": 7.55e-07, "loss": 0.0006, "reward": 2.7125, "reward_std": 0.025, "rewards/accuracy_reward": 1.7125, "rewards/format_reward": 1.0, "step": 490 }, { "completion_length": 50.034375, "epoch": 0.4, "grad_norm": 0.08426347374916077, "kl": 0.077099609375, "learning_rate": 7.5e-07, "loss": 0.0008, "reward": 2.68125, "reward_std": 0.04136751294136047, "rewards/accuracy_reward": 1.68125, "rewards/format_reward": 1.0, "step": 500 }, { "completion_length": 51.378125, "epoch": 0.408, "grad_norm": 0.040815118700265884, "kl": 0.06416015625, "learning_rate": 7.45e-07, "loss": 0.0006, "reward": 2.73125, "reward_std": 0.026933756470680238, "rewards/accuracy_reward": 1.73125, "rewards/format_reward": 1.0, "step": 510 }, { "completion_length": 49.878125, "epoch": 0.416, "grad_norm": 0.06027600169181824, "kl": 0.0675537109375, "learning_rate": 7.4e-07, "loss": 0.0007, "reward": 2.671875, "reward_std": 0.00625, "rewards/accuracy_reward": 1.675, "rewards/format_reward": 0.996875, "step": 520 }, { "completion_length": 47.921875, "epoch": 0.424, "grad_norm": 0.06604389101266861, "kl": 0.07177734375, "learning_rate": 7.35e-07, "loss": 0.0007, "reward": 2.675, "reward_std": 0.08943375647068023, "rewards/accuracy_reward": 1.68125, "rewards/format_reward": 0.99375, "step": 530 }, { "completion_length": 41.890625, "epoch": 0.432, "grad_norm": 2.579275608062744, "kl": 0.080859375, "learning_rate": 7.3e-07, "loss": 0.0008, "reward": 2.75, "reward_std": 0.03943375647068024, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 540 }, { "completion_length": 44.046875, "epoch": 0.44, "grad_norm": 0.04179125651717186, "kl": 0.076025390625, "learning_rate": 7.249999999999999e-07, "loss": 0.0008, "reward": 2.5375, "reward_std": 0.025, "rewards/accuracy_reward": 1.5375, "rewards/format_reward": 1.0, "step": 550 }, { "completion_length": 46.725, "epoch": 0.448, "grad_norm": 0.04865502566099167, "kl": 0.075830078125, "learning_rate": 7.2e-07, "loss": 0.0008, "reward": 2.66875, "reward_std": 0.0125, "rewards/accuracy_reward": 1.66875, "rewards/format_reward": 1.0, "step": 560 }, { "completion_length": 48.0875, "epoch": 0.456, "grad_norm": 0.1781499981880188, "kl": 92.47451171875, "learning_rate": 7.149999999999999e-07, "loss": 0.9243, "reward": 2.8, "reward_std": 0.025, "rewards/accuracy_reward": 1.8, "rewards/format_reward": 1.0, "step": 570 }, { "completion_length": 49.703125, "epoch": 0.464, "grad_norm": 0.05255131423473358, "kl": 0.0656982421875, "learning_rate": 7.1e-07, "loss": 0.0007, "reward": 2.6625, "reward_std": 0.025, "rewards/accuracy_reward": 1.6625, "rewards/format_reward": 1.0, "step": 580 }, { "completion_length": 52.06875, "epoch": 0.472, "grad_norm": 0.1266418695449829, "kl": 0.0781005859375, "learning_rate": 7.049999999999999e-07, "loss": 0.0008, "reward": 2.75, "reward_std": 0.025, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 590 }, { "completion_length": 53.475, "epoch": 0.48, "grad_norm": 0.07561592757701874, "kl": 0.0699951171875, "learning_rate": 7e-07, "loss": 0.0007, "reward": 2.6875, "reward_std": 0.053867512941360475, "rewards/accuracy_reward": 1.6875, "rewards/format_reward": 1.0, "step": 600 }, { "completion_length": 52.0625, "epoch": 0.488, "grad_norm": 0.04883831739425659, "kl": 0.0799560546875, "learning_rate": 6.949999999999999e-07, "loss": 0.0008, "reward": 2.65625, "reward_std": 0.0125, "rewards/accuracy_reward": 1.65625, "rewards/format_reward": 1.0, "step": 610 }, { "completion_length": 49.54375, "epoch": 0.496, "grad_norm": 2.3243064880371094, "kl": 0.0752685546875, "learning_rate": 6.9e-07, "loss": 0.0008, "reward": 2.815625, "reward_std": 0.058183756470680234, "rewards/accuracy_reward": 1.81875, "rewards/format_reward": 0.996875, "step": 620 }, { "completion_length": 48.690625, "epoch": 0.504, "grad_norm": 0.06750122457742691, "kl": 0.06513671875, "learning_rate": 6.85e-07, "loss": 0.0007, "reward": 2.84375, "reward_std": 0.0375, "rewards/accuracy_reward": 1.84375, "rewards/format_reward": 1.0, "step": 630 }, { "completion_length": 49.271875, "epoch": 0.512, "grad_norm": 0.056099992245435715, "kl": 0.0666259765625, "learning_rate": 6.800000000000001e-07, "loss": 0.0007, "reward": 2.69375, "reward_std": 0.0125, "rewards/accuracy_reward": 1.69375, "rewards/format_reward": 1.0, "step": 640 }, { "completion_length": 46.4375, "epoch": 0.52, "grad_norm": 0.0455087348818779, "kl": 0.0546630859375, "learning_rate": 6.75e-07, "loss": 0.0005, "reward": 2.75625, "reward_std": 0.0125, "rewards/accuracy_reward": 1.75625, "rewards/format_reward": 1.0, "step": 650 }, { "completion_length": 46.496875, "epoch": 0.528, "grad_norm": 0.05418640747666359, "kl": 0.0645263671875, "learning_rate": 6.7e-07, "loss": 0.0006, "reward": 2.6875, "reward_std": 0.014433756470680237, "rewards/accuracy_reward": 1.6875, "rewards/format_reward": 1.0, "step": 660 }, { "completion_length": 46.328125, "epoch": 0.536, "grad_norm": 4.0458455085754395, "kl": 0.081103515625, "learning_rate": 6.65e-07, "loss": 0.0008, "reward": 2.65625, "reward_std": 0.08080126941204072, "rewards/accuracy_reward": 1.65625, "rewards/format_reward": 1.0, "step": 670 }, { "completion_length": 48.91875, "epoch": 0.544, "grad_norm": 0.04970540851354599, "kl": 0.0717041015625, "learning_rate": 6.6e-07, "loss": 0.0007, "reward": 2.7625, "reward_std": 0.04330126941204071, "rewards/accuracy_reward": 1.7625, "rewards/format_reward": 1.0, "step": 680 }, { "completion_length": 49.01875, "epoch": 0.552, "grad_norm": 0.1746923178434372, "kl": 0.073779296875, "learning_rate": 6.55e-07, "loss": 0.0007, "reward": 2.75, "reward_std": 0.025, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 690 }, { "completion_length": 48.3125, "epoch": 0.56, "grad_norm": 0.051023293286561966, "kl": 0.06783447265625, "learning_rate": 6.5e-07, "loss": 0.0007, "reward": 2.7625, "reward_std": 0.025, "rewards/accuracy_reward": 1.7625, "rewards/format_reward": 1.0, "step": 700 }, { "completion_length": 49.11875, "epoch": 0.568, "grad_norm": 0.07166194915771484, "kl": 0.0619384765625, "learning_rate": 6.45e-07, "loss": 0.0006, "reward": 2.7375, "reward_std": 0.014433756470680237, "rewards/accuracy_reward": 1.7375, "rewards/format_reward": 1.0, "step": 710 }, { "completion_length": 51.103125, "epoch": 0.576, "grad_norm": 0.08520376682281494, "kl": 0.0830078125, "learning_rate": 6.4e-07, "loss": 0.0008, "reward": 2.7375, "reward_std": 0.014433756470680237, "rewards/accuracy_reward": 1.7375, "rewards/format_reward": 1.0, "step": 720 }, { "completion_length": 49.615625, "epoch": 0.584, "grad_norm": 0.10399647802114487, "kl": 0.0688232421875, "learning_rate": 6.35e-07, "loss": 0.0007, "reward": 2.69375, "reward_std": 0.0125, "rewards/accuracy_reward": 1.69375, "rewards/format_reward": 1.0, "step": 730 }, { "completion_length": 50.596875, "epoch": 0.592, "grad_norm": 0.06369677186012268, "kl": 0.087890625, "learning_rate": 6.3e-07, "loss": 0.0009, "reward": 2.61875, "reward_std": 0.0375, "rewards/accuracy_reward": 1.61875, "rewards/format_reward": 1.0, "step": 740 }, { "completion_length": 50.56875, "epoch": 0.6, "grad_norm": 0.07198835164308548, "kl": 0.10087890625, "learning_rate": 6.249999999999999e-07, "loss": 0.001, "reward": 2.625, "reward_std": 0.06443375647068024, "rewards/accuracy_reward": 1.625, "rewards/format_reward": 1.0, "step": 750 }, { "completion_length": 50.953125, "epoch": 0.608, "grad_norm": 0.04980659857392311, "kl": 0.101806640625, "learning_rate": 6.2e-07, "loss": 0.001, "reward": 2.721875, "reward_std": 0.03125, "rewards/accuracy_reward": 1.725, "rewards/format_reward": 0.996875, "step": 760 }, { "completion_length": 46.609375, "epoch": 0.616, "grad_norm": 2.673631191253662, "kl": 0.0730224609375, "learning_rate": 6.149999999999999e-07, "loss": 0.0007, "reward": 2.6875, "reward_std": 0.03943375647068024, "rewards/accuracy_reward": 1.6875, "rewards/format_reward": 1.0, "step": 770 }, { "completion_length": 46.16875, "epoch": 0.624, "grad_norm": 0.07191024720668793, "kl": 0.07197265625, "learning_rate": 6.1e-07, "loss": 0.0007, "reward": 2.6125, "reward_std": 0.03943375647068024, "rewards/accuracy_reward": 1.6125, "rewards/format_reward": 1.0, "step": 780 }, { "completion_length": 47.346875, "epoch": 0.632, "grad_norm": 0.31487828493118286, "kl": 0.0890625, "learning_rate": 6.049999999999999e-07, "loss": 0.0009, "reward": 2.7, "reward_std": 0.0, "rewards/accuracy_reward": 1.7, "rewards/format_reward": 1.0, "step": 790 }, { "completion_length": 48.5125, "epoch": 0.64, "grad_norm": 0.04281134530901909, "kl": 0.0651611328125, "learning_rate": 6e-07, "loss": 0.0007, "reward": 2.65, "reward_std": 0.0, "rewards/accuracy_reward": 1.65, "rewards/format_reward": 1.0, "step": 800 }, { "completion_length": 47.9625, "epoch": 0.648, "grad_norm": 1.7782899141311646, "kl": 0.0711669921875, "learning_rate": 5.949999999999999e-07, "loss": 0.0007, "reward": 2.634375, "reward_std": 0.04568375647068024, "rewards/accuracy_reward": 1.6375, "rewards/format_reward": 0.996875, "step": 810 }, { "completion_length": 47.7, "epoch": 0.656, "grad_norm": 0.9939271211624146, "kl": 0.07099609375, "learning_rate": 5.9e-07, "loss": 0.0007, "reward": 2.6625, "reward_std": 0.025, "rewards/accuracy_reward": 1.6625, "rewards/format_reward": 1.0, "step": 820 }, { "completion_length": 45.790625, "epoch": 0.664, "grad_norm": 0.05890406668186188, "kl": 0.0596923828125, "learning_rate": 5.849999999999999e-07, "loss": 0.0006, "reward": 2.696875, "reward_std": 0.00625, "rewards/accuracy_reward": 1.7, "rewards/format_reward": 0.996875, "step": 830 }, { "completion_length": 46.675, "epoch": 0.672, "grad_norm": 0.062360286712646484, "kl": 0.071240234375, "learning_rate": 5.8e-07, "loss": 0.0007, "reward": 2.5875, "reward_std": 0.025, "rewards/accuracy_reward": 1.5875, "rewards/format_reward": 1.0, "step": 840 }, { "completion_length": 47.621875, "epoch": 0.68, "grad_norm": 2.2732224464416504, "kl": 1.6703369140625, "learning_rate": 5.749999999999999e-07, "loss": 0.0167, "reward": 2.64375, "reward_std": 0.04136751294136047, "rewards/accuracy_reward": 1.64375, "rewards/format_reward": 1.0, "step": 850 }, { "completion_length": 50.35, "epoch": 0.688, "grad_norm": 2.1026487350463867, "kl": 0.09111328125, "learning_rate": 5.699999999999999e-07, "loss": 0.0009, "reward": 2.6625, "reward_std": 0.053867512941360475, "rewards/accuracy_reward": 1.6625, "rewards/format_reward": 1.0, "step": 860 }, { "completion_length": 52.321875, "epoch": 0.696, "grad_norm": 3.0173561573028564, "kl": 321.521728515625, "learning_rate": 5.649999999999999e-07, "loss": 3.2171, "reward": 2.46875, "reward_std": 0.04136751294136047, "rewards/accuracy_reward": 1.46875, "rewards/format_reward": 1.0, "step": 870 }, { "completion_length": 49.7625, "epoch": 0.704, "grad_norm": 0.06468257308006287, "kl": 0.39306640625, "learning_rate": 5.6e-07, "loss": 0.0039, "reward": 2.7125, "reward_std": 0.06443375647068024, "rewards/accuracy_reward": 1.7125, "rewards/format_reward": 1.0, "step": 880 }, { "completion_length": 48.240625, "epoch": 0.712, "grad_norm": 0.07906866073608398, "kl": 0.12939453125, "learning_rate": 5.55e-07, "loss": 0.0013, "reward": 2.68125, "reward_std": 0.026933756470680238, "rewards/accuracy_reward": 1.68125, "rewards/format_reward": 1.0, "step": 890 }, { "completion_length": 49.03125, "epoch": 0.72, "grad_norm": 0.07313551008701324, "kl": 0.18505859375, "learning_rate": 5.5e-07, "loss": 0.0019, "reward": 2.69375, "reward_std": 0.026933756470680238, "rewards/accuracy_reward": 1.69375, "rewards/format_reward": 1.0, "step": 900 }, { "completion_length": 49.60625, "epoch": 0.728, "grad_norm": 4.0763630867004395, "kl": 2.13671875, "learning_rate": 5.45e-07, "loss": 0.0213, "reward": 2.753125, "reward_std": 0.058183756470680234, "rewards/accuracy_reward": 1.75625, "rewards/format_reward": 0.996875, "step": 910 }, { "completion_length": 49.83125, "epoch": 0.736, "grad_norm": 4.245804786682129, "kl": 0.094384765625, "learning_rate": 5.4e-07, "loss": 0.0009, "reward": 2.75625, "reward_std": 0.04136751294136047, "rewards/accuracy_reward": 1.75625, "rewards/format_reward": 1.0, "step": 920 }, { "completion_length": 50.996875, "epoch": 0.744, "grad_norm": 39.86748504638672, "kl": 3.122021484375, "learning_rate": 5.35e-07, "loss": 0.0313, "reward": 2.8125, "reward_std": 0.03943375647068024, "rewards/accuracy_reward": 1.8125, "rewards/format_reward": 1.0, "step": 930 }, { "completion_length": 49.59375, "epoch": 0.752, "grad_norm": 0.05779128894209862, "kl": 0.0998046875, "learning_rate": 5.3e-07, "loss": 0.001, "reward": 2.79375, "reward_std": 0.051933756470680235, "rewards/accuracy_reward": 1.79375, "rewards/format_reward": 1.0, "step": 940 }, { "completion_length": 47.74375, "epoch": 0.76, "grad_norm": 0.056427907198667526, "kl": 0.445361328125, "learning_rate": 5.25e-07, "loss": 0.0045, "reward": 2.76875, "reward_std": 0.0125, "rewards/accuracy_reward": 1.76875, "rewards/format_reward": 1.0, "step": 950 }, { "completion_length": 48.50625, "epoch": 0.768, "grad_norm": 66.90420532226562, "kl": 2.232958984375, "learning_rate": 5.2e-07, "loss": 0.0223, "reward": 2.80625, "reward_std": 0.0375, "rewards/accuracy_reward": 1.80625, "rewards/format_reward": 1.0, "step": 960 }, { "completion_length": 51.01875, "epoch": 0.776, "grad_norm": 0.09945037215948105, "kl": 0.09228515625, "learning_rate": 5.149999999999999e-07, "loss": 0.0009, "reward": 2.775, "reward_std": 0.0, "rewards/accuracy_reward": 1.775, "rewards/format_reward": 1.0, "step": 970 }, { "completion_length": 50.090625, "epoch": 0.784, "grad_norm": 0.07448896020650864, "kl": 0.15419921875, "learning_rate": 5.1e-07, "loss": 0.0015, "reward": 2.76875, "reward_std": 0.0375, "rewards/accuracy_reward": 1.76875, "rewards/format_reward": 1.0, "step": 980 }, { "completion_length": 49.23125, "epoch": 0.792, "grad_norm": 2.0038902759552, "kl": 0.098828125, "learning_rate": 5.049999999999999e-07, "loss": 0.001, "reward": 2.64375, "reward_std": 0.0125, "rewards/accuracy_reward": 1.64375, "rewards/format_reward": 1.0, "step": 990 }, { "completion_length": 49.790625, "epoch": 0.8, "grad_norm": 0.03871207684278488, "kl": 0.31083984375, "learning_rate": 5e-07, "loss": 0.0031, "reward": 2.7625, "reward_std": 0.025, "rewards/accuracy_reward": 1.7625, "rewards/format_reward": 1.0, "step": 1000 } ], "logging_steps": 10, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }