{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 210, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023809523809523808, "grad_norm": 23.0, "learning_rate": 4.20004080527265e-07, "loss": 0.3575, "step": 5 }, { "epoch": 0.047619047619047616, "grad_norm": 17.625, "learning_rate": 9.450091811863462e-07, "loss": 0.3096, "step": 10 }, { "epoch": 0.07142857142857142, "grad_norm": 11.9375, "learning_rate": 1.4700142818454278e-06, "loss": 0.2658, "step": 15 }, { "epoch": 0.09523809523809523, "grad_norm": 10.625, "learning_rate": 1.9950193825045087e-06, "loss": 0.1987, "step": 20 }, { "epoch": 0.11904761904761904, "grad_norm": 13.375, "learning_rate": 2.52002448316359e-06, "loss": 0.204, "step": 25 }, { "epoch": 0.14285714285714285, "grad_norm": 12.875, "learning_rate": 3.0450295838226718e-06, "loss": 0.1673, "step": 30 }, { "epoch": 0.16666666666666666, "grad_norm": 10.75, "learning_rate": 3.5700346844817527e-06, "loss": 0.1856, "step": 35 }, { "epoch": 0.19047619047619047, "grad_norm": 11.5, "learning_rate": 3.674728355129646e-06, "loss": 0.1819, "step": 40 }, { "epoch": 0.21428571428571427, "grad_norm": 11.4375, "learning_rate": 3.673479982799582e-06, "loss": 0.1577, "step": 45 }, { "epoch": 0.23809523809523808, "grad_norm": 9.0, "learning_rate": 3.671272247622192e-06, "loss": 0.1598, "step": 50 }, { "epoch": 0.2619047619047619, "grad_norm": 9.75, "learning_rate": 3.668106688204025e-06, "loss": 0.1579, "step": 55 }, { "epoch": 0.2857142857142857, "grad_norm": 9.625, "learning_rate": 3.6639855106749177e-06, "loss": 0.1868, "step": 60 }, { "epoch": 0.30952380952380953, "grad_norm": 9.5, "learning_rate": 3.6589115871505097e-06, "loss": 0.1898, "step": 65 }, { "epoch": 0.3333333333333333, "grad_norm": 12.6875, "learning_rate": 3.65288845373062e-06, "loss": 0.1883, "step": 70 }, { "epoch": 0.35714285714285715, "grad_norm": 9.5625, "learning_rate": 3.6459203080348775e-06, "loss": 0.1548, "step": 75 }, { "epoch": 0.38095238095238093, "grad_norm": 8.625, "learning_rate": 3.6380120062773375e-06, "loss": 0.148, "step": 80 }, { "epoch": 0.40476190476190477, "grad_norm": 9.375, "learning_rate": 3.6291690598821e-06, "loss": 0.1656, "step": 85 }, { "epoch": 0.42857142857142855, "grad_norm": 8.9375, "learning_rate": 3.6193976316423157e-06, "loss": 0.1621, "step": 90 }, { "epoch": 0.4523809523809524, "grad_norm": 9.5625, "learning_rate": 3.6087045314252307e-06, "loss": 0.1627, "step": 95 }, { "epoch": 0.47619047619047616, "grad_norm": 10.0, "learning_rate": 3.597097211426281e-06, "loss": 0.1831, "step": 100 }, { "epoch": 0.5, "grad_norm": 11.5625, "learning_rate": 3.5845837609755354e-06, "loss": 0.1655, "step": 105 }, { "epoch": 0.5238095238095238, "grad_norm": 9.9375, "learning_rate": 3.5711729009001095e-06, "loss": 0.1668, "step": 110 }, { "epoch": 0.5476190476190477, "grad_norm": 10.125, "learning_rate": 3.556873977446483e-06, "loss": 0.1767, "step": 115 }, { "epoch": 0.5714285714285714, "grad_norm": 8.0, "learning_rate": 3.541696955766943e-06, "loss": 0.1581, "step": 120 }, { "epoch": 0.5952380952380952, "grad_norm": 10.1875, "learning_rate": 3.5256524129747123e-06, "loss": 0.1574, "step": 125 }, { "epoch": 0.6190476190476191, "grad_norm": 11.3125, "learning_rate": 3.5087515307725825e-06, "loss": 0.186, "step": 130 }, { "epoch": 0.6428571428571429, "grad_norm": 9.0, "learning_rate": 3.4910060876602087e-06, "loss": 0.1665, "step": 135 }, { "epoch": 0.6666666666666666, "grad_norm": 8.875, "learning_rate": 3.472428450725478e-06, "loss": 0.1725, "step": 140 }, { "epoch": 0.6904761904761905, "grad_norm": 10.125, "learning_rate": 3.4530315670256834e-06, "loss": 0.178, "step": 145 }, { "epoch": 0.7142857142857143, "grad_norm": 9.4375, "learning_rate": 3.432828954564515e-06, "loss": 0.1679, "step": 150 }, { "epoch": 0.7380952380952381, "grad_norm": 9.375, "learning_rate": 3.411834692871138e-06, "loss": 0.1819, "step": 155 }, { "epoch": 0.7619047619047619, "grad_norm": 9.9375, "learning_rate": 3.390063413187941e-06, "loss": 0.1764, "step": 160 }, { "epoch": 0.7857142857142857, "grad_norm": 10.5, "learning_rate": 3.3675302882737866e-06, "loss": 0.1655, "step": 165 }, { "epoch": 0.8095238095238095, "grad_norm": 10.0, "learning_rate": 3.344251021829867e-06, "loss": 0.1759, "step": 170 }, { "epoch": 0.8333333333333334, "grad_norm": 8.375, "learning_rate": 3.3202418375555435e-06, "loss": 0.1502, "step": 175 }, { "epoch": 0.8571428571428571, "grad_norm": 10.6875, "learning_rate": 3.29551946784178e-06, "loss": 0.1667, "step": 180 }, { "epoch": 0.8809523809523809, "grad_norm": 9.375, "learning_rate": 3.2701011421100723e-06, "loss": 0.1684, "step": 185 }, { "epoch": 0.9047619047619048, "grad_norm": 7.875, "learning_rate": 3.2440045748049846e-06, "loss": 0.1475, "step": 190 }, { "epoch": 0.9285714285714286, "grad_norm": 9.4375, "learning_rate": 3.217247953048661e-06, "loss": 0.1741, "step": 195 }, { "epoch": 0.9523809523809523, "grad_norm": 9.5625, "learning_rate": 3.1898499239659283e-06, "loss": 0.1498, "step": 200 }, { "epoch": 0.9761904761904762, "grad_norm": 9.3125, "learning_rate": 3.1618295816888093e-06, "loss": 0.1632, "step": 205 }, { "epoch": 1.0, "grad_norm": 9.0625, "learning_rate": 3.133206454049512e-06, "loss": 0.1533, "step": 210 }, { "epoch": 1.0, "eval_loss": 0.1544312834739685, "eval_runtime": 1.7978, "eval_samples_per_second": 18.912, "eval_steps_per_second": 9.456, "step": 210 } ], "logging_steps": 5, "max_steps": 630, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0275582854792479e+18, "train_batch_size": 28, "trial_name": null, "trial_params": null }