| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 210, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.023809523809523808, | |
| "grad_norm": 23.0, | |
| "learning_rate": 4.20004080527265e-07, | |
| "loss": 0.3575, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.047619047619047616, | |
| "grad_norm": 17.625, | |
| "learning_rate": 9.450091811863462e-07, | |
| "loss": 0.3096, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 1.4700142818454278e-06, | |
| "loss": 0.2658, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 10.625, | |
| "learning_rate": 1.9950193825045087e-06, | |
| "loss": 0.1987, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11904761904761904, | |
| "grad_norm": 13.375, | |
| "learning_rate": 2.52002448316359e-06, | |
| "loss": 0.204, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 12.875, | |
| "learning_rate": 3.0450295838226718e-06, | |
| "loss": 0.1673, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 10.75, | |
| "learning_rate": 3.5700346844817527e-06, | |
| "loss": 0.1856, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 11.5, | |
| "learning_rate": 3.674728355129646e-06, | |
| "loss": 0.1819, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 3.673479982799582e-06, | |
| "loss": 0.1577, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 9.0, | |
| "learning_rate": 3.671272247622192e-06, | |
| "loss": 0.1598, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2619047619047619, | |
| "grad_norm": 9.75, | |
| "learning_rate": 3.668106688204025e-06, | |
| "loss": 0.1579, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 9.625, | |
| "learning_rate": 3.6639855106749177e-06, | |
| "loss": 0.1868, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.30952380952380953, | |
| "grad_norm": 9.5, | |
| "learning_rate": 3.6589115871505097e-06, | |
| "loss": 0.1898, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 12.6875, | |
| "learning_rate": 3.65288845373062e-06, | |
| "loss": 0.1883, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 3.6459203080348775e-06, | |
| "loss": 0.1548, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 8.625, | |
| "learning_rate": 3.6380120062773375e-06, | |
| "loss": 0.148, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.40476190476190477, | |
| "grad_norm": 9.375, | |
| "learning_rate": 3.6291690598821e-06, | |
| "loss": 0.1656, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 3.6193976316423157e-06, | |
| "loss": 0.1621, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4523809523809524, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 3.6087045314252307e-06, | |
| "loss": 0.1627, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 10.0, | |
| "learning_rate": 3.597097211426281e-06, | |
| "loss": 0.1831, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 3.5845837609755354e-06, | |
| "loss": 0.1655, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.5238095238095238, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 3.5711729009001095e-06, | |
| "loss": 0.1668, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5476190476190477, | |
| "grad_norm": 10.125, | |
| "learning_rate": 3.556873977446483e-06, | |
| "loss": 0.1767, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 8.0, | |
| "learning_rate": 3.541696955766943e-06, | |
| "loss": 0.1581, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5952380952380952, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 3.5256524129747123e-06, | |
| "loss": 0.1574, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.6190476190476191, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 3.5087515307725825e-06, | |
| "loss": 0.186, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 9.0, | |
| "learning_rate": 3.4910060876602087e-06, | |
| "loss": 0.1665, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 8.875, | |
| "learning_rate": 3.472428450725478e-06, | |
| "loss": 0.1725, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6904761904761905, | |
| "grad_norm": 10.125, | |
| "learning_rate": 3.4530315670256834e-06, | |
| "loss": 0.178, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 3.432828954564515e-06, | |
| "loss": 0.1679, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7380952380952381, | |
| "grad_norm": 9.375, | |
| "learning_rate": 3.411834692871138e-06, | |
| "loss": 0.1819, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 3.390063413187941e-06, | |
| "loss": 0.1764, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 10.5, | |
| "learning_rate": 3.3675302882737866e-06, | |
| "loss": 0.1655, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.8095238095238095, | |
| "grad_norm": 10.0, | |
| "learning_rate": 3.344251021829867e-06, | |
| "loss": 0.1759, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 8.375, | |
| "learning_rate": 3.3202418375555435e-06, | |
| "loss": 0.1502, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 3.29551946784178e-06, | |
| "loss": 0.1667, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8809523809523809, | |
| "grad_norm": 9.375, | |
| "learning_rate": 3.2701011421100723e-06, | |
| "loss": 0.1684, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.9047619047619048, | |
| "grad_norm": 7.875, | |
| "learning_rate": 3.2440045748049846e-06, | |
| "loss": 0.1475, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 3.217247953048661e-06, | |
| "loss": 0.1741, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 3.1898499239659283e-06, | |
| "loss": 0.1498, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9761904761904762, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 3.1618295816888093e-06, | |
| "loss": 0.1632, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 3.133206454049512e-06, | |
| "loss": 0.1533, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.1544312834739685, | |
| "eval_runtime": 1.7978, | |
| "eval_samples_per_second": 18.912, | |
| "eval_steps_per_second": 9.456, | |
| "step": 210 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 630, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0275582854792479e+18, | |
| "train_batch_size": 28, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |