| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 280, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 1.602001754412024e-05, | |
| "loss": 2.3742, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 3.604503947427054e-05, | |
| "loss": 0.875, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 5.607006140442084e-05, | |
| "loss": 0.5145, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 7.609508333457115e-05, | |
| "loss": 0.3936, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8.409453412069171e-05, | |
| "loss": 0.3117, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 8.403003442637811e-05, | |
| "loss": 0.2541, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8.39070004172113e-05, | |
| "loss": 0.2357, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 8.372562275470936e-05, | |
| "loss": 0.1999, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 8.348618251351745e-05, | |
| "loss": 0.1941, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 8.318905074583633e-05, | |
| "loss": 0.1594, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 8.283468790641561e-05, | |
| "loss": 0.1508, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 8.242364313900313e-05, | |
| "loss": 0.1409, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 8.195655342535598e-05, | |
| "loss": 0.1295, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 8.143414259813207e-05, | |
| "loss": 0.1175, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 8.085722021919186e-05, | |
| "loss": 0.1099, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 8.022668032504859e-05, | |
| "loss": 0.1056, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6071428571428571, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 7.954350004141101e-05, | |
| "loss": 0.1048, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 7.880873806896572e-05, | |
| "loss": 0.0955, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6785714285714286, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 7.802353304274557e-05, | |
| "loss": 0.0919, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 7.718910176762666e-05, | |
| "loss": 0.0877, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 7.630673733268813e-05, | |
| "loss": 0.0926, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 7.537780710735718e-05, | |
| "loss": 0.0855, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8214285714285714, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 7.440375062244419e-05, | |
| "loss": 0.0783, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 7.338607733935205e-05, | |
| "loss": 0.0783, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 7.232636431091635e-05, | |
| "loss": 0.0735, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 7.122625373750156e-05, | |
| "loss": 0.0675, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9642857142857143, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 7.008745042214034e-05, | |
| "loss": 0.0726, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 6.891171912865967e-05, | |
| "loss": 0.063, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.04865735396742821, | |
| "eval_runtime": 0.7956, | |
| "eval_samples_per_second": 12.569, | |
| "eval_steps_per_second": 12.569, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0357142857142858, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 6.770088184688781e-05, | |
| "loss": 0.046, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 6.64568149691801e-05, | |
| "loss": 0.0467, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.1071428571428572, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 6.518144638263924e-05, | |
| "loss": 0.0435, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 6.387675248153556e-05, | |
| "loss": 0.0399, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.1785714285714286, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 6.254475510455786e-05, | |
| "loss": 0.0411, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.2142857142857142, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 6.118751840164043e-05, | |
| "loss": 0.0407, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 5.980714563522164e-05, | |
| "loss": 0.0355, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 5.84057759208916e-05, | |
| "loss": 0.0388, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.3214285714285714, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 5.698558091247909e-05, | |
| "loss": 0.0359, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.3571428571428572, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 5.554876143671537e-05, | |
| "loss": 0.0364, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.3928571428571428, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 5.409754408268964e-05, | |
| "loss": 0.0385, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 5.263417775138147e-05, | |
| "loss": 0.0384, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.4642857142857144, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 5.116093017061732e-05, | |
| "loss": 0.0379, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.9680084380851704e-05, | |
| "loss": 0.0393, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5357142857142856, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 4.819393519721892e-05, | |
| "loss": 0.0338, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.670478565333803e-05, | |
| "loss": 0.0345, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.6071428571428572, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 4.5214943432381836e-05, | |
| "loss": 0.0301, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.6428571428571428, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 4.372671729094068e-05, | |
| "loss": 0.031, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.6785714285714286, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.224241348122277e-05, | |
| "loss": 0.0324, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.076433217713548e-05, | |
| "loss": 0.0315, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 3.9294763909785973e-05, | |
| "loss": 0.0318, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 3.7835986017924955e-05, | |
| "loss": 0.0331, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.8214285714285714, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 3.639025911883407e-05, | |
| "loss": 0.0291, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 3.495982360512619e-05, | |
| "loss": 0.0307, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.8928571428571428, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 3.354689617288684e-05, | |
| "loss": 0.0337, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.9285714285714286, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 3.215366638653771e-05, | |
| "loss": 0.033, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.9642857142857144, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 3.078229328574474e-05, | |
| "loss": 0.0292, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 2.9434902039629697e-05, | |
| "loss": 0.0283, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.04047780483961105, | |
| "eval_runtime": 0.7957, | |
| "eval_samples_per_second": 12.568, | |
| "eval_steps_per_second": 12.568, | |
| "step": 280 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 420, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.972069268553728e+17, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |