| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.896, | |
| "eval_steps": 500, | |
| "global_step": 45, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 5.865464631994252, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.8384, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 5.795282414900805, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.8522, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 6.004920342985478, | |
| "learning_rate": 3e-06, | |
| "loss": 0.8846, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 5.2374685746239775, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.8557, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.177667527282572, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8018, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 2.4660509268420956, | |
| "learning_rate": 4.992293334332821e-06, | |
| "loss": 0.8114, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 2.1484055104781894, | |
| "learning_rate": 4.9692208514878445e-06, | |
| "loss": 0.7613, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 4.24941370523086, | |
| "learning_rate": 4.930924800994192e-06, | |
| "loss": 0.7823, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 4.4742275329617796, | |
| "learning_rate": 4.8776412907378845e-06, | |
| "loss": 0.7947, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4.584150310828626, | |
| "learning_rate": 4.809698831278217e-06, | |
| "loss": 0.7633, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 4.214193969501364, | |
| "learning_rate": 4.72751631047092e-06, | |
| "loss": 0.7346, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 3.991993764075393, | |
| "learning_rate": 4.631600410885231e-06, | |
| "loss": 0.746, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 3.1675527640947263, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.7606, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 2.335842962931273, | |
| "learning_rate": 4.401014914000078e-06, | |
| "loss": 0.7106, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.7295030845724642, | |
| "learning_rate": 4.267766952966369e-06, | |
| "loss": 0.73, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 1.9808122172905522, | |
| "learning_rate": 4.123620120825459e-06, | |
| "loss": 1.0288, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 1.549926635365851, | |
| "learning_rate": 3.969463130731183e-06, | |
| "loss": 0.8017, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.1443613272680522, | |
| "learning_rate": 3.806246411789872e-06, | |
| "loss": 0.5691, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 1.2766961031761175, | |
| "learning_rate": 3.634976249348867e-06, | |
| "loss": 0.6708, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 0.9945115738673014, | |
| "learning_rate": 3.4567085809127247e-06, | |
| "loss": 0.5849, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 0.8883087926133657, | |
| "learning_rate": 3.272542485937369e-06, | |
| "loss": 0.6678, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 0.8678409731064457, | |
| "learning_rate": 3.0836134096397642e-06, | |
| "loss": 0.6192, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.8592932615188835, | |
| "learning_rate": 2.8910861626005774e-06, | |
| "loss": 0.6421, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 0.8360777220132355, | |
| "learning_rate": 2.696147739319613e-06, | |
| "loss": 0.6162, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 0.7344073667637827, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.6482, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.7979431332300274, | |
| "learning_rate": 2.3038522606803882e-06, | |
| "loss": 0.6589, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 0.6706917123119454, | |
| "learning_rate": 2.1089138373994226e-06, | |
| "loss": 0.5894, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.5516361015903626, | |
| "learning_rate": 1.9163865903602374e-06, | |
| "loss": 0.6103, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 0.585020748716762, | |
| "learning_rate": 1.7274575140626318e-06, | |
| "loss": 0.5794, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": 0.6557857522167144, | |
| "learning_rate": 1.5432914190872757e-06, | |
| "loss": 0.6615, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 1.0124709491728257, | |
| "learning_rate": 1.3650237506511333e-06, | |
| "loss": 0.8842, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.6368456188165972, | |
| "learning_rate": 1.193753588210128e-06, | |
| "loss": 0.6189, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 0.5133965301911704, | |
| "learning_rate": 1.0305368692688175e-06, | |
| "loss": 0.5628, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 0.5303466898294212, | |
| "learning_rate": 8.763798791745413e-07, | |
| "loss": 0.5932, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 0.6189893333744867, | |
| "learning_rate": 7.322330470336314e-07, | |
| "loss": 0.5959, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.49962903266405023, | |
| "learning_rate": 5.989850859999227e-07, | |
| "loss": 0.6223, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.4805054080903501, | |
| "learning_rate": 4.774575140626317e-07, | |
| "loss": 0.5717, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 0.5109754242918464, | |
| "learning_rate": 3.683995891147696e-07, | |
| "loss": 0.5771, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 0.44837516000440286, | |
| "learning_rate": 2.7248368952908055e-07, | |
| "loss": 0.5931, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 0.4870172383822943, | |
| "learning_rate": 1.9030116872178317e-07, | |
| "loss": 0.6105, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.4254891486184633, | |
| "learning_rate": 1.223587092621162e-07, | |
| "loss": 0.5796, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.5097241752627314, | |
| "learning_rate": 6.907519900580862e-08, | |
| "loss": 0.6076, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 0.4670809129547463, | |
| "learning_rate": 3.077914851215585e-08, | |
| "loss": 0.5931, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 0.4245230557723331, | |
| "learning_rate": 7.70666566718009e-09, | |
| "loss": 0.554, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 0.4713555028487441, | |
| "learning_rate": 0.0, | |
| "loss": 0.6079, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "step": 45, | |
| "total_flos": 5.682962857616998e+16, | |
| "train_loss": 0.687734149561988, | |
| "train_runtime": 1863.975, | |
| "train_samples_per_second": 1.603, | |
| "train_steps_per_second": 0.024 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 45, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.682962857616998e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |