| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.6736842105263158, |
| "eval_steps": 500, |
| "global_step": 6000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011228070175438596, |
| "grad_norm": 0.60546875, |
| "learning_rate": 0.00024749999999999994, |
| "loss": 8.5199, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02245614035087719, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.0002998664031981949, |
| "loss": 6.415, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.03368421052631579, |
| "grad_norm": 0.48046875, |
| "learning_rate": 0.0002993145411731054, |
| "loss": 5.8846, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.04491228070175438, |
| "grad_norm": 0.5, |
| "learning_rate": 0.00029833654740795074, |
| "loss": 5.5615, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.056140350877192984, |
| "grad_norm": 0.63671875, |
| "learning_rate": 0.00029693521301859697, |
| "loss": 5.3436, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.056140350877192984, |
| "eval_loss": 5.2718000411987305, |
| "eval_runtime": 296.6131, |
| "eval_samples_per_second": 50.571, |
| "eval_steps_per_second": 6.321, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.06736842105263158, |
| "grad_norm": 0.484375, |
| "learning_rate": 0.00029511453730114126, |
| "loss": 5.1964, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07859649122807018, |
| "grad_norm": 0.45703125, |
| "learning_rate": 0.0002928797163182408, |
| "loss": 5.0532, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08982456140350877, |
| "grad_norm": 0.3984375, |
| "learning_rate": 0.00029023712806996646, |
| "loss": 4.9288, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.10105263157894737, |
| "grad_norm": 0.408203125, |
| "learning_rate": 0.0002871943142915013, |
| "loss": 4.8118, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.11228070175438597, |
| "grad_norm": 0.609375, |
| "learning_rate": 0.0002837599589296326, |
| "loss": 4.6787, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11228070175438597, |
| "eval_loss": 4.601714134216309, |
| "eval_runtime": 296.6412, |
| "eval_samples_per_second": 50.566, |
| "eval_steps_per_second": 6.321, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.12350877192982457, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.00027994386335946324, |
| "loss": 4.5393, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.13473684210526315, |
| "grad_norm": 0.4375, |
| "learning_rate": 0.0002757569184120724, |
| "loss": 4.3818, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.14596491228070174, |
| "grad_norm": 0.431640625, |
| "learning_rate": 0.00027121107329295584, |
| "loss": 4.2778, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.15719298245614036, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.0002663193014799507, |
| "loss": 4.2072, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.16842105263157894, |
| "grad_norm": 0.462890625, |
| "learning_rate": 0.000261095563697969, |
| "loss": 4.1242, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.16842105263157894, |
| "eval_loss": 4.072988510131836, |
| "eval_runtime": 296.6152, |
| "eval_samples_per_second": 50.571, |
| "eval_steps_per_second": 6.321, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.17964912280701753, |
| "grad_norm": 0.40625, |
| "learning_rate": 0.0002555547680762069, |
| "loss": 4.0549, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.19087719298245615, |
| "grad_norm": 0.47265625, |
| "learning_rate": 0.00024971272760153834, |
| "loss": 4.0018, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.20210526315789473, |
| "grad_norm": 0.4453125, |
| "learning_rate": 0.00024358611498951694, |
| "loss": 3.9404, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.21333333333333335, |
| "grad_norm": 0.40234375, |
| "learning_rate": 0.0002371924151017814, |
| "loss": 3.9074, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.22456140350877193, |
| "grad_norm": 0.39453125, |
| "learning_rate": 0.00023054987504566113, |
| "loss": 3.8638, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.22456140350877193, |
| "eval_loss": 3.8390953540802, |
| "eval_runtime": 296.6249, |
| "eval_samples_per_second": 50.569, |
| "eval_steps_per_second": 6.321, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.23578947368421052, |
| "grad_norm": 0.40625, |
| "learning_rate": 0.0002236774520983933, |
| "loss": 3.8229, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.24701754385964914, |
| "grad_norm": 0.40625, |
| "learning_rate": 0.0002165947596045723, |
| "loss": 3.7931, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.2582456140350877, |
| "grad_norm": 0.408203125, |
| "learning_rate": 0.0002093220110012354, |
| "loss": 3.753, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2694736842105263, |
| "grad_norm": 0.40234375, |
| "learning_rate": 0.00020187996213033227, |
| "loss": 3.6916, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2807017543859649, |
| "grad_norm": 0.421875, |
| "learning_rate": 0.0001942898520032151, |
| "loss": 3.7182, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2807017543859649, |
| "eval_loss": 3.7045886516571045, |
| "eval_runtime": 296.6142, |
| "eval_samples_per_second": 50.571, |
| "eval_steps_per_second": 6.321, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2919298245614035, |
| "grad_norm": 0.3984375, |
| "learning_rate": 0.00018657334218620119, |
| "loss": 3.6917, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.3031578947368421, |
| "grad_norm": 0.39453125, |
| "learning_rate": 0.00017875245498019782, |
| "loss": 3.6778, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.3143859649122807, |
| "grad_norm": 0.361328125, |
| "learning_rate": 0.0001708495105708196, |
| "loss": 3.6465, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.3256140350877193, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.0001628870633283667, |
| "loss": 3.61, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.3368421052631579, |
| "grad_norm": 0.373046875, |
| "learning_rate": 0.00015488783743945845, |
| "loss": 3.6202, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3368421052631579, |
| "eval_loss": 3.6153271198272705, |
| "eval_runtime": 296.6146, |
| "eval_samples_per_second": 50.571, |
| "eval_steps_per_second": 6.321, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3480701754385965, |
| "grad_norm": 0.384765625, |
| "learning_rate": 0.00014687466205402512, |
| "loss": 3.6382, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.35929824561403506, |
| "grad_norm": 0.3828125, |
| "learning_rate": 0.00013887040613274276, |
| "loss": 3.6321, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3705263157894737, |
| "grad_norm": 0.369140625, |
| "learning_rate": 0.00013089791318085124, |
| "loss": 3.5585, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.3817543859649123, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.00012297993605461978, |
| "loss": 3.5688, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.3929824561403509, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.00011513907202651833, |
| "loss": 3.5655, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.3929824561403509, |
| "eval_loss": 3.5610458850860596, |
| "eval_runtime": 296.6199, |
| "eval_samples_per_second": 50.57, |
| "eval_steps_per_second": 6.321, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.40421052631578946, |
| "grad_norm": 0.392578125, |
| "learning_rate": 0.0001073976982944116, |
| "loss": 3.5872, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.41543859649122805, |
| "grad_norm": 0.5078125, |
| "learning_rate": 9.977790811882895e-05, |
| "loss": 3.5564, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.4266666666666667, |
| "grad_norm": 0.390625, |
| "learning_rate": 9.23014477705685e-05, |
| "loss": 3.5286, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.4378947368421053, |
| "grad_norm": 0.419921875, |
| "learning_rate": 8.498965446858242e-05, |
| "loss": 3.5338, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.44912280701754387, |
| "grad_norm": 0.36328125, |
| "learning_rate": 7.786339548526267e-05, |
| "loss": 3.522, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.44912280701754387, |
| "eval_loss": 3.527006149291992, |
| "eval_runtime": 296.6237, |
| "eval_samples_per_second": 50.569, |
| "eval_steps_per_second": 6.321, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.46035087719298246, |
| "grad_norm": 0.365234375, |
| "learning_rate": 7.094300859291779e-05, |
| "loss": 3.5287, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.47157894736842104, |
| "grad_norm": 0.36328125, |
| "learning_rate": 6.424824402139943e-05, |
| "loss": 3.5926, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.48280701754385963, |
| "grad_norm": 0.384765625, |
| "learning_rate": 5.779820809252842e-05, |
| "loss": 3.5062, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.49403508771929827, |
| "grad_norm": 0.38671875, |
| "learning_rate": 5.161130869218288e-05, |
| "loss": 3.4921, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.5052631578947369, |
| "grad_norm": 0.375, |
| "learning_rate": 4.5705202735666346e-05, |
| "loss": 3.5462, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5052631578947369, |
| "eval_loss": 3.5098726749420166, |
| "eval_runtime": 296.5947, |
| "eval_samples_per_second": 50.574, |
| "eval_steps_per_second": 6.322, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5164912280701754, |
| "grad_norm": 0.380859375, |
| "learning_rate": 4.0096745776285776e-05, |
| "loss": 3.5366, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.527719298245614, |
| "grad_norm": 0.37890625, |
| "learning_rate": 3.4801943900952816e-05, |
| "loss": 3.4956, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5389473684210526, |
| "grad_norm": 0.361328125, |
| "learning_rate": 2.9835908050093693e-05, |
| "loss": 3.4891, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5501754385964912, |
| "grad_norm": 0.357421875, |
| "learning_rate": 2.521281089223571e-05, |
| "loss": 3.5134, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.5614035087719298, |
| "grad_norm": 0.36328125, |
| "learning_rate": 2.094584637634653e-05, |
| "loss": 3.5214, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5614035087719298, |
| "eval_loss": 3.503505229949951, |
| "eval_runtime": 296.6246, |
| "eval_samples_per_second": 50.569, |
| "eval_steps_per_second": 6.321, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5726315789473684, |
| "grad_norm": 0.376953125, |
| "learning_rate": 1.7047192077360732e-05, |
| "loss": 3.4897, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.583859649122807, |
| "grad_norm": 0.35546875, |
| "learning_rate": 1.352797444235626e-05, |
| "loss": 3.4981, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5950877192982457, |
| "grad_norm": 0.3515625, |
| "learning_rate": 1.0398237036565915e-05, |
| "loss": 3.5027, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.6063157894736843, |
| "grad_norm": 0.349609375, |
| "learning_rate": 7.66691187984656e-06, |
| "loss": 3.5306, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.6175438596491228, |
| "grad_norm": 0.361328125, |
| "learning_rate": 5.341793955409951e-06, |
| "loss": 3.4939, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6175438596491228, |
| "eval_loss": 3.502210855484009, |
| "eval_runtime": 296.5914, |
| "eval_samples_per_second": 50.575, |
| "eval_steps_per_second": 6.322, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6287719298245614, |
| "grad_norm": 0.359375, |
| "learning_rate": 3.42951896356507e-06, |
| "loss": 3.5079, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.373046875, |
| "learning_rate": 1.9355443839609632e-06, |
| "loss": 3.5295, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.6512280701754386, |
| "grad_norm": 0.353515625, |
| "learning_rate": 8.64133900376951e-07, |
| "loss": 3.5329, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.6624561403508772, |
| "grad_norm": 0.38671875, |
| "learning_rate": 2.1834523251072488e-07, |
| "loss": 3.5276, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.6736842105263158, |
| "grad_norm": 0.373046875, |
| "learning_rate": 2.1409491196555663e-11, |
| "loss": 3.483, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6736842105263158, |
| "eval_loss": 3.5021510124206543, |
| "eval_runtime": 296.6149, |
| "eval_samples_per_second": 50.571, |
| "eval_steps_per_second": 6.321, |
| "step": 6000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 6000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.91820076531712e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|