| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.992688870836718, |
| "eval_steps": 500, |
| "global_step": 921, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03249390739236393, |
| "grad_norm": 2.5250117454296688, |
| "learning_rate": 5e-06, |
| "loss": 0.822, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06498781478472786, |
| "grad_norm": 1.1696004146252543, |
| "learning_rate": 5e-06, |
| "loss": 0.7337, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09748172217709179, |
| "grad_norm": 1.0985755367960321, |
| "learning_rate": 5e-06, |
| "loss": 0.7074, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.12997562956945571, |
| "grad_norm": 0.9210546044771958, |
| "learning_rate": 5e-06, |
| "loss": 0.6975, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.16246953696181965, |
| "grad_norm": 1.205608692095207, |
| "learning_rate": 5e-06, |
| "loss": 0.6824, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.19496344435418358, |
| "grad_norm": 0.9213739096950729, |
| "learning_rate": 5e-06, |
| "loss": 0.6687, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.22745735174654752, |
| "grad_norm": 1.1268962045000313, |
| "learning_rate": 5e-06, |
| "loss": 0.6663, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.25995125913891143, |
| "grad_norm": 0.7485158028875047, |
| "learning_rate": 5e-06, |
| "loss": 0.651, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2924451665312754, |
| "grad_norm": 1.1057133229023792, |
| "learning_rate": 5e-06, |
| "loss": 0.6586, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3249390739236393, |
| "grad_norm": 0.6278383743313128, |
| "learning_rate": 5e-06, |
| "loss": 0.6473, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.35743298131600326, |
| "grad_norm": 0.5769967627285304, |
| "learning_rate": 5e-06, |
| "loss": 0.6511, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.38992688870836717, |
| "grad_norm": 0.4919697275903666, |
| "learning_rate": 5e-06, |
| "loss": 0.6441, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.42242079610073113, |
| "grad_norm": 0.9547279637518388, |
| "learning_rate": 5e-06, |
| "loss": 0.6543, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.45491470349309504, |
| "grad_norm": 0.6806699547566448, |
| "learning_rate": 5e-06, |
| "loss": 0.6445, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.487408610885459, |
| "grad_norm": 0.7128274001677305, |
| "learning_rate": 5e-06, |
| "loss": 0.6364, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5199025182778229, |
| "grad_norm": 0.5534613632007734, |
| "learning_rate": 5e-06, |
| "loss": 0.6325, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5523964256701869, |
| "grad_norm": 0.6402876644636555, |
| "learning_rate": 5e-06, |
| "loss": 0.631, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5848903330625508, |
| "grad_norm": 0.5061595935903519, |
| "learning_rate": 5e-06, |
| "loss": 0.629, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6173842404549147, |
| "grad_norm": 0.5151243164239879, |
| "learning_rate": 5e-06, |
| "loss": 0.6383, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6498781478472786, |
| "grad_norm": 0.5209850069165646, |
| "learning_rate": 5e-06, |
| "loss": 0.6277, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6823720552396426, |
| "grad_norm": 0.53784457051734, |
| "learning_rate": 5e-06, |
| "loss": 0.6287, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7148659626320065, |
| "grad_norm": 0.6457223875849278, |
| "learning_rate": 5e-06, |
| "loss": 0.6358, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7473598700243704, |
| "grad_norm": 0.4969903844480984, |
| "learning_rate": 5e-06, |
| "loss": 0.6331, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7798537774167343, |
| "grad_norm": 1.0671390274200754, |
| "learning_rate": 5e-06, |
| "loss": 0.6224, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8123476848090982, |
| "grad_norm": 1.9438042138910412, |
| "learning_rate": 5e-06, |
| "loss": 0.6198, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8448415922014623, |
| "grad_norm": 0.6831461136116804, |
| "learning_rate": 5e-06, |
| "loss": 0.6157, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8773354995938262, |
| "grad_norm": 0.548956486134229, |
| "learning_rate": 5e-06, |
| "loss": 0.6116, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9098294069861901, |
| "grad_norm": 0.5540177414913263, |
| "learning_rate": 5e-06, |
| "loss": 0.6326, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.942323314378554, |
| "grad_norm": 0.7706090314968392, |
| "learning_rate": 5e-06, |
| "loss": 0.6331, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.974817221770918, |
| "grad_norm": 0.59177053251295, |
| "learning_rate": 5e-06, |
| "loss": 0.6185, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9975629569455727, |
| "eval_loss": 0.6177791357040405, |
| "eval_runtime": 107.8067, |
| "eval_samples_per_second": 76.878, |
| "eval_steps_per_second": 0.603, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.007311129163282, |
| "grad_norm": 1.4811327109031245, |
| "learning_rate": 5e-06, |
| "loss": 0.6074, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0398050365556457, |
| "grad_norm": 0.924221358685102, |
| "learning_rate": 5e-06, |
| "loss": 0.5701, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0722989439480097, |
| "grad_norm": 0.7293714439923176, |
| "learning_rate": 5e-06, |
| "loss": 0.5836, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.1047928513403737, |
| "grad_norm": 0.5947409116978826, |
| "learning_rate": 5e-06, |
| "loss": 0.5676, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.1372867587327375, |
| "grad_norm": 0.7000520581211176, |
| "learning_rate": 5e-06, |
| "loss": 0.5722, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.1697806661251016, |
| "grad_norm": 0.5866183764561665, |
| "learning_rate": 5e-06, |
| "loss": 0.5692, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.2022745735174656, |
| "grad_norm": 0.5380736496064281, |
| "learning_rate": 5e-06, |
| "loss": 0.5716, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.2347684809098294, |
| "grad_norm": 0.4885682503135813, |
| "learning_rate": 5e-06, |
| "loss": 0.573, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.2672623883021934, |
| "grad_norm": 0.48634335674401963, |
| "learning_rate": 5e-06, |
| "loss": 0.5759, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.2997562956945572, |
| "grad_norm": 0.6079027143194111, |
| "learning_rate": 5e-06, |
| "loss": 0.5812, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.3322502030869212, |
| "grad_norm": 0.5095044846593028, |
| "learning_rate": 5e-06, |
| "loss": 0.5781, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.3647441104792852, |
| "grad_norm": 0.7887736133740892, |
| "learning_rate": 5e-06, |
| "loss": 0.5829, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.397238017871649, |
| "grad_norm": 0.546342811666024, |
| "learning_rate": 5e-06, |
| "loss": 0.5743, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.429731925264013, |
| "grad_norm": 0.49153267657911065, |
| "learning_rate": 5e-06, |
| "loss": 0.568, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.4622258326563768, |
| "grad_norm": 0.5236274223462694, |
| "learning_rate": 5e-06, |
| "loss": 0.5688, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.4947197400487409, |
| "grad_norm": 0.4672834690085644, |
| "learning_rate": 5e-06, |
| "loss": 0.5765, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.5272136474411049, |
| "grad_norm": 0.5572476008854027, |
| "learning_rate": 5e-06, |
| "loss": 0.5709, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.5597075548334687, |
| "grad_norm": 0.6292993165160258, |
| "learning_rate": 5e-06, |
| "loss": 0.5744, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.5922014622258327, |
| "grad_norm": 0.7710487341485719, |
| "learning_rate": 5e-06, |
| "loss": 0.5736, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.6246953696181965, |
| "grad_norm": 0.5087134161939325, |
| "learning_rate": 5e-06, |
| "loss": 0.5644, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.6571892770105605, |
| "grad_norm": 0.7400405946910072, |
| "learning_rate": 5e-06, |
| "loss": 0.5631, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.6896831844029245, |
| "grad_norm": 0.7142563791899093, |
| "learning_rate": 5e-06, |
| "loss": 0.5736, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.7221770917952883, |
| "grad_norm": 0.5733442942289497, |
| "learning_rate": 5e-06, |
| "loss": 0.565, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.7546709991876523, |
| "grad_norm": 0.5068224110056896, |
| "learning_rate": 5e-06, |
| "loss": 0.5735, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.7871649065800161, |
| "grad_norm": 0.5248635762437218, |
| "learning_rate": 5e-06, |
| "loss": 0.5673, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.8196588139723802, |
| "grad_norm": 0.5160341588912364, |
| "learning_rate": 5e-06, |
| "loss": 0.5678, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.8521527213647442, |
| "grad_norm": 0.5597640569403803, |
| "learning_rate": 5e-06, |
| "loss": 0.5685, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.8846466287571082, |
| "grad_norm": 0.5725824151776507, |
| "learning_rate": 5e-06, |
| "loss": 0.5717, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.917140536149472, |
| "grad_norm": 0.6677340004988277, |
| "learning_rate": 5e-06, |
| "loss": 0.5744, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.9496344435418358, |
| "grad_norm": 0.48981238508621994, |
| "learning_rate": 5e-06, |
| "loss": 0.5646, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.9821283509341998, |
| "grad_norm": 0.5139052165921854, |
| "learning_rate": 5e-06, |
| "loss": 0.5652, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.9983753046303818, |
| "eval_loss": 0.6079972982406616, |
| "eval_runtime": 105.9269, |
| "eval_samples_per_second": 78.243, |
| "eval_steps_per_second": 0.614, |
| "step": 615 |
| }, |
| { |
| "epoch": 2.014622258326564, |
| "grad_norm": 0.8669205295109212, |
| "learning_rate": 5e-06, |
| "loss": 0.5462, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.047116165718928, |
| "grad_norm": 0.5112180094253919, |
| "learning_rate": 5e-06, |
| "loss": 0.5103, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.0796100731112914, |
| "grad_norm": 0.6398716969047498, |
| "learning_rate": 5e-06, |
| "loss": 0.5161, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.1121039805036554, |
| "grad_norm": 0.6665002162537883, |
| "learning_rate": 5e-06, |
| "loss": 0.5166, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.1445978878960195, |
| "grad_norm": 0.611641023084313, |
| "learning_rate": 5e-06, |
| "loss": 0.5057, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.1770917952883835, |
| "grad_norm": 0.5429214844688849, |
| "learning_rate": 5e-06, |
| "loss": 0.5193, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.2095857026807475, |
| "grad_norm": 0.590201521626834, |
| "learning_rate": 5e-06, |
| "loss": 0.5261, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.2420796100731115, |
| "grad_norm": 0.5384309983189058, |
| "learning_rate": 5e-06, |
| "loss": 0.5216, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.274573517465475, |
| "grad_norm": 0.6332831563355791, |
| "learning_rate": 5e-06, |
| "loss": 0.5239, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.307067424857839, |
| "grad_norm": 0.5774048376947836, |
| "learning_rate": 5e-06, |
| "loss": 0.518, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.339561332250203, |
| "grad_norm": 0.5715485935547556, |
| "learning_rate": 5e-06, |
| "loss": 0.5311, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.372055239642567, |
| "grad_norm": 0.6112818977221842, |
| "learning_rate": 5e-06, |
| "loss": 0.5225, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.404549147034931, |
| "grad_norm": 0.5043334772464461, |
| "learning_rate": 5e-06, |
| "loss": 0.5228, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.4370430544272947, |
| "grad_norm": 0.5427442635398124, |
| "learning_rate": 5e-06, |
| "loss": 0.5269, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.4695369618196588, |
| "grad_norm": 0.48115676937900054, |
| "learning_rate": 5e-06, |
| "loss": 0.5175, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.502030869212023, |
| "grad_norm": 0.5657773427745864, |
| "learning_rate": 5e-06, |
| "loss": 0.5219, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.534524776604387, |
| "grad_norm": 0.6022938542861616, |
| "learning_rate": 5e-06, |
| "loss": 0.5196, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.567018683996751, |
| "grad_norm": 0.501463044421915, |
| "learning_rate": 5e-06, |
| "loss": 0.5167, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.5995125913891144, |
| "grad_norm": 0.5141918965616773, |
| "learning_rate": 5e-06, |
| "loss": 0.5186, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.6320064987814784, |
| "grad_norm": 0.549194326600886, |
| "learning_rate": 5e-06, |
| "loss": 0.5269, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.6645004061738424, |
| "grad_norm": 0.6157662245530849, |
| "learning_rate": 5e-06, |
| "loss": 0.5241, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.6969943135662064, |
| "grad_norm": 0.6492362584129086, |
| "learning_rate": 5e-06, |
| "loss": 0.52, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.7294882209585705, |
| "grad_norm": 0.532657814156255, |
| "learning_rate": 5e-06, |
| "loss": 0.5218, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.761982128350934, |
| "grad_norm": 0.5072209705902165, |
| "learning_rate": 5e-06, |
| "loss": 0.5295, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.794476035743298, |
| "grad_norm": 0.5590113792642208, |
| "learning_rate": 5e-06, |
| "loss": 0.5237, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.826969943135662, |
| "grad_norm": 0.5766994462509862, |
| "learning_rate": 5e-06, |
| "loss": 0.522, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.859463850528026, |
| "grad_norm": 0.5390978647646242, |
| "learning_rate": 5e-06, |
| "loss": 0.5188, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.89195775792039, |
| "grad_norm": 0.5874802051466262, |
| "learning_rate": 5e-06, |
| "loss": 0.5252, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.9244516653127537, |
| "grad_norm": 0.4949352435636162, |
| "learning_rate": 5e-06, |
| "loss": 0.5181, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.9569455727051177, |
| "grad_norm": 0.5960858577490875, |
| "learning_rate": 5e-06, |
| "loss": 0.5282, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.9894394800974817, |
| "grad_norm": 0.551557074132645, |
| "learning_rate": 5e-06, |
| "loss": 0.5197, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.992688870836718, |
| "eval_loss": 0.6120628714561462, |
| "eval_runtime": 104.3409, |
| "eval_samples_per_second": 79.432, |
| "eval_steps_per_second": 0.623, |
| "step": 921 |
| }, |
| { |
| "epoch": 2.992688870836718, |
| "step": 921, |
| "total_flos": 1542290543738880.0, |
| "train_loss": 0.5815144646245416, |
| "train_runtime": 15433.4875, |
| "train_samples_per_second": 30.607, |
| "train_steps_per_second": 0.06 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 921, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1542290543738880.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|