| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.99775617053104, | |
| "eval_steps": 500, | |
| "global_step": 1002, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.029917726252804786, | |
| "grad_norm": 3.1024108625766647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7821, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05983545250560957, | |
| "grad_norm": 2.0154961377920744, | |
| "learning_rate": 5e-06, | |
| "loss": 0.709, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08975317875841436, | |
| "grad_norm": 4.098110110830773, | |
| "learning_rate": 5e-06, | |
| "loss": 0.716, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11967090501121914, | |
| "grad_norm": 1.3839237187441724, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6884, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14958863126402394, | |
| "grad_norm": 1.6642156675636246, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6742, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17950635751682872, | |
| "grad_norm": 0.8382342346916078, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6578, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2094240837696335, | |
| "grad_norm": 1.0785514818670323, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6526, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2393418100224383, | |
| "grad_norm": 0.6806480615454938, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6534, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.26925953627524307, | |
| "grad_norm": 0.6822998110330022, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6396, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2991772625280479, | |
| "grad_norm": 0.6407539938463268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6336, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.32909498878085264, | |
| "grad_norm": 0.6023614683907215, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.35901271503365745, | |
| "grad_norm": 1.2991344935679592, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6284, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3889304412864622, | |
| "grad_norm": 1.2836405178889319, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6206, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.418848167539267, | |
| "grad_norm": 0.5455928257982399, | |
| "learning_rate": 5e-06, | |
| "loss": 0.626, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4487658937920718, | |
| "grad_norm": 0.8705277514817991, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6243, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4786836200448766, | |
| "grad_norm": 0.6015797080220293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6239, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5086013462976814, | |
| "grad_norm": 0.5799933689114735, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6242, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5385190725504861, | |
| "grad_norm": 0.7181956170370547, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6187, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5684367988032909, | |
| "grad_norm": 1.1457159660009166, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6249, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5983545250560958, | |
| "grad_norm": 0.8081668167489676, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6262, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6282722513089005, | |
| "grad_norm": 0.494100005031363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6115, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6581899775617053, | |
| "grad_norm": 0.49011911418037035, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6154, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6881077038145101, | |
| "grad_norm": 0.5619549814669743, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6182, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7180254300673149, | |
| "grad_norm": 0.47978866623290667, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6136, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7479431563201197, | |
| "grad_norm": 0.4834220711675942, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6092, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7778608825729244, | |
| "grad_norm": 0.5842572852141615, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6183, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8077786088257293, | |
| "grad_norm": 0.5124590739903697, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6091, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.837696335078534, | |
| "grad_norm": 0.571561034937845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6089, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8676140613313388, | |
| "grad_norm": 0.50639640711907, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6075, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8975317875841436, | |
| "grad_norm": 0.47922109749059255, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6113, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9274495138369484, | |
| "grad_norm": 0.6105089487425442, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6123, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9573672400897532, | |
| "grad_norm": 0.5056360719681718, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6156, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9872849663425579, | |
| "grad_norm": 0.48485936511026195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6008, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9992520568436799, | |
| "eval_loss": 0.6066673398017883, | |
| "eval_runtime": 179.254, | |
| "eval_samples_per_second": 50.242, | |
| "eval_steps_per_second": 0.396, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.0172026925953628, | |
| "grad_norm": 0.6922596264407767, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5765, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0471204188481675, | |
| "grad_norm": 0.49267857964550554, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5603, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0770381451009723, | |
| "grad_norm": 0.49751934654599217, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5686, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.106955871353777, | |
| "grad_norm": 0.6837370299507576, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5612, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.136873597606582, | |
| "grad_norm": 0.47693359913296135, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5539, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1667913238593868, | |
| "grad_norm": 0.648584636579785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5622, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1967090501121915, | |
| "grad_norm": 0.5670898037838431, | |
| "learning_rate": 5e-06, | |
| "loss": 0.554, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2266267763649963, | |
| "grad_norm": 0.5703136712557304, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5609, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.256544502617801, | |
| "grad_norm": 0.6102946772879293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.559, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2864622288706058, | |
| "grad_norm": 0.5772564364701466, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5603, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3163799551234106, | |
| "grad_norm": 0.450931906177121, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5601, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3462976813762153, | |
| "grad_norm": 0.5054266947188414, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5572, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.37621540762902, | |
| "grad_norm": 0.48292336210926307, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5587, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.406133133881825, | |
| "grad_norm": 0.5249447315851029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5575, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4360508601346298, | |
| "grad_norm": 0.5140359027555277, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5546, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4659685863874345, | |
| "grad_norm": 0.5354245491327346, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5578, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4958863126402393, | |
| "grad_norm": 0.5682468395490031, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5602, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5258040388930443, | |
| "grad_norm": 0.5900754898799866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5616, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.555721765145849, | |
| "grad_norm": 0.4996963352974884, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5557, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5856394913986538, | |
| "grad_norm": 0.5075965675534208, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5573, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.6155572176514585, | |
| "grad_norm": 0.6139589992144621, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5601, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.6454749439042633, | |
| "grad_norm": 0.5479413460230939, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5549, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.675392670157068, | |
| "grad_norm": 0.5917928857307124, | |
| "learning_rate": 5e-06, | |
| "loss": 0.553, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.7053103964098728, | |
| "grad_norm": 0.5837997645694308, | |
| "learning_rate": 5e-06, | |
| "loss": 0.557, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.7352281226626776, | |
| "grad_norm": 0.5008632390644033, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5497, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.7651458489154823, | |
| "grad_norm": 0.5321160422034145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.554, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.795063575168287, | |
| "grad_norm": 0.5334951007964042, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5594, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.824981301421092, | |
| "grad_norm": 0.5210632059648287, | |
| "learning_rate": 5e-06, | |
| "loss": 0.56, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.8548990276738968, | |
| "grad_norm": 0.5043445814465957, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5575, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.8848167539267016, | |
| "grad_norm": 0.613976408483885, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5583, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9147344801795063, | |
| "grad_norm": 0.5363674682216356, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5643, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9446522064323113, | |
| "grad_norm": 0.5061979916241445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5617, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.974569932685116, | |
| "grad_norm": 0.5681791964553616, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5567, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.9985041136873598, | |
| "eval_loss": 0.5981965661048889, | |
| "eval_runtime": 179.6452, | |
| "eval_samples_per_second": 50.132, | |
| "eval_steps_per_second": 0.395, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 2.004487658937921, | |
| "grad_norm": 0.7426343168495358, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5487, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.0344053851907256, | |
| "grad_norm": 0.8360352246281931, | |
| "learning_rate": 5e-06, | |
| "loss": 0.509, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.0643231114435303, | |
| "grad_norm": 0.5586788759798959, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5107, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.094240837696335, | |
| "grad_norm": 0.5334495185578801, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5083, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.12415856394914, | |
| "grad_norm": 0.5451729047772684, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5103, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.1540762902019446, | |
| "grad_norm": 0.5384304992109907, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5083, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.1839940164547493, | |
| "grad_norm": 0.6897130607106118, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5087, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.213911742707554, | |
| "grad_norm": 0.4914356466355389, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5028, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.243829468960359, | |
| "grad_norm": 0.4859395300154547, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5107, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.273747195213164, | |
| "grad_norm": 0.5080772320686796, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5085, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.303664921465969, | |
| "grad_norm": 0.5928831782745143, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5065, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.3335826477187736, | |
| "grad_norm": 0.5292235530530627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5064, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.3635003739715783, | |
| "grad_norm": 0.5612309376826925, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5047, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.393418100224383, | |
| "grad_norm": 0.7840153330542589, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5107, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.423335826477188, | |
| "grad_norm": 0.563462388459372, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5074, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.4532535527299926, | |
| "grad_norm": 0.6037763909811528, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5027, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.4831712789827973, | |
| "grad_norm": 0.512144492512373, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5059, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.513089005235602, | |
| "grad_norm": 0.5423885548086916, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5087, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.543006731488407, | |
| "grad_norm": 0.5435953936785038, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5097, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.5729244577412116, | |
| "grad_norm": 0.5702298253328785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5139, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.6028421839940163, | |
| "grad_norm": 0.5598880036547539, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5108, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.632759910246821, | |
| "grad_norm": 0.5408931335977699, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5123, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.662677636499626, | |
| "grad_norm": 0.5650280550038018, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5024, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.6925953627524306, | |
| "grad_norm": 0.5162753582027944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5086, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.7225130890052354, | |
| "grad_norm": 0.562255619044257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5091, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.75243081525804, | |
| "grad_norm": 0.525530348245258, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5101, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.7823485415108453, | |
| "grad_norm": 0.5416570745705145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5099, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.81226626776365, | |
| "grad_norm": 0.5094752288812316, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5091, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.842183994016455, | |
| "grad_norm": 0.48328984684734566, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5115, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.8721017202692596, | |
| "grad_norm": 0.5309507328482131, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5145, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.9020194465220643, | |
| "grad_norm": 0.5645199920156511, | |
| "learning_rate": 5e-06, | |
| "loss": 0.507, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.931937172774869, | |
| "grad_norm": 0.6341772078202893, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5164, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.961854899027674, | |
| "grad_norm": 0.5241928497019043, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5075, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.9917726252804786, | |
| "grad_norm": 0.5568699384966846, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5124, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.99775617053104, | |
| "eval_loss": 0.6033644080162048, | |
| "eval_runtime": 179.8158, | |
| "eval_samples_per_second": 50.085, | |
| "eval_steps_per_second": 0.395, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 2.99775617053104, | |
| "step": 1002, | |
| "total_flos": 1677968560619520.0, | |
| "train_loss": 0.5679433697949865, | |
| "train_runtime": 29958.3489, | |
| "train_samples_per_second": 17.134, | |
| "train_steps_per_second": 0.033 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1002, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1677968560619520.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |