{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99775617053104, "eval_steps": 500, "global_step": 1002, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029917726252804786, "grad_norm": 3.1024108625766647, "learning_rate": 5e-06, "loss": 0.7821, "step": 10 }, { "epoch": 0.05983545250560957, "grad_norm": 2.0154961377920744, "learning_rate": 5e-06, "loss": 0.709, "step": 20 }, { "epoch": 0.08975317875841436, "grad_norm": 4.098110110830773, "learning_rate": 5e-06, "loss": 0.716, "step": 30 }, { "epoch": 0.11967090501121914, "grad_norm": 1.3839237187441724, "learning_rate": 5e-06, "loss": 0.6884, "step": 40 }, { "epoch": 0.14958863126402394, "grad_norm": 1.6642156675636246, "learning_rate": 5e-06, "loss": 0.6742, "step": 50 }, { "epoch": 0.17950635751682872, "grad_norm": 0.8382342346916078, "learning_rate": 5e-06, "loss": 0.6578, "step": 60 }, { "epoch": 0.2094240837696335, "grad_norm": 1.0785514818670323, "learning_rate": 5e-06, "loss": 0.6526, "step": 70 }, { "epoch": 0.2393418100224383, "grad_norm": 0.6806480615454938, "learning_rate": 5e-06, "loss": 0.6534, "step": 80 }, { "epoch": 0.26925953627524307, "grad_norm": 0.6822998110330022, "learning_rate": 5e-06, "loss": 0.6396, "step": 90 }, { "epoch": 0.2991772625280479, "grad_norm": 0.6407539938463268, "learning_rate": 5e-06, "loss": 0.6336, "step": 100 }, { "epoch": 0.32909498878085264, "grad_norm": 0.6023614683907215, "learning_rate": 5e-06, "loss": 0.6345, "step": 110 }, { "epoch": 0.35901271503365745, "grad_norm": 1.2991344935679592, "learning_rate": 5e-06, "loss": 0.6284, "step": 120 }, { "epoch": 0.3889304412864622, "grad_norm": 1.2836405178889319, "learning_rate": 5e-06, "loss": 0.6206, "step": 130 }, { "epoch": 0.418848167539267, "grad_norm": 0.5455928257982399, "learning_rate": 5e-06, "loss": 0.626, "step": 140 }, { "epoch": 0.4487658937920718, "grad_norm": 0.8705277514817991, "learning_rate": 5e-06, "loss": 0.6243, "step": 150 }, { "epoch": 0.4786836200448766, "grad_norm": 0.6015797080220293, "learning_rate": 5e-06, "loss": 0.6239, "step": 160 }, { "epoch": 0.5086013462976814, "grad_norm": 0.5799933689114735, "learning_rate": 5e-06, "loss": 0.6242, "step": 170 }, { "epoch": 0.5385190725504861, "grad_norm": 0.7181956170370547, "learning_rate": 5e-06, "loss": 0.6187, "step": 180 }, { "epoch": 0.5684367988032909, "grad_norm": 1.1457159660009166, "learning_rate": 5e-06, "loss": 0.6249, "step": 190 }, { "epoch": 0.5983545250560958, "grad_norm": 0.8081668167489676, "learning_rate": 5e-06, "loss": 0.6262, "step": 200 }, { "epoch": 0.6282722513089005, "grad_norm": 0.494100005031363, "learning_rate": 5e-06, "loss": 0.6115, "step": 210 }, { "epoch": 0.6581899775617053, "grad_norm": 0.49011911418037035, "learning_rate": 5e-06, "loss": 0.6154, "step": 220 }, { "epoch": 0.6881077038145101, "grad_norm": 0.5619549814669743, "learning_rate": 5e-06, "loss": 0.6182, "step": 230 }, { "epoch": 0.7180254300673149, "grad_norm": 0.47978866623290667, "learning_rate": 5e-06, "loss": 0.6136, "step": 240 }, { "epoch": 0.7479431563201197, "grad_norm": 0.4834220711675942, "learning_rate": 5e-06, "loss": 0.6092, "step": 250 }, { "epoch": 0.7778608825729244, "grad_norm": 0.5842572852141615, "learning_rate": 5e-06, "loss": 0.6183, "step": 260 }, { "epoch": 0.8077786088257293, "grad_norm": 0.5124590739903697, "learning_rate": 5e-06, "loss": 0.6091, "step": 270 }, { "epoch": 0.837696335078534, "grad_norm": 0.571561034937845, "learning_rate": 5e-06, "loss": 0.6089, "step": 280 }, { "epoch": 0.8676140613313388, "grad_norm": 0.50639640711907, "learning_rate": 5e-06, "loss": 0.6075, "step": 290 }, { "epoch": 0.8975317875841436, "grad_norm": 0.47922109749059255, "learning_rate": 5e-06, "loss": 0.6113, "step": 300 }, { "epoch": 0.9274495138369484, "grad_norm": 0.6105089487425442, "learning_rate": 5e-06, "loss": 0.6123, "step": 310 }, { "epoch": 0.9573672400897532, "grad_norm": 0.5056360719681718, "learning_rate": 5e-06, "loss": 0.6156, "step": 320 }, { "epoch": 0.9872849663425579, "grad_norm": 0.48485936511026195, "learning_rate": 5e-06, "loss": 0.6008, "step": 330 }, { "epoch": 0.9992520568436799, "eval_loss": 0.6066673398017883, "eval_runtime": 179.254, "eval_samples_per_second": 50.242, "eval_steps_per_second": 0.396, "step": 334 }, { "epoch": 1.0172026925953628, "grad_norm": 0.6922596264407767, "learning_rate": 5e-06, "loss": 0.5765, "step": 340 }, { "epoch": 1.0471204188481675, "grad_norm": 0.49267857964550554, "learning_rate": 5e-06, "loss": 0.5603, "step": 350 }, { "epoch": 1.0770381451009723, "grad_norm": 0.49751934654599217, "learning_rate": 5e-06, "loss": 0.5686, "step": 360 }, { "epoch": 1.106955871353777, "grad_norm": 0.6837370299507576, "learning_rate": 5e-06, "loss": 0.5612, "step": 370 }, { "epoch": 1.136873597606582, "grad_norm": 0.47693359913296135, "learning_rate": 5e-06, "loss": 0.5539, "step": 380 }, { "epoch": 1.1667913238593868, "grad_norm": 0.648584636579785, "learning_rate": 5e-06, "loss": 0.5622, "step": 390 }, { "epoch": 1.1967090501121915, "grad_norm": 0.5670898037838431, "learning_rate": 5e-06, "loss": 0.554, "step": 400 }, { "epoch": 1.2266267763649963, "grad_norm": 0.5703136712557304, "learning_rate": 5e-06, "loss": 0.5609, "step": 410 }, { "epoch": 1.256544502617801, "grad_norm": 0.6102946772879293, "learning_rate": 5e-06, "loss": 0.559, "step": 420 }, { "epoch": 1.2864622288706058, "grad_norm": 0.5772564364701466, "learning_rate": 5e-06, "loss": 0.5603, "step": 430 }, { "epoch": 1.3163799551234106, "grad_norm": 0.450931906177121, "learning_rate": 5e-06, "loss": 0.5601, "step": 440 }, { "epoch": 1.3462976813762153, "grad_norm": 0.5054266947188414, "learning_rate": 5e-06, "loss": 0.5572, "step": 450 }, { "epoch": 1.37621540762902, "grad_norm": 0.48292336210926307, "learning_rate": 5e-06, "loss": 0.5587, "step": 460 }, { "epoch": 1.406133133881825, "grad_norm": 0.5249447315851029, "learning_rate": 5e-06, "loss": 0.5575, "step": 470 }, { "epoch": 1.4360508601346298, "grad_norm": 0.5140359027555277, "learning_rate": 5e-06, "loss": 0.5546, "step": 480 }, { "epoch": 1.4659685863874345, "grad_norm": 0.5354245491327346, "learning_rate": 5e-06, "loss": 0.5578, "step": 490 }, { "epoch": 1.4958863126402393, "grad_norm": 0.5682468395490031, "learning_rate": 5e-06, "loss": 0.5602, "step": 500 }, { "epoch": 1.5258040388930443, "grad_norm": 0.5900754898799866, "learning_rate": 5e-06, "loss": 0.5616, "step": 510 }, { "epoch": 1.555721765145849, "grad_norm": 0.4996963352974884, "learning_rate": 5e-06, "loss": 0.5557, "step": 520 }, { "epoch": 1.5856394913986538, "grad_norm": 0.5075965675534208, "learning_rate": 5e-06, "loss": 0.5573, "step": 530 }, { "epoch": 1.6155572176514585, "grad_norm": 0.6139589992144621, "learning_rate": 5e-06, "loss": 0.5601, "step": 540 }, { "epoch": 1.6454749439042633, "grad_norm": 0.5479413460230939, "learning_rate": 5e-06, "loss": 0.5549, "step": 550 }, { "epoch": 1.675392670157068, "grad_norm": 0.5917928857307124, "learning_rate": 5e-06, "loss": 0.553, "step": 560 }, { "epoch": 1.7053103964098728, "grad_norm": 0.5837997645694308, "learning_rate": 5e-06, "loss": 0.557, "step": 570 }, { "epoch": 1.7352281226626776, "grad_norm": 0.5008632390644033, "learning_rate": 5e-06, "loss": 0.5497, "step": 580 }, { "epoch": 1.7651458489154823, "grad_norm": 0.5321160422034145, "learning_rate": 5e-06, "loss": 0.554, "step": 590 }, { "epoch": 1.795063575168287, "grad_norm": 0.5334951007964042, "learning_rate": 5e-06, "loss": 0.5594, "step": 600 }, { "epoch": 1.824981301421092, "grad_norm": 0.5210632059648287, "learning_rate": 5e-06, "loss": 0.56, "step": 610 }, { "epoch": 1.8548990276738968, "grad_norm": 0.5043445814465957, "learning_rate": 5e-06, "loss": 0.5575, "step": 620 }, { "epoch": 1.8848167539267016, "grad_norm": 0.613976408483885, "learning_rate": 5e-06, "loss": 0.5583, "step": 630 }, { "epoch": 1.9147344801795063, "grad_norm": 0.5363674682216356, "learning_rate": 5e-06, "loss": 0.5643, "step": 640 }, { "epoch": 1.9446522064323113, "grad_norm": 0.5061979916241445, "learning_rate": 5e-06, "loss": 0.5617, "step": 650 }, { "epoch": 1.974569932685116, "grad_norm": 0.5681791964553616, "learning_rate": 5e-06, "loss": 0.5567, "step": 660 }, { "epoch": 1.9985041136873598, "eval_loss": 0.5981965661048889, "eval_runtime": 179.6452, "eval_samples_per_second": 50.132, "eval_steps_per_second": 0.395, "step": 668 }, { "epoch": 2.004487658937921, "grad_norm": 0.7426343168495358, "learning_rate": 5e-06, "loss": 0.5487, "step": 670 }, { "epoch": 2.0344053851907256, "grad_norm": 0.8360352246281931, "learning_rate": 5e-06, "loss": 0.509, "step": 680 }, { "epoch": 2.0643231114435303, "grad_norm": 0.5586788759798959, "learning_rate": 5e-06, "loss": 0.5107, "step": 690 }, { "epoch": 2.094240837696335, "grad_norm": 0.5334495185578801, "learning_rate": 5e-06, "loss": 0.5083, "step": 700 }, { "epoch": 2.12415856394914, "grad_norm": 0.5451729047772684, "learning_rate": 5e-06, "loss": 0.5103, "step": 710 }, { "epoch": 2.1540762902019446, "grad_norm": 0.5384304992109907, "learning_rate": 5e-06, "loss": 0.5083, "step": 720 }, { "epoch": 2.1839940164547493, "grad_norm": 0.6897130607106118, "learning_rate": 5e-06, "loss": 0.5087, "step": 730 }, { "epoch": 2.213911742707554, "grad_norm": 0.4914356466355389, "learning_rate": 5e-06, "loss": 0.5028, "step": 740 }, { "epoch": 2.243829468960359, "grad_norm": 0.4859395300154547, "learning_rate": 5e-06, "loss": 0.5107, "step": 750 }, { "epoch": 2.273747195213164, "grad_norm": 0.5080772320686796, "learning_rate": 5e-06, "loss": 0.5085, "step": 760 }, { "epoch": 2.303664921465969, "grad_norm": 0.5928831782745143, "learning_rate": 5e-06, "loss": 0.5065, "step": 770 }, { "epoch": 2.3335826477187736, "grad_norm": 0.5292235530530627, "learning_rate": 5e-06, "loss": 0.5064, "step": 780 }, { "epoch": 2.3635003739715783, "grad_norm": 0.5612309376826925, "learning_rate": 5e-06, "loss": 0.5047, "step": 790 }, { "epoch": 2.393418100224383, "grad_norm": 0.7840153330542589, "learning_rate": 5e-06, "loss": 0.5107, "step": 800 }, { "epoch": 2.423335826477188, "grad_norm": 0.563462388459372, "learning_rate": 5e-06, "loss": 0.5074, "step": 810 }, { "epoch": 2.4532535527299926, "grad_norm": 0.6037763909811528, "learning_rate": 5e-06, "loss": 0.5027, "step": 820 }, { "epoch": 2.4831712789827973, "grad_norm": 0.512144492512373, "learning_rate": 5e-06, "loss": 0.5059, "step": 830 }, { "epoch": 2.513089005235602, "grad_norm": 0.5423885548086916, "learning_rate": 5e-06, "loss": 0.5087, "step": 840 }, { "epoch": 2.543006731488407, "grad_norm": 0.5435953936785038, "learning_rate": 5e-06, "loss": 0.5097, "step": 850 }, { "epoch": 2.5729244577412116, "grad_norm": 0.5702298253328785, "learning_rate": 5e-06, "loss": 0.5139, "step": 860 }, { "epoch": 2.6028421839940163, "grad_norm": 0.5598880036547539, "learning_rate": 5e-06, "loss": 0.5108, "step": 870 }, { "epoch": 2.632759910246821, "grad_norm": 0.5408931335977699, "learning_rate": 5e-06, "loss": 0.5123, "step": 880 }, { "epoch": 2.662677636499626, "grad_norm": 0.5650280550038018, "learning_rate": 5e-06, "loss": 0.5024, "step": 890 }, { "epoch": 2.6925953627524306, "grad_norm": 0.5162753582027944, "learning_rate": 5e-06, "loss": 0.5086, "step": 900 }, { "epoch": 2.7225130890052354, "grad_norm": 0.562255619044257, "learning_rate": 5e-06, "loss": 0.5091, "step": 910 }, { "epoch": 2.75243081525804, "grad_norm": 0.525530348245258, "learning_rate": 5e-06, "loss": 0.5101, "step": 920 }, { "epoch": 2.7823485415108453, "grad_norm": 0.5416570745705145, "learning_rate": 5e-06, "loss": 0.5099, "step": 930 }, { "epoch": 2.81226626776365, "grad_norm": 0.5094752288812316, "learning_rate": 5e-06, "loss": 0.5091, "step": 940 }, { "epoch": 2.842183994016455, "grad_norm": 0.48328984684734566, "learning_rate": 5e-06, "loss": 0.5115, "step": 950 }, { "epoch": 2.8721017202692596, "grad_norm": 0.5309507328482131, "learning_rate": 5e-06, "loss": 0.5145, "step": 960 }, { "epoch": 2.9020194465220643, "grad_norm": 0.5645199920156511, "learning_rate": 5e-06, "loss": 0.507, "step": 970 }, { "epoch": 2.931937172774869, "grad_norm": 0.6341772078202893, "learning_rate": 5e-06, "loss": 0.5164, "step": 980 }, { "epoch": 2.961854899027674, "grad_norm": 0.5241928497019043, "learning_rate": 5e-06, "loss": 0.5075, "step": 990 }, { "epoch": 2.9917726252804786, "grad_norm": 0.5568699384966846, "learning_rate": 5e-06, "loss": 0.5124, "step": 1000 }, { "epoch": 2.99775617053104, "eval_loss": 0.6033644080162048, "eval_runtime": 179.8158, "eval_samples_per_second": 50.085, "eval_steps_per_second": 0.395, "step": 1002 }, { "epoch": 2.99775617053104, "step": 1002, "total_flos": 1677968560619520.0, "train_loss": 0.5679433697949865, "train_runtime": 29958.3489, "train_samples_per_second": 17.134, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1002, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1677968560619520.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }