{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 682, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014670823399963322, "grad_norm": 4.686307907104492, "learning_rate": 2.608695652173913e-05, "loss": 1.086164951324463, "step": 10 }, { "epoch": 0.029341646799926645, "grad_norm": 1.3878921270370483, "learning_rate": 5.507246376811594e-05, "loss": 0.13360737562179564, "step": 20 }, { "epoch": 0.04401247019988997, "grad_norm": 0.7091318368911743, "learning_rate": 8.405797101449276e-05, "loss": 0.09536288380622863, "step": 30 }, { "epoch": 0.05868329359985329, "grad_norm": 0.9854250550270081, "learning_rate": 0.00011304347826086956, "loss": 0.06767122745513916, "step": 40 }, { "epoch": 0.07335411699981662, "grad_norm": 0.6373249888420105, "learning_rate": 0.00014202898550724638, "loss": 0.07146756052970886, "step": 50 }, { "epoch": 0.08802494039977994, "grad_norm": 0.6071982979774475, "learning_rate": 0.0001710144927536232, "loss": 0.07726313471794129, "step": 60 }, { "epoch": 0.10269576379974327, "grad_norm": 0.9975550174713135, "learning_rate": 0.0002, "loss": 0.06706693768501282, "step": 70 }, { "epoch": 0.11736658719970658, "grad_norm": 0.5094273090362549, "learning_rate": 0.00019986870332074194, "loss": 0.07236328125, "step": 80 }, { "epoch": 0.13203741059966992, "grad_norm": 0.7871644496917725, "learning_rate": 0.00019947515805932744, "loss": 0.07177542448043824, "step": 90 }, { "epoch": 0.14670823399963323, "grad_norm": 1.741397738456726, "learning_rate": 0.0001988203976394757, "loss": 0.048826560378074646, "step": 100 }, { "epoch": 0.16137905739959654, "grad_norm": 0.5896034836769104, "learning_rate": 0.0001979061414185635, "loss": 0.06617986559867858, "step": 110 }, { "epoch": 0.17604988079955988, "grad_norm": 0.7956095337867737, "learning_rate": 0.0001967347901727067, "loss": 0.08305451273918152, "step": 120 }, { "epoch": 0.1907207041995232, "grad_norm": 0.492136150598526, "learning_rate": 0.0001953094197924819, "loss": 0.06995530128479004, "step": 130 }, { "epoch": 0.20539152759948653, "grad_norm": 0.6558647751808167, "learning_rate": 0.00019363377320584174, "loss": 0.05795600414276123, "step": 140 }, { "epoch": 0.22006235099944985, "grad_norm": 0.5106936097145081, "learning_rate": 0.000191712250549435, "loss": 0.055801987648010254, "step": 150 }, { "epoch": 0.23473317439941316, "grad_norm": 0.44069793820381165, "learning_rate": 0.0001895498976141398, "loss": 0.06355689167976379, "step": 160 }, { "epoch": 0.2494039977993765, "grad_norm": 0.405668169260025, "learning_rate": 0.00018715239259515184, "loss": 0.05164743065834045, "step": 170 }, { "epoch": 0.26407482119933984, "grad_norm": 0.3792116045951843, "learning_rate": 0.00018452603118142112, "loss": 0.0679062008857727, "step": 180 }, { "epoch": 0.2787456445993031, "grad_norm": 0.6442562937736511, "learning_rate": 0.00018167771002359072, "loss": 0.06293455362319947, "step": 190 }, { "epoch": 0.29341646799926646, "grad_norm": 0.5488963723182678, "learning_rate": 0.0001786149086238503, "loss": 0.0556623637676239, "step": 200 }, { "epoch": 0.3080872913992298, "grad_norm": 0.29444584250450134, "learning_rate": 0.0001753456696952601, "loss": 0.0685071349143982, "step": 210 }, { "epoch": 0.3227581147991931, "grad_norm": 0.414754718542099, "learning_rate": 0.0001718785780421207, "loss": 0.06181240677833557, "step": 220 }, { "epoch": 0.3374289381991564, "grad_norm": 0.32311928272247314, "learning_rate": 0.00016822273801684682, "loss": 0.07574231624603271, "step": 230 }, { "epoch": 0.35209976159911976, "grad_norm": 0.48792004585266113, "learning_rate": 0.00016438774961254285, "loss": 0.043923291563987735, "step": 240 }, { "epoch": 0.36677058499908305, "grad_norm": 1.073688268661499, "learning_rate": 0.00016038368325405834, "loss": 0.05729702115058899, "step": 250 }, { "epoch": 0.3814414083990464, "grad_norm": 0.5740509629249573, "learning_rate": 0.00015622105335372127, "loss": 0.06541360020637513, "step": 260 }, { "epoch": 0.39611223179900973, "grad_norm": 0.6383430361747742, "learning_rate": 0.0001519107907011895, "loss": 0.057945191860198975, "step": 270 }, { "epoch": 0.41078305519897307, "grad_norm": 0.46824830770492554, "learning_rate": 0.0001474642137599232, "loss": 0.05833690166473389, "step": 280 }, { "epoch": 0.42545387859893635, "grad_norm": 0.689471960067749, "learning_rate": 0.00014289299894565147, "loss": 0.055763131380081175, "step": 290 }, { "epoch": 0.4401247019988997, "grad_norm": 0.362657368183136, "learning_rate": 0.00013820914996488004, "loss": 0.07186369895935059, "step": 300 }, { "epoch": 0.45479552539886303, "grad_norm": 0.40035027265548706, "learning_rate": 0.00013342496629395538, "loss": 0.06522900462150574, "step": 310 }, { "epoch": 0.4694663487988263, "grad_norm": 0.4143030345439911, "learning_rate": 0.00012855301088145652, "loss": 0.040158060193061826, "step": 320 }, { "epoch": 0.48413717219878966, "grad_norm": 0.24200226366519928, "learning_rate": 0.0001236060771587266, "loss": 0.05714940428733826, "step": 330 }, { "epoch": 0.498807995598753, "grad_norm": 0.2773888111114502, "learning_rate": 0.00011859715544517164, "loss": 0.04442446827888489, "step": 340 }, { "epoch": 0.5134788189987163, "grad_norm": 0.3424382209777832, "learning_rate": 0.00011353939883654476, "loss": 0.0499860942363739, "step": 350 }, { "epoch": 0.5281496423986797, "grad_norm": 0.4579457938671112, "learning_rate": 0.0001084460886657901, "loss": 0.05433698296546936, "step": 360 }, { "epoch": 0.542820465798643, "grad_norm": 0.6211843490600586, "learning_rate": 0.00010333059962714469, "loss": 0.048888799548149106, "step": 370 }, { "epoch": 0.5574912891986062, "grad_norm": 0.5006217360496521, "learning_rate": 9.820636465507961e-05, "loss": 0.05106990933418274, "step": 380 }, { "epoch": 0.5721621125985696, "grad_norm": 0.3458799719810486, "learning_rate": 9.308683965030631e-05, "loss": 0.03896746933460236, "step": 390 }, { "epoch": 0.5868329359985329, "grad_norm": 0.29490038752555847, "learning_rate": 8.798546814547487e-05, "loss": 0.044534245133399965, "step": 400 }, { "epoch": 0.6015037593984962, "grad_norm": 0.28574299812316895, "learning_rate": 8.291564600335022e-05, "loss": 0.04822182059288025, "step": 410 }, { "epoch": 0.6161745827984596, "grad_norm": 0.3831021785736084, "learning_rate": 7.789068624016616e-05, "loss": 0.04143353998661041, "step": 420 }, { "epoch": 0.6308454061984229, "grad_norm": 0.3398614525794983, "learning_rate": 7.292378406652891e-05, "loss": 0.039598295092582704, "step": 430 }, { "epoch": 0.6455162295983862, "grad_norm": 0.22855930030345917, "learning_rate": 6.802798223767044e-05, "loss": 0.04850543141365051, "step": 440 }, { "epoch": 0.6601870529983496, "grad_norm": 0.26312509179115295, "learning_rate": 6.321613680403946e-05, "loss": 0.05014724731445312, "step": 450 }, { "epoch": 0.6748578763983129, "grad_norm": 0.343281090259552, "learning_rate": 5.8500883352166715e-05, "loss": 0.03788905143737793, "step": 460 }, { "epoch": 0.6895286997982761, "grad_norm": 0.6305585503578186, "learning_rate": 5.3894603824454056e-05, "loss": 0.05027334094047546, "step": 470 }, { "epoch": 0.7041995231982395, "grad_norm": 0.09157969057559967, "learning_rate": 4.940939400501593e-05, "loss": 0.04882456958293915, "step": 480 }, { "epoch": 0.7188703465982028, "grad_norm": 0.2803705632686615, "learning_rate": 4.505703175695366e-05, "loss": 0.051465296745300294, "step": 490 }, { "epoch": 0.7335411699981661, "grad_norm": 0.32852405309677124, "learning_rate": 4.0848946094469334e-05, "loss": 0.04661123156547546, "step": 500 }, { "epoch": 0.7482119933981295, "grad_norm": 0.31889474391937256, "learning_rate": 3.679618717103316e-05, "loss": 0.04635309278964996, "step": 510 }, { "epoch": 0.7628828167980928, "grad_norm": 0.2887394428253174, "learning_rate": 3.2909397262414845e-05, "loss": 0.04521143436431885, "step": 520 }, { "epoch": 0.7775536401980561, "grad_norm": 0.4780530035495758, "learning_rate": 2.9198782820773828e-05, "loss": 0.041201579570770266, "step": 530 }, { "epoch": 0.7922244635980195, "grad_norm": 0.4354000985622406, "learning_rate": 2.5674087673194115e-05, "loss": 0.036979615688323975, "step": 540 }, { "epoch": 0.8068952869979827, "grad_norm": 0.11631964892148972, "learning_rate": 2.2344567435041054e-05, "loss": 0.03683710396289826, "step": 550 }, { "epoch": 0.8215661103979461, "grad_norm": 0.40628868341445923, "learning_rate": 1.9218965205330576e-05, "loss": 0.04675011336803436, "step": 560 }, { "epoch": 0.8362369337979094, "grad_norm": 0.31028568744659424, "learning_rate": 1.6305488607931486e-05, "loss": 0.033157148957252504, "step": 570 }, { "epoch": 0.8509077571978727, "grad_norm": 0.26061493158340454, "learning_rate": 1.3611788238890511e-05, "loss": 0.04655841886997223, "step": 580 }, { "epoch": 0.8655785805978361, "grad_norm": 0.2419964224100113, "learning_rate": 1.114493757647508e-05, "loss": 0.030328187346458434, "step": 590 }, { "epoch": 0.8802494039977994, "grad_norm": 0.3614746332168579, "learning_rate": 8.911414406689145e-06, "loss": 0.03218616545200348, "step": 600 }, { "epoch": 0.8949202273977627, "grad_norm": 0.37310630083084106, "learning_rate": 6.9170838130375505e-06, "loss": 0.04881116449832916, "step": 610 }, { "epoch": 0.9095910507977261, "grad_norm": 0.2813904881477356, "learning_rate": 5.167182775206026e-06, "loss": 0.05659586191177368, "step": 620 }, { "epoch": 0.9242618741976893, "grad_norm": 0.13661852478981018, "learning_rate": 3.6663064171005956e-06, "loss": 0.042176204919815066, "step": 630 }, { "epoch": 0.9389326975976526, "grad_norm": 0.3304874300956726, "learning_rate": 2.418395940357099e-06, "loss": 0.05347890257835388, "step": 640 }, { "epoch": 0.953603520997616, "grad_norm": 0.34348776936531067, "learning_rate": 1.4267282750077493e-06, "loss": 0.04495801329612732, "step": 650 }, { "epoch": 0.9682743443975793, "grad_norm": 0.27644288539886475, "learning_rate": 6.93907474480282e-07, "loss": 0.04055593609809875, "step": 660 }, { "epoch": 0.9829451677975426, "grad_norm": 0.2929557263851166, "learning_rate": 2.2185787752672104e-07, "loss": 0.03816842138767242, "step": 670 }, { "epoch": 0.997615991197506, "grad_norm": 0.2137778252363205, "learning_rate": 1.1819055037554095e-08, "loss": 0.05327551364898682, "step": 680 }, { "epoch": 1.0, "step": 682, "total_flos": 3.3343785449628713e+18, "train_loss": 0.07003523399807578, "train_runtime": 66098.6339, "train_samples_per_second": 0.165, "train_steps_per_second": 0.01 } ], "logging_steps": 10, "max_steps": 682, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3343785449628713e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }