| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9988776655443323, |
| "eval_steps": 500, |
| "global_step": 1002, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.029928918817807706, |
| "grad_norm": 1.3197502899687559, |
| "learning_rate": 5e-06, |
| "loss": 0.7729, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05985783763561541, |
| "grad_norm": 0.7957246856225982, |
| "learning_rate": 5e-06, |
| "loss": 0.694, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.08978675645342311, |
| "grad_norm": 0.7023919119728642, |
| "learning_rate": 5e-06, |
| "loss": 0.6692, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.11971567527123082, |
| "grad_norm": 0.6852471491957217, |
| "learning_rate": 5e-06, |
| "loss": 0.6652, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.14964459408903855, |
| "grad_norm": 0.7441284057804172, |
| "learning_rate": 5e-06, |
| "loss": 0.6567, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.17957351290684623, |
| "grad_norm": 0.6737131800519109, |
| "learning_rate": 5e-06, |
| "loss": 0.6489, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.20950243172465394, |
| "grad_norm": 0.8645698704938743, |
| "learning_rate": 5e-06, |
| "loss": 0.6458, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.23943135054246165, |
| "grad_norm": 0.6824403788440216, |
| "learning_rate": 5e-06, |
| "loss": 0.6472, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.26936026936026936, |
| "grad_norm": 0.8355879527708924, |
| "learning_rate": 5e-06, |
| "loss": 0.6382, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2992891881780771, |
| "grad_norm": 0.6566317269166482, |
| "learning_rate": 5e-06, |
| "loss": 0.6394, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3292181069958848, |
| "grad_norm": 0.7025002610859795, |
| "learning_rate": 5e-06, |
| "loss": 0.6352, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.35914702581369246, |
| "grad_norm": 0.7294514273893201, |
| "learning_rate": 5e-06, |
| "loss": 0.6341, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3890759446315002, |
| "grad_norm": 0.7204998726570041, |
| "learning_rate": 5e-06, |
| "loss": 0.6342, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4190048634493079, |
| "grad_norm": 0.9245929000779519, |
| "learning_rate": 5e-06, |
| "loss": 0.6279, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4489337822671156, |
| "grad_norm": 0.8312008040431372, |
| "learning_rate": 5e-06, |
| "loss": 0.6298, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4788627010849233, |
| "grad_norm": 0.6941447661619787, |
| "learning_rate": 5e-06, |
| "loss": 0.6287, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.508791619902731, |
| "grad_norm": 0.7880713474277835, |
| "learning_rate": 5e-06, |
| "loss": 0.623, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5387205387205387, |
| "grad_norm": 0.7199931353143368, |
| "learning_rate": 5e-06, |
| "loss": 0.6247, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5686494575383464, |
| "grad_norm": 0.7680737861171968, |
| "learning_rate": 5e-06, |
| "loss": 0.6223, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5985783763561542, |
| "grad_norm": 0.7601491643468152, |
| "learning_rate": 5e-06, |
| "loss": 0.6299, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6285072951739619, |
| "grad_norm": 0.76786698349262, |
| "learning_rate": 5e-06, |
| "loss": 0.6236, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6584362139917695, |
| "grad_norm": 0.9029566246000676, |
| "learning_rate": 5e-06, |
| "loss": 0.6224, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6883651328095772, |
| "grad_norm": 0.7045261038164553, |
| "learning_rate": 5e-06, |
| "loss": 0.6245, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7182940516273849, |
| "grad_norm": 0.6774924026654922, |
| "learning_rate": 5e-06, |
| "loss": 0.6234, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7482229704451927, |
| "grad_norm": 0.6011441610841004, |
| "learning_rate": 5e-06, |
| "loss": 0.6201, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7781518892630004, |
| "grad_norm": 0.6589701033868924, |
| "learning_rate": 5e-06, |
| "loss": 0.6188, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8080808080808081, |
| "grad_norm": 0.7793955701511873, |
| "learning_rate": 5e-06, |
| "loss": 0.6263, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8380097268986157, |
| "grad_norm": 0.6801997659823543, |
| "learning_rate": 5e-06, |
| "loss": 0.6164, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8679386457164235, |
| "grad_norm": 0.7863034594758392, |
| "learning_rate": 5e-06, |
| "loss": 0.6133, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8978675645342312, |
| "grad_norm": 0.8097674195506819, |
| "learning_rate": 5e-06, |
| "loss": 0.6145, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9277964833520389, |
| "grad_norm": 0.6976711471967793, |
| "learning_rate": 5e-06, |
| "loss": 0.6132, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9577254021698466, |
| "grad_norm": 0.6845188306806823, |
| "learning_rate": 5e-06, |
| "loss": 0.615, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9876543209876543, |
| "grad_norm": 0.9820656021369418, |
| "learning_rate": 5e-06, |
| "loss": 0.6085, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9996258885147774, |
| "eval_loss": 0.6192271709442139, |
| "eval_runtime": 270.7058, |
| "eval_samples_per_second": 33.25, |
| "eval_steps_per_second": 0.521, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.017583239805462, |
| "grad_norm": 0.8670558349646079, |
| "learning_rate": 5e-06, |
| "loss": 0.6314, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.0475121586232696, |
| "grad_norm": 1.0194196930172406, |
| "learning_rate": 5e-06, |
| "loss": 0.5525, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.0774410774410774, |
| "grad_norm": 0.7571264421758325, |
| "learning_rate": 5e-06, |
| "loss": 0.5475, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.1073699962588852, |
| "grad_norm": 0.716142300432686, |
| "learning_rate": 5e-06, |
| "loss": 0.5479, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1372989150766928, |
| "grad_norm": 0.7134968159345548, |
| "learning_rate": 5e-06, |
| "loss": 0.5483, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.1672278338945006, |
| "grad_norm": 0.7093422370162528, |
| "learning_rate": 5e-06, |
| "loss": 0.5497, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.1971567527123081, |
| "grad_norm": 0.6758306313904245, |
| "learning_rate": 5e-06, |
| "loss": 0.5499, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.227085671530116, |
| "grad_norm": 0.6590188596738886, |
| "learning_rate": 5e-06, |
| "loss": 0.5523, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.2570145903479237, |
| "grad_norm": 0.7115281205352587, |
| "learning_rate": 5e-06, |
| "loss": 0.557, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.2869435091657313, |
| "grad_norm": 0.6651956769462775, |
| "learning_rate": 5e-06, |
| "loss": 0.5521, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.316872427983539, |
| "grad_norm": 0.686904033471436, |
| "learning_rate": 5e-06, |
| "loss": 0.5542, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.3468013468013469, |
| "grad_norm": 0.7052326227629313, |
| "learning_rate": 5e-06, |
| "loss": 0.5473, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.3767302656191545, |
| "grad_norm": 0.6603203892732427, |
| "learning_rate": 5e-06, |
| "loss": 0.558, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.4066591844369623, |
| "grad_norm": 0.7204930864199384, |
| "learning_rate": 5e-06, |
| "loss": 0.557, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.43658810325477, |
| "grad_norm": 0.6582974125304011, |
| "learning_rate": 5e-06, |
| "loss": 0.5598, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.4665170220725776, |
| "grad_norm": 0.653408089340934, |
| "learning_rate": 5e-06, |
| "loss": 0.5589, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.4964459408903854, |
| "grad_norm": 0.7722703692356943, |
| "learning_rate": 5e-06, |
| "loss": 0.5549, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.5263748597081932, |
| "grad_norm": 0.6410463952946445, |
| "learning_rate": 5e-06, |
| "loss": 0.5571, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.5563037785260008, |
| "grad_norm": 0.6788292489082296, |
| "learning_rate": 5e-06, |
| "loss": 0.5567, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.5862326973438083, |
| "grad_norm": 0.7347495173956178, |
| "learning_rate": 5e-06, |
| "loss": 0.5557, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.6161616161616161, |
| "grad_norm": 0.7489697605253047, |
| "learning_rate": 5e-06, |
| "loss": 0.56, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.646090534979424, |
| "grad_norm": 0.6649594456868578, |
| "learning_rate": 5e-06, |
| "loss": 0.5596, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.6760194537972315, |
| "grad_norm": 0.6944801894329058, |
| "learning_rate": 5e-06, |
| "loss": 0.5499, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.7059483726150393, |
| "grad_norm": 0.7516636245416078, |
| "learning_rate": 5e-06, |
| "loss": 0.5519, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.735877291432847, |
| "grad_norm": 0.834145985540098, |
| "learning_rate": 5e-06, |
| "loss": 0.5525, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.7658062102506547, |
| "grad_norm": 0.6940488546001392, |
| "learning_rate": 5e-06, |
| "loss": 0.5561, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.7957351290684624, |
| "grad_norm": 0.6996951151429136, |
| "learning_rate": 5e-06, |
| "loss": 0.5568, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.8256640478862702, |
| "grad_norm": 0.6321044767548653, |
| "learning_rate": 5e-06, |
| "loss": 0.5543, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.8555929667040778, |
| "grad_norm": 0.6380400908901183, |
| "learning_rate": 5e-06, |
| "loss": 0.5521, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.8855218855218854, |
| "grad_norm": 0.6726516418071744, |
| "learning_rate": 5e-06, |
| "loss": 0.5536, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.9154508043396934, |
| "grad_norm": 0.6952484734366503, |
| "learning_rate": 5e-06, |
| "loss": 0.5556, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.945379723157501, |
| "grad_norm": 0.6339074390401458, |
| "learning_rate": 5e-06, |
| "loss": 0.554, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.9753086419753085, |
| "grad_norm": 0.8335015680516275, |
| "learning_rate": 5e-06, |
| "loss": 0.5595, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.9992517770295548, |
| "eval_loss": 0.615513026714325, |
| "eval_runtime": 271.7727, |
| "eval_samples_per_second": 33.12, |
| "eval_steps_per_second": 0.519, |
| "step": 668 |
| }, |
| { |
| "epoch": 2.0052375607931165, |
| "grad_norm": 1.095470898789856, |
| "learning_rate": 5e-06, |
| "loss": 0.5957, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.035166479610924, |
| "grad_norm": 0.7925440628175368, |
| "learning_rate": 5e-06, |
| "loss": 0.4851, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.0650953984287317, |
| "grad_norm": 0.7600969395946293, |
| "learning_rate": 5e-06, |
| "loss": 0.4799, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.0950243172465393, |
| "grad_norm": 0.8685890982294241, |
| "learning_rate": 5e-06, |
| "loss": 0.4845, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.1249532360643473, |
| "grad_norm": 0.7159762779954674, |
| "learning_rate": 5e-06, |
| "loss": 0.4862, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.154882154882155, |
| "grad_norm": 0.7850425626912287, |
| "learning_rate": 5e-06, |
| "loss": 0.4882, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.1848110736999624, |
| "grad_norm": 0.7829173560959974, |
| "learning_rate": 5e-06, |
| "loss": 0.4894, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.2147399925177704, |
| "grad_norm": 0.7053202412118417, |
| "learning_rate": 5e-06, |
| "loss": 0.4898, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.244668911335578, |
| "grad_norm": 0.7275797577145928, |
| "learning_rate": 5e-06, |
| "loss": 0.4917, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.2745978301533856, |
| "grad_norm": 0.684143630508004, |
| "learning_rate": 5e-06, |
| "loss": 0.4878, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.3045267489711936, |
| "grad_norm": 0.778690697436679, |
| "learning_rate": 5e-06, |
| "loss": 0.4936, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.334455667789001, |
| "grad_norm": 0.6973756438711023, |
| "learning_rate": 5e-06, |
| "loss": 0.4885, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.3643845866068087, |
| "grad_norm": 0.7512378015475496, |
| "learning_rate": 5e-06, |
| "loss": 0.4902, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.3943135054246163, |
| "grad_norm": 0.6954041240036626, |
| "learning_rate": 5e-06, |
| "loss": 0.4947, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.4242424242424243, |
| "grad_norm": 0.7661445266388807, |
| "learning_rate": 5e-06, |
| "loss": 0.4995, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.454171343060232, |
| "grad_norm": 0.7288724567709918, |
| "learning_rate": 5e-06, |
| "loss": 0.4979, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.48410026187804, |
| "grad_norm": 0.7507674417043292, |
| "learning_rate": 5e-06, |
| "loss": 0.4968, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.5140291806958475, |
| "grad_norm": 0.6886877322873068, |
| "learning_rate": 5e-06, |
| "loss": 0.4951, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.543958099513655, |
| "grad_norm": 0.710314562589874, |
| "learning_rate": 5e-06, |
| "loss": 0.498, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.5738870183314626, |
| "grad_norm": 0.6994762876301733, |
| "learning_rate": 5e-06, |
| "loss": 0.4959, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.6038159371492706, |
| "grad_norm": 0.7582356365854407, |
| "learning_rate": 5e-06, |
| "loss": 0.4938, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.633744855967078, |
| "grad_norm": 0.731935619090177, |
| "learning_rate": 5e-06, |
| "loss": 0.4921, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.6636737747848858, |
| "grad_norm": 0.74782144362319, |
| "learning_rate": 5e-06, |
| "loss": 0.4977, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.6936026936026938, |
| "grad_norm": 0.6942188030457457, |
| "learning_rate": 5e-06, |
| "loss": 0.5011, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.7235316124205013, |
| "grad_norm": 0.6881327401867374, |
| "learning_rate": 5e-06, |
| "loss": 0.4947, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.753460531238309, |
| "grad_norm": 0.6856202337817314, |
| "learning_rate": 5e-06, |
| "loss": 0.4959, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.7833894500561165, |
| "grad_norm": 0.7141040450174527, |
| "learning_rate": 5e-06, |
| "loss": 0.5016, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.8133183688739245, |
| "grad_norm": 0.6857610208401852, |
| "learning_rate": 5e-06, |
| "loss": 0.4985, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.843247287691732, |
| "grad_norm": 0.6698180625003869, |
| "learning_rate": 5e-06, |
| "loss": 0.4986, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.87317620650954, |
| "grad_norm": 0.8039520213911328, |
| "learning_rate": 5e-06, |
| "loss": 0.502, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.9031051253273477, |
| "grad_norm": 0.7415409936401505, |
| "learning_rate": 5e-06, |
| "loss": 0.4959, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.9330340441451552, |
| "grad_norm": 0.7856625436324756, |
| "learning_rate": 5e-06, |
| "loss": 0.5023, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.962962962962963, |
| "grad_norm": 0.761345605606732, |
| "learning_rate": 5e-06, |
| "loss": 0.5048, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.992891881780771, |
| "grad_norm": 0.7302412373936236, |
| "learning_rate": 5e-06, |
| "loss": 0.5047, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.9988776655443323, |
| "eval_loss": 0.6354114413261414, |
| "eval_runtime": 270.917, |
| "eval_samples_per_second": 33.224, |
| "eval_steps_per_second": 0.52, |
| "step": 1002 |
| }, |
| { |
| "epoch": 2.9988776655443323, |
| "step": 1002, |
| "total_flos": 3817854814126080.0, |
| "train_loss": 0.5626692353727337, |
| "train_runtime": 46778.8688, |
| "train_samples_per_second": 10.967, |
| "train_steps_per_second": 0.021 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1002, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3817854814126080.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|