| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9976507439310884, | |
| "eval_steps": 500, | |
| "global_step": 957, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.031323414252153486, | |
| "grad_norm": 2.8610078308107854, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9785, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06264682850430697, | |
| "grad_norm": 1.7702609085485501, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8565, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09397024275646046, | |
| "grad_norm": 1.0245069679485275, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8274, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12529365700861395, | |
| "grad_norm": 1.3708437833993996, | |
| "learning_rate": 5e-06, | |
| "loss": 0.799, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15661707126076743, | |
| "grad_norm": 0.8356008055369265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7847, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18794048551292092, | |
| "grad_norm": 1.0931171880514496, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7733, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2192638997650744, | |
| "grad_norm": 0.8028201655311987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7543, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2505873140172279, | |
| "grad_norm": 0.7488845273909638, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7505, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.28191072826938135, | |
| "grad_norm": 0.6273923877919437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7414, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.31323414252153486, | |
| "grad_norm": 0.5605687620675001, | |
| "learning_rate": 5e-06, | |
| "loss": 0.741, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3445575567736883, | |
| "grad_norm": 0.6510334014280172, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7386, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.37588097102584184, | |
| "grad_norm": 0.6570705408467123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7283, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4072043852779953, | |
| "grad_norm": 0.6492497055594569, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7326, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4385277995301488, | |
| "grad_norm": 0.8036703292766153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7239, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.46985121378230227, | |
| "grad_norm": 0.5816796226058352, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7294, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5011746280344558, | |
| "grad_norm": 0.6877992121274088, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7287, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5324980422866092, | |
| "grad_norm": 0.7272574314246748, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7208, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5638214565387627, | |
| "grad_norm": 0.5709516023324598, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7194, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5951448707909162, | |
| "grad_norm": 0.5707557600894976, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7209, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6264682850430697, | |
| "grad_norm": 0.6274568023357758, | |
| "learning_rate": 5e-06, | |
| "loss": 0.719, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6577916992952232, | |
| "grad_norm": 0.5482714368683154, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7155, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6891151135473766, | |
| "grad_norm": 0.5435355545098447, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7137, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7204385277995301, | |
| "grad_norm": 0.5549463909209981, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7145, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7517619420516837, | |
| "grad_norm": 0.5872929591126732, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7151, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7830853563038371, | |
| "grad_norm": 0.6595124362809732, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7145, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8144087705559906, | |
| "grad_norm": 0.5459174781598405, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7174, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.845732184808144, | |
| "grad_norm": 0.659643905818339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7089, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8770555990602976, | |
| "grad_norm": 0.686424151578199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7113, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9083790133124511, | |
| "grad_norm": 0.7152116087909246, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7087, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9397024275646045, | |
| "grad_norm": 0.5755513442821608, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7035, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.971025841816758, | |
| "grad_norm": 0.49802238359887663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7101, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9992169146436961, | |
| "eval_loss": 0.7040486931800842, | |
| "eval_runtime": 340.4664, | |
| "eval_samples_per_second": 25.265, | |
| "eval_steps_per_second": 0.397, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.0023492560689116, | |
| "grad_norm": 1.39557569608369, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7543, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.033672670321065, | |
| "grad_norm": 0.7048516666631669, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6574, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0649960845732185, | |
| "grad_norm": 0.7154001922458556, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6596, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.096319498825372, | |
| "grad_norm": 0.8075870470182475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6634, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1276429130775254, | |
| "grad_norm": 0.6958604595694347, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6588, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1589663273296789, | |
| "grad_norm": 0.6457902283198051, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6627, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.1902897415818323, | |
| "grad_norm": 0.7060779787846581, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6623, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.221613155833986, | |
| "grad_norm": 0.7178631366122828, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6595, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.2529365700861395, | |
| "grad_norm": 0.7303589998987047, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6602, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.284259984338293, | |
| "grad_norm": 0.614614426793837, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6623, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3155833985904464, | |
| "grad_norm": 0.5603517373438384, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6623, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3469068128425998, | |
| "grad_norm": 0.5439036905330217, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6624, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3782302270947533, | |
| "grad_norm": 0.5345730868641901, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6597, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4095536413469067, | |
| "grad_norm": 0.6120306566548636, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6601, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4408770555990604, | |
| "grad_norm": 0.7662777275751876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6579, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4722004698512139, | |
| "grad_norm": 0.5861171635982056, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6617, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.5035238841033673, | |
| "grad_norm": 0.550703096699278, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6575, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5348472983555208, | |
| "grad_norm": 0.8724124589877997, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6619, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5661707126076743, | |
| "grad_norm": 0.5466331808335562, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6582, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5974941268598277, | |
| "grad_norm": 0.5658899060560815, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6606, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.6288175411119812, | |
| "grad_norm": 0.5924530569896768, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6591, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6601409553641346, | |
| "grad_norm": 0.6898298377267612, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6567, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.691464369616288, | |
| "grad_norm": 0.5747921428491914, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6558, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7227877838684416, | |
| "grad_norm": 0.5318671363061512, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6552, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.754111198120595, | |
| "grad_norm": 0.6440423203167484, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6569, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.7854346123727485, | |
| "grad_norm": 0.6871334064396125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6579, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8167580266249022, | |
| "grad_norm": 0.6410119301707079, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6586, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.8480814408770556, | |
| "grad_norm": 0.5225004293954756, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6602, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.879404855129209, | |
| "grad_norm": 0.5478451699362562, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6562, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.9107282693813625, | |
| "grad_norm": 0.6535107349232507, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6598, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9420516836335162, | |
| "grad_norm": 0.5332260801259929, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6577, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.9733750978856697, | |
| "grad_norm": 0.6351126336858737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6588, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9984338292873924, | |
| "eval_loss": 0.6915597319602966, | |
| "eval_runtime": 342.8543, | |
| "eval_samples_per_second": 25.089, | |
| "eval_steps_per_second": 0.394, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 2.004698512137823, | |
| "grad_norm": 0.9348794210283986, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7005, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.0360219263899766, | |
| "grad_norm": 0.6116207803075494, | |
| "learning_rate": 5e-06, | |
| "loss": 0.61, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.06734534064213, | |
| "grad_norm": 0.7136818100838924, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6129, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.0986687548942835, | |
| "grad_norm": 0.6893268088593335, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6073, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.129992169146437, | |
| "grad_norm": 0.5885891280735964, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6069, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.1613155833985904, | |
| "grad_norm": 0.5966990660667627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6115, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.192638997650744, | |
| "grad_norm": 0.5903227979936005, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6083, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.2239624119028973, | |
| "grad_norm": 0.5378339870907247, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6055, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.255285826155051, | |
| "grad_norm": 0.7262899779231057, | |
| "learning_rate": 5e-06, | |
| "loss": 0.612, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.2866092404072043, | |
| "grad_norm": 0.5969529088102685, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6119, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.3179326546593577, | |
| "grad_norm": 0.6265310864883772, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6113, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.349256068911511, | |
| "grad_norm": 0.6715123614069597, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6111, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.3805794831636646, | |
| "grad_norm": 0.5903854838476866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6121, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.4119028974158185, | |
| "grad_norm": 0.7108455834296159, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6159, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.443226311667972, | |
| "grad_norm": 0.5396295194681806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6142, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.4745497259201255, | |
| "grad_norm": 0.6709263277652183, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6181, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.505873140172279, | |
| "grad_norm": 0.6922212368293488, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6127, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.5371965544244324, | |
| "grad_norm": 0.802266836685208, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6192, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.568519968676586, | |
| "grad_norm": 0.7055921712823672, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6098, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.5998433829287393, | |
| "grad_norm": 0.6549155076226189, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6126, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.6311667971808927, | |
| "grad_norm": 0.6145335107184124, | |
| "learning_rate": 5e-06, | |
| "loss": 0.618, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.662490211433046, | |
| "grad_norm": 0.6520405733421761, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6132, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.6938136256851997, | |
| "grad_norm": 0.5955304451757282, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6211, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.725137039937353, | |
| "grad_norm": 0.6957752696961866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6072, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.7564604541895066, | |
| "grad_norm": 0.7212196624619537, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6181, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.78778386844166, | |
| "grad_norm": 0.6362474481595267, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6183, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.8191072826938135, | |
| "grad_norm": 0.6043748136471383, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6185, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.850430696945967, | |
| "grad_norm": 0.6293339433347603, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6113, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.881754111198121, | |
| "grad_norm": 0.5487837401420806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.613, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.913077525450274, | |
| "grad_norm": 0.6366669947057394, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6178, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.9444009397024278, | |
| "grad_norm": 0.6539470297076938, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6157, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.975724353954581, | |
| "grad_norm": 0.7862259105525281, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6176, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.9976507439310884, | |
| "eval_loss": 0.6914501786231995, | |
| "eval_runtime": 343.0486, | |
| "eval_samples_per_second": 25.075, | |
| "eval_steps_per_second": 0.394, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 2.9976507439310884, | |
| "step": 957, | |
| "total_flos": 1602801264230400.0, | |
| "train_loss": 0.6732874076069958, | |
| "train_runtime": 56787.3035, | |
| "train_samples_per_second": 8.633, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 957, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1602801264230400.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |