{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9976507439310884, "eval_steps": 500, "global_step": 957, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.031323414252153486, "grad_norm": 2.8610078308107854, "learning_rate": 5e-06, "loss": 0.9785, "step": 10 }, { "epoch": 0.06264682850430697, "grad_norm": 1.7702609085485501, "learning_rate": 5e-06, "loss": 0.8565, "step": 20 }, { "epoch": 0.09397024275646046, "grad_norm": 1.0245069679485275, "learning_rate": 5e-06, "loss": 0.8274, "step": 30 }, { "epoch": 0.12529365700861395, "grad_norm": 1.3708437833993996, "learning_rate": 5e-06, "loss": 0.799, "step": 40 }, { "epoch": 0.15661707126076743, "grad_norm": 0.8356008055369265, "learning_rate": 5e-06, "loss": 0.7847, "step": 50 }, { "epoch": 0.18794048551292092, "grad_norm": 1.0931171880514496, "learning_rate": 5e-06, "loss": 0.7733, "step": 60 }, { "epoch": 0.2192638997650744, "grad_norm": 0.8028201655311987, "learning_rate": 5e-06, "loss": 0.7543, "step": 70 }, { "epoch": 0.2505873140172279, "grad_norm": 0.7488845273909638, "learning_rate": 5e-06, "loss": 0.7505, "step": 80 }, { "epoch": 0.28191072826938135, "grad_norm": 0.6273923877919437, "learning_rate": 5e-06, "loss": 0.7414, "step": 90 }, { "epoch": 0.31323414252153486, "grad_norm": 0.5605687620675001, "learning_rate": 5e-06, "loss": 0.741, "step": 100 }, { "epoch": 0.3445575567736883, "grad_norm": 0.6510334014280172, "learning_rate": 5e-06, "loss": 0.7386, "step": 110 }, { "epoch": 0.37588097102584184, "grad_norm": 0.6570705408467123, "learning_rate": 5e-06, "loss": 0.7283, "step": 120 }, { "epoch": 0.4072043852779953, "grad_norm": 0.6492497055594569, "learning_rate": 5e-06, "loss": 0.7326, "step": 130 }, { "epoch": 0.4385277995301488, "grad_norm": 0.8036703292766153, "learning_rate": 5e-06, "loss": 0.7239, "step": 140 }, { "epoch": 0.46985121378230227, "grad_norm": 0.5816796226058352, "learning_rate": 5e-06, "loss": 0.7294, "step": 150 }, { "epoch": 0.5011746280344558, "grad_norm": 0.6877992121274088, "learning_rate": 5e-06, "loss": 0.7287, "step": 160 }, { "epoch": 0.5324980422866092, "grad_norm": 0.7272574314246748, "learning_rate": 5e-06, "loss": 0.7208, "step": 170 }, { "epoch": 0.5638214565387627, "grad_norm": 0.5709516023324598, "learning_rate": 5e-06, "loss": 0.7194, "step": 180 }, { "epoch": 0.5951448707909162, "grad_norm": 0.5707557600894976, "learning_rate": 5e-06, "loss": 0.7209, "step": 190 }, { "epoch": 0.6264682850430697, "grad_norm": 0.6274568023357758, "learning_rate": 5e-06, "loss": 0.719, "step": 200 }, { "epoch": 0.6577916992952232, "grad_norm": 0.5482714368683154, "learning_rate": 5e-06, "loss": 0.7155, "step": 210 }, { "epoch": 0.6891151135473766, "grad_norm": 0.5435355545098447, "learning_rate": 5e-06, "loss": 0.7137, "step": 220 }, { "epoch": 0.7204385277995301, "grad_norm": 0.5549463909209981, "learning_rate": 5e-06, "loss": 0.7145, "step": 230 }, { "epoch": 0.7517619420516837, "grad_norm": 0.5872929591126732, "learning_rate": 5e-06, "loss": 0.7151, "step": 240 }, { "epoch": 0.7830853563038371, "grad_norm": 0.6595124362809732, "learning_rate": 5e-06, "loss": 0.7145, "step": 250 }, { "epoch": 0.8144087705559906, "grad_norm": 0.5459174781598405, "learning_rate": 5e-06, "loss": 0.7174, "step": 260 }, { "epoch": 0.845732184808144, "grad_norm": 0.659643905818339, "learning_rate": 5e-06, "loss": 0.7089, "step": 270 }, { "epoch": 0.8770555990602976, "grad_norm": 0.686424151578199, "learning_rate": 5e-06, "loss": 0.7113, "step": 280 }, { "epoch": 0.9083790133124511, "grad_norm": 0.7152116087909246, "learning_rate": 5e-06, "loss": 0.7087, "step": 290 }, { "epoch": 0.9397024275646045, "grad_norm": 0.5755513442821608, "learning_rate": 5e-06, "loss": 0.7035, "step": 300 }, { "epoch": 0.971025841816758, "grad_norm": 0.49802238359887663, "learning_rate": 5e-06, "loss": 0.7101, "step": 310 }, { "epoch": 0.9992169146436961, "eval_loss": 0.7040486931800842, "eval_runtime": 340.4664, "eval_samples_per_second": 25.265, "eval_steps_per_second": 0.397, "step": 319 }, { "epoch": 1.0023492560689116, "grad_norm": 1.39557569608369, "learning_rate": 5e-06, "loss": 0.7543, "step": 320 }, { "epoch": 1.033672670321065, "grad_norm": 0.7048516666631669, "learning_rate": 5e-06, "loss": 0.6574, "step": 330 }, { "epoch": 1.0649960845732185, "grad_norm": 0.7154001922458556, "learning_rate": 5e-06, "loss": 0.6596, "step": 340 }, { "epoch": 1.096319498825372, "grad_norm": 0.8075870470182475, "learning_rate": 5e-06, "loss": 0.6634, "step": 350 }, { "epoch": 1.1276429130775254, "grad_norm": 0.6958604595694347, "learning_rate": 5e-06, "loss": 0.6588, "step": 360 }, { "epoch": 1.1589663273296789, "grad_norm": 0.6457902283198051, "learning_rate": 5e-06, "loss": 0.6627, "step": 370 }, { "epoch": 1.1902897415818323, "grad_norm": 0.7060779787846581, "learning_rate": 5e-06, "loss": 0.6623, "step": 380 }, { "epoch": 1.221613155833986, "grad_norm": 0.7178631366122828, "learning_rate": 5e-06, "loss": 0.6595, "step": 390 }, { "epoch": 1.2529365700861395, "grad_norm": 0.7303589998987047, "learning_rate": 5e-06, "loss": 0.6602, "step": 400 }, { "epoch": 1.284259984338293, "grad_norm": 0.614614426793837, "learning_rate": 5e-06, "loss": 0.6623, "step": 410 }, { "epoch": 1.3155833985904464, "grad_norm": 0.5603517373438384, "learning_rate": 5e-06, "loss": 0.6623, "step": 420 }, { "epoch": 1.3469068128425998, "grad_norm": 0.5439036905330217, "learning_rate": 5e-06, "loss": 0.6624, "step": 430 }, { "epoch": 1.3782302270947533, "grad_norm": 0.5345730868641901, "learning_rate": 5e-06, "loss": 0.6597, "step": 440 }, { "epoch": 1.4095536413469067, "grad_norm": 0.6120306566548636, "learning_rate": 5e-06, "loss": 0.6601, "step": 450 }, { "epoch": 1.4408770555990604, "grad_norm": 0.7662777275751876, "learning_rate": 5e-06, "loss": 0.6579, "step": 460 }, { "epoch": 1.4722004698512139, "grad_norm": 0.5861171635982056, "learning_rate": 5e-06, "loss": 0.6617, "step": 470 }, { "epoch": 1.5035238841033673, "grad_norm": 0.550703096699278, "learning_rate": 5e-06, "loss": 0.6575, "step": 480 }, { "epoch": 1.5348472983555208, "grad_norm": 0.8724124589877997, "learning_rate": 5e-06, "loss": 0.6619, "step": 490 }, { "epoch": 1.5661707126076743, "grad_norm": 0.5466331808335562, "learning_rate": 5e-06, "loss": 0.6582, "step": 500 }, { "epoch": 1.5974941268598277, "grad_norm": 0.5658899060560815, "learning_rate": 5e-06, "loss": 0.6606, "step": 510 }, { "epoch": 1.6288175411119812, "grad_norm": 0.5924530569896768, "learning_rate": 5e-06, "loss": 0.6591, "step": 520 }, { "epoch": 1.6601409553641346, "grad_norm": 0.6898298377267612, "learning_rate": 5e-06, "loss": 0.6567, "step": 530 }, { "epoch": 1.691464369616288, "grad_norm": 0.5747921428491914, "learning_rate": 5e-06, "loss": 0.6558, "step": 540 }, { "epoch": 1.7227877838684416, "grad_norm": 0.5318671363061512, "learning_rate": 5e-06, "loss": 0.6552, "step": 550 }, { "epoch": 1.754111198120595, "grad_norm": 0.6440423203167484, "learning_rate": 5e-06, "loss": 0.6569, "step": 560 }, { "epoch": 1.7854346123727485, "grad_norm": 0.6871334064396125, "learning_rate": 5e-06, "loss": 0.6579, "step": 570 }, { "epoch": 1.8167580266249022, "grad_norm": 0.6410119301707079, "learning_rate": 5e-06, "loss": 0.6586, "step": 580 }, { "epoch": 1.8480814408770556, "grad_norm": 0.5225004293954756, "learning_rate": 5e-06, "loss": 0.6602, "step": 590 }, { "epoch": 1.879404855129209, "grad_norm": 0.5478451699362562, "learning_rate": 5e-06, "loss": 0.6562, "step": 600 }, { "epoch": 1.9107282693813625, "grad_norm": 0.6535107349232507, "learning_rate": 5e-06, "loss": 0.6598, "step": 610 }, { "epoch": 1.9420516836335162, "grad_norm": 0.5332260801259929, "learning_rate": 5e-06, "loss": 0.6577, "step": 620 }, { "epoch": 1.9733750978856697, "grad_norm": 0.6351126336858737, "learning_rate": 5e-06, "loss": 0.6588, "step": 630 }, { "epoch": 1.9984338292873924, "eval_loss": 0.6915597319602966, "eval_runtime": 342.8543, "eval_samples_per_second": 25.089, "eval_steps_per_second": 0.394, "step": 638 }, { "epoch": 2.004698512137823, "grad_norm": 0.9348794210283986, "learning_rate": 5e-06, "loss": 0.7005, "step": 640 }, { "epoch": 2.0360219263899766, "grad_norm": 0.6116207803075494, "learning_rate": 5e-06, "loss": 0.61, "step": 650 }, { "epoch": 2.06734534064213, "grad_norm": 0.7136818100838924, "learning_rate": 5e-06, "loss": 0.6129, "step": 660 }, { "epoch": 2.0986687548942835, "grad_norm": 0.6893268088593335, "learning_rate": 5e-06, "loss": 0.6073, "step": 670 }, { "epoch": 2.129992169146437, "grad_norm": 0.5885891280735964, "learning_rate": 5e-06, "loss": 0.6069, "step": 680 }, { "epoch": 2.1613155833985904, "grad_norm": 0.5966990660667627, "learning_rate": 5e-06, "loss": 0.6115, "step": 690 }, { "epoch": 2.192638997650744, "grad_norm": 0.5903227979936005, "learning_rate": 5e-06, "loss": 0.6083, "step": 700 }, { "epoch": 2.2239624119028973, "grad_norm": 0.5378339870907247, "learning_rate": 5e-06, "loss": 0.6055, "step": 710 }, { "epoch": 2.255285826155051, "grad_norm": 0.7262899779231057, "learning_rate": 5e-06, "loss": 0.612, "step": 720 }, { "epoch": 2.2866092404072043, "grad_norm": 0.5969529088102685, "learning_rate": 5e-06, "loss": 0.6119, "step": 730 }, { "epoch": 2.3179326546593577, "grad_norm": 0.6265310864883772, "learning_rate": 5e-06, "loss": 0.6113, "step": 740 }, { "epoch": 2.349256068911511, "grad_norm": 0.6715123614069597, "learning_rate": 5e-06, "loss": 0.6111, "step": 750 }, { "epoch": 2.3805794831636646, "grad_norm": 0.5903854838476866, "learning_rate": 5e-06, "loss": 0.6121, "step": 760 }, { "epoch": 2.4119028974158185, "grad_norm": 0.7108455834296159, "learning_rate": 5e-06, "loss": 0.6159, "step": 770 }, { "epoch": 2.443226311667972, "grad_norm": 0.5396295194681806, "learning_rate": 5e-06, "loss": 0.6142, "step": 780 }, { "epoch": 2.4745497259201255, "grad_norm": 0.6709263277652183, "learning_rate": 5e-06, "loss": 0.6181, "step": 790 }, { "epoch": 2.505873140172279, "grad_norm": 0.6922212368293488, "learning_rate": 5e-06, "loss": 0.6127, "step": 800 }, { "epoch": 2.5371965544244324, "grad_norm": 0.802266836685208, "learning_rate": 5e-06, "loss": 0.6192, "step": 810 }, { "epoch": 2.568519968676586, "grad_norm": 0.7055921712823672, "learning_rate": 5e-06, "loss": 0.6098, "step": 820 }, { "epoch": 2.5998433829287393, "grad_norm": 0.6549155076226189, "learning_rate": 5e-06, "loss": 0.6126, "step": 830 }, { "epoch": 2.6311667971808927, "grad_norm": 0.6145335107184124, "learning_rate": 5e-06, "loss": 0.618, "step": 840 }, { "epoch": 2.662490211433046, "grad_norm": 0.6520405733421761, "learning_rate": 5e-06, "loss": 0.6132, "step": 850 }, { "epoch": 2.6938136256851997, "grad_norm": 0.5955304451757282, "learning_rate": 5e-06, "loss": 0.6211, "step": 860 }, { "epoch": 2.725137039937353, "grad_norm": 0.6957752696961866, "learning_rate": 5e-06, "loss": 0.6072, "step": 870 }, { "epoch": 2.7564604541895066, "grad_norm": 0.7212196624619537, "learning_rate": 5e-06, "loss": 0.6181, "step": 880 }, { "epoch": 2.78778386844166, "grad_norm": 0.6362474481595267, "learning_rate": 5e-06, "loss": 0.6183, "step": 890 }, { "epoch": 2.8191072826938135, "grad_norm": 0.6043748136471383, "learning_rate": 5e-06, "loss": 0.6185, "step": 900 }, { "epoch": 2.850430696945967, "grad_norm": 0.6293339433347603, "learning_rate": 5e-06, "loss": 0.6113, "step": 910 }, { "epoch": 2.881754111198121, "grad_norm": 0.5487837401420806, "learning_rate": 5e-06, "loss": 0.613, "step": 920 }, { "epoch": 2.913077525450274, "grad_norm": 0.6366669947057394, "learning_rate": 5e-06, "loss": 0.6178, "step": 930 }, { "epoch": 2.9444009397024278, "grad_norm": 0.6539470297076938, "learning_rate": 5e-06, "loss": 0.6157, "step": 940 }, { "epoch": 2.975724353954581, "grad_norm": 0.7862259105525281, "learning_rate": 5e-06, "loss": 0.6176, "step": 950 }, { "epoch": 2.9976507439310884, "eval_loss": 0.6914501786231995, "eval_runtime": 343.0486, "eval_samples_per_second": 25.075, "eval_steps_per_second": 0.394, "step": 957 }, { "epoch": 2.9976507439310884, "step": 957, "total_flos": 1602801264230400.0, "train_loss": 0.6732874076069958, "train_runtime": 56787.3035, "train_samples_per_second": 8.633, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 957, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1602801264230400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }