{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.976, "eval_steps": 500, "global_step": 93, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 6.08677457937853, "learning_rate": 4.000000000000001e-06, "loss": 0.8709, "step": 1 }, { "epoch": 0.064, "grad_norm": 5.862503603501722, "learning_rate": 8.000000000000001e-06, "loss": 0.8625, "step": 2 }, { "epoch": 0.096, "grad_norm": 4.5723036455815205, "learning_rate": 1.2e-05, "loss": 0.839, "step": 3 }, { "epoch": 0.128, "grad_norm": 2.0695238404961547, "learning_rate": 1.6000000000000003e-05, "loss": 0.7376, "step": 4 }, { "epoch": 0.16, "grad_norm": 5.589012282844458, "learning_rate": 2e-05, "loss": 0.825, "step": 5 }, { "epoch": 0.192, "grad_norm": 8.490554966738163, "learning_rate": 2.4e-05, "loss": 0.8478, "step": 6 }, { "epoch": 0.224, "grad_norm": 6.29894861103046, "learning_rate": 2.8e-05, "loss": 0.7658, "step": 7 }, { "epoch": 0.256, "grad_norm": 3.558435531359445, "learning_rate": 3.2000000000000005e-05, "loss": 0.7555, "step": 8 }, { "epoch": 0.288, "grad_norm": 2.722863714076688, "learning_rate": 3.6e-05, "loss": 0.7274, "step": 9 }, { "epoch": 0.32, "grad_norm": 2.088640250697761, "learning_rate": 4e-05, "loss": 0.6627, "step": 10 }, { "epoch": 0.352, "grad_norm": 1.5096876602344986, "learning_rate": 3.998567509632663e-05, "loss": 0.6817, "step": 11 }, { "epoch": 0.384, "grad_norm": 1.6896637753034105, "learning_rate": 3.9942720905593045e-05, "loss": 0.6761, "step": 12 }, { "epoch": 0.416, "grad_norm": 1.5060644054718806, "learning_rate": 3.98711989592637e-05, "loss": 0.6519, "step": 13 }, { "epoch": 0.448, "grad_norm": 1.4049366508424377, "learning_rate": 3.9771211711837774e-05, "loss": 0.6333, "step": 14 }, { "epoch": 0.48, "grad_norm": 1.411091031460123, "learning_rate": 3.9642902394084056e-05, "loss": 0.5874, "step": 15 }, { "epoch": 0.512, "grad_norm": 0.8883192699998052, "learning_rate": 3.948645480786427e-05, "loss": 0.6116, "step": 16 }, { "epoch": 0.544, "grad_norm": 1.129257283929009, "learning_rate": 3.930209306283867e-05, "loss": 0.5852, "step": 17 }, { "epoch": 0.576, "grad_norm": 0.901306643045694, "learning_rate": 3.909008125543111e-05, "loss": 0.5821, "step": 18 }, { "epoch": 0.608, "grad_norm": 0.9465933700636773, "learning_rate": 3.885072309051346e-05, "loss": 0.5833, "step": 19 }, { "epoch": 0.64, "grad_norm": 0.8046120689103757, "learning_rate": 3.858436144635131e-05, "loss": 0.5597, "step": 20 }, { "epoch": 0.672, "grad_norm": 0.9228859026973337, "learning_rate": 3.829137788343415e-05, "loss": 0.5571, "step": 21 }, { "epoch": 0.704, "grad_norm": 0.9202261747479373, "learning_rate": 3.797219209789365e-05, "loss": 0.6007, "step": 22 }, { "epoch": 0.736, "grad_norm": 0.9336252436543613, "learning_rate": 3.762726132029298e-05, "loss": 0.5618, "step": 23 }, { "epoch": 0.768, "grad_norm": 1.2080051557421407, "learning_rate": 3.725707966064846e-05, "loss": 0.6057, "step": 24 }, { "epoch": 0.8, "grad_norm": 1.1231601773572062, "learning_rate": 3.686217740062169e-05, "loss": 0.5603, "step": 25 }, { "epoch": 0.832, "grad_norm": 0.8382981788712457, "learning_rate": 3.644312023389621e-05, "loss": 0.5491, "step": 26 }, { "epoch": 0.864, "grad_norm": 1.1862939971134692, "learning_rate": 3.600050845582669e-05, "loss": 0.5887, "step": 27 }, { "epoch": 0.896, "grad_norm": 1.0957634618366314, "learning_rate": 3.5534976103521716e-05, "loss": 0.5958, "step": 28 }, { "epoch": 0.928, "grad_norm": 0.6590817382859444, "learning_rate": 3.504719004759163e-05, "loss": 0.5528, "step": 29 }, { "epoch": 0.96, "grad_norm": 0.875749033008583, "learning_rate": 3.4537849036862874e-05, "loss": 0.565, "step": 30 }, { "epoch": 0.992, "grad_norm": 0.783563683071678, "learning_rate": 3.400768269742702e-05, "loss": 0.5595, "step": 31 }, { "epoch": 1.024, "grad_norm": 1.4075572002103194, "learning_rate": 3.345745048745838e-05, "loss": 0.919, "step": 32 }, { "epoch": 1.056, "grad_norm": 0.7062499705952857, "learning_rate": 3.288794060929754e-05, "loss": 0.4404, "step": 33 }, { "epoch": 1.088, "grad_norm": 0.9967541412280615, "learning_rate": 3.229996888035908e-05, "loss": 0.4984, "step": 34 }, { "epoch": 1.12, "grad_norm": 0.8310704240859569, "learning_rate": 3.169437756448095e-05, "loss": 0.4807, "step": 35 }, { "epoch": 1.152, "grad_norm": 0.7016314595292313, "learning_rate": 3.107203416538969e-05, "loss": 0.4703, "step": 36 }, { "epoch": 1.184, "grad_norm": 1.4372935885534768, "learning_rate": 3.0433830184009694e-05, "loss": 0.4739, "step": 37 }, { "epoch": 1.216, "grad_norm": 0.8650508524606009, "learning_rate": 2.9780679841396668e-05, "loss": 0.4525, "step": 38 }, { "epoch": 1.248, "grad_norm": 1.0766618304625992, "learning_rate": 2.9113518769124836e-05, "loss": 0.4987, "step": 39 }, { "epoch": 1.28, "grad_norm": 0.6767070574855524, "learning_rate": 2.843330266900368e-05, "loss": 0.4475, "step": 40 }, { "epoch": 1.312, "grad_norm": 1.026763400096595, "learning_rate": 2.774100594404435e-05, "loss": 0.4667, "step": 41 }, { "epoch": 1.3439999999999999, "grad_norm": 0.9002290625499892, "learning_rate": 2.703762030263666e-05, "loss": 0.4916, "step": 42 }, { "epoch": 1.376, "grad_norm": 0.9903797867735974, "learning_rate": 2.632415333793648e-05, "loss": 0.4771, "step": 43 }, { "epoch": 1.408, "grad_norm": 0.7218070561744779, "learning_rate": 2.5601627084498146e-05, "loss": 0.407, "step": 44 }, { "epoch": 1.44, "grad_norm": 0.8313194407823631, "learning_rate": 2.4871076554219838e-05, "loss": 0.4442, "step": 45 }, { "epoch": 1.472, "grad_norm": 0.8236730874850681, "learning_rate": 2.413354825369906e-05, "loss": 0.5223, "step": 46 }, { "epoch": 1.504, "grad_norm": 0.6125019115754542, "learning_rate": 2.3390098685121938e-05, "loss": 0.42, "step": 47 }, { "epoch": 1.536, "grad_norm": 0.6737099841054438, "learning_rate": 2.264179283283405e-05, "loss": 0.4665, "step": 48 }, { "epoch": 1.568, "grad_norm": 0.5907056602966384, "learning_rate": 2.1889702637760627e-05, "loss": 0.4445, "step": 49 }, { "epoch": 1.6, "grad_norm": 0.6965345367451425, "learning_rate": 2.1134905461861486e-05, "loss": 0.5221, "step": 50 }, { "epoch": 1.6320000000000001, "grad_norm": 0.5006105897500711, "learning_rate": 2.0378482544820383e-05, "loss": 0.4218, "step": 51 }, { "epoch": 1.6640000000000001, "grad_norm": 0.6174830888739168, "learning_rate": 1.9621517455179627e-05, "loss": 0.476, "step": 52 }, { "epoch": 1.696, "grad_norm": 0.503990909520708, "learning_rate": 1.886509453813852e-05, "loss": 0.4275, "step": 53 }, { "epoch": 1.728, "grad_norm": 0.5974502021657286, "learning_rate": 1.8110297362239376e-05, "loss": 0.4757, "step": 54 }, { "epoch": 1.76, "grad_norm": 0.551329913445271, "learning_rate": 1.735820716716596e-05, "loss": 0.4757, "step": 55 }, { "epoch": 1.792, "grad_norm": 0.579064058765967, "learning_rate": 1.660990131487807e-05, "loss": 0.4182, "step": 56 }, { "epoch": 1.8239999999999998, "grad_norm": 0.548961385728647, "learning_rate": 1.586645174630094e-05, "loss": 0.4731, "step": 57 }, { "epoch": 1.8559999999999999, "grad_norm": 0.5764590922933827, "learning_rate": 1.5128923445780163e-05, "loss": 0.4271, "step": 58 }, { "epoch": 1.888, "grad_norm": 0.5781209646736115, "learning_rate": 1.4398372915501862e-05, "loss": 0.4644, "step": 59 }, { "epoch": 1.92, "grad_norm": 0.42862823912049036, "learning_rate": 1.3675846662063521e-05, "loss": 0.4071, "step": 60 }, { "epoch": 1.952, "grad_norm": 0.569159061133135, "learning_rate": 1.296237969736334e-05, "loss": 0.4561, "step": 61 }, { "epoch": 1.984, "grad_norm": 0.47530501617214926, "learning_rate": 1.2258994055955658e-05, "loss": 0.3817, "step": 62 }, { "epoch": 2.016, "grad_norm": 0.9251054706846084, "learning_rate": 1.156669733099632e-05, "loss": 0.7898, "step": 63 }, { "epoch": 2.048, "grad_norm": 0.5716376901076641, "learning_rate": 1.0886481230875172e-05, "loss": 0.3525, "step": 64 }, { "epoch": 2.08, "grad_norm": 0.5177936896448316, "learning_rate": 1.0219320158603337e-05, "loss": 0.3394, "step": 65 }, { "epoch": 2.112, "grad_norm": 0.5247806458447061, "learning_rate": 9.566169815990311e-06, "loss": 0.3834, "step": 66 }, { "epoch": 2.144, "grad_norm": 0.5455234154497576, "learning_rate": 8.92796583461031e-06, "loss": 0.3577, "step": 67 }, { "epoch": 2.176, "grad_norm": 0.5925096096878631, "learning_rate": 8.305622435519058e-06, "loss": 0.3831, "step": 68 }, { "epoch": 2.208, "grad_norm": 0.6820182428585542, "learning_rate": 7.70003111964093e-06, "loss": 0.376, "step": 69 }, { "epoch": 2.24, "grad_norm": 0.6320620904769954, "learning_rate": 7.112059390702459e-06, "loss": 0.3715, "step": 70 }, { "epoch": 2.2720000000000002, "grad_norm": 0.4928197929862798, "learning_rate": 6.542549512541623e-06, "loss": 0.3713, "step": 71 }, { "epoch": 2.304, "grad_norm": 0.4767133735569691, "learning_rate": 5.9923173025729895e-06, "loss": 0.3303, "step": 72 }, { "epoch": 2.336, "grad_norm": 0.7059312169326228, "learning_rate": 5.462150963137125e-06, "loss": 0.4568, "step": 73 }, { "epoch": 2.368, "grad_norm": 0.5003954153261982, "learning_rate": 4.952809952408375e-06, "loss": 0.3514, "step": 74 }, { "epoch": 2.4, "grad_norm": 0.5944579046423205, "learning_rate": 4.465023896478293e-06, "loss": 0.3627, "step": 75 }, { "epoch": 2.432, "grad_norm": 0.4180864471254852, "learning_rate": 3.999491544173311e-06, "loss": 0.3054, "step": 76 }, { "epoch": 2.464, "grad_norm": 0.44019542396333683, "learning_rate": 3.5568797661038004e-06, "loss": 0.375, "step": 77 }, { "epoch": 2.496, "grad_norm": 0.4294310051147678, "learning_rate": 3.137822599378315e-06, "loss": 0.3537, "step": 78 }, { "epoch": 2.528, "grad_norm": 0.3888176996168452, "learning_rate": 2.7429203393515426e-06, "loss": 0.378, "step": 79 }, { "epoch": 2.56, "grad_norm": 0.3576643131385393, "learning_rate": 2.372738679707023e-06, "loss": 0.3232, "step": 80 }, { "epoch": 2.592, "grad_norm": 0.3696160994931973, "learning_rate": 2.02780790210636e-06, "loss": 0.3542, "step": 81 }, { "epoch": 2.624, "grad_norm": 0.36882809964571234, "learning_rate": 1.7086221165658544e-06, "loss": 0.351, "step": 82 }, { "epoch": 2.656, "grad_norm": 0.34603236345776744, "learning_rate": 1.4156385536486973e-06, "loss": 0.3212, "step": 83 }, { "epoch": 2.6879999999999997, "grad_norm": 0.3805023899734686, "learning_rate": 1.1492769094865475e-06, "loss": 0.3744, "step": 84 }, { "epoch": 2.7199999999999998, "grad_norm": 0.3005431184449355, "learning_rate": 9.099187445688984e-07, "loss": 0.3071, "step": 85 }, { "epoch": 2.752, "grad_norm": 0.3625697661026582, "learning_rate": 6.979069371613345e-07, "loss": 0.3755, "step": 86 }, { "epoch": 2.784, "grad_norm": 0.30922554074419895, "learning_rate": 5.135451921357337e-07, "loss": 0.2993, "step": 87 }, { "epoch": 2.816, "grad_norm": 0.3209396414531254, "learning_rate": 3.570976059159481e-07, "loss": 0.3725, "step": 88 }, { "epoch": 2.848, "grad_norm": 0.3188890164441534, "learning_rate": 2.2878828816222942e-07, "loss": 0.3691, "step": 89 }, { "epoch": 2.88, "grad_norm": 0.30765746035254077, "learning_rate": 1.2880104073630163e-07, "loss": 0.3218, "step": 90 }, { "epoch": 2.912, "grad_norm": 0.3085708159717203, "learning_rate": 5.7279094406959e-08, "loss": 0.3625, "step": 91 }, { "epoch": 2.944, "grad_norm": 0.3092205424287526, "learning_rate": 1.4324903673370583e-08, "loss": 0.3543, "step": 92 }, { "epoch": 2.976, "grad_norm": 0.3121521435612877, "learning_rate": 0.0, "loss": 0.3917, "step": 93 }, { "epoch": 2.976, "step": 93, "total_flos": 1.818538711009198e+17, "train_loss": 0.5000655266546434, "train_runtime": 9080.5183, "train_samples_per_second": 0.99, "train_steps_per_second": 0.01 } ], "logging_steps": 1.0, "max_steps": 93, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.818538711009198e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }