| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.976, | |
| "eval_steps": 500, | |
| "global_step": 93, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 6.08677457937853, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.8709, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 5.862503603501722, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.8625, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 4.5723036455815205, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.839, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 2.0695238404961547, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.7376, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.589012282844458, | |
| "learning_rate": 2e-05, | |
| "loss": 0.825, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 8.490554966738163, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.8478, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 6.29894861103046, | |
| "learning_rate": 2.8e-05, | |
| "loss": 0.7658, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 3.558435531359445, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.7555, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 2.722863714076688, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.7274, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.088640250697761, | |
| "learning_rate": 4e-05, | |
| "loss": 0.6627, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 1.5096876602344986, | |
| "learning_rate": 3.998567509632663e-05, | |
| "loss": 0.6817, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 1.6896637753034105, | |
| "learning_rate": 3.9942720905593045e-05, | |
| "loss": 0.6761, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 1.5060644054718806, | |
| "learning_rate": 3.98711989592637e-05, | |
| "loss": 0.6519, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 1.4049366508424377, | |
| "learning_rate": 3.9771211711837774e-05, | |
| "loss": 0.6333, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.411091031460123, | |
| "learning_rate": 3.9642902394084056e-05, | |
| "loss": 0.5874, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.8883192699998052, | |
| "learning_rate": 3.948645480786427e-05, | |
| "loss": 0.6116, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 1.129257283929009, | |
| "learning_rate": 3.930209306283867e-05, | |
| "loss": 0.5852, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.901306643045694, | |
| "learning_rate": 3.909008125543111e-05, | |
| "loss": 0.5821, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.9465933700636773, | |
| "learning_rate": 3.885072309051346e-05, | |
| "loss": 0.5833, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.8046120689103757, | |
| "learning_rate": 3.858436144635131e-05, | |
| "loss": 0.5597, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.9228859026973337, | |
| "learning_rate": 3.829137788343415e-05, | |
| "loss": 0.5571, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.9202261747479373, | |
| "learning_rate": 3.797219209789365e-05, | |
| "loss": 0.6007, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.9336252436543613, | |
| "learning_rate": 3.762726132029298e-05, | |
| "loss": 0.5618, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.2080051557421407, | |
| "learning_rate": 3.725707966064846e-05, | |
| "loss": 0.6057, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.1231601773572062, | |
| "learning_rate": 3.686217740062169e-05, | |
| "loss": 0.5603, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.8382981788712457, | |
| "learning_rate": 3.644312023389621e-05, | |
| "loss": 0.5491, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 1.1862939971134692, | |
| "learning_rate": 3.600050845582669e-05, | |
| "loss": 0.5887, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.0957634618366314, | |
| "learning_rate": 3.5534976103521716e-05, | |
| "loss": 0.5958, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.6590817382859444, | |
| "learning_rate": 3.504719004759163e-05, | |
| "loss": 0.5528, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.875749033008583, | |
| "learning_rate": 3.4537849036862874e-05, | |
| "loss": 0.565, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.783563683071678, | |
| "learning_rate": 3.400768269742702e-05, | |
| "loss": 0.5595, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 1.4075572002103194, | |
| "learning_rate": 3.345745048745838e-05, | |
| "loss": 0.919, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 0.7062499705952857, | |
| "learning_rate": 3.288794060929754e-05, | |
| "loss": 0.4404, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 0.9967541412280615, | |
| "learning_rate": 3.229996888035908e-05, | |
| "loss": 0.4984, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.8310704240859569, | |
| "learning_rate": 3.169437756448095e-05, | |
| "loss": 0.4807, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 0.7016314595292313, | |
| "learning_rate": 3.107203416538969e-05, | |
| "loss": 0.4703, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 1.4372935885534768, | |
| "learning_rate": 3.0433830184009694e-05, | |
| "loss": 0.4739, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 0.8650508524606009, | |
| "learning_rate": 2.9780679841396668e-05, | |
| "loss": 0.4525, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 1.0766618304625992, | |
| "learning_rate": 2.9113518769124836e-05, | |
| "loss": 0.4987, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.6767070574855524, | |
| "learning_rate": 2.843330266900368e-05, | |
| "loss": 0.4475, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 1.026763400096595, | |
| "learning_rate": 2.774100594404435e-05, | |
| "loss": 0.4667, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 0.9002290625499892, | |
| "learning_rate": 2.703762030263666e-05, | |
| "loss": 0.4916, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 0.9903797867735974, | |
| "learning_rate": 2.632415333793648e-05, | |
| "loss": 0.4771, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 0.7218070561744779, | |
| "learning_rate": 2.5601627084498146e-05, | |
| "loss": 0.407, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.8313194407823631, | |
| "learning_rate": 2.4871076554219838e-05, | |
| "loss": 0.4442, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 0.8236730874850681, | |
| "learning_rate": 2.413354825369906e-05, | |
| "loss": 0.5223, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 0.6125019115754542, | |
| "learning_rate": 2.3390098685121938e-05, | |
| "loss": 0.42, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 0.6737099841054438, | |
| "learning_rate": 2.264179283283405e-05, | |
| "loss": 0.4665, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 0.5907056602966384, | |
| "learning_rate": 2.1889702637760627e-05, | |
| "loss": 0.4445, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.6965345367451425, | |
| "learning_rate": 2.1134905461861486e-05, | |
| "loss": 0.5221, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 0.5006105897500711, | |
| "learning_rate": 2.0378482544820383e-05, | |
| "loss": 0.4218, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 0.6174830888739168, | |
| "learning_rate": 1.9621517455179627e-05, | |
| "loss": 0.476, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 0.503990909520708, | |
| "learning_rate": 1.886509453813852e-05, | |
| "loss": 0.4275, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 0.5974502021657286, | |
| "learning_rate": 1.8110297362239376e-05, | |
| "loss": 0.4757, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.551329913445271, | |
| "learning_rate": 1.735820716716596e-05, | |
| "loss": 0.4757, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 0.579064058765967, | |
| "learning_rate": 1.660990131487807e-05, | |
| "loss": 0.4182, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 0.548961385728647, | |
| "learning_rate": 1.586645174630094e-05, | |
| "loss": 0.4731, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 0.5764590922933827, | |
| "learning_rate": 1.5128923445780163e-05, | |
| "loss": 0.4271, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 0.5781209646736115, | |
| "learning_rate": 1.4398372915501862e-05, | |
| "loss": 0.4644, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.42862823912049036, | |
| "learning_rate": 1.3675846662063521e-05, | |
| "loss": 0.4071, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 0.569159061133135, | |
| "learning_rate": 1.296237969736334e-05, | |
| "loss": 0.4561, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 0.47530501617214926, | |
| "learning_rate": 1.2258994055955658e-05, | |
| "loss": 0.3817, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 0.9251054706846084, | |
| "learning_rate": 1.156669733099632e-05, | |
| "loss": 0.7898, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 0.5716376901076641, | |
| "learning_rate": 1.0886481230875172e-05, | |
| "loss": 0.3525, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.5177936896448316, | |
| "learning_rate": 1.0219320158603337e-05, | |
| "loss": 0.3394, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 0.5247806458447061, | |
| "learning_rate": 9.566169815990311e-06, | |
| "loss": 0.3834, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 0.5455234154497576, | |
| "learning_rate": 8.92796583461031e-06, | |
| "loss": 0.3577, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 0.5925096096878631, | |
| "learning_rate": 8.305622435519058e-06, | |
| "loss": 0.3831, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 0.6820182428585542, | |
| "learning_rate": 7.70003111964093e-06, | |
| "loss": 0.376, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.6320620904769954, | |
| "learning_rate": 7.112059390702459e-06, | |
| "loss": 0.3715, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 0.4928197929862798, | |
| "learning_rate": 6.542549512541623e-06, | |
| "loss": 0.3713, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.4767133735569691, | |
| "learning_rate": 5.9923173025729895e-06, | |
| "loss": 0.3303, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 0.7059312169326228, | |
| "learning_rate": 5.462150963137125e-06, | |
| "loss": 0.4568, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 0.5003954153261982, | |
| "learning_rate": 4.952809952408375e-06, | |
| "loss": 0.3514, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.5944579046423205, | |
| "learning_rate": 4.465023896478293e-06, | |
| "loss": 0.3627, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 0.4180864471254852, | |
| "learning_rate": 3.999491544173311e-06, | |
| "loss": 0.3054, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 0.44019542396333683, | |
| "learning_rate": 3.5568797661038004e-06, | |
| "loss": 0.375, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 0.4294310051147678, | |
| "learning_rate": 3.137822599378315e-06, | |
| "loss": 0.3537, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 0.3888176996168452, | |
| "learning_rate": 2.7429203393515426e-06, | |
| "loss": 0.378, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.3576643131385393, | |
| "learning_rate": 2.372738679707023e-06, | |
| "loss": 0.3232, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 0.3696160994931973, | |
| "learning_rate": 2.02780790210636e-06, | |
| "loss": 0.3542, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 0.36882809964571234, | |
| "learning_rate": 1.7086221165658544e-06, | |
| "loss": 0.351, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 0.34603236345776744, | |
| "learning_rate": 1.4156385536486973e-06, | |
| "loss": 0.3212, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 0.3805023899734686, | |
| "learning_rate": 1.1492769094865475e-06, | |
| "loss": 0.3744, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.3005431184449355, | |
| "learning_rate": 9.099187445688984e-07, | |
| "loss": 0.3071, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 0.3625697661026582, | |
| "learning_rate": 6.979069371613345e-07, | |
| "loss": 0.3755, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 0.30922554074419895, | |
| "learning_rate": 5.135451921357337e-07, | |
| "loss": 0.2993, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 0.3209396414531254, | |
| "learning_rate": 3.570976059159481e-07, | |
| "loss": 0.3725, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 0.3188890164441534, | |
| "learning_rate": 2.2878828816222942e-07, | |
| "loss": 0.3691, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.30765746035254077, | |
| "learning_rate": 1.2880104073630163e-07, | |
| "loss": 0.3218, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 0.3085708159717203, | |
| "learning_rate": 5.7279094406959e-08, | |
| "loss": 0.3625, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 0.3092205424287526, | |
| "learning_rate": 1.4324903673370583e-08, | |
| "loss": 0.3543, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 0.3121521435612877, | |
| "learning_rate": 0.0, | |
| "loss": 0.3917, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "step": 93, | |
| "total_flos": 1.818538711009198e+17, | |
| "train_loss": 0.5000655266546434, | |
| "train_runtime": 9080.5183, | |
| "train_samples_per_second": 0.99, | |
| "train_steps_per_second": 0.01 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 93, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.818538711009198e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |