{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9985401459854014, "eval_steps": 500, "global_step": 228, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004379562043795621, "grad_norm": 34.64235305786133, "learning_rate": 5.0000000000000004e-08, "loss": 2.6583, "step": 1 }, { "epoch": 0.008759124087591242, "grad_norm": 33.89678192138672, "learning_rate": 1.0000000000000001e-07, "loss": 2.5074, "step": 2 }, { "epoch": 0.013138686131386862, "grad_norm": 35.2148551940918, "learning_rate": 1.5000000000000002e-07, "loss": 2.7094, "step": 3 }, { "epoch": 0.017518248175182483, "grad_norm": 35.11457061767578, "learning_rate": 2.0000000000000002e-07, "loss": 2.7266, "step": 4 }, { "epoch": 0.021897810218978103, "grad_norm": 35.70753479003906, "learning_rate": 2.5000000000000004e-07, "loss": 2.7442, "step": 5 }, { "epoch": 0.026277372262773723, "grad_norm": 34.34943771362305, "learning_rate": 3.0000000000000004e-07, "loss": 2.5578, "step": 6 }, { "epoch": 0.030656934306569343, "grad_norm": 34.31540298461914, "learning_rate": 3.5000000000000004e-07, "loss": 2.5893, "step": 7 }, { "epoch": 0.035036496350364967, "grad_norm": 32.545223236083984, "learning_rate": 4.0000000000000003e-07, "loss": 2.5039, "step": 8 }, { "epoch": 0.03941605839416058, "grad_norm": 35.70431137084961, "learning_rate": 4.5000000000000003e-07, "loss": 2.6719, "step": 9 }, { "epoch": 0.043795620437956206, "grad_norm": 34.14265441894531, "learning_rate": 5.000000000000001e-07, "loss": 2.5764, "step": 10 }, { "epoch": 0.04817518248175182, "grad_norm": 32.08097839355469, "learning_rate": 5.5e-07, "loss": 2.4564, "step": 11 }, { "epoch": 0.052554744525547446, "grad_norm": 32.66060256958008, "learning_rate": 6.000000000000001e-07, "loss": 2.458, "step": 12 }, { "epoch": 0.05693430656934306, "grad_norm": 33.21636962890625, "learning_rate": 6.5e-07, "loss": 2.4835, "step": 13 }, { "epoch": 0.061313868613138686, "grad_norm": 33.92257308959961, "learning_rate": 7.000000000000001e-07, "loss": 2.4288, "step": 14 }, { "epoch": 0.06569343065693431, "grad_norm": 32.19805145263672, "learning_rate": 7.5e-07, "loss": 2.2411, "step": 15 }, { "epoch": 0.07007299270072993, "grad_norm": 32.355220794677734, "learning_rate": 8.000000000000001e-07, "loss": 2.1597, "step": 16 }, { "epoch": 0.07445255474452554, "grad_norm": 33.08480453491211, "learning_rate": 8.500000000000001e-07, "loss": 2.1377, "step": 17 }, { "epoch": 0.07883211678832117, "grad_norm": 33.459957122802734, "learning_rate": 9.000000000000001e-07, "loss": 2.0306, "step": 18 }, { "epoch": 0.08321167883211679, "grad_norm": 32.897315979003906, "learning_rate": 9.500000000000001e-07, "loss": 1.8697, "step": 19 }, { "epoch": 0.08759124087591241, "grad_norm": 33.81785202026367, "learning_rate": 1.0000000000000002e-06, "loss": 1.8147, "step": 20 }, { "epoch": 0.09197080291970802, "grad_norm": 32.52595520019531, "learning_rate": 1.0500000000000001e-06, "loss": 1.6526, "step": 21 }, { "epoch": 0.09635036496350365, "grad_norm": 34.09442138671875, "learning_rate": 1.1e-06, "loss": 1.6127, "step": 22 }, { "epoch": 0.10072992700729927, "grad_norm": 30.89822769165039, "learning_rate": 1.1500000000000002e-06, "loss": 1.3872, "step": 23 }, { "epoch": 0.10510948905109489, "grad_norm": 29.566524505615234, "learning_rate": 1.2000000000000002e-06, "loss": 1.2755, "step": 24 }, { "epoch": 0.10948905109489052, "grad_norm": 28.26628875732422, "learning_rate": 1.25e-06, "loss": 1.1409, "step": 25 }, { "epoch": 0.11386861313868613, "grad_norm": 30.7103328704834, "learning_rate": 1.3e-06, "loss": 0.966, "step": 26 }, { "epoch": 0.11824817518248175, "grad_norm": 28.975385665893555, "learning_rate": 1.3500000000000002e-06, "loss": 0.7579, "step": 27 }, { "epoch": 0.12262773722627737, "grad_norm": 26.821529388427734, "learning_rate": 1.4000000000000001e-06, "loss": 0.6013, "step": 28 }, { "epoch": 0.12700729927007298, "grad_norm": 23.804439544677734, "learning_rate": 1.45e-06, "loss": 0.4978, "step": 29 }, { "epoch": 0.13138686131386862, "grad_norm": 21.404451370239258, "learning_rate": 1.5e-06, "loss": 0.3926, "step": 30 }, { "epoch": 0.13576642335766423, "grad_norm": 17.63161849975586, "learning_rate": 1.5500000000000002e-06, "loss": 0.2568, "step": 31 }, { "epoch": 0.14014598540145987, "grad_norm": 10.998854637145996, "learning_rate": 1.6000000000000001e-06, "loss": 0.2373, "step": 32 }, { "epoch": 0.14452554744525548, "grad_norm": 6.9544997215271, "learning_rate": 1.6500000000000003e-06, "loss": 0.1689, "step": 33 }, { "epoch": 0.14890510948905109, "grad_norm": 5.1013102531433105, "learning_rate": 1.7000000000000002e-06, "loss": 0.1471, "step": 34 }, { "epoch": 0.15328467153284672, "grad_norm": 4.501709461212158, "learning_rate": 1.75e-06, "loss": 0.132, "step": 35 }, { "epoch": 0.15766423357664233, "grad_norm": 3.198529005050659, "learning_rate": 1.8000000000000001e-06, "loss": 0.1065, "step": 36 }, { "epoch": 0.16204379562043797, "grad_norm": 3.2325005531311035, "learning_rate": 1.85e-06, "loss": 0.0907, "step": 37 }, { "epoch": 0.16642335766423358, "grad_norm": 1.5125375986099243, "learning_rate": 1.9000000000000002e-06, "loss": 0.0782, "step": 38 }, { "epoch": 0.1708029197080292, "grad_norm": 1.9160635471343994, "learning_rate": 1.9500000000000004e-06, "loss": 0.0852, "step": 39 }, { "epoch": 0.17518248175182483, "grad_norm": 1.6062333583831787, "learning_rate": 2.0000000000000003e-06, "loss": 0.074, "step": 40 }, { "epoch": 0.17956204379562044, "grad_norm": 1.5675855875015259, "learning_rate": 2.05e-06, "loss": 0.0704, "step": 41 }, { "epoch": 0.18394160583941604, "grad_norm": 1.440182089805603, "learning_rate": 2.1000000000000002e-06, "loss": 0.0674, "step": 42 }, { "epoch": 0.18832116788321168, "grad_norm": 1.1466726064682007, "learning_rate": 2.15e-06, "loss": 0.0702, "step": 43 }, { "epoch": 0.1927007299270073, "grad_norm": 1.2195515632629395, "learning_rate": 2.2e-06, "loss": 0.0723, "step": 44 }, { "epoch": 0.19708029197080293, "grad_norm": 1.743561029434204, "learning_rate": 2.25e-06, "loss": 0.0875, "step": 45 }, { "epoch": 0.20145985401459854, "grad_norm": 0.9764343500137329, "learning_rate": 2.3000000000000004e-06, "loss": 0.062, "step": 46 }, { "epoch": 0.20583941605839415, "grad_norm": 0.8891277313232422, "learning_rate": 2.35e-06, "loss": 0.0576, "step": 47 }, { "epoch": 0.21021897810218979, "grad_norm": 0.9648666977882385, "learning_rate": 2.4000000000000003e-06, "loss": 0.0656, "step": 48 }, { "epoch": 0.2145985401459854, "grad_norm": 0.784566342830658, "learning_rate": 2.4500000000000003e-06, "loss": 0.0548, "step": 49 }, { "epoch": 0.21897810218978103, "grad_norm": 0.9402966499328613, "learning_rate": 2.5e-06, "loss": 0.0626, "step": 50 }, { "epoch": 0.22335766423357664, "grad_norm": 1.3284685611724854, "learning_rate": 2.55e-06, "loss": 0.0632, "step": 51 }, { "epoch": 0.22773722627737225, "grad_norm": 1.0913968086242676, "learning_rate": 2.6e-06, "loss": 0.0675, "step": 52 }, { "epoch": 0.2321167883211679, "grad_norm": 1.1069140434265137, "learning_rate": 2.6500000000000005e-06, "loss": 0.0541, "step": 53 }, { "epoch": 0.2364963503649635, "grad_norm": 0.8529757857322693, "learning_rate": 2.7000000000000004e-06, "loss": 0.0657, "step": 54 }, { "epoch": 0.24087591240875914, "grad_norm": 0.7182446718215942, "learning_rate": 2.7500000000000004e-06, "loss": 0.0607, "step": 55 }, { "epoch": 0.24525547445255474, "grad_norm": 1.0538653135299683, "learning_rate": 2.8000000000000003e-06, "loss": 0.0556, "step": 56 }, { "epoch": 0.24963503649635035, "grad_norm": 1.2083594799041748, "learning_rate": 2.85e-06, "loss": 0.0532, "step": 57 }, { "epoch": 0.25401459854014596, "grad_norm": 0.8183572888374329, "learning_rate": 2.9e-06, "loss": 0.0529, "step": 58 }, { "epoch": 0.2583941605839416, "grad_norm": 0.9014842510223389, "learning_rate": 2.95e-06, "loss": 0.0601, "step": 59 }, { "epoch": 0.26277372262773724, "grad_norm": 0.9017247557640076, "learning_rate": 3e-06, "loss": 0.0584, "step": 60 }, { "epoch": 0.2671532846715328, "grad_norm": 1.1078683137893677, "learning_rate": 3.05e-06, "loss": 0.0635, "step": 61 }, { "epoch": 0.27153284671532846, "grad_norm": 1.174526572227478, "learning_rate": 3.1000000000000004e-06, "loss": 0.0523, "step": 62 }, { "epoch": 0.2759124087591241, "grad_norm": 0.9296770095825195, "learning_rate": 3.1500000000000003e-06, "loss": 0.0588, "step": 63 }, { "epoch": 0.28029197080291973, "grad_norm": 0.8549372553825378, "learning_rate": 3.2000000000000003e-06, "loss": 0.0639, "step": 64 }, { "epoch": 0.2846715328467153, "grad_norm": 0.8956279158592224, "learning_rate": 3.2500000000000002e-06, "loss": 0.059, "step": 65 }, { "epoch": 0.28905109489051095, "grad_norm": 0.7937710285186768, "learning_rate": 3.3000000000000006e-06, "loss": 0.0579, "step": 66 }, { "epoch": 0.2934306569343066, "grad_norm": 0.7786620855331421, "learning_rate": 3.3500000000000005e-06, "loss": 0.0586, "step": 67 }, { "epoch": 0.29781021897810217, "grad_norm": 0.7562637329101562, "learning_rate": 3.4000000000000005e-06, "loss": 0.046, "step": 68 }, { "epoch": 0.3021897810218978, "grad_norm": 0.8958250880241394, "learning_rate": 3.45e-06, "loss": 0.0566, "step": 69 }, { "epoch": 0.30656934306569344, "grad_norm": 0.9434528946876526, "learning_rate": 3.5e-06, "loss": 0.0548, "step": 70 }, { "epoch": 0.310948905109489, "grad_norm": 1.0564453601837158, "learning_rate": 3.5500000000000003e-06, "loss": 0.0529, "step": 71 }, { "epoch": 0.31532846715328466, "grad_norm": 0.896443247795105, "learning_rate": 3.6000000000000003e-06, "loss": 0.0517, "step": 72 }, { "epoch": 0.3197080291970803, "grad_norm": 1.1364223957061768, "learning_rate": 3.65e-06, "loss": 0.0489, "step": 73 }, { "epoch": 0.32408759124087594, "grad_norm": 1.1319010257720947, "learning_rate": 3.7e-06, "loss": 0.0548, "step": 74 }, { "epoch": 0.3284671532846715, "grad_norm": 0.9694503545761108, "learning_rate": 3.7500000000000005e-06, "loss": 0.0525, "step": 75 }, { "epoch": 0.33284671532846716, "grad_norm": 0.8128111958503723, "learning_rate": 3.8000000000000005e-06, "loss": 0.0566, "step": 76 }, { "epoch": 0.3372262773722628, "grad_norm": 0.9068273901939392, "learning_rate": 3.85e-06, "loss": 0.0475, "step": 77 }, { "epoch": 0.3416058394160584, "grad_norm": 0.9689438343048096, "learning_rate": 3.900000000000001e-06, "loss": 0.048, "step": 78 }, { "epoch": 0.345985401459854, "grad_norm": 0.940131664276123, "learning_rate": 3.95e-06, "loss": 0.0567, "step": 79 }, { "epoch": 0.35036496350364965, "grad_norm": 0.8836082220077515, "learning_rate": 4.000000000000001e-06, "loss": 0.0542, "step": 80 }, { "epoch": 0.35474452554744523, "grad_norm": 0.9325949549674988, "learning_rate": 4.05e-06, "loss": 0.0551, "step": 81 }, { "epoch": 0.35912408759124087, "grad_norm": 0.8954764008522034, "learning_rate": 4.1e-06, "loss": 0.0517, "step": 82 }, { "epoch": 0.3635036496350365, "grad_norm": 0.6444959044456482, "learning_rate": 4.15e-06, "loss": 0.0434, "step": 83 }, { "epoch": 0.3678832116788321, "grad_norm": 0.9097581505775452, "learning_rate": 4.2000000000000004e-06, "loss": 0.0471, "step": 84 }, { "epoch": 0.3722627737226277, "grad_norm": 0.849006712436676, "learning_rate": 4.25e-06, "loss": 0.0529, "step": 85 }, { "epoch": 0.37664233576642336, "grad_norm": 0.8611392378807068, "learning_rate": 4.3e-06, "loss": 0.0513, "step": 86 }, { "epoch": 0.381021897810219, "grad_norm": 0.7885357737541199, "learning_rate": 4.350000000000001e-06, "loss": 0.0523, "step": 87 }, { "epoch": 0.3854014598540146, "grad_norm": 0.7642116546630859, "learning_rate": 4.4e-06, "loss": 0.0407, "step": 88 }, { "epoch": 0.3897810218978102, "grad_norm": 0.8920945525169373, "learning_rate": 4.450000000000001e-06, "loss": 0.0485, "step": 89 }, { "epoch": 0.39416058394160586, "grad_norm": 0.9801046848297119, "learning_rate": 4.5e-06, "loss": 0.0404, "step": 90 }, { "epoch": 0.39854014598540144, "grad_norm": 1.0874953269958496, "learning_rate": 4.5500000000000005e-06, "loss": 0.0588, "step": 91 }, { "epoch": 0.4029197080291971, "grad_norm": 0.9019029140472412, "learning_rate": 4.600000000000001e-06, "loss": 0.0466, "step": 92 }, { "epoch": 0.4072992700729927, "grad_norm": 0.7258988618850708, "learning_rate": 4.65e-06, "loss": 0.0493, "step": 93 }, { "epoch": 0.4116788321167883, "grad_norm": 1.103407859802246, "learning_rate": 4.7e-06, "loss": 0.0495, "step": 94 }, { "epoch": 0.41605839416058393, "grad_norm": 0.751805305480957, "learning_rate": 4.75e-06, "loss": 0.0484, "step": 95 }, { "epoch": 0.42043795620437957, "grad_norm": 0.7717764973640442, "learning_rate": 4.800000000000001e-06, "loss": 0.0447, "step": 96 }, { "epoch": 0.4248175182481752, "grad_norm": 0.7147190570831299, "learning_rate": 4.85e-06, "loss": 0.0523, "step": 97 }, { "epoch": 0.4291970802919708, "grad_norm": 0.9990110993385315, "learning_rate": 4.9000000000000005e-06, "loss": 0.0454, "step": 98 }, { "epoch": 0.4335766423357664, "grad_norm": 0.7766187191009521, "learning_rate": 4.95e-06, "loss": 0.0472, "step": 99 }, { "epoch": 0.43795620437956206, "grad_norm": 0.7124347686767578, "learning_rate": 5e-06, "loss": 0.0473, "step": 100 }, { "epoch": 0.44233576642335765, "grad_norm": 0.9340270757675171, "learning_rate": 4.99999232689698e-06, "loss": 0.0499, "step": 101 }, { "epoch": 0.4467153284671533, "grad_norm": 0.7429985404014587, "learning_rate": 4.999969307635021e-06, "loss": 0.042, "step": 102 }, { "epoch": 0.4510948905109489, "grad_norm": 0.9131317138671875, "learning_rate": 4.999930942355425e-06, "loss": 0.0519, "step": 103 }, { "epoch": 0.4554744525547445, "grad_norm": 0.9970843195915222, "learning_rate": 4.999877231293698e-06, "loss": 0.0428, "step": 104 }, { "epoch": 0.45985401459854014, "grad_norm": 0.7625145316123962, "learning_rate": 4.999808174779543e-06, "loss": 0.0442, "step": 105 }, { "epoch": 0.4642335766423358, "grad_norm": 0.6059474945068359, "learning_rate": 4.999723773236865e-06, "loss": 0.0456, "step": 106 }, { "epoch": 0.4686131386861314, "grad_norm": 0.6798833608627319, "learning_rate": 4.999624027183758e-06, "loss": 0.0408, "step": 107 }, { "epoch": 0.472992700729927, "grad_norm": 1.0250803232192993, "learning_rate": 4.999508937232514e-06, "loss": 0.0471, "step": 108 }, { "epoch": 0.47737226277372263, "grad_norm": 0.8457198739051819, "learning_rate": 4.999378504089609e-06, "loss": 0.0425, "step": 109 }, { "epoch": 0.48175182481751827, "grad_norm": 0.9417868852615356, "learning_rate": 4.999232728555705e-06, "loss": 0.0388, "step": 110 }, { "epoch": 0.48613138686131385, "grad_norm": 0.8558921813964844, "learning_rate": 4.999071611525643e-06, "loss": 0.0423, "step": 111 }, { "epoch": 0.4905109489051095, "grad_norm": 0.7070104479789734, "learning_rate": 4.998895153988437e-06, "loss": 0.0354, "step": 112 }, { "epoch": 0.4948905109489051, "grad_norm": 0.8162719011306763, "learning_rate": 4.998703357027268e-06, "loss": 0.0465, "step": 113 }, { "epoch": 0.4992700729927007, "grad_norm": 0.9140358567237854, "learning_rate": 4.998496221819479e-06, "loss": 0.0457, "step": 114 }, { "epoch": 0.5036496350364964, "grad_norm": 0.6447531580924988, "learning_rate": 4.998273749636564e-06, "loss": 0.039, "step": 115 }, { "epoch": 0.5080291970802919, "grad_norm": 0.9157156944274902, "learning_rate": 4.998035941844167e-06, "loss": 0.0469, "step": 116 }, { "epoch": 0.5124087591240876, "grad_norm": 0.7706230878829956, "learning_rate": 4.997782799902065e-06, "loss": 0.0325, "step": 117 }, { "epoch": 0.5167883211678832, "grad_norm": 0.9391443729400635, "learning_rate": 4.997514325364168e-06, "loss": 0.0397, "step": 118 }, { "epoch": 0.5211678832116788, "grad_norm": 1.0085054636001587, "learning_rate": 4.997230519878499e-06, "loss": 0.0403, "step": 119 }, { "epoch": 0.5255474452554745, "grad_norm": 1.8318824768066406, "learning_rate": 4.996931385187195e-06, "loss": 0.0463, "step": 120 }, { "epoch": 0.5299270072992701, "grad_norm": 1.0216630697250366, "learning_rate": 4.9966169231264885e-06, "loss": 0.0406, "step": 121 }, { "epoch": 0.5343065693430656, "grad_norm": 1.4819082021713257, "learning_rate": 4.9962871356267e-06, "loss": 0.0485, "step": 122 }, { "epoch": 0.5386861313868613, "grad_norm": 0.9435060024261475, "learning_rate": 4.995942024712222e-06, "loss": 0.04, "step": 123 }, { "epoch": 0.5430656934306569, "grad_norm": 0.7887905240058899, "learning_rate": 4.995581592501514e-06, "loss": 0.0397, "step": 124 }, { "epoch": 0.5474452554744526, "grad_norm": 0.8321148753166199, "learning_rate": 4.995205841207082e-06, "loss": 0.0413, "step": 125 }, { "epoch": 0.5518248175182482, "grad_norm": 1.0303553342819214, "learning_rate": 4.99481477313547e-06, "loss": 0.0422, "step": 126 }, { "epoch": 0.5562043795620438, "grad_norm": 0.7056427001953125, "learning_rate": 4.994408390687241e-06, "loss": 0.0362, "step": 127 }, { "epoch": 0.5605839416058395, "grad_norm": 0.9762740135192871, "learning_rate": 4.993986696356966e-06, "loss": 0.0385, "step": 128 }, { "epoch": 0.564963503649635, "grad_norm": 0.9447624683380127, "learning_rate": 4.9935496927332095e-06, "loss": 0.0402, "step": 129 }, { "epoch": 0.5693430656934306, "grad_norm": 0.6106760501861572, "learning_rate": 4.993097382498511e-06, "loss": 0.0319, "step": 130 }, { "epoch": 0.5737226277372263, "grad_norm": 1.0554594993591309, "learning_rate": 4.992629768429367e-06, "loss": 0.0437, "step": 131 }, { "epoch": 0.5781021897810219, "grad_norm": 1.066218376159668, "learning_rate": 4.992146853396219e-06, "loss": 0.0382, "step": 132 }, { "epoch": 0.5824817518248175, "grad_norm": 0.7517623901367188, "learning_rate": 4.991648640363434e-06, "loss": 0.0317, "step": 133 }, { "epoch": 0.5868613138686132, "grad_norm": 0.8136976957321167, "learning_rate": 4.991135132389282e-06, "loss": 0.0339, "step": 134 }, { "epoch": 0.5912408759124088, "grad_norm": 0.9254240989685059, "learning_rate": 4.990606332625923e-06, "loss": 0.0413, "step": 135 }, { "epoch": 0.5956204379562043, "grad_norm": 0.6778447031974792, "learning_rate": 4.990062244319387e-06, "loss": 0.0377, "step": 136 }, { "epoch": 0.6, "grad_norm": 1.1036059856414795, "learning_rate": 4.989502870809547e-06, "loss": 0.0376, "step": 137 }, { "epoch": 0.6043795620437956, "grad_norm": 0.8054158091545105, "learning_rate": 4.988928215530111e-06, "loss": 0.0367, "step": 138 }, { "epoch": 0.6087591240875913, "grad_norm": 0.9227175116539001, "learning_rate": 4.988338282008588e-06, "loss": 0.0374, "step": 139 }, { "epoch": 0.6131386861313869, "grad_norm": 0.8502228260040283, "learning_rate": 4.9877330738662755e-06, "loss": 0.0384, "step": 140 }, { "epoch": 0.6175182481751825, "grad_norm": 0.684752881526947, "learning_rate": 4.987112594818232e-06, "loss": 0.0366, "step": 141 }, { "epoch": 0.621897810218978, "grad_norm": 0.7456391453742981, "learning_rate": 4.9864768486732585e-06, "loss": 0.037, "step": 142 }, { "epoch": 0.6262773722627737, "grad_norm": 0.6797431111335754, "learning_rate": 4.985825839333872e-06, "loss": 0.0325, "step": 143 }, { "epoch": 0.6306569343065693, "grad_norm": 0.8098205924034119, "learning_rate": 4.985159570796279e-06, "loss": 0.0343, "step": 144 }, { "epoch": 0.635036496350365, "grad_norm": 0.8089592456817627, "learning_rate": 4.984478047150361e-06, "loss": 0.026, "step": 145 }, { "epoch": 0.6394160583941606, "grad_norm": 0.9282512664794922, "learning_rate": 4.983781272579637e-06, "loss": 0.0334, "step": 146 }, { "epoch": 0.6437956204379562, "grad_norm": 0.802608072757721, "learning_rate": 4.9830692513612445e-06, "loss": 0.0259, "step": 147 }, { "epoch": 0.6481751824817519, "grad_norm": 1.3046361207962036, "learning_rate": 4.982341987865914e-06, "loss": 0.045, "step": 148 }, { "epoch": 0.6525547445255474, "grad_norm": 1.0812411308288574, "learning_rate": 4.9815994865579405e-06, "loss": 0.0329, "step": 149 }, { "epoch": 0.656934306569343, "grad_norm": 0.7856137156486511, "learning_rate": 4.980841751995155e-06, "loss": 0.0341, "step": 150 }, { "epoch": 0.6613138686131387, "grad_norm": 1.0517083406448364, "learning_rate": 4.980068788828897e-06, "loss": 0.0299, "step": 151 }, { "epoch": 0.6656934306569343, "grad_norm": 0.6148231029510498, "learning_rate": 4.979280601803988e-06, "loss": 0.0304, "step": 152 }, { "epoch": 0.67007299270073, "grad_norm": 0.7572031021118164, "learning_rate": 4.9784771957586995e-06, "loss": 0.0309, "step": 153 }, { "epoch": 0.6744525547445256, "grad_norm": 2.0948777198791504, "learning_rate": 4.977658575624727e-06, "loss": 0.0307, "step": 154 }, { "epoch": 0.6788321167883211, "grad_norm": 0.624940037727356, "learning_rate": 4.976824746427153e-06, "loss": 0.03, "step": 155 }, { "epoch": 0.6832116788321168, "grad_norm": 0.8346346616744995, "learning_rate": 4.975975713284426e-06, "loss": 0.036, "step": 156 }, { "epoch": 0.6875912408759124, "grad_norm": 0.742098867893219, "learning_rate": 4.975111481408319e-06, "loss": 0.0325, "step": 157 }, { "epoch": 0.691970802919708, "grad_norm": 0.8000304102897644, "learning_rate": 4.9742320561039055e-06, "loss": 0.0332, "step": 158 }, { "epoch": 0.6963503649635037, "grad_norm": 1.063854694366455, "learning_rate": 4.973337442769523e-06, "loss": 0.0366, "step": 159 }, { "epoch": 0.7007299270072993, "grad_norm": 0.965560257434845, "learning_rate": 4.972427646896738e-06, "loss": 0.0331, "step": 160 }, { "epoch": 0.7051094890510949, "grad_norm": 1.5070244073867798, "learning_rate": 4.971502674070317e-06, "loss": 0.0446, "step": 161 }, { "epoch": 0.7094890510948905, "grad_norm": 0.8810545206069946, "learning_rate": 4.970562529968189e-06, "loss": 0.0299, "step": 162 }, { "epoch": 0.7138686131386861, "grad_norm": 0.7683446407318115, "learning_rate": 4.969607220361414e-06, "loss": 0.0244, "step": 163 }, { "epoch": 0.7182481751824817, "grad_norm": 0.7444891929626465, "learning_rate": 4.968636751114141e-06, "loss": 0.0338, "step": 164 }, { "epoch": 0.7226277372262774, "grad_norm": 0.7077688574790955, "learning_rate": 4.96765112818358e-06, "loss": 0.0285, "step": 165 }, { "epoch": 0.727007299270073, "grad_norm": 0.5648500919342041, "learning_rate": 4.9666503576199574e-06, "loss": 0.026, "step": 166 }, { "epoch": 0.7313868613138687, "grad_norm": 0.763556718826294, "learning_rate": 4.965634445566489e-06, "loss": 0.0299, "step": 167 }, { "epoch": 0.7357664233576642, "grad_norm": 0.6892725825309753, "learning_rate": 4.9646033982593315e-06, "loss": 0.023, "step": 168 }, { "epoch": 0.7401459854014598, "grad_norm": 1.0332573652267456, "learning_rate": 4.963557222027551e-06, "loss": 0.0313, "step": 169 }, { "epoch": 0.7445255474452555, "grad_norm": 1.214428424835205, "learning_rate": 4.962495923293081e-06, "loss": 0.027, "step": 170 }, { "epoch": 0.7489051094890511, "grad_norm": 0.9823130965232849, "learning_rate": 4.961419508570686e-06, "loss": 0.0231, "step": 171 }, { "epoch": 0.7532846715328467, "grad_norm": 1.2535115480422974, "learning_rate": 4.960327984467919e-06, "loss": 0.0326, "step": 172 }, { "epoch": 0.7576642335766424, "grad_norm": 0.9383441209793091, "learning_rate": 4.959221357685081e-06, "loss": 0.0286, "step": 173 }, { "epoch": 0.762043795620438, "grad_norm": 1.0426976680755615, "learning_rate": 4.958099635015182e-06, "loss": 0.0298, "step": 174 }, { "epoch": 0.7664233576642335, "grad_norm": 0.9159742593765259, "learning_rate": 4.956962823343895e-06, "loss": 0.025, "step": 175 }, { "epoch": 0.7708029197080292, "grad_norm": 0.8746912479400635, "learning_rate": 4.95581092964952e-06, "loss": 0.0299, "step": 176 }, { "epoch": 0.7751824817518248, "grad_norm": 0.9875199198722839, "learning_rate": 4.954643961002936e-06, "loss": 0.0309, "step": 177 }, { "epoch": 0.7795620437956204, "grad_norm": 0.7389516234397888, "learning_rate": 4.953461924567559e-06, "loss": 0.0291, "step": 178 }, { "epoch": 0.7839416058394161, "grad_norm": 0.790238082408905, "learning_rate": 4.952264827599299e-06, "loss": 0.0236, "step": 179 }, { "epoch": 0.7883211678832117, "grad_norm": 0.6766819953918457, "learning_rate": 4.951052677446515e-06, "loss": 0.0238, "step": 180 }, { "epoch": 0.7927007299270074, "grad_norm": 0.8832846283912659, "learning_rate": 4.94982548154997e-06, "loss": 0.0259, "step": 181 }, { "epoch": 0.7970802919708029, "grad_norm": 0.7298055291175842, "learning_rate": 4.948583247442783e-06, "loss": 0.023, "step": 182 }, { "epoch": 0.8014598540145985, "grad_norm": 0.911920428276062, "learning_rate": 4.947325982750387e-06, "loss": 0.0272, "step": 183 }, { "epoch": 0.8058394160583942, "grad_norm": 0.9145316481590271, "learning_rate": 4.946053695190479e-06, "loss": 0.0248, "step": 184 }, { "epoch": 0.8102189781021898, "grad_norm": 0.8759565353393555, "learning_rate": 4.9447663925729735e-06, "loss": 0.0263, "step": 185 }, { "epoch": 0.8145985401459854, "grad_norm": 1.1927592754364014, "learning_rate": 4.943464082799956e-06, "loss": 0.0305, "step": 186 }, { "epoch": 0.8189781021897811, "grad_norm": 0.752566933631897, "learning_rate": 4.942146773865631e-06, "loss": 0.0247, "step": 187 }, { "epoch": 0.8233576642335766, "grad_norm": 1.1121447086334229, "learning_rate": 4.940814473856278e-06, "loss": 0.0293, "step": 188 }, { "epoch": 0.8277372262773722, "grad_norm": 1.0319955348968506, "learning_rate": 4.939467190950195e-06, "loss": 0.0247, "step": 189 }, { "epoch": 0.8321167883211679, "grad_norm": 0.7960589528083801, "learning_rate": 4.938104933417655e-06, "loss": 0.0232, "step": 190 }, { "epoch": 0.8364963503649635, "grad_norm": 0.593197226524353, "learning_rate": 4.936727709620853e-06, "loss": 0.0232, "step": 191 }, { "epoch": 0.8408759124087591, "grad_norm": 0.6710584759712219, "learning_rate": 4.9353355280138525e-06, "loss": 0.0278, "step": 192 }, { "epoch": 0.8452554744525548, "grad_norm": 0.7627159357070923, "learning_rate": 4.933928397142535e-06, "loss": 0.0291, "step": 193 }, { "epoch": 0.8496350364963504, "grad_norm": 0.4998359680175781, "learning_rate": 4.93250632564455e-06, "loss": 0.018, "step": 194 }, { "epoch": 0.8540145985401459, "grad_norm": 0.8028760552406311, "learning_rate": 4.931069322249258e-06, "loss": 0.0193, "step": 195 }, { "epoch": 0.8583941605839416, "grad_norm": 0.6061640977859497, "learning_rate": 4.929617395777678e-06, "loss": 0.0142, "step": 196 }, { "epoch": 0.8627737226277372, "grad_norm": 0.5901748538017273, "learning_rate": 4.928150555142436e-06, "loss": 0.0177, "step": 197 }, { "epoch": 0.8671532846715329, "grad_norm": 0.7800254225730896, "learning_rate": 4.926668809347707e-06, "loss": 0.0264, "step": 198 }, { "epoch": 0.8715328467153285, "grad_norm": 0.9308339357376099, "learning_rate": 4.925172167489162e-06, "loss": 0.0247, "step": 199 }, { "epoch": 0.8759124087591241, "grad_norm": 0.9651213884353638, "learning_rate": 4.923660638753911e-06, "loss": 0.0216, "step": 200 }, { "epoch": 0.8802919708029197, "grad_norm": 1.1258251667022705, "learning_rate": 4.9221342324204455e-06, "loss": 0.0249, "step": 201 }, { "epoch": 0.8846715328467153, "grad_norm": 1.0175387859344482, "learning_rate": 4.9205929578585845e-06, "loss": 0.0201, "step": 202 }, { "epoch": 0.8890510948905109, "grad_norm": 1.5190610885620117, "learning_rate": 4.9190368245294155e-06, "loss": 0.0319, "step": 203 }, { "epoch": 0.8934306569343066, "grad_norm": 0.9947767853736877, "learning_rate": 4.917465841985234e-06, "loss": 0.0228, "step": 204 }, { "epoch": 0.8978102189781022, "grad_norm": 0.6416967511177063, "learning_rate": 4.91588001986949e-06, "loss": 0.0198, "step": 205 }, { "epoch": 0.9021897810218978, "grad_norm": 0.6980161666870117, "learning_rate": 4.914279367916724e-06, "loss": 0.0172, "step": 206 }, { "epoch": 0.9065693430656935, "grad_norm": 0.5301483869552612, "learning_rate": 4.912663895952511e-06, "loss": 0.0208, "step": 207 }, { "epoch": 0.910948905109489, "grad_norm": 0.6047857999801636, "learning_rate": 4.911033613893397e-06, "loss": 0.0227, "step": 208 }, { "epoch": 0.9153284671532846, "grad_norm": 0.6069537997245789, "learning_rate": 4.909388531746837e-06, "loss": 0.0195, "step": 209 }, { "epoch": 0.9197080291970803, "grad_norm": 0.6859843730926514, "learning_rate": 4.907728659611143e-06, "loss": 0.0244, "step": 210 }, { "epoch": 0.9240875912408759, "grad_norm": 0.6074005365371704, "learning_rate": 4.906054007675408e-06, "loss": 0.0195, "step": 211 }, { "epoch": 0.9284671532846716, "grad_norm": 1.1983692646026611, "learning_rate": 4.9043645862194545e-06, "loss": 0.023, "step": 212 }, { "epoch": 0.9328467153284672, "grad_norm": 0.8806214928627014, "learning_rate": 4.902660405613767e-06, "loss": 0.0243, "step": 213 }, { "epoch": 0.9372262773722628, "grad_norm": 0.6523962616920471, "learning_rate": 4.900941476319426e-06, "loss": 0.016, "step": 214 }, { "epoch": 0.9416058394160584, "grad_norm": 0.5673899054527283, "learning_rate": 4.899207808888051e-06, "loss": 0.0158, "step": 215 }, { "epoch": 0.945985401459854, "grad_norm": 0.9643133282661438, "learning_rate": 4.897459413961729e-06, "loss": 0.0194, "step": 216 }, { "epoch": 0.9503649635036496, "grad_norm": 0.6007612347602844, "learning_rate": 4.8956963022729495e-06, "loss": 0.0187, "step": 217 }, { "epoch": 0.9547445255474453, "grad_norm": 0.968173623085022, "learning_rate": 4.893918484644545e-06, "loss": 0.0223, "step": 218 }, { "epoch": 0.9591240875912409, "grad_norm": 0.6649457216262817, "learning_rate": 4.892125971989616e-06, "loss": 0.0205, "step": 219 }, { "epoch": 0.9635036496350365, "grad_norm": 0.48259082436561584, "learning_rate": 4.890318775311471e-06, "loss": 0.0121, "step": 220 }, { "epoch": 0.9678832116788321, "grad_norm": 0.8284991383552551, "learning_rate": 4.888496905703554e-06, "loss": 0.0176, "step": 221 }, { "epoch": 0.9722627737226277, "grad_norm": 0.5141683220863342, "learning_rate": 4.8866603743493805e-06, "loss": 0.0154, "step": 222 }, { "epoch": 0.9766423357664233, "grad_norm": 1.0223891735076904, "learning_rate": 4.884809192522466e-06, "loss": 0.0111, "step": 223 }, { "epoch": 0.981021897810219, "grad_norm": 0.844782292842865, "learning_rate": 4.882943371586256e-06, "loss": 0.016, "step": 224 }, { "epoch": 0.9854014598540146, "grad_norm": 0.6978311538696289, "learning_rate": 4.881062922994061e-06, "loss": 0.0129, "step": 225 }, { "epoch": 0.9897810218978103, "grad_norm": 0.8764100074768066, "learning_rate": 4.879167858288982e-06, "loss": 0.0213, "step": 226 }, { "epoch": 0.9941605839416059, "grad_norm": 1.0449023246765137, "learning_rate": 4.877258189103839e-06, "loss": 0.015, "step": 227 }, { "epoch": 0.9985401459854014, "grad_norm": 0.7534664869308472, "learning_rate": 4.875333927161104e-06, "loss": 0.0144, "step": 228 } ], "logging_steps": 1, "max_steps": 1368, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 228, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.7593636686135296e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }