{ "best_metric": 0.7210479974746704, "best_model_checkpoint": "sean_test_out/checkpoint-81250", "epoch": 1.0, "eval_steps": 500, "global_step": 81250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 143092.3125, "learning_rate": 5.9630769230769234e-05, "loss": 1.284, "step": 500 }, { "epoch": 0.01, "grad_norm": 220955.625, "learning_rate": 5.926153846153846e-05, "loss": 1.0569, "step": 1000 }, { "epoch": 0.02, "grad_norm": 195870.921875, "learning_rate": 5.88923076923077e-05, "loss": 1.02, "step": 1500 }, { "epoch": 0.02, "grad_norm": 375734.96875, "learning_rate": 5.8523076923076926e-05, "loss": 0.9793, "step": 2000 }, { "epoch": 0.03, "grad_norm": 294788.75, "learning_rate": 5.815384615384616e-05, "loss": 0.9703, "step": 2500 }, { "epoch": 0.04, "grad_norm": 185062.484375, "learning_rate": 5.7784615384615384e-05, "loss": 0.9628, "step": 3000 }, { "epoch": 0.04, "grad_norm": 91022.734375, "learning_rate": 5.741538461538462e-05, "loss": 0.9524, "step": 3500 }, { "epoch": 0.05, "grad_norm": 262511.375, "learning_rate": 5.704615384615385e-05, "loss": 0.9158, "step": 4000 }, { "epoch": 0.06, "grad_norm": 205652.171875, "learning_rate": 5.667692307692308e-05, "loss": 0.9065, "step": 4500 }, { "epoch": 0.06, "grad_norm": 133203.75, "learning_rate": 5.630769230769231e-05, "loss": 0.9291, "step": 5000 }, { "epoch": 0.07, "grad_norm": 287849.9375, "learning_rate": 5.593846153846154e-05, "loss": 0.8981, "step": 5500 }, { "epoch": 0.07, "grad_norm": 295450.40625, "learning_rate": 5.556923076923077e-05, "loss": 0.8868, "step": 6000 }, { "epoch": 0.08, "grad_norm": 229412.40625, "learning_rate": 5.520000000000001e-05, "loss": 0.9001, "step": 6500 }, { "epoch": 0.09, "grad_norm": 362417.71875, "learning_rate": 5.483076923076923e-05, "loss": 0.8748, "step": 7000 }, { "epoch": 0.09, "grad_norm": 232139.9375, "learning_rate": 5.4461538461538466e-05, "loss": 0.8565, "step": 7500 }, { "epoch": 0.1, "grad_norm": 439068.125, "learning_rate": 5.409230769230769e-05, "loss": 0.8653, "step": 8000 }, { "epoch": 0.1, "grad_norm": 176573.96875, "learning_rate": 5.3723076923076924e-05, "loss": 0.8592, "step": 8500 }, { "epoch": 0.11, "grad_norm": 122979.9140625, "learning_rate": 5.335384615384616e-05, "loss": 0.8881, "step": 9000 }, { "epoch": 0.12, "grad_norm": 163604.125, "learning_rate": 5.298461538461539e-05, "loss": 0.8512, "step": 9500 }, { "epoch": 0.12, "grad_norm": 265370.75, "learning_rate": 5.2615384615384616e-05, "loss": 0.8654, "step": 10000 }, { "epoch": 0.13, "grad_norm": 308622.75, "learning_rate": 5.224615384615385e-05, "loss": 0.849, "step": 10500 }, { "epoch": 0.14, "grad_norm": 121160.9609375, "learning_rate": 5.1876923076923074e-05, "loss": 0.8561, "step": 11000 }, { "epoch": 0.14, "grad_norm": 172874.421875, "learning_rate": 5.1507692307692314e-05, "loss": 0.8487, "step": 11500 }, { "epoch": 0.15, "grad_norm": 227611.28125, "learning_rate": 5.113846153846154e-05, "loss": 0.8266, "step": 12000 }, { "epoch": 0.15, "grad_norm": 61996.62890625, "learning_rate": 5.076923076923077e-05, "loss": 0.8243, "step": 12500 }, { "epoch": 0.16, "grad_norm": 157856.609375, "learning_rate": 5.04e-05, "loss": 0.8127, "step": 13000 }, { "epoch": 0.17, "grad_norm": 308874.09375, "learning_rate": 5.003076923076923e-05, "loss": 0.8439, "step": 13500 }, { "epoch": 0.17, "grad_norm": 234965.546875, "learning_rate": 4.9661538461538464e-05, "loss": 0.8501, "step": 14000 }, { "epoch": 0.18, "grad_norm": 146659.5, "learning_rate": 4.92923076923077e-05, "loss": 0.8142, "step": 14500 }, { "epoch": 0.18, "grad_norm": 330394.90625, "learning_rate": 4.892307692307692e-05, "loss": 0.8347, "step": 15000 }, { "epoch": 0.19, "grad_norm": 134570.78125, "learning_rate": 4.8553846153846155e-05, "loss": 0.8189, "step": 15500 }, { "epoch": 0.2, "grad_norm": 231298.65625, "learning_rate": 4.818461538461538e-05, "loss": 0.8235, "step": 16000 }, { "epoch": 0.2, "grad_norm": 180303.484375, "learning_rate": 4.781538461538462e-05, "loss": 0.8131, "step": 16500 }, { "epoch": 0.21, "grad_norm": 326101.53125, "learning_rate": 4.744615384615385e-05, "loss": 0.8432, "step": 17000 }, { "epoch": 0.22, "grad_norm": 184914.640625, "learning_rate": 4.707692307692308e-05, "loss": 0.8138, "step": 17500 }, { "epoch": 0.22, "grad_norm": 159594.78125, "learning_rate": 4.6707692307692306e-05, "loss": 0.8243, "step": 18000 }, { "epoch": 0.23, "grad_norm": 282742.4375, "learning_rate": 4.6338461538461545e-05, "loss": 0.827, "step": 18500 }, { "epoch": 0.23, "grad_norm": 203088.765625, "learning_rate": 4.596923076923077e-05, "loss": 0.8547, "step": 19000 }, { "epoch": 0.24, "grad_norm": 220744.96875, "learning_rate": 4.5600000000000004e-05, "loss": 0.8126, "step": 19500 }, { "epoch": 0.25, "grad_norm": 244869.15625, "learning_rate": 4.523076923076923e-05, "loss": 0.8038, "step": 20000 }, { "epoch": 0.25, "grad_norm": 168005.03125, "learning_rate": 4.486153846153846e-05, "loss": 0.8196, "step": 20500 }, { "epoch": 0.26, "grad_norm": 201658.421875, "learning_rate": 4.4492307692307695e-05, "loss": 0.8087, "step": 21000 }, { "epoch": 0.26, "grad_norm": 149320.546875, "learning_rate": 4.412307692307693e-05, "loss": 0.8014, "step": 21500 }, { "epoch": 0.27, "grad_norm": 174892.421875, "learning_rate": 4.3753846153846154e-05, "loss": 0.8044, "step": 22000 }, { "epoch": 0.28, "grad_norm": 91686.6953125, "learning_rate": 4.338461538461539e-05, "loss": 0.8066, "step": 22500 }, { "epoch": 0.28, "grad_norm": 159807.546875, "learning_rate": 4.301538461538461e-05, "loss": 0.7999, "step": 23000 }, { "epoch": 0.29, "grad_norm": 74645.9609375, "learning_rate": 4.264615384615385e-05, "loss": 0.8069, "step": 23500 }, { "epoch": 0.3, "grad_norm": 150263.703125, "learning_rate": 4.227692307692308e-05, "loss": 0.8012, "step": 24000 }, { "epoch": 0.3, "grad_norm": 128120.828125, "learning_rate": 4.190769230769231e-05, "loss": 0.8068, "step": 24500 }, { "epoch": 0.31, "grad_norm": 179413.015625, "learning_rate": 4.153846153846154e-05, "loss": 0.7907, "step": 25000 }, { "epoch": 0.31, "grad_norm": 185550.65625, "learning_rate": 4.116923076923077e-05, "loss": 0.8009, "step": 25500 }, { "epoch": 0.32, "grad_norm": 188985.90625, "learning_rate": 4.08e-05, "loss": 0.7801, "step": 26000 }, { "epoch": 0.33, "grad_norm": 144887.1875, "learning_rate": 4.0430769230769235e-05, "loss": 0.8008, "step": 26500 }, { "epoch": 0.33, "grad_norm": 127779.3515625, "learning_rate": 4.006153846153846e-05, "loss": 0.8072, "step": 27000 }, { "epoch": 0.34, "grad_norm": 170718.90625, "learning_rate": 3.9692307692307694e-05, "loss": 0.7795, "step": 27500 }, { "epoch": 0.34, "grad_norm": 144447.703125, "learning_rate": 3.932307692307692e-05, "loss": 0.7961, "step": 28000 }, { "epoch": 0.35, "grad_norm": 214981.546875, "learning_rate": 3.895384615384616e-05, "loss": 0.8085, "step": 28500 }, { "epoch": 0.36, "grad_norm": 152927.65625, "learning_rate": 3.8584615384615385e-05, "loss": 0.8134, "step": 29000 }, { "epoch": 0.36, "grad_norm": 139227.46875, "learning_rate": 3.821538461538462e-05, "loss": 0.7859, "step": 29500 }, { "epoch": 0.37, "grad_norm": 180853.71875, "learning_rate": 3.7846153846153844e-05, "loss": 0.7803, "step": 30000 }, { "epoch": 0.38, "grad_norm": 223101.5625, "learning_rate": 3.747692307692308e-05, "loss": 0.7783, "step": 30500 }, { "epoch": 0.38, "grad_norm": 135408.609375, "learning_rate": 3.710769230769231e-05, "loss": 0.7729, "step": 31000 }, { "epoch": 0.39, "grad_norm": 162892.96875, "learning_rate": 3.673846153846154e-05, "loss": 0.7834, "step": 31500 }, { "epoch": 0.39, "grad_norm": 100030.2265625, "learning_rate": 3.636923076923077e-05, "loss": 0.7979, "step": 32000 }, { "epoch": 0.4, "grad_norm": 139896.234375, "learning_rate": 3.6e-05, "loss": 0.8103, "step": 32500 }, { "epoch": 0.41, "grad_norm": 252651.6875, "learning_rate": 3.563076923076923e-05, "loss": 0.7809, "step": 33000 }, { "epoch": 0.41, "grad_norm": 203613.046875, "learning_rate": 3.5261538461538466e-05, "loss": 0.7929, "step": 33500 }, { "epoch": 0.42, "grad_norm": 242859.828125, "learning_rate": 3.489230769230769e-05, "loss": 0.7644, "step": 34000 }, { "epoch": 0.42, "grad_norm": 119250.8203125, "learning_rate": 3.4523076923076925e-05, "loss": 0.783, "step": 34500 }, { "epoch": 0.43, "grad_norm": 242804.640625, "learning_rate": 3.415384615384615e-05, "loss": 0.7716, "step": 35000 }, { "epoch": 0.44, "grad_norm": 190967.953125, "learning_rate": 3.3784615384615384e-05, "loss": 0.7651, "step": 35500 }, { "epoch": 0.44, "grad_norm": 94749.5390625, "learning_rate": 3.3415384615384617e-05, "loss": 0.776, "step": 36000 }, { "epoch": 0.45, "grad_norm": 207242.640625, "learning_rate": 3.304615384615385e-05, "loss": 0.7691, "step": 36500 }, { "epoch": 0.46, "grad_norm": 168760.109375, "learning_rate": 3.2676923076923075e-05, "loss": 0.7726, "step": 37000 }, { "epoch": 0.46, "grad_norm": 118607.4921875, "learning_rate": 3.230769230769231e-05, "loss": 0.7882, "step": 37500 }, { "epoch": 0.47, "grad_norm": 273119.8125, "learning_rate": 3.1938461538461534e-05, "loss": 0.7551, "step": 38000 }, { "epoch": 0.47, "grad_norm": 146971.109375, "learning_rate": 3.1569230769230773e-05, "loss": 0.7744, "step": 38500 }, { "epoch": 0.48, "grad_norm": 137494.625, "learning_rate": 3.12e-05, "loss": 0.7544, "step": 39000 }, { "epoch": 0.49, "grad_norm": 157850.96875, "learning_rate": 3.083076923076923e-05, "loss": 0.7666, "step": 39500 }, { "epoch": 0.49, "grad_norm": 121842.53125, "learning_rate": 3.046153846153846e-05, "loss": 0.779, "step": 40000 }, { "epoch": 0.5, "grad_norm": 238816.578125, "learning_rate": 3.009230769230769e-05, "loss": 0.7595, "step": 40500 }, { "epoch": 0.5, "grad_norm": 202905.109375, "learning_rate": 2.9723076923076924e-05, "loss": 0.7742, "step": 41000 }, { "epoch": 0.51, "grad_norm": 118714.484375, "learning_rate": 2.9353846153846156e-05, "loss": 0.7616, "step": 41500 }, { "epoch": 0.52, "grad_norm": 78076.921875, "learning_rate": 2.8984615384615386e-05, "loss": 0.7365, "step": 42000 }, { "epoch": 0.52, "grad_norm": 185485.65625, "learning_rate": 2.861538461538462e-05, "loss": 0.7758, "step": 42500 }, { "epoch": 0.53, "grad_norm": 281552.0625, "learning_rate": 2.8246153846153848e-05, "loss": 0.7684, "step": 43000 }, { "epoch": 0.54, "grad_norm": 158893.921875, "learning_rate": 2.7876923076923077e-05, "loss": 0.7746, "step": 43500 }, { "epoch": 0.54, "grad_norm": 107463.6015625, "learning_rate": 2.750769230769231e-05, "loss": 0.7434, "step": 44000 }, { "epoch": 0.55, "grad_norm": 167811.15625, "learning_rate": 2.713846153846154e-05, "loss": 0.7878, "step": 44500 }, { "epoch": 0.55, "grad_norm": 246179.9375, "learning_rate": 2.6769230769230772e-05, "loss": 0.7566, "step": 45000 }, { "epoch": 0.56, "grad_norm": 192624.828125, "learning_rate": 2.64e-05, "loss": 0.7549, "step": 45500 }, { "epoch": 0.57, "grad_norm": 298613.28125, "learning_rate": 2.603076923076923e-05, "loss": 0.7586, "step": 46000 }, { "epoch": 0.57, "grad_norm": 188642.6875, "learning_rate": 2.5661538461538463e-05, "loss": 0.7435, "step": 46500 }, { "epoch": 0.58, "grad_norm": 170600.96875, "learning_rate": 2.5292307692307693e-05, "loss": 0.7559, "step": 47000 }, { "epoch": 0.58, "grad_norm": 114553.796875, "learning_rate": 2.4923076923076926e-05, "loss": 0.7369, "step": 47500 }, { "epoch": 0.59, "grad_norm": 177816.875, "learning_rate": 2.4553846153846155e-05, "loss": 0.7704, "step": 48000 }, { "epoch": 0.6, "grad_norm": 137487.265625, "learning_rate": 2.4184615384615384e-05, "loss": 0.7455, "step": 48500 }, { "epoch": 0.6, "grad_norm": 245769.703125, "learning_rate": 2.3815384615384617e-05, "loss": 0.7404, "step": 49000 }, { "epoch": 0.61, "grad_norm": 119365.515625, "learning_rate": 2.3446153846153846e-05, "loss": 0.7554, "step": 49500 }, { "epoch": 0.62, "grad_norm": 236374.578125, "learning_rate": 2.307692307692308e-05, "loss": 0.7582, "step": 50000 }, { "epoch": 0.62, "grad_norm": 167086.234375, "learning_rate": 2.270769230769231e-05, "loss": 0.7535, "step": 50500 }, { "epoch": 0.63, "grad_norm": 162424.609375, "learning_rate": 2.2338461538461538e-05, "loss": 0.7548, "step": 51000 }, { "epoch": 0.63, "grad_norm": 184460.578125, "learning_rate": 2.196923076923077e-05, "loss": 0.7365, "step": 51500 }, { "epoch": 0.64, "grad_norm": 82718.4375, "learning_rate": 2.16e-05, "loss": 0.7843, "step": 52000 }, { "epoch": 0.65, "grad_norm": 192023.21875, "learning_rate": 2.1230769230769233e-05, "loss": 0.7276, "step": 52500 }, { "epoch": 0.65, "grad_norm": 168197.328125, "learning_rate": 2.0861538461538462e-05, "loss": 0.759, "step": 53000 }, { "epoch": 0.66, "grad_norm": 215071.3125, "learning_rate": 2.049230769230769e-05, "loss": 0.733, "step": 53500 }, { "epoch": 0.66, "grad_norm": 215896.6875, "learning_rate": 2.0123076923076924e-05, "loss": 0.7423, "step": 54000 }, { "epoch": 0.67, "grad_norm": 231679.890625, "learning_rate": 1.9753846153846153e-05, "loss": 0.7586, "step": 54500 }, { "epoch": 0.68, "grad_norm": 198304.984375, "learning_rate": 1.9384615384615386e-05, "loss": 0.7276, "step": 55000 }, { "epoch": 0.68, "grad_norm": 154414.9375, "learning_rate": 1.9015384615384616e-05, "loss": 0.7266, "step": 55500 }, { "epoch": 0.69, "grad_norm": 104532.3984375, "learning_rate": 1.8646153846153845e-05, "loss": 0.7646, "step": 56000 }, { "epoch": 0.7, "grad_norm": 135327.46875, "learning_rate": 1.8276923076923078e-05, "loss": 0.7516, "step": 56500 }, { "epoch": 0.7, "grad_norm": 211435.890625, "learning_rate": 1.7907692307692307e-05, "loss": 0.748, "step": 57000 }, { "epoch": 0.71, "grad_norm": 198907.0, "learning_rate": 1.753846153846154e-05, "loss": 0.7445, "step": 57500 }, { "epoch": 0.71, "grad_norm": 159631.171875, "learning_rate": 1.716923076923077e-05, "loss": 0.738, "step": 58000 }, { "epoch": 0.72, "grad_norm": 99158.5234375, "learning_rate": 1.6800000000000002e-05, "loss": 0.7502, "step": 58500 }, { "epoch": 0.73, "grad_norm": 211235.359375, "learning_rate": 1.643076923076923e-05, "loss": 0.7467, "step": 59000 }, { "epoch": 0.73, "grad_norm": 196873.21875, "learning_rate": 1.606153846153846e-05, "loss": 0.7409, "step": 59500 }, { "epoch": 0.74, "grad_norm": 129529.453125, "learning_rate": 1.5692307692307693e-05, "loss": 0.7482, "step": 60000 }, { "epoch": 0.74, "grad_norm": 377672.9375, "learning_rate": 1.5323076923076923e-05, "loss": 0.7327, "step": 60500 }, { "epoch": 0.75, "grad_norm": 193886.515625, "learning_rate": 1.4953846153846154e-05, "loss": 0.7412, "step": 61000 }, { "epoch": 0.76, "grad_norm": 381785.875, "learning_rate": 1.4584615384615385e-05, "loss": 0.7404, "step": 61500 }, { "epoch": 0.76, "grad_norm": 180874.8125, "learning_rate": 1.4215384615384616e-05, "loss": 0.7358, "step": 62000 }, { "epoch": 0.77, "grad_norm": 128392.703125, "learning_rate": 1.3846153846153847e-05, "loss": 0.7432, "step": 62500 }, { "epoch": 0.78, "grad_norm": 86952.875, "learning_rate": 1.3476923076923076e-05, "loss": 0.7302, "step": 63000 }, { "epoch": 0.78, "grad_norm": 149977.921875, "learning_rate": 1.3107692307692307e-05, "loss": 0.7252, "step": 63500 }, { "epoch": 0.79, "grad_norm": 294148.46875, "learning_rate": 1.2738461538461538e-05, "loss": 0.7197, "step": 64000 }, { "epoch": 0.79, "grad_norm": 165598.09375, "learning_rate": 1.236923076923077e-05, "loss": 0.7557, "step": 64500 }, { "epoch": 0.8, "grad_norm": 70225.3125, "learning_rate": 1.2e-05, "loss": 0.7364, "step": 65000 }, { "epoch": 0.81, "grad_norm": 182316.828125, "learning_rate": 1.163076923076923e-05, "loss": 0.7468, "step": 65500 }, { "epoch": 0.81, "grad_norm": 214259.90625, "learning_rate": 1.126153846153846e-05, "loss": 0.7108, "step": 66000 }, { "epoch": 0.82, "grad_norm": 149377.9375, "learning_rate": 1.0892307692307692e-05, "loss": 0.7428, "step": 66500 }, { "epoch": 0.82, "grad_norm": 204207.546875, "learning_rate": 1.0523076923076923e-05, "loss": 0.709, "step": 67000 }, { "epoch": 0.83, "grad_norm": 117305.4296875, "learning_rate": 1.0153846153846154e-05, "loss": 0.7298, "step": 67500 }, { "epoch": 0.84, "grad_norm": 102029.6015625, "learning_rate": 9.784615384615385e-06, "loss": 0.7149, "step": 68000 }, { "epoch": 0.84, "grad_norm": 128112.8046875, "learning_rate": 9.415384615384616e-06, "loss": 0.7377, "step": 68500 }, { "epoch": 0.85, "grad_norm": 152489.640625, "learning_rate": 9.046153846153847e-06, "loss": 0.7092, "step": 69000 }, { "epoch": 0.86, "grad_norm": 163935.234375, "learning_rate": 8.676923076923078e-06, "loss": 0.7336, "step": 69500 }, { "epoch": 0.86, "grad_norm": 280511.15625, "learning_rate": 8.307692307692309e-06, "loss": 0.7268, "step": 70000 }, { "epoch": 0.87, "grad_norm": 99928.0234375, "learning_rate": 7.93846153846154e-06, "loss": 0.7194, "step": 70500 }, { "epoch": 0.87, "grad_norm": 154504.671875, "learning_rate": 7.569230769230769e-06, "loss": 0.7235, "step": 71000 }, { "epoch": 0.88, "grad_norm": 164383.453125, "learning_rate": 7.2e-06, "loss": 0.7418, "step": 71500 }, { "epoch": 0.89, "grad_norm": 228512.703125, "learning_rate": 6.830769230769231e-06, "loss": 0.7193, "step": 72000 }, { "epoch": 0.89, "grad_norm": 245047.8125, "learning_rate": 6.461538461538462e-06, "loss": 0.7192, "step": 72500 }, { "epoch": 0.9, "grad_norm": 159019.28125, "learning_rate": 6.092307692307692e-06, "loss": 0.7181, "step": 73000 }, { "epoch": 0.9, "grad_norm": 175369.8125, "learning_rate": 5.723076923076923e-06, "loss": 0.7187, "step": 73500 }, { "epoch": 0.91, "grad_norm": 328376.28125, "learning_rate": 5.353846153846153e-06, "loss": 0.728, "step": 74000 }, { "epoch": 0.92, "grad_norm": 217310.265625, "learning_rate": 4.984615384615385e-06, "loss": 0.7318, "step": 74500 }, { "epoch": 0.92, "grad_norm": 260174.71875, "learning_rate": 4.615384615384616e-06, "loss": 0.7076, "step": 75000 }, { "epoch": 0.93, "grad_norm": 208830.875, "learning_rate": 4.246153846153846e-06, "loss": 0.7062, "step": 75500 }, { "epoch": 0.94, "grad_norm": 293483.0, "learning_rate": 3.876923076923077e-06, "loss": 0.7124, "step": 76000 }, { "epoch": 0.94, "grad_norm": 120294.0078125, "learning_rate": 3.5076923076923076e-06, "loss": 0.7144, "step": 76500 }, { "epoch": 0.95, "grad_norm": 161758.796875, "learning_rate": 3.1384615384615382e-06, "loss": 0.7206, "step": 77000 }, { "epoch": 0.95, "grad_norm": 218907.75, "learning_rate": 2.7692307692307693e-06, "loss": 0.71, "step": 77500 }, { "epoch": 0.96, "grad_norm": 185302.8125, "learning_rate": 2.4000000000000003e-06, "loss": 0.7216, "step": 78000 }, { "epoch": 0.97, "grad_norm": 178341.28125, "learning_rate": 2.030769230769231e-06, "loss": 0.7362, "step": 78500 }, { "epoch": 0.97, "grad_norm": 149915.484375, "learning_rate": 1.6615384615384616e-06, "loss": 0.709, "step": 79000 }, { "epoch": 0.98, "grad_norm": 167170.640625, "learning_rate": 1.2923076923076922e-06, "loss": 0.7186, "step": 79500 }, { "epoch": 0.98, "grad_norm": 184650.28125, "learning_rate": 9.230769230769231e-07, "loss": 0.7077, "step": 80000 }, { "epoch": 0.99, "grad_norm": 111304.5078125, "learning_rate": 5.53846153846154e-07, "loss": 0.6904, "step": 80500 }, { "epoch": 1.0, "grad_norm": 277845.34375, "learning_rate": 1.846153846153846e-07, "loss": 0.7259, "step": 81000 }, { "epoch": 1.0, "eval_accuracy": 0.68412, "eval_loss": 0.7210479974746704, "eval_runtime": 1756.6277, "eval_samples_per_second": 28.464, "eval_steps_per_second": 3.558, "step": 81250 } ], "logging_steps": 500, "max_steps": 81250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 2.1137491040083968e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }