{ "best_global_step": 300, "best_metric": 0.017029576003551483, "best_model_checkpoint": "./results/checkpoint-300", "epoch": 0.6730769230769231, "eval_steps": 1, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019230769230769232, "grad_norm": 7.937829494476318, "learning_rate": 0.0001, "loss": 0.9513, "step": 1 }, { "epoch": 0.0019230769230769232, "eval_loss": 0.2997034788131714, "eval_runtime": 193.365, "eval_samples_per_second": 238.518, "eval_steps_per_second": 1.867, "step": 1 }, { "epoch": 0.0038461538461538464, "grad_norm": 7.157807350158691, "learning_rate": 9.980769230769231e-05, "loss": 0.5562, "step": 2 }, { "epoch": 0.0038461538461538464, "eval_loss": 0.2078278511762619, "eval_runtime": 194.5982, "eval_samples_per_second": 237.006, "eval_steps_per_second": 1.855, "step": 2 }, { "epoch": 0.0057692307692307696, "grad_norm": 8.241674423217773, "learning_rate": 9.961538461538463e-05, "loss": 0.439, "step": 3 }, { "epoch": 0.0057692307692307696, "eval_loss": 0.12207631766796112, "eval_runtime": 227.6594, "eval_samples_per_second": 202.588, "eval_steps_per_second": 1.586, "step": 3 }, { "epoch": 0.007692307692307693, "grad_norm": 5.947928428649902, "learning_rate": 9.942307692307693e-05, "loss": 0.3473, "step": 4 }, { "epoch": 0.007692307692307693, "eval_loss": 0.0955578088760376, "eval_runtime": 201.925, "eval_samples_per_second": 228.407, "eval_steps_per_second": 1.788, "step": 4 }, { "epoch": 0.009615384615384616, "grad_norm": 3.876149892807007, "learning_rate": 9.923076923076923e-05, "loss": 0.1832, "step": 5 }, { "epoch": 0.009615384615384616, "eval_loss": 0.07960505038499832, "eval_runtime": 194.1962, "eval_samples_per_second": 237.497, "eval_steps_per_second": 1.859, "step": 5 }, { "epoch": 0.011538461538461539, "grad_norm": 2.4056999683380127, "learning_rate": 9.903846153846155e-05, "loss": 0.145, "step": 6 }, { "epoch": 0.011538461538461539, "eval_loss": 0.07136845588684082, "eval_runtime": 190.7225, "eval_samples_per_second": 241.823, "eval_steps_per_second": 1.893, "step": 6 }, { "epoch": 0.013461538461538462, "grad_norm": 3.1581223011016846, "learning_rate": 9.884615384615386e-05, "loss": 0.15, "step": 7 }, { "epoch": 0.013461538461538462, "eval_loss": 0.0684390515089035, "eval_runtime": 195.3393, "eval_samples_per_second": 236.107, "eval_steps_per_second": 1.848, "step": 7 }, { "epoch": 0.015384615384615385, "grad_norm": 3.687472343444824, "learning_rate": 9.865384615384616e-05, "loss": 0.1821, "step": 8 }, { "epoch": 0.015384615384615385, "eval_loss": 0.07759178429841995, "eval_runtime": 192.748, "eval_samples_per_second": 239.281, "eval_steps_per_second": 1.873, "step": 8 }, { "epoch": 0.01730769230769231, "grad_norm": 1.418228268623352, "learning_rate": 9.846153846153848e-05, "loss": 0.0854, "step": 9 }, { "epoch": 0.01730769230769231, "eval_loss": 0.10003213584423065, "eval_runtime": 190.3642, "eval_samples_per_second": 242.278, "eval_steps_per_second": 1.896, "step": 9 }, { "epoch": 0.019230769230769232, "grad_norm": 4.458162784576416, "learning_rate": 9.826923076923077e-05, "loss": 0.1871, "step": 10 }, { "epoch": 0.019230769230769232, "eval_loss": 0.1014280766248703, "eval_runtime": 189.4477, "eval_samples_per_second": 243.45, "eval_steps_per_second": 1.906, "step": 10 }, { "epoch": 0.021153846153846155, "grad_norm": 5.193276882171631, "learning_rate": 9.807692307692307e-05, "loss": 0.1438, "step": 11 }, { "epoch": 0.021153846153846155, "eval_loss": 0.08028294146060944, "eval_runtime": 186.9296, "eval_samples_per_second": 246.729, "eval_steps_per_second": 1.931, "step": 11 }, { "epoch": 0.023076923076923078, "grad_norm": 2.2287089824676514, "learning_rate": 9.788461538461539e-05, "loss": 0.1486, "step": 12 }, { "epoch": 0.023076923076923078, "eval_loss": 0.07001757621765137, "eval_runtime": 184.0254, "eval_samples_per_second": 250.623, "eval_steps_per_second": 1.962, "step": 12 }, { "epoch": 0.025, "grad_norm": 1.668630838394165, "learning_rate": 9.76923076923077e-05, "loss": 0.0869, "step": 13 }, { "epoch": 0.025, "eval_loss": 0.059553906321525574, "eval_runtime": 181.9616, "eval_samples_per_second": 253.466, "eval_steps_per_second": 1.984, "step": 13 }, { "epoch": 0.026923076923076925, "grad_norm": 1.8616667985916138, "learning_rate": 9.75e-05, "loss": 0.1096, "step": 14 }, { "epoch": 0.026923076923076925, "eval_loss": 0.0529908612370491, "eval_runtime": 183.3273, "eval_samples_per_second": 251.577, "eval_steps_per_second": 1.969, "step": 14 }, { "epoch": 0.028846153846153848, "grad_norm": 2.9516372680664062, "learning_rate": 9.730769230769232e-05, "loss": 0.1237, "step": 15 }, { "epoch": 0.028846153846153848, "eval_loss": 0.052440524101257324, "eval_runtime": 184.1599, "eval_samples_per_second": 250.44, "eval_steps_per_second": 1.96, "step": 15 }, { "epoch": 0.03076923076923077, "grad_norm": 1.757940649986267, "learning_rate": 9.711538461538462e-05, "loss": 0.0928, "step": 16 }, { "epoch": 0.03076923076923077, "eval_loss": 0.05516982078552246, "eval_runtime": 184.2604, "eval_samples_per_second": 250.303, "eval_steps_per_second": 1.959, "step": 16 }, { "epoch": 0.032692307692307694, "grad_norm": 2.269965887069702, "learning_rate": 9.692307692307692e-05, "loss": 0.0939, "step": 17 }, { "epoch": 0.032692307692307694, "eval_loss": 0.058670658618211746, "eval_runtime": 184.3309, "eval_samples_per_second": 250.208, "eval_steps_per_second": 1.958, "step": 17 }, { "epoch": 0.03461538461538462, "grad_norm": 2.7135467529296875, "learning_rate": 9.673076923076924e-05, "loss": 0.0731, "step": 18 }, { "epoch": 0.03461538461538462, "eval_loss": 0.06141166388988495, "eval_runtime": 184.7185, "eval_samples_per_second": 249.683, "eval_steps_per_second": 1.954, "step": 18 }, { "epoch": 0.03653846153846154, "grad_norm": 1.853324294090271, "learning_rate": 9.653846153846155e-05, "loss": 0.0995, "step": 19 }, { "epoch": 0.03653846153846154, "eval_loss": 0.06201282888650894, "eval_runtime": 185.1901, "eval_samples_per_second": 249.047, "eval_steps_per_second": 1.949, "step": 19 }, { "epoch": 0.038461538461538464, "grad_norm": 3.1507911682128906, "learning_rate": 9.634615384615385e-05, "loss": 0.1082, "step": 20 }, { "epoch": 0.038461538461538464, "eval_loss": 0.054486844688653946, "eval_runtime": 186.7571, "eval_samples_per_second": 246.957, "eval_steps_per_second": 1.933, "step": 20 }, { "epoch": 0.04038461538461539, "grad_norm": 1.8622870445251465, "learning_rate": 9.615384615384617e-05, "loss": 0.1058, "step": 21 }, { "epoch": 0.04038461538461539, "eval_loss": 0.04959222301840782, "eval_runtime": 185.3392, "eval_samples_per_second": 248.846, "eval_steps_per_second": 1.948, "step": 21 }, { "epoch": 0.04230769230769231, "grad_norm": 2.0501298904418945, "learning_rate": 9.596153846153847e-05, "loss": 0.0793, "step": 22 }, { "epoch": 0.04230769230769231, "eval_loss": 0.04307747259736061, "eval_runtime": 186.5253, "eval_samples_per_second": 247.264, "eval_steps_per_second": 1.935, "step": 22 }, { "epoch": 0.04423076923076923, "grad_norm": 2.891623020172119, "learning_rate": 9.576923076923078e-05, "loss": 0.108, "step": 23 }, { "epoch": 0.04423076923076923, "eval_loss": 0.039741210639476776, "eval_runtime": 186.9639, "eval_samples_per_second": 246.684, "eval_steps_per_second": 1.931, "step": 23 }, { "epoch": 0.046153846153846156, "grad_norm": 3.0970637798309326, "learning_rate": 9.557692307692308e-05, "loss": 0.1315, "step": 24 }, { "epoch": 0.046153846153846156, "eval_loss": 0.03945652395486832, "eval_runtime": 184.7227, "eval_samples_per_second": 249.677, "eval_steps_per_second": 1.954, "step": 24 }, { "epoch": 0.04807692307692308, "grad_norm": 2.6016592979431152, "learning_rate": 9.53846153846154e-05, "loss": 0.0548, "step": 25 }, { "epoch": 0.04807692307692308, "eval_loss": 0.041967593133449554, "eval_runtime": 184.27, "eval_samples_per_second": 250.29, "eval_steps_per_second": 1.959, "step": 25 }, { "epoch": 0.05, "grad_norm": 2.6659207344055176, "learning_rate": 9.519230769230769e-05, "loss": 0.0879, "step": 26 }, { "epoch": 0.05, "eval_loss": 0.05009755492210388, "eval_runtime": 183.7036, "eval_samples_per_second": 251.062, "eval_steps_per_second": 1.965, "step": 26 }, { "epoch": 0.051923076923076926, "grad_norm": 1.676653265953064, "learning_rate": 9.5e-05, "loss": 0.0619, "step": 27 }, { "epoch": 0.051923076923076926, "eval_loss": 0.05302096903324127, "eval_runtime": 184.6409, "eval_samples_per_second": 249.788, "eval_steps_per_second": 1.955, "step": 27 }, { "epoch": 0.05384615384615385, "grad_norm": 1.7307082414627075, "learning_rate": 9.480769230769231e-05, "loss": 0.0603, "step": 28 }, { "epoch": 0.05384615384615385, "eval_loss": 0.050088509917259216, "eval_runtime": 185.2218, "eval_samples_per_second": 249.004, "eval_steps_per_second": 1.949, "step": 28 }, { "epoch": 0.05576923076923077, "grad_norm": 2.250020742416382, "learning_rate": 9.461538461538461e-05, "loss": 0.1022, "step": 29 }, { "epoch": 0.05576923076923077, "eval_loss": 0.043293166905641556, "eval_runtime": 184.2912, "eval_samples_per_second": 250.262, "eval_steps_per_second": 1.959, "step": 29 }, { "epoch": 0.057692307692307696, "grad_norm": 1.9879577159881592, "learning_rate": 9.442307692307693e-05, "loss": 0.0528, "step": 30 }, { "epoch": 0.057692307692307696, "eval_loss": 0.04137587174773216, "eval_runtime": 185.0258, "eval_samples_per_second": 249.268, "eval_steps_per_second": 1.951, "step": 30 }, { "epoch": 0.05961538461538462, "grad_norm": 1.5150986909866333, "learning_rate": 9.423076923076924e-05, "loss": 0.0844, "step": 31 }, { "epoch": 0.05961538461538462, "eval_loss": 0.0410715714097023, "eval_runtime": 183.2326, "eval_samples_per_second": 251.707, "eval_steps_per_second": 1.97, "step": 31 }, { "epoch": 0.06153846153846154, "grad_norm": 2.154343605041504, "learning_rate": 9.403846153846154e-05, "loss": 0.0756, "step": 32 }, { "epoch": 0.06153846153846154, "eval_loss": 0.04246753454208374, "eval_runtime": 185.129, "eval_samples_per_second": 249.129, "eval_steps_per_second": 1.95, "step": 32 }, { "epoch": 0.06346153846153846, "grad_norm": 2.0272536277770996, "learning_rate": 9.384615384615386e-05, "loss": 0.0892, "step": 33 }, { "epoch": 0.06346153846153846, "eval_loss": 0.04080929979681969, "eval_runtime": 182.0873, "eval_samples_per_second": 253.291, "eval_steps_per_second": 1.983, "step": 33 }, { "epoch": 0.06538461538461539, "grad_norm": 1.3116830587387085, "learning_rate": 9.365384615384616e-05, "loss": 0.0569, "step": 34 }, { "epoch": 0.06538461538461539, "eval_loss": 0.04051681235432625, "eval_runtime": 182.1351, "eval_samples_per_second": 253.224, "eval_steps_per_second": 1.982, "step": 34 }, { "epoch": 0.0673076923076923, "grad_norm": 1.3111695051193237, "learning_rate": 9.346153846153846e-05, "loss": 0.0717, "step": 35 }, { "epoch": 0.0673076923076923, "eval_loss": 0.0393197201192379, "eval_runtime": 184.6064, "eval_samples_per_second": 249.834, "eval_steps_per_second": 1.956, "step": 35 }, { "epoch": 0.06923076923076923, "grad_norm": 0.8019259572029114, "learning_rate": 9.326923076923077e-05, "loss": 0.0428, "step": 36 }, { "epoch": 0.06923076923076923, "eval_loss": 0.03660094365477562, "eval_runtime": 182.2104, "eval_samples_per_second": 253.12, "eval_steps_per_second": 1.981, "step": 36 }, { "epoch": 0.07115384615384615, "grad_norm": 2.83894419670105, "learning_rate": 9.307692307692309e-05, "loss": 0.105, "step": 37 }, { "epoch": 0.07115384615384615, "eval_loss": 0.03613066300749779, "eval_runtime": 184.1752, "eval_samples_per_second": 250.419, "eval_steps_per_second": 1.96, "step": 37 }, { "epoch": 0.07307692307692308, "grad_norm": 1.8959332704544067, "learning_rate": 9.288461538461539e-05, "loss": 0.0419, "step": 38 }, { "epoch": 0.07307692307692308, "eval_loss": 0.03350270912051201, "eval_runtime": 184.6287, "eval_samples_per_second": 249.804, "eval_steps_per_second": 1.955, "step": 38 }, { "epoch": 0.075, "grad_norm": 2.977259397506714, "learning_rate": 9.26923076923077e-05, "loss": 0.1345, "step": 39 }, { "epoch": 0.075, "eval_loss": 0.03360990434885025, "eval_runtime": 184.2045, "eval_samples_per_second": 250.379, "eval_steps_per_second": 1.96, "step": 39 }, { "epoch": 0.07692307692307693, "grad_norm": 1.0986078977584839, "learning_rate": 9.250000000000001e-05, "loss": 0.0586, "step": 40 }, { "epoch": 0.07692307692307693, "eval_loss": 0.03480533882975578, "eval_runtime": 185.2469, "eval_samples_per_second": 248.97, "eval_steps_per_second": 1.949, "step": 40 }, { "epoch": 0.07884615384615384, "grad_norm": 2.1644158363342285, "learning_rate": 9.230769230769232e-05, "loss": 0.048, "step": 41 }, { "epoch": 0.07884615384615384, "eval_loss": 0.03521328046917915, "eval_runtime": 183.8053, "eval_samples_per_second": 250.923, "eval_steps_per_second": 1.964, "step": 41 }, { "epoch": 0.08076923076923077, "grad_norm": 4.704512596130371, "learning_rate": 9.211538461538462e-05, "loss": 0.0967, "step": 42 }, { "epoch": 0.08076923076923077, "eval_loss": 0.039289865642786026, "eval_runtime": 183.4419, "eval_samples_per_second": 251.42, "eval_steps_per_second": 1.968, "step": 42 }, { "epoch": 0.08269230769230769, "grad_norm": 1.3928428888320923, "learning_rate": 9.192307692307692e-05, "loss": 0.0492, "step": 43 }, { "epoch": 0.08269230769230769, "eval_loss": 0.04399557411670685, "eval_runtime": 183.3842, "eval_samples_per_second": 251.499, "eval_steps_per_second": 1.969, "step": 43 }, { "epoch": 0.08461538461538462, "grad_norm": 1.3426079750061035, "learning_rate": 9.173076923076923e-05, "loss": 0.0756, "step": 44 }, { "epoch": 0.08461538461538462, "eval_loss": 0.04704272374510765, "eval_runtime": 184.0623, "eval_samples_per_second": 250.573, "eval_steps_per_second": 1.961, "step": 44 }, { "epoch": 0.08653846153846154, "grad_norm": 2.435299873352051, "learning_rate": 9.153846153846155e-05, "loss": 0.0769, "step": 45 }, { "epoch": 0.08653846153846154, "eval_loss": 0.04405398294329643, "eval_runtime": 186.5663, "eval_samples_per_second": 247.21, "eval_steps_per_second": 1.935, "step": 45 }, { "epoch": 0.08846153846153847, "grad_norm": 2.430859088897705, "learning_rate": 9.134615384615385e-05, "loss": 0.114, "step": 46 }, { "epoch": 0.08846153846153847, "eval_loss": 0.041341982781887054, "eval_runtime": 184.1332, "eval_samples_per_second": 250.476, "eval_steps_per_second": 1.961, "step": 46 }, { "epoch": 0.09038461538461538, "grad_norm": 2.121349334716797, "learning_rate": 9.115384615384615e-05, "loss": 0.0448, "step": 47 }, { "epoch": 0.09038461538461538, "eval_loss": 0.036010172218084335, "eval_runtime": 182.9905, "eval_samples_per_second": 252.04, "eval_steps_per_second": 1.973, "step": 47 }, { "epoch": 0.09230769230769231, "grad_norm": 1.8270937204360962, "learning_rate": 9.096153846153846e-05, "loss": 0.0642, "step": 48 }, { "epoch": 0.09230769230769231, "eval_loss": 0.034751046448946, "eval_runtime": 182.6922, "eval_samples_per_second": 252.452, "eval_steps_per_second": 1.976, "step": 48 }, { "epoch": 0.09423076923076923, "grad_norm": 1.234506368637085, "learning_rate": 9.076923076923078e-05, "loss": 0.0556, "step": 49 }, { "epoch": 0.09423076923076923, "eval_loss": 0.0333593524992466, "eval_runtime": 183.8994, "eval_samples_per_second": 250.795, "eval_steps_per_second": 1.963, "step": 49 }, { "epoch": 0.09615384615384616, "grad_norm": 1.265337347984314, "learning_rate": 9.057692307692308e-05, "loss": 0.0641, "step": 50 }, { "epoch": 0.09615384615384616, "eval_loss": 0.03294428065419197, "eval_runtime": 183.2342, "eval_samples_per_second": 251.705, "eval_steps_per_second": 1.97, "step": 50 }, { "epoch": 0.09807692307692308, "grad_norm": 1.0051465034484863, "learning_rate": 9.038461538461538e-05, "loss": 0.0402, "step": 51 }, { "epoch": 0.09807692307692308, "eval_loss": 0.03119943104684353, "eval_runtime": 183.9091, "eval_samples_per_second": 250.781, "eval_steps_per_second": 1.963, "step": 51 }, { "epoch": 0.1, "grad_norm": 1.3566607236862183, "learning_rate": 9.01923076923077e-05, "loss": 0.0447, "step": 52 }, { "epoch": 0.1, "eval_loss": 0.03182150423526764, "eval_runtime": 183.9643, "eval_samples_per_second": 250.706, "eval_steps_per_second": 1.962, "step": 52 }, { "epoch": 0.10192307692307692, "grad_norm": 3.3966872692108154, "learning_rate": 9e-05, "loss": 0.08, "step": 53 }, { "epoch": 0.10192307692307692, "eval_loss": 0.03427828475832939, "eval_runtime": 183.0367, "eval_samples_per_second": 251.977, "eval_steps_per_second": 1.972, "step": 53 }, { "epoch": 0.10384615384615385, "grad_norm": 1.4918586015701294, "learning_rate": 8.980769230769231e-05, "loss": 0.0785, "step": 54 }, { "epoch": 0.10384615384615385, "eval_loss": 0.03516368940472603, "eval_runtime": 182.9678, "eval_samples_per_second": 252.072, "eval_steps_per_second": 1.973, "step": 54 }, { "epoch": 0.10576923076923077, "grad_norm": 1.2416688203811646, "learning_rate": 8.961538461538463e-05, "loss": 0.0422, "step": 55 }, { "epoch": 0.10576923076923077, "eval_loss": 0.0376507006585598, "eval_runtime": 183.9714, "eval_samples_per_second": 250.697, "eval_steps_per_second": 1.962, "step": 55 }, { "epoch": 0.1076923076923077, "grad_norm": 2.5123279094696045, "learning_rate": 8.942307692307693e-05, "loss": 0.1594, "step": 56 }, { "epoch": 0.1076923076923077, "eval_loss": 0.038452692329883575, "eval_runtime": 182.5423, "eval_samples_per_second": 252.659, "eval_steps_per_second": 1.978, "step": 56 }, { "epoch": 0.10961538461538461, "grad_norm": 1.8067007064819336, "learning_rate": 8.923076923076924e-05, "loss": 0.0918, "step": 57 }, { "epoch": 0.10961538461538461, "eval_loss": 0.036170173436403275, "eval_runtime": 182.3949, "eval_samples_per_second": 252.863, "eval_steps_per_second": 1.979, "step": 57 }, { "epoch": 0.11153846153846154, "grad_norm": 2.269913911819458, "learning_rate": 8.903846153846154e-05, "loss": 0.0715, "step": 58 }, { "epoch": 0.11153846153846154, "eval_loss": 0.03197428211569786, "eval_runtime": 182.2905, "eval_samples_per_second": 253.008, "eval_steps_per_second": 1.98, "step": 58 }, { "epoch": 0.11346153846153846, "grad_norm": 1.736000418663025, "learning_rate": 8.884615384615384e-05, "loss": 0.0571, "step": 59 }, { "epoch": 0.11346153846153846, "eval_loss": 0.02888152375817299, "eval_runtime": 182.3969, "eval_samples_per_second": 252.861, "eval_steps_per_second": 1.979, "step": 59 }, { "epoch": 0.11538461538461539, "grad_norm": 1.3405442237854004, "learning_rate": 8.865384615384615e-05, "loss": 0.0856, "step": 60 }, { "epoch": 0.11538461538461539, "eval_loss": 0.026720238849520683, "eval_runtime": 182.7564, "eval_samples_per_second": 252.363, "eval_steps_per_second": 1.975, "step": 60 }, { "epoch": 0.11730769230769231, "grad_norm": 3.9098620414733887, "learning_rate": 8.846153846153847e-05, "loss": 0.0663, "step": 61 }, { "epoch": 0.11730769230769231, "eval_loss": 0.02634381875395775, "eval_runtime": 182.2338, "eval_samples_per_second": 253.087, "eval_steps_per_second": 1.981, "step": 61 }, { "epoch": 0.11923076923076924, "grad_norm": 2.598466634750366, "learning_rate": 8.826923076923077e-05, "loss": 0.0571, "step": 62 }, { "epoch": 0.11923076923076924, "eval_loss": 0.027414267882704735, "eval_runtime": 184.1358, "eval_samples_per_second": 250.473, "eval_steps_per_second": 1.961, "step": 62 }, { "epoch": 0.12115384615384615, "grad_norm": 2.4408273696899414, "learning_rate": 8.807692307692307e-05, "loss": 0.0538, "step": 63 }, { "epoch": 0.12115384615384615, "eval_loss": 0.029992803931236267, "eval_runtime": 183.9982, "eval_samples_per_second": 250.66, "eval_steps_per_second": 1.962, "step": 63 }, { "epoch": 0.12307692307692308, "grad_norm": 1.904429316520691, "learning_rate": 8.788461538461539e-05, "loss": 0.0478, "step": 64 }, { "epoch": 0.12307692307692308, "eval_loss": 0.03255239129066467, "eval_runtime": 183.9668, "eval_samples_per_second": 250.703, "eval_steps_per_second": 1.962, "step": 64 }, { "epoch": 0.125, "grad_norm": 2.515045166015625, "learning_rate": 8.76923076923077e-05, "loss": 0.0336, "step": 65 }, { "epoch": 0.125, "eval_loss": 0.03170439973473549, "eval_runtime": 184.8246, "eval_samples_per_second": 249.539, "eval_steps_per_second": 1.953, "step": 65 }, { "epoch": 0.12692307692307692, "grad_norm": 1.5397083759307861, "learning_rate": 8.75e-05, "loss": 0.0335, "step": 66 }, { "epoch": 0.12692307692307692, "eval_loss": 0.029536928981542587, "eval_runtime": 184.1714, "eval_samples_per_second": 250.424, "eval_steps_per_second": 1.96, "step": 66 }, { "epoch": 0.12884615384615383, "grad_norm": 1.85903000831604, "learning_rate": 8.730769230769232e-05, "loss": 0.0543, "step": 67 }, { "epoch": 0.12884615384615383, "eval_loss": 0.02915562316775322, "eval_runtime": 184.0688, "eval_samples_per_second": 250.564, "eval_steps_per_second": 1.961, "step": 67 }, { "epoch": 0.13076923076923078, "grad_norm": 2.114628791809082, "learning_rate": 8.711538461538462e-05, "loss": 0.0557, "step": 68 }, { "epoch": 0.13076923076923078, "eval_loss": 0.02956259623169899, "eval_runtime": 185.074, "eval_samples_per_second": 249.203, "eval_steps_per_second": 1.951, "step": 68 }, { "epoch": 0.1326923076923077, "grad_norm": 0.6971794962882996, "learning_rate": 8.692307692307692e-05, "loss": 0.0516, "step": 69 }, { "epoch": 0.1326923076923077, "eval_loss": 0.030795959755778313, "eval_runtime": 184.0759, "eval_samples_per_second": 250.554, "eval_steps_per_second": 1.961, "step": 69 }, { "epoch": 0.1346153846153846, "grad_norm": 0.9613048434257507, "learning_rate": 8.673076923076924e-05, "loss": 0.0615, "step": 70 }, { "epoch": 0.1346153846153846, "eval_loss": 0.032335903495550156, "eval_runtime": 183.9235, "eval_samples_per_second": 250.762, "eval_steps_per_second": 1.963, "step": 70 }, { "epoch": 0.13653846153846153, "grad_norm": 2.4344446659088135, "learning_rate": 8.653846153846155e-05, "loss": 0.0757, "step": 71 }, { "epoch": 0.13653846153846153, "eval_loss": 0.030952226370573044, "eval_runtime": 182.946, "eval_samples_per_second": 252.102, "eval_steps_per_second": 1.973, "step": 71 }, { "epoch": 0.13846153846153847, "grad_norm": 0.8824633955955505, "learning_rate": 8.634615384615385e-05, "loss": 0.0385, "step": 72 }, { "epoch": 0.13846153846153847, "eval_loss": 0.031768955290317535, "eval_runtime": 182.333, "eval_samples_per_second": 252.949, "eval_steps_per_second": 1.98, "step": 72 }, { "epoch": 0.14038461538461539, "grad_norm": 1.980722188949585, "learning_rate": 8.615384615384617e-05, "loss": 0.0624, "step": 73 }, { "epoch": 0.14038461538461539, "eval_loss": 0.03059910237789154, "eval_runtime": 182.0241, "eval_samples_per_second": 253.378, "eval_steps_per_second": 1.983, "step": 73 }, { "epoch": 0.1423076923076923, "grad_norm": 1.5362180471420288, "learning_rate": 8.596153846153847e-05, "loss": 0.0779, "step": 74 }, { "epoch": 0.1423076923076923, "eval_loss": 0.028434548527002335, "eval_runtime": 181.9839, "eval_samples_per_second": 253.435, "eval_steps_per_second": 1.984, "step": 74 }, { "epoch": 0.14423076923076922, "grad_norm": 4.635537147521973, "learning_rate": 8.576923076923076e-05, "loss": 0.093, "step": 75 }, { "epoch": 0.14423076923076922, "eval_loss": 0.029211917892098427, "eval_runtime": 182.7811, "eval_samples_per_second": 252.329, "eval_steps_per_second": 1.975, "step": 75 }, { "epoch": 0.14615384615384616, "grad_norm": 0.548228919506073, "learning_rate": 8.557692307692308e-05, "loss": 0.0144, "step": 76 }, { "epoch": 0.14615384615384616, "eval_loss": 0.02987842820584774, "eval_runtime": 183.1098, "eval_samples_per_second": 251.876, "eval_steps_per_second": 1.971, "step": 76 }, { "epoch": 0.14807692307692308, "grad_norm": 2.241633176803589, "learning_rate": 8.538461538461538e-05, "loss": 0.0331, "step": 77 }, { "epoch": 0.14807692307692308, "eval_loss": 0.029068879783153534, "eval_runtime": 182.0148, "eval_samples_per_second": 253.391, "eval_steps_per_second": 1.983, "step": 77 }, { "epoch": 0.15, "grad_norm": 2.180568218231201, "learning_rate": 8.519230769230769e-05, "loss": 0.0683, "step": 78 }, { "epoch": 0.15, "eval_loss": 0.02795836329460144, "eval_runtime": 183.975, "eval_samples_per_second": 250.692, "eval_steps_per_second": 1.962, "step": 78 }, { "epoch": 0.1519230769230769, "grad_norm": 2.788595199584961, "learning_rate": 8.5e-05, "loss": 0.0753, "step": 79 }, { "epoch": 0.1519230769230769, "eval_loss": 0.025826551020145416, "eval_runtime": 182.2724, "eval_samples_per_second": 253.033, "eval_steps_per_second": 1.981, "step": 79 }, { "epoch": 0.15384615384615385, "grad_norm": 3.0569944381713867, "learning_rate": 8.480769230769231e-05, "loss": 0.0658, "step": 80 }, { "epoch": 0.15384615384615385, "eval_loss": 0.023890940472483635, "eval_runtime": 183.5645, "eval_samples_per_second": 251.252, "eval_steps_per_second": 1.967, "step": 80 }, { "epoch": 0.15576923076923077, "grad_norm": 2.4523251056671143, "learning_rate": 8.461538461538461e-05, "loss": 0.0924, "step": 81 }, { "epoch": 0.15576923076923077, "eval_loss": 0.023335987702012062, "eval_runtime": 182.5326, "eval_samples_per_second": 252.673, "eval_steps_per_second": 1.978, "step": 81 }, { "epoch": 0.1576923076923077, "grad_norm": 2.6388676166534424, "learning_rate": 8.442307692307693e-05, "loss": 0.0725, "step": 82 }, { "epoch": 0.1576923076923077, "eval_loss": 0.023239314556121826, "eval_runtime": 183.2766, "eval_samples_per_second": 251.647, "eval_steps_per_second": 1.97, "step": 82 }, { "epoch": 0.1596153846153846, "grad_norm": 5.005173206329346, "learning_rate": 8.423076923076924e-05, "loss": 0.0981, "step": 83 }, { "epoch": 0.1596153846153846, "eval_loss": 0.023996710777282715, "eval_runtime": 182.5391, "eval_samples_per_second": 252.664, "eval_steps_per_second": 1.978, "step": 83 }, { "epoch": 0.16153846153846155, "grad_norm": 2.1891043186187744, "learning_rate": 8.403846153846154e-05, "loss": 0.0505, "step": 84 }, { "epoch": 0.16153846153846155, "eval_loss": 0.026021044701337814, "eval_runtime": 181.6905, "eval_samples_per_second": 253.844, "eval_steps_per_second": 1.987, "step": 84 }, { "epoch": 0.16346153846153846, "grad_norm": 1.2925877571105957, "learning_rate": 8.384615384615386e-05, "loss": 0.0465, "step": 85 }, { "epoch": 0.16346153846153846, "eval_loss": 0.029900116845965385, "eval_runtime": 183.3081, "eval_samples_per_second": 251.604, "eval_steps_per_second": 1.969, "step": 85 }, { "epoch": 0.16538461538461538, "grad_norm": 2.1117990016937256, "learning_rate": 8.365384615384616e-05, "loss": 0.0444, "step": 86 }, { "epoch": 0.16538461538461538, "eval_loss": 0.03299647569656372, "eval_runtime": 182.6014, "eval_samples_per_second": 252.578, "eval_steps_per_second": 1.977, "step": 86 }, { "epoch": 0.1673076923076923, "grad_norm": 2.243436813354492, "learning_rate": 8.346153846153847e-05, "loss": 0.0571, "step": 87 }, { "epoch": 0.1673076923076923, "eval_loss": 0.034737322479486465, "eval_runtime": 181.6185, "eval_samples_per_second": 253.944, "eval_steps_per_second": 1.988, "step": 87 }, { "epoch": 0.16923076923076924, "grad_norm": 2.0375781059265137, "learning_rate": 8.326923076923078e-05, "loss": 0.0529, "step": 88 }, { "epoch": 0.16923076923076924, "eval_loss": 0.033401209861040115, "eval_runtime": 182.5847, "eval_samples_per_second": 252.601, "eval_steps_per_second": 1.977, "step": 88 }, { "epoch": 0.17115384615384616, "grad_norm": 1.781933307647705, "learning_rate": 8.307692307692309e-05, "loss": 0.0823, "step": 89 }, { "epoch": 0.17115384615384616, "eval_loss": 0.030983150005340576, "eval_runtime": 182.1974, "eval_samples_per_second": 253.137, "eval_steps_per_second": 1.981, "step": 89 }, { "epoch": 0.17307692307692307, "grad_norm": 1.0016299486160278, "learning_rate": 8.288461538461539e-05, "loss": 0.0234, "step": 90 }, { "epoch": 0.17307692307692307, "eval_loss": 0.02751440368592739, "eval_runtime": 182.5732, "eval_samples_per_second": 252.616, "eval_steps_per_second": 1.977, "step": 90 }, { "epoch": 0.175, "grad_norm": 2.1391496658325195, "learning_rate": 8.26923076923077e-05, "loss": 0.0802, "step": 91 }, { "epoch": 0.175, "eval_loss": 0.025574011728167534, "eval_runtime": 181.6703, "eval_samples_per_second": 253.872, "eval_steps_per_second": 1.987, "step": 91 }, { "epoch": 0.17692307692307693, "grad_norm": 1.8795677423477173, "learning_rate": 8.25e-05, "loss": 0.041, "step": 92 }, { "epoch": 0.17692307692307693, "eval_loss": 0.02361590415239334, "eval_runtime": 181.4712, "eval_samples_per_second": 254.151, "eval_steps_per_second": 1.989, "step": 92 }, { "epoch": 0.17884615384615385, "grad_norm": 2.376096487045288, "learning_rate": 8.23076923076923e-05, "loss": 0.0387, "step": 93 }, { "epoch": 0.17884615384615385, "eval_loss": 0.022694583982229233, "eval_runtime": 181.0427, "eval_samples_per_second": 254.752, "eval_steps_per_second": 1.994, "step": 93 }, { "epoch": 0.18076923076923077, "grad_norm": 2.218397855758667, "learning_rate": 8.211538461538462e-05, "loss": 0.0409, "step": 94 }, { "epoch": 0.18076923076923077, "eval_loss": 0.02242557518184185, "eval_runtime": 182.2902, "eval_samples_per_second": 253.009, "eval_steps_per_second": 1.98, "step": 94 }, { "epoch": 0.18269230769230768, "grad_norm": 2.596670150756836, "learning_rate": 8.192307692307693e-05, "loss": 0.0595, "step": 95 }, { "epoch": 0.18269230769230768, "eval_loss": 0.02277774177491665, "eval_runtime": 182.0412, "eval_samples_per_second": 253.355, "eval_steps_per_second": 1.983, "step": 95 }, { "epoch": 0.18461538461538463, "grad_norm": 1.311848759651184, "learning_rate": 8.173076923076923e-05, "loss": 0.0389, "step": 96 }, { "epoch": 0.18461538461538463, "eval_loss": 0.024142242968082428, "eval_runtime": 181.193, "eval_samples_per_second": 254.541, "eval_steps_per_second": 1.992, "step": 96 }, { "epoch": 0.18653846153846154, "grad_norm": 0.9079859256744385, "learning_rate": 8.153846153846155e-05, "loss": 0.017, "step": 97 }, { "epoch": 0.18653846153846154, "eval_loss": 0.027621854096651077, "eval_runtime": 181.9685, "eval_samples_per_second": 253.456, "eval_steps_per_second": 1.984, "step": 97 }, { "epoch": 0.18846153846153846, "grad_norm": 1.0217198133468628, "learning_rate": 8.134615384615385e-05, "loss": 0.0186, "step": 98 }, { "epoch": 0.18846153846153846, "eval_loss": 0.03133901208639145, "eval_runtime": 181.8849, "eval_samples_per_second": 253.572, "eval_steps_per_second": 1.985, "step": 98 }, { "epoch": 0.19038461538461537, "grad_norm": 1.6041982173919678, "learning_rate": 8.115384615384616e-05, "loss": 0.0528, "step": 99 }, { "epoch": 0.19038461538461537, "eval_loss": 0.03536279872059822, "eval_runtime": 181.8467, "eval_samples_per_second": 253.626, "eval_steps_per_second": 1.985, "step": 99 }, { "epoch": 0.19230769230769232, "grad_norm": 2.190931558609009, "learning_rate": 8.096153846153847e-05, "loss": 0.0483, "step": 100 }, { "epoch": 0.19230769230769232, "eval_loss": 0.03583548963069916, "eval_runtime": 181.8924, "eval_samples_per_second": 253.562, "eval_steps_per_second": 1.985, "step": 100 }, { "epoch": 0.19423076923076923, "grad_norm": 1.281062126159668, "learning_rate": 8.076923076923078e-05, "loss": 0.021, "step": 101 }, { "epoch": 0.19423076923076923, "eval_loss": 0.03581365570425987, "eval_runtime": 181.2306, "eval_samples_per_second": 254.488, "eval_steps_per_second": 1.992, "step": 101 }, { "epoch": 0.19615384615384615, "grad_norm": 1.3938978910446167, "learning_rate": 8.057692307692308e-05, "loss": 0.0326, "step": 102 }, { "epoch": 0.19615384615384615, "eval_loss": 0.03531914949417114, "eval_runtime": 182.1448, "eval_samples_per_second": 253.211, "eval_steps_per_second": 1.982, "step": 102 }, { "epoch": 0.19807692307692307, "grad_norm": 3.451387643814087, "learning_rate": 8.038461538461538e-05, "loss": 0.1205, "step": 103 }, { "epoch": 0.19807692307692307, "eval_loss": 0.03354247659444809, "eval_runtime": 182.8587, "eval_samples_per_second": 252.222, "eval_steps_per_second": 1.974, "step": 103 }, { "epoch": 0.2, "grad_norm": 1.6962478160858154, "learning_rate": 8.01923076923077e-05, "loss": 0.0423, "step": 104 }, { "epoch": 0.2, "eval_loss": 0.031101541593670845, "eval_runtime": 183.5946, "eval_samples_per_second": 251.211, "eval_steps_per_second": 1.966, "step": 104 }, { "epoch": 0.20192307692307693, "grad_norm": 1.4025405645370483, "learning_rate": 8e-05, "loss": 0.0895, "step": 105 }, { "epoch": 0.20192307692307693, "eval_loss": 0.02922738529741764, "eval_runtime": 185.2005, "eval_samples_per_second": 249.033, "eval_steps_per_second": 1.949, "step": 105 }, { "epoch": 0.20384615384615384, "grad_norm": 1.1363201141357422, "learning_rate": 7.980769230769231e-05, "loss": 0.0254, "step": 106 }, { "epoch": 0.20384615384615384, "eval_loss": 0.027069460600614548, "eval_runtime": 183.2009, "eval_samples_per_second": 251.751, "eval_steps_per_second": 1.971, "step": 106 }, { "epoch": 0.20576923076923076, "grad_norm": 0.6673398613929749, "learning_rate": 7.961538461538461e-05, "loss": 0.0149, "step": 107 }, { "epoch": 0.20576923076923076, "eval_loss": 0.025280024856328964, "eval_runtime": 182.5339, "eval_samples_per_second": 252.671, "eval_steps_per_second": 1.978, "step": 107 }, { "epoch": 0.2076923076923077, "grad_norm": 2.2569580078125, "learning_rate": 7.942307692307692e-05, "loss": 0.0917, "step": 108 }, { "epoch": 0.2076923076923077, "eval_loss": 0.024966858327388763, "eval_runtime": 183.4982, "eval_samples_per_second": 251.343, "eval_steps_per_second": 1.967, "step": 108 }, { "epoch": 0.20961538461538462, "grad_norm": 2.1165764331817627, "learning_rate": 7.923076923076924e-05, "loss": 0.0888, "step": 109 }, { "epoch": 0.20961538461538462, "eval_loss": 0.02572954259812832, "eval_runtime": 182.2057, "eval_samples_per_second": 253.126, "eval_steps_per_second": 1.981, "step": 109 }, { "epoch": 0.21153846153846154, "grad_norm": 1.5322438478469849, "learning_rate": 7.903846153846154e-05, "loss": 0.0378, "step": 110 }, { "epoch": 0.21153846153846154, "eval_loss": 0.027774129062891006, "eval_runtime": 183.4476, "eval_samples_per_second": 251.412, "eval_steps_per_second": 1.968, "step": 110 }, { "epoch": 0.21346153846153845, "grad_norm": 1.5025537014007568, "learning_rate": 7.884615384615384e-05, "loss": 0.089, "step": 111 }, { "epoch": 0.21346153846153845, "eval_loss": 0.03216475620865822, "eval_runtime": 182.6688, "eval_samples_per_second": 252.484, "eval_steps_per_second": 1.976, "step": 111 }, { "epoch": 0.2153846153846154, "grad_norm": 1.4053198099136353, "learning_rate": 7.865384615384616e-05, "loss": 0.0632, "step": 112 }, { "epoch": 0.2153846153846154, "eval_loss": 0.03782927617430687, "eval_runtime": 183.2384, "eval_samples_per_second": 251.699, "eval_steps_per_second": 1.97, "step": 112 }, { "epoch": 0.2173076923076923, "grad_norm": 5.42258882522583, "learning_rate": 7.846153846153847e-05, "loss": 0.1271, "step": 113 }, { "epoch": 0.2173076923076923, "eval_loss": 0.040466830134391785, "eval_runtime": 181.9963, "eval_samples_per_second": 253.417, "eval_steps_per_second": 1.984, "step": 113 }, { "epoch": 0.21923076923076923, "grad_norm": 2.1966440677642822, "learning_rate": 7.826923076923077e-05, "loss": 0.0392, "step": 114 }, { "epoch": 0.21923076923076923, "eval_loss": 0.03886817768216133, "eval_runtime": 182.0201, "eval_samples_per_second": 253.384, "eval_steps_per_second": 1.983, "step": 114 }, { "epoch": 0.22115384615384615, "grad_norm": 1.9059343338012695, "learning_rate": 7.807692307692307e-05, "loss": 0.026, "step": 115 }, { "epoch": 0.22115384615384615, "eval_loss": 0.034843236207962036, "eval_runtime": 182.0596, "eval_samples_per_second": 253.329, "eval_steps_per_second": 1.983, "step": 115 }, { "epoch": 0.2230769230769231, "grad_norm": 3.3110170364379883, "learning_rate": 7.788461538461539e-05, "loss": 0.0732, "step": 116 }, { "epoch": 0.2230769230769231, "eval_loss": 0.03086087852716446, "eval_runtime": 181.1715, "eval_samples_per_second": 254.571, "eval_steps_per_second": 1.993, "step": 116 }, { "epoch": 0.225, "grad_norm": 0.64942467212677, "learning_rate": 7.76923076923077e-05, "loss": 0.0175, "step": 117 }, { "epoch": 0.225, "eval_loss": 0.028490141034126282, "eval_runtime": 181.1702, "eval_samples_per_second": 254.573, "eval_steps_per_second": 1.993, "step": 117 }, { "epoch": 0.22692307692307692, "grad_norm": 1.8217730522155762, "learning_rate": 7.75e-05, "loss": 0.0407, "step": 118 }, { "epoch": 0.22692307692307692, "eval_loss": 0.026924345642328262, "eval_runtime": 181.6397, "eval_samples_per_second": 253.915, "eval_steps_per_second": 1.987, "step": 118 }, { "epoch": 0.22884615384615384, "grad_norm": 5.165238380432129, "learning_rate": 7.730769230769232e-05, "loss": 0.1066, "step": 119 }, { "epoch": 0.22884615384615384, "eval_loss": 0.026207981631159782, "eval_runtime": 181.6267, "eval_samples_per_second": 253.933, "eval_steps_per_second": 1.988, "step": 119 }, { "epoch": 0.23076923076923078, "grad_norm": 4.6868977546691895, "learning_rate": 7.711538461538462e-05, "loss": 0.0843, "step": 120 }, { "epoch": 0.23076923076923078, "eval_loss": 0.02519201673567295, "eval_runtime": 181.6069, "eval_samples_per_second": 253.961, "eval_steps_per_second": 1.988, "step": 120 }, { "epoch": 0.2326923076923077, "grad_norm": 3.41184139251709, "learning_rate": 7.692307692307693e-05, "loss": 0.0988, "step": 121 }, { "epoch": 0.2326923076923077, "eval_loss": 0.023151233792304993, "eval_runtime": 181.4734, "eval_samples_per_second": 254.147, "eval_steps_per_second": 1.989, "step": 121 }, { "epoch": 0.23461538461538461, "grad_norm": 4.652563095092773, "learning_rate": 7.673076923076924e-05, "loss": 0.1316, "step": 122 }, { "epoch": 0.23461538461538461, "eval_loss": 0.02223026566207409, "eval_runtime": 181.4249, "eval_samples_per_second": 254.215, "eval_steps_per_second": 1.99, "step": 122 }, { "epoch": 0.23653846153846153, "grad_norm": 3.011662721633911, "learning_rate": 7.653846153846153e-05, "loss": 0.0805, "step": 123 }, { "epoch": 0.23653846153846153, "eval_loss": 0.0226932130753994, "eval_runtime": 182.1195, "eval_samples_per_second": 253.246, "eval_steps_per_second": 1.982, "step": 123 }, { "epoch": 0.23846153846153847, "grad_norm": 1.1616426706314087, "learning_rate": 7.634615384615385e-05, "loss": 0.0182, "step": 124 }, { "epoch": 0.23846153846153847, "eval_loss": 0.024565137922763824, "eval_runtime": 182.4427, "eval_samples_per_second": 252.797, "eval_steps_per_second": 1.979, "step": 124 }, { "epoch": 0.2403846153846154, "grad_norm": 0.7270947098731995, "learning_rate": 7.615384615384616e-05, "loss": 0.0133, "step": 125 }, { "epoch": 0.2403846153846154, "eval_loss": 0.026539519429206848, "eval_runtime": 183.1212, "eval_samples_per_second": 251.861, "eval_steps_per_second": 1.971, "step": 125 }, { "epoch": 0.2423076923076923, "grad_norm": 1.8908319473266602, "learning_rate": 7.596153846153846e-05, "loss": 0.0293, "step": 126 }, { "epoch": 0.2423076923076923, "eval_loss": 0.028904786333441734, "eval_runtime": 181.1513, "eval_samples_per_second": 254.599, "eval_steps_per_second": 1.993, "step": 126 }, { "epoch": 0.24423076923076922, "grad_norm": 1.1141525506973267, "learning_rate": 7.576923076923076e-05, "loss": 0.0208, "step": 127 }, { "epoch": 0.24423076923076922, "eval_loss": 0.030000800266861916, "eval_runtime": 181.2927, "eval_samples_per_second": 254.401, "eval_steps_per_second": 1.991, "step": 127 }, { "epoch": 0.24615384615384617, "grad_norm": 1.4708863496780396, "learning_rate": 7.557692307692308e-05, "loss": 0.033, "step": 128 }, { "epoch": 0.24615384615384617, "eval_loss": 0.03249819576740265, "eval_runtime": 181.507, "eval_samples_per_second": 254.1, "eval_steps_per_second": 1.989, "step": 128 }, { "epoch": 0.24807692307692308, "grad_norm": 3.061389684677124, "learning_rate": 7.538461538461539e-05, "loss": 0.0634, "step": 129 }, { "epoch": 0.24807692307692308, "eval_loss": 0.03363075479865074, "eval_runtime": 182.3989, "eval_samples_per_second": 252.858, "eval_steps_per_second": 1.979, "step": 129 }, { "epoch": 0.25, "grad_norm": 2.663599729537964, "learning_rate": 7.519230769230769e-05, "loss": 0.0414, "step": 130 }, { "epoch": 0.25, "eval_loss": 0.03339559584856033, "eval_runtime": 182.3749, "eval_samples_per_second": 252.891, "eval_steps_per_second": 1.979, "step": 130 }, { "epoch": 0.2519230769230769, "grad_norm": 0.3709014058113098, "learning_rate": 7.500000000000001e-05, "loss": 0.0081, "step": 131 }, { "epoch": 0.2519230769230769, "eval_loss": 0.03324908763170242, "eval_runtime": 181.6302, "eval_samples_per_second": 253.928, "eval_steps_per_second": 1.988, "step": 131 }, { "epoch": 0.25384615384615383, "grad_norm": 2.106776475906372, "learning_rate": 7.480769230769231e-05, "loss": 0.0534, "step": 132 }, { "epoch": 0.25384615384615383, "eval_loss": 0.03283924236893654, "eval_runtime": 182.1872, "eval_samples_per_second": 253.152, "eval_steps_per_second": 1.981, "step": 132 }, { "epoch": 0.25576923076923075, "grad_norm": 3.1101605892181396, "learning_rate": 7.461538461538462e-05, "loss": 0.1828, "step": 133 }, { "epoch": 0.25576923076923075, "eval_loss": 0.03169732913374901, "eval_runtime": 181.8198, "eval_samples_per_second": 253.663, "eval_steps_per_second": 1.985, "step": 133 }, { "epoch": 0.25769230769230766, "grad_norm": 3.3294451236724854, "learning_rate": 7.442307692307693e-05, "loss": 0.0441, "step": 134 }, { "epoch": 0.25769230769230766, "eval_loss": 0.031509336084127426, "eval_runtime": 181.3532, "eval_samples_per_second": 254.316, "eval_steps_per_second": 1.991, "step": 134 }, { "epoch": 0.25961538461538464, "grad_norm": 0.6319352984428406, "learning_rate": 7.423076923076924e-05, "loss": 0.0126, "step": 135 }, { "epoch": 0.25961538461538464, "eval_loss": 0.03166291490197182, "eval_runtime": 180.5095, "eval_samples_per_second": 255.505, "eval_steps_per_second": 2.0, "step": 135 }, { "epoch": 0.26153846153846155, "grad_norm": 3.598060131072998, "learning_rate": 7.403846153846154e-05, "loss": 0.0637, "step": 136 }, { "epoch": 0.26153846153846155, "eval_loss": 0.032380297780036926, "eval_runtime": 180.7907, "eval_samples_per_second": 255.107, "eval_steps_per_second": 1.997, "step": 136 }, { "epoch": 0.26346153846153847, "grad_norm": 2.8882503509521484, "learning_rate": 7.384615384615386e-05, "loss": 0.0655, "step": 137 }, { "epoch": 0.26346153846153847, "eval_loss": 0.03351568803191185, "eval_runtime": 181.8359, "eval_samples_per_second": 253.641, "eval_steps_per_second": 1.985, "step": 137 }, { "epoch": 0.2653846153846154, "grad_norm": 1.519407868385315, "learning_rate": 7.365384615384616e-05, "loss": 0.0199, "step": 138 }, { "epoch": 0.2653846153846154, "eval_loss": 0.03430071100592613, "eval_runtime": 181.9857, "eval_samples_per_second": 253.432, "eval_steps_per_second": 1.984, "step": 138 }, { "epoch": 0.2673076923076923, "grad_norm": 0.8318182229995728, "learning_rate": 7.346153846153847e-05, "loss": 0.0158, "step": 139 }, { "epoch": 0.2673076923076923, "eval_loss": 0.03548838198184967, "eval_runtime": 180.9902, "eval_samples_per_second": 254.826, "eval_steps_per_second": 1.995, "step": 139 }, { "epoch": 0.2692307692307692, "grad_norm": 2.658010721206665, "learning_rate": 7.326923076923077e-05, "loss": 0.0582, "step": 140 }, { "epoch": 0.2692307692307692, "eval_loss": 0.03817412257194519, "eval_runtime": 181.1846, "eval_samples_per_second": 254.553, "eval_steps_per_second": 1.992, "step": 140 }, { "epoch": 0.27115384615384613, "grad_norm": 3.6118085384368896, "learning_rate": 7.307692307692307e-05, "loss": 0.0506, "step": 141 }, { "epoch": 0.27115384615384613, "eval_loss": 0.03875559940934181, "eval_runtime": 182.2632, "eval_samples_per_second": 253.046, "eval_steps_per_second": 1.981, "step": 141 }, { "epoch": 0.27307692307692305, "grad_norm": 1.4768801927566528, "learning_rate": 7.288461538461538e-05, "loss": 0.0823, "step": 142 }, { "epoch": 0.27307692307692305, "eval_loss": 0.03805309161543846, "eval_runtime": 181.8144, "eval_samples_per_second": 253.671, "eval_steps_per_second": 1.986, "step": 142 }, { "epoch": 0.275, "grad_norm": 4.359960079193115, "learning_rate": 7.26923076923077e-05, "loss": 0.0601, "step": 143 }, { "epoch": 0.275, "eval_loss": 0.03436999395489693, "eval_runtime": 182.7585, "eval_samples_per_second": 252.36, "eval_steps_per_second": 1.975, "step": 143 }, { "epoch": 0.27692307692307694, "grad_norm": 3.49018931388855, "learning_rate": 7.25e-05, "loss": 0.0697, "step": 144 }, { "epoch": 0.27692307692307694, "eval_loss": 0.029680771753191948, "eval_runtime": 182.5284, "eval_samples_per_second": 252.678, "eval_steps_per_second": 1.978, "step": 144 }, { "epoch": 0.27884615384615385, "grad_norm": 1.6799707412719727, "learning_rate": 7.23076923076923e-05, "loss": 0.047, "step": 145 }, { "epoch": 0.27884615384615385, "eval_loss": 0.027619585394859314, "eval_runtime": 181.8266, "eval_samples_per_second": 253.654, "eval_steps_per_second": 1.985, "step": 145 }, { "epoch": 0.28076923076923077, "grad_norm": 1.9224464893341064, "learning_rate": 7.211538461538462e-05, "loss": 0.0265, "step": 146 }, { "epoch": 0.28076923076923077, "eval_loss": 0.025914136320352554, "eval_runtime": 181.3461, "eval_samples_per_second": 254.326, "eval_steps_per_second": 1.991, "step": 146 }, { "epoch": 0.2826923076923077, "grad_norm": 1.8353599309921265, "learning_rate": 7.192307692307693e-05, "loss": 0.0265, "step": 147 }, { "epoch": 0.2826923076923077, "eval_loss": 0.024255190044641495, "eval_runtime": 180.3297, "eval_samples_per_second": 255.759, "eval_steps_per_second": 2.002, "step": 147 }, { "epoch": 0.2846153846153846, "grad_norm": 1.9630978107452393, "learning_rate": 7.173076923076923e-05, "loss": 0.0514, "step": 148 }, { "epoch": 0.2846153846153846, "eval_loss": 0.023349367082118988, "eval_runtime": 181.2689, "eval_samples_per_second": 254.434, "eval_steps_per_second": 1.992, "step": 148 }, { "epoch": 0.2865384615384615, "grad_norm": 1.8774313926696777, "learning_rate": 7.153846153846155e-05, "loss": 0.0339, "step": 149 }, { "epoch": 0.2865384615384615, "eval_loss": 0.022075794637203217, "eval_runtime": 182.4232, "eval_samples_per_second": 252.824, "eval_steps_per_second": 1.979, "step": 149 }, { "epoch": 0.28846153846153844, "grad_norm": 0.6925719380378723, "learning_rate": 7.134615384615385e-05, "loss": 0.013, "step": 150 }, { "epoch": 0.28846153846153844, "eval_loss": 0.021298719570040703, "eval_runtime": 182.5149, "eval_samples_per_second": 252.697, "eval_steps_per_second": 1.978, "step": 150 }, { "epoch": 0.2903846153846154, "grad_norm": 1.8816715478897095, "learning_rate": 7.115384615384616e-05, "loss": 0.0645, "step": 151 }, { "epoch": 0.2903846153846154, "eval_loss": 0.021134961396455765, "eval_runtime": 182.282, "eval_samples_per_second": 253.02, "eval_steps_per_second": 1.98, "step": 151 }, { "epoch": 0.2923076923076923, "grad_norm": 0.7385954260826111, "learning_rate": 7.096153846153847e-05, "loss": 0.0175, "step": 152 }, { "epoch": 0.2923076923076923, "eval_loss": 0.020952800288796425, "eval_runtime": 181.8302, "eval_samples_per_second": 253.649, "eval_steps_per_second": 1.985, "step": 152 }, { "epoch": 0.29423076923076924, "grad_norm": 1.84195077419281, "learning_rate": 7.076923076923078e-05, "loss": 0.0512, "step": 153 }, { "epoch": 0.29423076923076924, "eval_loss": 0.020642004907131195, "eval_runtime": 181.2698, "eval_samples_per_second": 254.433, "eval_steps_per_second": 1.992, "step": 153 }, { "epoch": 0.29615384615384616, "grad_norm": 0.2837388813495636, "learning_rate": 7.057692307692308e-05, "loss": 0.0073, "step": 154 }, { "epoch": 0.29615384615384616, "eval_loss": 0.020667677745223045, "eval_runtime": 181.7385, "eval_samples_per_second": 253.777, "eval_steps_per_second": 1.986, "step": 154 }, { "epoch": 0.2980769230769231, "grad_norm": 2.8022420406341553, "learning_rate": 7.03846153846154e-05, "loss": 0.0436, "step": 155 }, { "epoch": 0.2980769230769231, "eval_loss": 0.02192995697259903, "eval_runtime": 180.8781, "eval_samples_per_second": 254.984, "eval_steps_per_second": 1.996, "step": 155 }, { "epoch": 0.3, "grad_norm": 1.9743694067001343, "learning_rate": 7.019230769230769e-05, "loss": 0.0238, "step": 156 }, { "epoch": 0.3, "eval_loss": 0.02315637096762657, "eval_runtime": 182.1362, "eval_samples_per_second": 253.223, "eval_steps_per_second": 1.982, "step": 156 }, { "epoch": 0.3019230769230769, "grad_norm": 2.248816967010498, "learning_rate": 7e-05, "loss": 0.08, "step": 157 }, { "epoch": 0.3019230769230769, "eval_loss": 0.02545558102428913, "eval_runtime": 181.4173, "eval_samples_per_second": 254.226, "eval_steps_per_second": 1.99, "step": 157 }, { "epoch": 0.3038461538461538, "grad_norm": 2.220722198486328, "learning_rate": 6.980769230769231e-05, "loss": 0.0797, "step": 158 }, { "epoch": 0.3038461538461538, "eval_loss": 0.02986188232898712, "eval_runtime": 180.9052, "eval_samples_per_second": 254.946, "eval_steps_per_second": 1.996, "step": 158 }, { "epoch": 0.3057692307692308, "grad_norm": 3.1965675354003906, "learning_rate": 6.961538461538462e-05, "loss": 0.0801, "step": 159 }, { "epoch": 0.3057692307692308, "eval_loss": 0.03733256086707115, "eval_runtime": 181.8686, "eval_samples_per_second": 253.595, "eval_steps_per_second": 1.985, "step": 159 }, { "epoch": 0.3076923076923077, "grad_norm": 3.019501209259033, "learning_rate": 6.942307692307692e-05, "loss": 0.0599, "step": 160 }, { "epoch": 0.3076923076923077, "eval_loss": 0.04430044814944267, "eval_runtime": 182.2711, "eval_samples_per_second": 253.035, "eval_steps_per_second": 1.981, "step": 160 }, { "epoch": 0.3096153846153846, "grad_norm": 3.3681864738464355, "learning_rate": 6.923076923076924e-05, "loss": 0.0276, "step": 161 }, { "epoch": 0.3096153846153846, "eval_loss": 0.04622860625386238, "eval_runtime": 182.9267, "eval_samples_per_second": 252.128, "eval_steps_per_second": 1.973, "step": 161 }, { "epoch": 0.31153846153846154, "grad_norm": 1.2851427793502808, "learning_rate": 6.903846153846154e-05, "loss": 0.0426, "step": 162 }, { "epoch": 0.31153846153846154, "eval_loss": 0.04630015045404434, "eval_runtime": 181.3811, "eval_samples_per_second": 254.277, "eval_steps_per_second": 1.99, "step": 162 }, { "epoch": 0.31346153846153846, "grad_norm": 3.0502898693084717, "learning_rate": 6.884615384615385e-05, "loss": 0.045, "step": 163 }, { "epoch": 0.31346153846153846, "eval_loss": 0.04202108457684517, "eval_runtime": 181.4928, "eval_samples_per_second": 254.12, "eval_steps_per_second": 1.989, "step": 163 }, { "epoch": 0.3153846153846154, "grad_norm": 3.149498462677002, "learning_rate": 6.865384615384616e-05, "loss": 0.0708, "step": 164 }, { "epoch": 0.3153846153846154, "eval_loss": 0.03506983816623688, "eval_runtime": 181.8089, "eval_samples_per_second": 253.678, "eval_steps_per_second": 1.986, "step": 164 }, { "epoch": 0.3173076923076923, "grad_norm": 1.2693160772323608, "learning_rate": 6.846153846153847e-05, "loss": 0.0456, "step": 165 }, { "epoch": 0.3173076923076923, "eval_loss": 0.028076015412807465, "eval_runtime": 182.1103, "eval_samples_per_second": 253.259, "eval_steps_per_second": 1.982, "step": 165 }, { "epoch": 0.3192307692307692, "grad_norm": 1.5411460399627686, "learning_rate": 6.826923076923077e-05, "loss": 0.0261, "step": 166 }, { "epoch": 0.3192307692307692, "eval_loss": 0.024974722415208817, "eval_runtime": 181.8849, "eval_samples_per_second": 253.572, "eval_steps_per_second": 1.985, "step": 166 }, { "epoch": 0.3211538461538462, "grad_norm": 2.7267351150512695, "learning_rate": 6.807692307692309e-05, "loss": 0.0427, "step": 167 }, { "epoch": 0.3211538461538462, "eval_loss": 0.022312704473733902, "eval_runtime": 181.8135, "eval_samples_per_second": 253.672, "eval_steps_per_second": 1.986, "step": 167 }, { "epoch": 0.3230769230769231, "grad_norm": 1.551154613494873, "learning_rate": 6.788461538461539e-05, "loss": 0.038, "step": 168 }, { "epoch": 0.3230769230769231, "eval_loss": 0.020363658666610718, "eval_runtime": 180.8456, "eval_samples_per_second": 255.03, "eval_steps_per_second": 1.996, "step": 168 }, { "epoch": 0.325, "grad_norm": 1.9637490510940552, "learning_rate": 6.76923076923077e-05, "loss": 0.0275, "step": 169 }, { "epoch": 0.325, "eval_loss": 0.019464140757918358, "eval_runtime": 181.1515, "eval_samples_per_second": 254.599, "eval_steps_per_second": 1.993, "step": 169 }, { "epoch": 0.3269230769230769, "grad_norm": 0.9584169983863831, "learning_rate": 6.750000000000001e-05, "loss": 0.0351, "step": 170 }, { "epoch": 0.3269230769230769, "eval_loss": 0.018735043704509735, "eval_runtime": 182.4595, "eval_samples_per_second": 252.774, "eval_steps_per_second": 1.979, "step": 170 }, { "epoch": 0.32884615384615384, "grad_norm": 2.4790542125701904, "learning_rate": 6.730769230769232e-05, "loss": 0.0812, "step": 171 }, { "epoch": 0.32884615384615384, "eval_loss": 0.01843821443617344, "eval_runtime": 184.1966, "eval_samples_per_second": 250.39, "eval_steps_per_second": 1.96, "step": 171 }, { "epoch": 0.33076923076923076, "grad_norm": 1.239414095878601, "learning_rate": 6.711538461538461e-05, "loss": 0.0338, "step": 172 }, { "epoch": 0.33076923076923076, "eval_loss": 0.018382636830210686, "eval_runtime": 182.3178, "eval_samples_per_second": 252.97, "eval_steps_per_second": 1.98, "step": 172 }, { "epoch": 0.3326923076923077, "grad_norm": 2.3932952880859375, "learning_rate": 6.692307692307693e-05, "loss": 0.0716, "step": 173 }, { "epoch": 0.3326923076923077, "eval_loss": 0.018522335216403008, "eval_runtime": 182.6222, "eval_samples_per_second": 252.549, "eval_steps_per_second": 1.977, "step": 173 }, { "epoch": 0.3346153846153846, "grad_norm": 3.787052869796753, "learning_rate": 6.673076923076923e-05, "loss": 0.0869, "step": 174 }, { "epoch": 0.3346153846153846, "eval_loss": 0.019300226122140884, "eval_runtime": 181.3346, "eval_samples_per_second": 254.342, "eval_steps_per_second": 1.991, "step": 174 }, { "epoch": 0.33653846153846156, "grad_norm": 1.6532280445098877, "learning_rate": 6.653846153846153e-05, "loss": 0.0501, "step": 175 }, { "epoch": 0.33653846153846156, "eval_loss": 0.021606482565402985, "eval_runtime": 182.1641, "eval_samples_per_second": 253.184, "eval_steps_per_second": 1.982, "step": 175 }, { "epoch": 0.3384615384615385, "grad_norm": 5.240301132202148, "learning_rate": 6.634615384615385e-05, "loss": 0.0421, "step": 176 }, { "epoch": 0.3384615384615385, "eval_loss": 0.02588193118572235, "eval_runtime": 182.0323, "eval_samples_per_second": 253.367, "eval_steps_per_second": 1.983, "step": 176 }, { "epoch": 0.3403846153846154, "grad_norm": 2.0743260383605957, "learning_rate": 6.615384615384616e-05, "loss": 0.0653, "step": 177 }, { "epoch": 0.3403846153846154, "eval_loss": 0.031672630459070206, "eval_runtime": 182.8966, "eval_samples_per_second": 252.17, "eval_steps_per_second": 1.974, "step": 177 }, { "epoch": 0.3423076923076923, "grad_norm": 2.0226855278015137, "learning_rate": 6.596153846153846e-05, "loss": 0.0776, "step": 178 }, { "epoch": 0.3423076923076923, "eval_loss": 0.03493071347475052, "eval_runtime": 179.3302, "eval_samples_per_second": 257.185, "eval_steps_per_second": 2.013, "step": 178 }, { "epoch": 0.34423076923076923, "grad_norm": 1.797101378440857, "learning_rate": 6.576923076923078e-05, "loss": 0.0338, "step": 179 }, { "epoch": 0.34423076923076923, "eval_loss": 0.03656415641307831, "eval_runtime": 178.9522, "eval_samples_per_second": 257.728, "eval_steps_per_second": 2.017, "step": 179 }, { "epoch": 0.34615384615384615, "grad_norm": 3.3137032985687256, "learning_rate": 6.557692307692308e-05, "loss": 0.0469, "step": 180 }, { "epoch": 0.34615384615384615, "eval_loss": 0.03430218622088432, "eval_runtime": 179.502, "eval_samples_per_second": 256.939, "eval_steps_per_second": 2.011, "step": 180 }, { "epoch": 0.34807692307692306, "grad_norm": 0.8624777793884277, "learning_rate": 6.538461538461539e-05, "loss": 0.0218, "step": 181 }, { "epoch": 0.34807692307692306, "eval_loss": 0.029889389872550964, "eval_runtime": 179.85, "eval_samples_per_second": 256.441, "eval_steps_per_second": 2.007, "step": 181 }, { "epoch": 0.35, "grad_norm": 1.7499721050262451, "learning_rate": 6.519230769230769e-05, "loss": 0.0426, "step": 182 }, { "epoch": 0.35, "eval_loss": 0.02500992640852928, "eval_runtime": 179.0168, "eval_samples_per_second": 257.635, "eval_steps_per_second": 2.017, "step": 182 }, { "epoch": 0.35192307692307695, "grad_norm": 0.598138689994812, "learning_rate": 6.500000000000001e-05, "loss": 0.0166, "step": 183 }, { "epoch": 0.35192307692307695, "eval_loss": 0.021099161356687546, "eval_runtime": 179.3361, "eval_samples_per_second": 257.176, "eval_steps_per_second": 2.013, "step": 183 }, { "epoch": 0.35384615384615387, "grad_norm": 1.0341280698776245, "learning_rate": 6.480769230769231e-05, "loss": 0.0245, "step": 184 }, { "epoch": 0.35384615384615387, "eval_loss": 0.01857278123497963, "eval_runtime": 178.9082, "eval_samples_per_second": 257.791, "eval_steps_per_second": 2.018, "step": 184 }, { "epoch": 0.3557692307692308, "grad_norm": 0.775191605091095, "learning_rate": 6.461538461538462e-05, "loss": 0.0173, "step": 185 }, { "epoch": 0.3557692307692308, "eval_loss": 0.01758418418467045, "eval_runtime": 178.4482, "eval_samples_per_second": 258.456, "eval_steps_per_second": 2.023, "step": 185 }, { "epoch": 0.3576923076923077, "grad_norm": 1.1633071899414062, "learning_rate": 6.442307692307693e-05, "loss": 0.0204, "step": 186 }, { "epoch": 0.3576923076923077, "eval_loss": 0.017436116933822632, "eval_runtime": 177.889, "eval_samples_per_second": 259.268, "eval_steps_per_second": 2.029, "step": 186 }, { "epoch": 0.3596153846153846, "grad_norm": 2.2689030170440674, "learning_rate": 6.423076923076924e-05, "loss": 0.0402, "step": 187 }, { "epoch": 0.3596153846153846, "eval_loss": 0.017784688621759415, "eval_runtime": 179.0045, "eval_samples_per_second": 257.653, "eval_steps_per_second": 2.017, "step": 187 }, { "epoch": 0.36153846153846153, "grad_norm": 1.899922490119934, "learning_rate": 6.403846153846154e-05, "loss": 0.0392, "step": 188 }, { "epoch": 0.36153846153846153, "eval_loss": 0.018613914027810097, "eval_runtime": 178.4172, "eval_samples_per_second": 258.501, "eval_steps_per_second": 2.023, "step": 188 }, { "epoch": 0.36346153846153845, "grad_norm": 1.4108256101608276, "learning_rate": 6.384615384615385e-05, "loss": 0.0481, "step": 189 }, { "epoch": 0.36346153846153845, "eval_loss": 0.019706113263964653, "eval_runtime": 178.8412, "eval_samples_per_second": 257.888, "eval_steps_per_second": 2.019, "step": 189 }, { "epoch": 0.36538461538461536, "grad_norm": 2.3156585693359375, "learning_rate": 6.365384615384615e-05, "loss": 0.0475, "step": 190 }, { "epoch": 0.36538461538461536, "eval_loss": 0.02033383585512638, "eval_runtime": 179.2749, "eval_samples_per_second": 257.264, "eval_steps_per_second": 2.014, "step": 190 }, { "epoch": 0.36730769230769234, "grad_norm": 1.200846552848816, "learning_rate": 6.346153846153847e-05, "loss": 0.0364, "step": 191 }, { "epoch": 0.36730769230769234, "eval_loss": 0.02068951539695263, "eval_runtime": 179.023, "eval_samples_per_second": 257.626, "eval_steps_per_second": 2.017, "step": 191 }, { "epoch": 0.36923076923076925, "grad_norm": 0.5756196975708008, "learning_rate": 6.326923076923077e-05, "loss": 0.0139, "step": 192 }, { "epoch": 0.36923076923076925, "eval_loss": 0.020952697843313217, "eval_runtime": 178.3352, "eval_samples_per_second": 258.62, "eval_steps_per_second": 2.024, "step": 192 }, { "epoch": 0.37115384615384617, "grad_norm": 0.9504797458648682, "learning_rate": 6.307692307692308e-05, "loss": 0.012, "step": 193 }, { "epoch": 0.37115384615384617, "eval_loss": 0.021903619170188904, "eval_runtime": 179.2178, "eval_samples_per_second": 257.346, "eval_steps_per_second": 2.014, "step": 193 }, { "epoch": 0.3730769230769231, "grad_norm": 0.8241443634033203, "learning_rate": 6.288461538461538e-05, "loss": 0.0095, "step": 194 }, { "epoch": 0.3730769230769231, "eval_loss": 0.023630516603589058, "eval_runtime": 177.9821, "eval_samples_per_second": 259.133, "eval_steps_per_second": 2.028, "step": 194 }, { "epoch": 0.375, "grad_norm": 3.045523166656494, "learning_rate": 6.26923076923077e-05, "loss": 0.0798, "step": 195 }, { "epoch": 0.375, "eval_loss": 0.02465650625526905, "eval_runtime": 179.008, "eval_samples_per_second": 257.648, "eval_steps_per_second": 2.017, "step": 195 }, { "epoch": 0.3769230769230769, "grad_norm": 3.0229413509368896, "learning_rate": 6.25e-05, "loss": 0.0548, "step": 196 }, { "epoch": 0.3769230769230769, "eval_loss": 0.02465982548892498, "eval_runtime": 179.1942, "eval_samples_per_second": 257.38, "eval_steps_per_second": 2.015, "step": 196 }, { "epoch": 0.37884615384615383, "grad_norm": 1.4179023504257202, "learning_rate": 6.23076923076923e-05, "loss": 0.0456, "step": 197 }, { "epoch": 0.37884615384615383, "eval_loss": 0.02369517832994461, "eval_runtime": 179.1101, "eval_samples_per_second": 257.501, "eval_steps_per_second": 2.016, "step": 197 }, { "epoch": 0.38076923076923075, "grad_norm": 1.207181692123413, "learning_rate": 6.211538461538462e-05, "loss": 0.0167, "step": 198 }, { "epoch": 0.38076923076923075, "eval_loss": 0.02231222204864025, "eval_runtime": 178.044, "eval_samples_per_second": 259.043, "eval_steps_per_second": 2.028, "step": 198 }, { "epoch": 0.38269230769230766, "grad_norm": 0.838901937007904, "learning_rate": 6.192307692307693e-05, "loss": 0.0182, "step": 199 }, { "epoch": 0.38269230769230766, "eval_loss": 0.021893635392189026, "eval_runtime": 178.3295, "eval_samples_per_second": 258.628, "eval_steps_per_second": 2.024, "step": 199 }, { "epoch": 0.38461538461538464, "grad_norm": 0.957854151725769, "learning_rate": 6.173076923076923e-05, "loss": 0.0156, "step": 200 }, { "epoch": 0.38461538461538464, "eval_loss": 0.021150017157197, "eval_runtime": 178.9639, "eval_samples_per_second": 257.711, "eval_steps_per_second": 2.017, "step": 200 }, { "epoch": 0.38653846153846155, "grad_norm": 4.174772262573242, "learning_rate": 6.153846153846155e-05, "loss": 0.0678, "step": 201 }, { "epoch": 0.38653846153846155, "eval_loss": 0.02083815075457096, "eval_runtime": 179.5827, "eval_samples_per_second": 256.823, "eval_steps_per_second": 2.01, "step": 201 }, { "epoch": 0.38846153846153847, "grad_norm": 1.1510279178619385, "learning_rate": 6.134615384615385e-05, "loss": 0.0575, "step": 202 }, { "epoch": 0.38846153846153847, "eval_loss": 0.020570380613207817, "eval_runtime": 180.2019, "eval_samples_per_second": 255.941, "eval_steps_per_second": 2.003, "step": 202 }, { "epoch": 0.3903846153846154, "grad_norm": 1.705539345741272, "learning_rate": 6.115384615384616e-05, "loss": 0.0345, "step": 203 }, { "epoch": 0.3903846153846154, "eval_loss": 0.020152855664491653, "eval_runtime": 179.2612, "eval_samples_per_second": 257.284, "eval_steps_per_second": 2.014, "step": 203 }, { "epoch": 0.3923076923076923, "grad_norm": 3.794814348220825, "learning_rate": 6.096153846153847e-05, "loss": 0.1773, "step": 204 }, { "epoch": 0.3923076923076923, "eval_loss": 0.019848283380270004, "eval_runtime": 178.9587, "eval_samples_per_second": 257.719, "eval_steps_per_second": 2.017, "step": 204 }, { "epoch": 0.3942307692307692, "grad_norm": 2.470301866531372, "learning_rate": 6.0769230769230765e-05, "loss": 0.0211, "step": 205 }, { "epoch": 0.3942307692307692, "eval_loss": 0.019784526899456978, "eval_runtime": 179.1875, "eval_samples_per_second": 257.39, "eval_steps_per_second": 2.015, "step": 205 }, { "epoch": 0.39615384615384613, "grad_norm": 3.789724349975586, "learning_rate": 6.0576923076923076e-05, "loss": 0.0377, "step": 206 }, { "epoch": 0.39615384615384613, "eval_loss": 0.02005017176270485, "eval_runtime": 178.7582, "eval_samples_per_second": 258.008, "eval_steps_per_second": 2.019, "step": 206 }, { "epoch": 0.39807692307692305, "grad_norm": 2.2620668411254883, "learning_rate": 6.038461538461539e-05, "loss": 0.0709, "step": 207 }, { "epoch": 0.39807692307692305, "eval_loss": 0.020129531621932983, "eval_runtime": 179.2997, "eval_samples_per_second": 257.228, "eval_steps_per_second": 2.013, "step": 207 }, { "epoch": 0.4, "grad_norm": 2.889387607574463, "learning_rate": 6.019230769230769e-05, "loss": 0.0436, "step": 208 }, { "epoch": 0.4, "eval_loss": 0.02092353254556656, "eval_runtime": 178.7779, "eval_samples_per_second": 257.979, "eval_steps_per_second": 2.019, "step": 208 }, { "epoch": 0.40192307692307694, "grad_norm": 1.8757256269454956, "learning_rate": 6e-05, "loss": 0.0242, "step": 209 }, { "epoch": 0.40192307692307694, "eval_loss": 0.022796517238020897, "eval_runtime": 179.275, "eval_samples_per_second": 257.264, "eval_steps_per_second": 2.014, "step": 209 }, { "epoch": 0.40384615384615385, "grad_norm": 0.5746337175369263, "learning_rate": 5.980769230769231e-05, "loss": 0.0093, "step": 210 }, { "epoch": 0.40384615384615385, "eval_loss": 0.026022404432296753, "eval_runtime": 179.3393, "eval_samples_per_second": 257.172, "eval_steps_per_second": 2.013, "step": 210 }, { "epoch": 0.40576923076923077, "grad_norm": 0.6468233466148376, "learning_rate": 5.9615384615384616e-05, "loss": 0.0291, "step": 211 }, { "epoch": 0.40576923076923077, "eval_loss": 0.028801048174500465, "eval_runtime": 180.0726, "eval_samples_per_second": 256.124, "eval_steps_per_second": 2.005, "step": 211 }, { "epoch": 0.4076923076923077, "grad_norm": 3.5019261837005615, "learning_rate": 5.942307692307693e-05, "loss": 0.0955, "step": 212 }, { "epoch": 0.4076923076923077, "eval_loss": 0.03047165460884571, "eval_runtime": 179.3877, "eval_samples_per_second": 257.102, "eval_steps_per_second": 2.012, "step": 212 }, { "epoch": 0.4096153846153846, "grad_norm": 2.398637056350708, "learning_rate": 5.923076923076923e-05, "loss": 0.0332, "step": 213 }, { "epoch": 0.4096153846153846, "eval_loss": 0.03037206083536148, "eval_runtime": 179.177, "eval_samples_per_second": 257.405, "eval_steps_per_second": 2.015, "step": 213 }, { "epoch": 0.4115384615384615, "grad_norm": 0.37438809871673584, "learning_rate": 5.903846153846154e-05, "loss": 0.0086, "step": 214 }, { "epoch": 0.4115384615384615, "eval_loss": 0.02970583364367485, "eval_runtime": 180.2418, "eval_samples_per_second": 255.884, "eval_steps_per_second": 2.003, "step": 214 }, { "epoch": 0.41346153846153844, "grad_norm": 3.2650182247161865, "learning_rate": 5.884615384615385e-05, "loss": 0.0226, "step": 215 }, { "epoch": 0.41346153846153844, "eval_loss": 0.027145324274897575, "eval_runtime": 179.8572, "eval_samples_per_second": 256.431, "eval_steps_per_second": 2.007, "step": 215 }, { "epoch": 0.4153846153846154, "grad_norm": 3.719679832458496, "learning_rate": 5.865384615384616e-05, "loss": 0.088, "step": 216 }, { "epoch": 0.4153846153846154, "eval_loss": 0.02398001216351986, "eval_runtime": 179.4786, "eval_samples_per_second": 256.972, "eval_steps_per_second": 2.011, "step": 216 }, { "epoch": 0.4173076923076923, "grad_norm": 3.390564441680908, "learning_rate": 5.846153846153847e-05, "loss": 0.073, "step": 217 }, { "epoch": 0.4173076923076923, "eval_loss": 0.022082313895225525, "eval_runtime": 178.9292, "eval_samples_per_second": 257.761, "eval_steps_per_second": 2.018, "step": 217 }, { "epoch": 0.41923076923076924, "grad_norm": 0.4555812180042267, "learning_rate": 5.826923076923078e-05, "loss": 0.0085, "step": 218 }, { "epoch": 0.41923076923076924, "eval_loss": 0.020960917696356773, "eval_runtime": 179.7744, "eval_samples_per_second": 256.549, "eval_steps_per_second": 2.008, "step": 218 }, { "epoch": 0.42115384615384616, "grad_norm": 1.5771949291229248, "learning_rate": 5.807692307692308e-05, "loss": 0.0355, "step": 219 }, { "epoch": 0.42115384615384616, "eval_loss": 0.020832329988479614, "eval_runtime": 179.0768, "eval_samples_per_second": 257.549, "eval_steps_per_second": 2.016, "step": 219 }, { "epoch": 0.4230769230769231, "grad_norm": 1.863141417503357, "learning_rate": 5.7884615384615394e-05, "loss": 0.0319, "step": 220 }, { "epoch": 0.4230769230769231, "eval_loss": 0.020400503650307655, "eval_runtime": 179.8136, "eval_samples_per_second": 256.493, "eval_steps_per_second": 2.008, "step": 220 }, { "epoch": 0.425, "grad_norm": 2.5015952587127686, "learning_rate": 5.769230769230769e-05, "loss": 0.0297, "step": 221 }, { "epoch": 0.425, "eval_loss": 0.019669881090521812, "eval_runtime": 179.629, "eval_samples_per_second": 256.757, "eval_steps_per_second": 2.01, "step": 221 }, { "epoch": 0.4269230769230769, "grad_norm": 2.183319568634033, "learning_rate": 5.7499999999999995e-05, "loss": 0.041, "step": 222 }, { "epoch": 0.4269230769230769, "eval_loss": 0.018956847488880157, "eval_runtime": 179.7634, "eval_samples_per_second": 256.565, "eval_steps_per_second": 2.008, "step": 222 }, { "epoch": 0.4288461538461538, "grad_norm": 0.6641192436218262, "learning_rate": 5.7307692307692306e-05, "loss": 0.0103, "step": 223 }, { "epoch": 0.4288461538461538, "eval_loss": 0.018505413085222244, "eval_runtime": 178.939, "eval_samples_per_second": 257.747, "eval_steps_per_second": 2.017, "step": 223 }, { "epoch": 0.4307692307692308, "grad_norm": 2.056015968322754, "learning_rate": 5.711538461538462e-05, "loss": 0.0509, "step": 224 }, { "epoch": 0.4307692307692308, "eval_loss": 0.018195876851677895, "eval_runtime": 178.7256, "eval_samples_per_second": 258.055, "eval_steps_per_second": 2.02, "step": 224 }, { "epoch": 0.4326923076923077, "grad_norm": 3.368030071258545, "learning_rate": 5.692307692307692e-05, "loss": 0.0265, "step": 225 }, { "epoch": 0.4326923076923077, "eval_loss": 0.01840364933013916, "eval_runtime": 178.4536, "eval_samples_per_second": 258.448, "eval_steps_per_second": 2.023, "step": 225 }, { "epoch": 0.4346153846153846, "grad_norm": 2.7104804515838623, "learning_rate": 5.673076923076923e-05, "loss": 0.0607, "step": 226 }, { "epoch": 0.4346153846153846, "eval_loss": 0.018871352076530457, "eval_runtime": 179.4064, "eval_samples_per_second": 257.076, "eval_steps_per_second": 2.012, "step": 226 }, { "epoch": 0.43653846153846154, "grad_norm": 1.095210313796997, "learning_rate": 5.653846153846154e-05, "loss": 0.0148, "step": 227 }, { "epoch": 0.43653846153846154, "eval_loss": 0.019451051950454712, "eval_runtime": 178.892, "eval_samples_per_second": 257.815, "eval_steps_per_second": 2.018, "step": 227 }, { "epoch": 0.43846153846153846, "grad_norm": 1.179794192314148, "learning_rate": 5.6346153846153846e-05, "loss": 0.0146, "step": 228 }, { "epoch": 0.43846153846153846, "eval_loss": 0.01976831443607807, "eval_runtime": 179.5309, "eval_samples_per_second": 256.897, "eval_steps_per_second": 2.011, "step": 228 }, { "epoch": 0.4403846153846154, "grad_norm": 1.9117586612701416, "learning_rate": 5.615384615384616e-05, "loss": 0.029, "step": 229 }, { "epoch": 0.4403846153846154, "eval_loss": 0.020064150914549828, "eval_runtime": 179.5303, "eval_samples_per_second": 256.898, "eval_steps_per_second": 2.011, "step": 229 }, { "epoch": 0.4423076923076923, "grad_norm": 2.375997304916382, "learning_rate": 5.596153846153847e-05, "loss": 0.0324, "step": 230 }, { "epoch": 0.4423076923076923, "eval_loss": 0.02067544311285019, "eval_runtime": 180.6883, "eval_samples_per_second": 255.252, "eval_steps_per_second": 1.998, "step": 230 }, { "epoch": 0.4442307692307692, "grad_norm": 2.2388057708740234, "learning_rate": 5.576923076923077e-05, "loss": 0.0335, "step": 231 }, { "epoch": 0.4442307692307692, "eval_loss": 0.021712226793169975, "eval_runtime": 178.3336, "eval_samples_per_second": 258.622, "eval_steps_per_second": 2.024, "step": 231 }, { "epoch": 0.4461538461538462, "grad_norm": 2.2240569591522217, "learning_rate": 5.557692307692308e-05, "loss": 0.0512, "step": 232 }, { "epoch": 0.4461538461538462, "eval_loss": 0.022264475002884865, "eval_runtime": 183.6952, "eval_samples_per_second": 251.074, "eval_steps_per_second": 1.965, "step": 232 }, { "epoch": 0.4480769230769231, "grad_norm": 0.18751874566078186, "learning_rate": 5.538461538461539e-05, "loss": 0.006, "step": 233 }, { "epoch": 0.4480769230769231, "eval_loss": 0.022908175364136696, "eval_runtime": 180.2674, "eval_samples_per_second": 255.848, "eval_steps_per_second": 2.003, "step": 233 }, { "epoch": 0.45, "grad_norm": 1.9546294212341309, "learning_rate": 5.51923076923077e-05, "loss": 0.02, "step": 234 }, { "epoch": 0.45, "eval_loss": 0.023024283349514008, "eval_runtime": 178.9188, "eval_samples_per_second": 257.776, "eval_steps_per_second": 2.018, "step": 234 }, { "epoch": 0.4519230769230769, "grad_norm": 2.1330456733703613, "learning_rate": 5.500000000000001e-05, "loss": 0.0322, "step": 235 }, { "epoch": 0.4519230769230769, "eval_loss": 0.023195333778858185, "eval_runtime": 179.3369, "eval_samples_per_second": 257.175, "eval_steps_per_second": 2.013, "step": 235 }, { "epoch": 0.45384615384615384, "grad_norm": 1.1413002014160156, "learning_rate": 5.480769230769231e-05, "loss": 0.0535, "step": 236 }, { "epoch": 0.45384615384615384, "eval_loss": 0.02287602610886097, "eval_runtime": 180.2062, "eval_samples_per_second": 255.935, "eval_steps_per_second": 2.003, "step": 236 }, { "epoch": 0.45576923076923076, "grad_norm": 1.1129988431930542, "learning_rate": 5.461538461538461e-05, "loss": 0.0301, "step": 237 }, { "epoch": 0.45576923076923076, "eval_loss": 0.022405732423067093, "eval_runtime": 178.7035, "eval_samples_per_second": 258.087, "eval_steps_per_second": 2.02, "step": 237 }, { "epoch": 0.4576923076923077, "grad_norm": 0.41665732860565186, "learning_rate": 5.442307692307692e-05, "loss": 0.0073, "step": 238 }, { "epoch": 0.4576923076923077, "eval_loss": 0.022149918600916862, "eval_runtime": 177.8064, "eval_samples_per_second": 259.389, "eval_steps_per_second": 2.03, "step": 238 }, { "epoch": 0.4596153846153846, "grad_norm": 1.4682824611663818, "learning_rate": 5.423076923076923e-05, "loss": 0.0206, "step": 239 }, { "epoch": 0.4596153846153846, "eval_loss": 0.02222280018031597, "eval_runtime": 178.3278, "eval_samples_per_second": 258.63, "eval_steps_per_second": 2.024, "step": 239 }, { "epoch": 0.46153846153846156, "grad_norm": 0.2500247657299042, "learning_rate": 5.4038461538461536e-05, "loss": 0.0058, "step": 240 }, { "epoch": 0.46153846153846156, "eval_loss": 0.022345269098877907, "eval_runtime": 178.187, "eval_samples_per_second": 258.835, "eval_steps_per_second": 2.026, "step": 240 }, { "epoch": 0.4634615384615385, "grad_norm": 1.4518128633499146, "learning_rate": 5.384615384615385e-05, "loss": 0.0333, "step": 241 }, { "epoch": 0.4634615384615385, "eval_loss": 0.022659137845039368, "eval_runtime": 178.3156, "eval_samples_per_second": 258.648, "eval_steps_per_second": 2.025, "step": 241 }, { "epoch": 0.4653846153846154, "grad_norm": 0.997244119644165, "learning_rate": 5.365384615384616e-05, "loss": 0.0407, "step": 242 }, { "epoch": 0.4653846153846154, "eval_loss": 0.02270101197063923, "eval_runtime": 178.6996, "eval_samples_per_second": 258.092, "eval_steps_per_second": 2.02, "step": 242 }, { "epoch": 0.4673076923076923, "grad_norm": 2.6662564277648926, "learning_rate": 5.346153846153846e-05, "loss": 0.0426, "step": 243 }, { "epoch": 0.4673076923076923, "eval_loss": 0.02220647782087326, "eval_runtime": 179.6128, "eval_samples_per_second": 256.78, "eval_steps_per_second": 2.01, "step": 243 }, { "epoch": 0.46923076923076923, "grad_norm": 0.8665458559989929, "learning_rate": 5.326923076923077e-05, "loss": 0.0114, "step": 244 }, { "epoch": 0.46923076923076923, "eval_loss": 0.02131798304617405, "eval_runtime": 179.1555, "eval_samples_per_second": 257.436, "eval_steps_per_second": 2.015, "step": 244 }, { "epoch": 0.47115384615384615, "grad_norm": 1.1648316383361816, "learning_rate": 5.3076923076923076e-05, "loss": 0.0122, "step": 245 }, { "epoch": 0.47115384615384615, "eval_loss": 0.02076887898147106, "eval_runtime": 179.2242, "eval_samples_per_second": 257.337, "eval_steps_per_second": 2.014, "step": 245 }, { "epoch": 0.47307692307692306, "grad_norm": 1.3646942377090454, "learning_rate": 5.288461538461539e-05, "loss": 0.0202, "step": 246 }, { "epoch": 0.47307692307692306, "eval_loss": 0.0202037263661623, "eval_runtime": 179.1515, "eval_samples_per_second": 257.441, "eval_steps_per_second": 2.015, "step": 246 }, { "epoch": 0.475, "grad_norm": 2.266969919204712, "learning_rate": 5.26923076923077e-05, "loss": 0.0168, "step": 247 }, { "epoch": 0.475, "eval_loss": 0.02008403278887272, "eval_runtime": 179.642, "eval_samples_per_second": 256.738, "eval_steps_per_second": 2.01, "step": 247 }, { "epoch": 0.47692307692307695, "grad_norm": 1.709193229675293, "learning_rate": 5.25e-05, "loss": 0.0222, "step": 248 }, { "epoch": 0.47692307692307695, "eval_loss": 0.02022576704621315, "eval_runtime": 179.6915, "eval_samples_per_second": 256.668, "eval_steps_per_second": 2.009, "step": 248 }, { "epoch": 0.47884615384615387, "grad_norm": 0.7432993054389954, "learning_rate": 5.230769230769231e-05, "loss": 0.0091, "step": 249 }, { "epoch": 0.47884615384615387, "eval_loss": 0.0207006074488163, "eval_runtime": 178.7806, "eval_samples_per_second": 257.975, "eval_steps_per_second": 2.019, "step": 249 }, { "epoch": 0.4807692307692308, "grad_norm": 1.0147693157196045, "learning_rate": 5.2115384615384624e-05, "loss": 0.0336, "step": 250 }, { "epoch": 0.4807692307692308, "eval_loss": 0.020908081904053688, "eval_runtime": 179.032, "eval_samples_per_second": 257.613, "eval_steps_per_second": 2.016, "step": 250 }, { "epoch": 0.4826923076923077, "grad_norm": 0.11277324706315994, "learning_rate": 5.192307692307693e-05, "loss": 0.0049, "step": 251 }, { "epoch": 0.4826923076923077, "eval_loss": 0.021289991214871407, "eval_runtime": 178.7696, "eval_samples_per_second": 257.991, "eval_steps_per_second": 2.019, "step": 251 }, { "epoch": 0.4846153846153846, "grad_norm": 1.4250966310501099, "learning_rate": 5.173076923076924e-05, "loss": 0.0725, "step": 252 }, { "epoch": 0.4846153846153846, "eval_loss": 0.02207464724779129, "eval_runtime": 178.046, "eval_samples_per_second": 259.04, "eval_steps_per_second": 2.028, "step": 252 }, { "epoch": 0.48653846153846153, "grad_norm": 0.6005804538726807, "learning_rate": 5.1538461538461536e-05, "loss": 0.0069, "step": 253 }, { "epoch": 0.48653846153846153, "eval_loss": 0.02242344059050083, "eval_runtime": 178.0571, "eval_samples_per_second": 259.024, "eval_steps_per_second": 2.027, "step": 253 }, { "epoch": 0.48846153846153845, "grad_norm": 2.921394109725952, "learning_rate": 5.134615384615385e-05, "loss": 0.0435, "step": 254 }, { "epoch": 0.48846153846153845, "eval_loss": 0.022368701174855232, "eval_runtime": 177.6433, "eval_samples_per_second": 259.627, "eval_steps_per_second": 2.032, "step": 254 }, { "epoch": 0.49038461538461536, "grad_norm": 2.527122974395752, "learning_rate": 5.115384615384615e-05, "loss": 0.0227, "step": 255 }, { "epoch": 0.49038461538461536, "eval_loss": 0.023167185485363007, "eval_runtime": 178.2025, "eval_samples_per_second": 258.812, "eval_steps_per_second": 2.026, "step": 255 }, { "epoch": 0.49230769230769234, "grad_norm": 1.260136604309082, "learning_rate": 5.096153846153846e-05, "loss": 0.0641, "step": 256 }, { "epoch": 0.49230769230769234, "eval_loss": 0.02456340193748474, "eval_runtime": 179.2064, "eval_samples_per_second": 257.362, "eval_steps_per_second": 2.014, "step": 256 }, { "epoch": 0.49423076923076925, "grad_norm": 4.960824489593506, "learning_rate": 5.0769230769230766e-05, "loss": 0.1312, "step": 257 }, { "epoch": 0.49423076923076925, "eval_loss": 0.026521550491452217, "eval_runtime": 177.6317, "eval_samples_per_second": 259.644, "eval_steps_per_second": 2.032, "step": 257 }, { "epoch": 0.49615384615384617, "grad_norm": 2.000896692276001, "learning_rate": 5.057692307692308e-05, "loss": 0.022, "step": 258 }, { "epoch": 0.49615384615384617, "eval_loss": 0.027118589729070663, "eval_runtime": 179.2158, "eval_samples_per_second": 257.349, "eval_steps_per_second": 2.014, "step": 258 }, { "epoch": 0.4980769230769231, "grad_norm": 2.8646159172058105, "learning_rate": 5.038461538461539e-05, "loss": 0.0257, "step": 259 }, { "epoch": 0.4980769230769231, "eval_loss": 0.026049936190247536, "eval_runtime": 179.4408, "eval_samples_per_second": 257.026, "eval_steps_per_second": 2.012, "step": 259 }, { "epoch": 0.5, "grad_norm": 0.943261981010437, "learning_rate": 5.019230769230769e-05, "loss": 0.0233, "step": 260 }, { "epoch": 0.5, "eval_loss": 0.025271113961935043, "eval_runtime": 178.2347, "eval_samples_per_second": 258.766, "eval_steps_per_second": 2.025, "step": 260 }, { "epoch": 0.5019230769230769, "grad_norm": 0.20943577587604523, "learning_rate": 5e-05, "loss": 0.0049, "step": 261 }, { "epoch": 0.5019230769230769, "eval_loss": 0.024457741528749466, "eval_runtime": 178.8024, "eval_samples_per_second": 257.944, "eval_steps_per_second": 2.019, "step": 261 }, { "epoch": 0.5038461538461538, "grad_norm": 1.5787253379821777, "learning_rate": 4.980769230769231e-05, "loss": 0.0253, "step": 262 }, { "epoch": 0.5038461538461538, "eval_loss": 0.023463794961571693, "eval_runtime": 179.4617, "eval_samples_per_second": 256.996, "eval_steps_per_second": 2.012, "step": 262 }, { "epoch": 0.5057692307692307, "grad_norm": 1.8370299339294434, "learning_rate": 4.961538461538462e-05, "loss": 0.123, "step": 263 }, { "epoch": 0.5057692307692307, "eval_loss": 0.022153466939926147, "eval_runtime": 177.8706, "eval_samples_per_second": 259.295, "eval_steps_per_second": 2.03, "step": 263 }, { "epoch": 0.5076923076923077, "grad_norm": 1.3905123472213745, "learning_rate": 4.942307692307693e-05, "loss": 0.0796, "step": 264 }, { "epoch": 0.5076923076923077, "eval_loss": 0.02036619931459427, "eval_runtime": 179.1649, "eval_samples_per_second": 257.422, "eval_steps_per_second": 2.015, "step": 264 }, { "epoch": 0.5096153846153846, "grad_norm": 0.5677681565284729, "learning_rate": 4.923076923076924e-05, "loss": 0.0374, "step": 265 }, { "epoch": 0.5096153846153846, "eval_loss": 0.019072143360972404, "eval_runtime": 177.7811, "eval_samples_per_second": 259.426, "eval_steps_per_second": 2.031, "step": 265 }, { "epoch": 0.5115384615384615, "grad_norm": 2.8789877891540527, "learning_rate": 4.9038461538461536e-05, "loss": 0.0607, "step": 266 }, { "epoch": 0.5115384615384615, "eval_loss": 0.01831653155386448, "eval_runtime": 177.7329, "eval_samples_per_second": 259.496, "eval_steps_per_second": 2.031, "step": 266 }, { "epoch": 0.5134615384615384, "grad_norm": 2.489546060562134, "learning_rate": 4.884615384615385e-05, "loss": 0.0265, "step": 267 }, { "epoch": 0.5134615384615384, "eval_loss": 0.018156476318836212, "eval_runtime": 178.3037, "eval_samples_per_second": 258.665, "eval_steps_per_second": 2.025, "step": 267 }, { "epoch": 0.5153846153846153, "grad_norm": 0.33813557028770447, "learning_rate": 4.865384615384616e-05, "loss": 0.0081, "step": 268 }, { "epoch": 0.5153846153846153, "eval_loss": 0.018267083913087845, "eval_runtime": 178.0725, "eval_samples_per_second": 259.001, "eval_steps_per_second": 2.027, "step": 268 }, { "epoch": 0.5173076923076924, "grad_norm": 1.1769758462905884, "learning_rate": 4.846153846153846e-05, "loss": 0.0109, "step": 269 }, { "epoch": 0.5173076923076924, "eval_loss": 0.018560878932476044, "eval_runtime": 178.4098, "eval_samples_per_second": 258.512, "eval_steps_per_second": 2.023, "step": 269 }, { "epoch": 0.5192307692307693, "grad_norm": 1.614059329032898, "learning_rate": 4.826923076923077e-05, "loss": 0.0297, "step": 270 }, { "epoch": 0.5192307692307693, "eval_loss": 0.019179968163371086, "eval_runtime": 178.4615, "eval_samples_per_second": 258.437, "eval_steps_per_second": 2.023, "step": 270 }, { "epoch": 0.5211538461538462, "grad_norm": 2.350944757461548, "learning_rate": 4.8076923076923084e-05, "loss": 0.0431, "step": 271 }, { "epoch": 0.5211538461538462, "eval_loss": 0.019696904346346855, "eval_runtime": 179.3451, "eval_samples_per_second": 257.163, "eval_steps_per_second": 2.013, "step": 271 }, { "epoch": 0.5230769230769231, "grad_norm": 1.677561640739441, "learning_rate": 4.788461538461539e-05, "loss": 0.0622, "step": 272 }, { "epoch": 0.5230769230769231, "eval_loss": 0.019987458363175392, "eval_runtime": 178.1412, "eval_samples_per_second": 258.901, "eval_steps_per_second": 2.026, "step": 272 }, { "epoch": 0.525, "grad_norm": 0.178351029753685, "learning_rate": 4.76923076923077e-05, "loss": 0.0043, "step": 273 }, { "epoch": 0.525, "eval_loss": 0.02033291384577751, "eval_runtime": 178.408, "eval_samples_per_second": 258.514, "eval_steps_per_second": 2.023, "step": 273 }, { "epoch": 0.5269230769230769, "grad_norm": 2.8552165031433105, "learning_rate": 4.75e-05, "loss": 0.0395, "step": 274 }, { "epoch": 0.5269230769230769, "eval_loss": 0.020751679316163063, "eval_runtime": 178.8897, "eval_samples_per_second": 257.818, "eval_steps_per_second": 2.018, "step": 274 }, { "epoch": 0.5288461538461539, "grad_norm": 1.308340311050415, "learning_rate": 4.730769230769231e-05, "loss": 0.0139, "step": 275 }, { "epoch": 0.5288461538461539, "eval_loss": 0.02126193419098854, "eval_runtime": 179.0787, "eval_samples_per_second": 257.546, "eval_steps_per_second": 2.016, "step": 275 }, { "epoch": 0.5307692307692308, "grad_norm": 2.3591983318328857, "learning_rate": 4.711538461538462e-05, "loss": 0.0831, "step": 276 }, { "epoch": 0.5307692307692308, "eval_loss": 0.022048989310860634, "eval_runtime": 178.4149, "eval_samples_per_second": 258.504, "eval_steps_per_second": 2.023, "step": 276 }, { "epoch": 0.5326923076923077, "grad_norm": 1.4477872848510742, "learning_rate": 4.692307692307693e-05, "loss": 0.077, "step": 277 }, { "epoch": 0.5326923076923077, "eval_loss": 0.02285471186041832, "eval_runtime": 178.6672, "eval_samples_per_second": 258.139, "eval_steps_per_second": 2.021, "step": 277 }, { "epoch": 0.5346153846153846, "grad_norm": 1.7352992296218872, "learning_rate": 4.673076923076923e-05, "loss": 0.0475, "step": 278 }, { "epoch": 0.5346153846153846, "eval_loss": 0.023948505520820618, "eval_runtime": 182.3486, "eval_samples_per_second": 252.928, "eval_steps_per_second": 1.98, "step": 278 }, { "epoch": 0.5365384615384615, "grad_norm": 0.7470586895942688, "learning_rate": 4.653846153846154e-05, "loss": 0.0404, "step": 279 }, { "epoch": 0.5365384615384615, "eval_loss": 0.025173615664243698, "eval_runtime": 190.4732, "eval_samples_per_second": 242.139, "eval_steps_per_second": 1.895, "step": 279 }, { "epoch": 0.5384615384615384, "grad_norm": 1.587988257408142, "learning_rate": 4.634615384615385e-05, "loss": 0.0825, "step": 280 }, { "epoch": 0.5384615384615384, "eval_loss": 0.026492061093449593, "eval_runtime": 179.4359, "eval_samples_per_second": 257.033, "eval_steps_per_second": 2.012, "step": 280 }, { "epoch": 0.5403846153846154, "grad_norm": 1.2091609239578247, "learning_rate": 4.615384615384616e-05, "loss": 0.0158, "step": 281 }, { "epoch": 0.5403846153846154, "eval_loss": 0.02694498375058174, "eval_runtime": 180.0245, "eval_samples_per_second": 256.193, "eval_steps_per_second": 2.005, "step": 281 }, { "epoch": 0.5423076923076923, "grad_norm": 2.1175975799560547, "learning_rate": 4.596153846153846e-05, "loss": 0.0591, "step": 282 }, { "epoch": 0.5423076923076923, "eval_loss": 0.026078298687934875, "eval_runtime": 180.9972, "eval_samples_per_second": 254.816, "eval_steps_per_second": 1.995, "step": 282 }, { "epoch": 0.5442307692307692, "grad_norm": 1.398386836051941, "learning_rate": 4.576923076923077e-05, "loss": 0.0313, "step": 283 }, { "epoch": 0.5442307692307692, "eval_loss": 0.02475452423095703, "eval_runtime": 181.364, "eval_samples_per_second": 254.301, "eval_steps_per_second": 1.99, "step": 283 }, { "epoch": 0.5461538461538461, "grad_norm": 0.47163820266723633, "learning_rate": 4.557692307692308e-05, "loss": 0.0078, "step": 284 }, { "epoch": 0.5461538461538461, "eval_loss": 0.023184489458799362, "eval_runtime": 182.4892, "eval_samples_per_second": 252.733, "eval_steps_per_second": 1.978, "step": 284 }, { "epoch": 0.5480769230769231, "grad_norm": 1.4991772174835205, "learning_rate": 4.538461538461539e-05, "loss": 0.0266, "step": 285 }, { "epoch": 0.5480769230769231, "eval_loss": 0.021967096254229546, "eval_runtime": 184.1286, "eval_samples_per_second": 250.483, "eval_steps_per_second": 1.961, "step": 285 }, { "epoch": 0.55, "grad_norm": 0.7483875155448914, "learning_rate": 4.519230769230769e-05, "loss": 0.0084, "step": 286 }, { "epoch": 0.55, "eval_loss": 0.020483436062932014, "eval_runtime": 183.7344, "eval_samples_per_second": 251.02, "eval_steps_per_second": 1.965, "step": 286 }, { "epoch": 0.551923076923077, "grad_norm": 0.33013495802879333, "learning_rate": 4.5e-05, "loss": 0.0057, "step": 287 }, { "epoch": 0.551923076923077, "eval_loss": 0.019703133031725883, "eval_runtime": 181.8834, "eval_samples_per_second": 253.575, "eval_steps_per_second": 1.985, "step": 287 }, { "epoch": 0.5538461538461539, "grad_norm": 2.1860246658325195, "learning_rate": 4.4807692307692314e-05, "loss": 0.0241, "step": 288 }, { "epoch": 0.5538461538461539, "eval_loss": 0.018677791580557823, "eval_runtime": 182.9398, "eval_samples_per_second": 252.11, "eval_steps_per_second": 1.973, "step": 288 }, { "epoch": 0.5557692307692308, "grad_norm": 1.618175983428955, "learning_rate": 4.461538461538462e-05, "loss": 0.0187, "step": 289 }, { "epoch": 0.5557692307692308, "eval_loss": 0.018410805612802505, "eval_runtime": 184.3183, "eval_samples_per_second": 250.225, "eval_steps_per_second": 1.959, "step": 289 }, { "epoch": 0.5576923076923077, "grad_norm": 1.3602591753005981, "learning_rate": 4.442307692307692e-05, "loss": 0.0465, "step": 290 }, { "epoch": 0.5576923076923077, "eval_loss": 0.018333878368139267, "eval_runtime": 183.9716, "eval_samples_per_second": 250.696, "eval_steps_per_second": 1.962, "step": 290 }, { "epoch": 0.5596153846153846, "grad_norm": 2.3707096576690674, "learning_rate": 4.423076923076923e-05, "loss": 0.071, "step": 291 }, { "epoch": 0.5596153846153846, "eval_loss": 0.01808229647576809, "eval_runtime": 183.4525, "eval_samples_per_second": 251.406, "eval_steps_per_second": 1.968, "step": 291 }, { "epoch": 0.5615384615384615, "grad_norm": 1.8033769130706787, "learning_rate": 4.403846153846154e-05, "loss": 0.0206, "step": 292 }, { "epoch": 0.5615384615384615, "eval_loss": 0.017939355224370956, "eval_runtime": 184.5771, "eval_samples_per_second": 249.874, "eval_steps_per_second": 1.956, "step": 292 }, { "epoch": 0.5634615384615385, "grad_norm": 2.6258585453033447, "learning_rate": 4.384615384615385e-05, "loss": 0.0291, "step": 293 }, { "epoch": 0.5634615384615385, "eval_loss": 0.01776740886271, "eval_runtime": 184.1944, "eval_samples_per_second": 250.393, "eval_steps_per_second": 1.96, "step": 293 }, { "epoch": 0.5653846153846154, "grad_norm": 0.5941898226737976, "learning_rate": 4.365384615384616e-05, "loss": 0.0103, "step": 294 }, { "epoch": 0.5653846153846154, "eval_loss": 0.017663318663835526, "eval_runtime": 185.8504, "eval_samples_per_second": 248.162, "eval_steps_per_second": 1.942, "step": 294 }, { "epoch": 0.5673076923076923, "grad_norm": 1.4761334657669067, "learning_rate": 4.346153846153846e-05, "loss": 0.0336, "step": 295 }, { "epoch": 0.5673076923076923, "eval_loss": 0.017522111535072327, "eval_runtime": 185.2284, "eval_samples_per_second": 248.995, "eval_steps_per_second": 1.949, "step": 295 }, { "epoch": 0.5692307692307692, "grad_norm": 3.071438789367676, "learning_rate": 4.326923076923077e-05, "loss": 0.0481, "step": 296 }, { "epoch": 0.5692307692307692, "eval_loss": 0.01728859543800354, "eval_runtime": 182.7731, "eval_samples_per_second": 252.34, "eval_steps_per_second": 1.975, "step": 296 }, { "epoch": 0.5711538461538461, "grad_norm": 2.6939680576324463, "learning_rate": 4.3076923076923084e-05, "loss": 0.0361, "step": 297 }, { "epoch": 0.5711538461538461, "eval_loss": 0.01725272834300995, "eval_runtime": 183.1168, "eval_samples_per_second": 251.867, "eval_steps_per_second": 1.971, "step": 297 }, { "epoch": 0.573076923076923, "grad_norm": 1.5768874883651733, "learning_rate": 4.288461538461538e-05, "loss": 0.0165, "step": 298 }, { "epoch": 0.573076923076923, "eval_loss": 0.017209839075803757, "eval_runtime": 183.6691, "eval_samples_per_second": 251.109, "eval_steps_per_second": 1.965, "step": 298 }, { "epoch": 0.575, "grad_norm": 0.6393303871154785, "learning_rate": 4.269230769230769e-05, "loss": 0.0093, "step": 299 }, { "epoch": 0.575, "eval_loss": 0.017135918140411377, "eval_runtime": 187.5931, "eval_samples_per_second": 245.857, "eval_steps_per_second": 1.924, "step": 299 }, { "epoch": 0.5769230769230769, "grad_norm": 3.0353610515594482, "learning_rate": 4.25e-05, "loss": 0.0203, "step": 300 }, { "epoch": 0.5769230769230769, "eval_loss": 0.017029576003551483, "eval_runtime": 187.2345, "eval_samples_per_second": 246.327, "eval_steps_per_second": 1.928, "step": 300 }, { "epoch": 0.5788461538461539, "grad_norm": 1.7909221649169922, "learning_rate": 4.230769230769231e-05, "loss": 0.0302, "step": 301 }, { "epoch": 0.5788461538461539, "eval_loss": 0.01717795990407467, "eval_runtime": 187.1637, "eval_samples_per_second": 246.421, "eval_steps_per_second": 1.929, "step": 301 }, { "epoch": 0.5807692307692308, "grad_norm": 0.9146409034729004, "learning_rate": 4.211538461538462e-05, "loss": 0.0089, "step": 302 }, { "epoch": 0.5807692307692308, "eval_loss": 0.017274800688028336, "eval_runtime": 186.4945, "eval_samples_per_second": 247.305, "eval_steps_per_second": 1.936, "step": 302 }, { "epoch": 0.5826923076923077, "grad_norm": 0.9270340800285339, "learning_rate": 4.192307692307693e-05, "loss": 0.0118, "step": 303 }, { "epoch": 0.5826923076923077, "eval_loss": 0.01724671758711338, "eval_runtime": 186.0669, "eval_samples_per_second": 247.873, "eval_steps_per_second": 1.94, "step": 303 }, { "epoch": 0.5846153846153846, "grad_norm": 1.39608895778656, "learning_rate": 4.173076923076923e-05, "loss": 0.0338, "step": 304 }, { "epoch": 0.5846153846153846, "eval_loss": 0.017357762902975082, "eval_runtime": 187.0885, "eval_samples_per_second": 246.52, "eval_steps_per_second": 1.93, "step": 304 }, { "epoch": 0.5865384615384616, "grad_norm": 1.7433981895446777, "learning_rate": 4.1538461538461544e-05, "loss": 0.0175, "step": 305 }, { "epoch": 0.5865384615384616, "eval_loss": 0.017904143780469894, "eval_runtime": 186.9937, "eval_samples_per_second": 246.645, "eval_steps_per_second": 1.931, "step": 305 }, { "epoch": 0.5884615384615385, "grad_norm": 0.5012370944023132, "learning_rate": 4.134615384615385e-05, "loss": 0.0075, "step": 306 }, { "epoch": 0.5884615384615385, "eval_loss": 0.018834874033927917, "eval_runtime": 187.3986, "eval_samples_per_second": 246.112, "eval_steps_per_second": 1.926, "step": 306 }, { "epoch": 0.5903846153846154, "grad_norm": 2.3230066299438477, "learning_rate": 4.115384615384615e-05, "loss": 0.015, "step": 307 }, { "epoch": 0.5903846153846154, "eval_loss": 0.020419873297214508, "eval_runtime": 188.2229, "eval_samples_per_second": 245.034, "eval_steps_per_second": 1.918, "step": 307 }, { "epoch": 0.5923076923076923, "grad_norm": 1.2293483018875122, "learning_rate": 4.096153846153846e-05, "loss": 0.0147, "step": 308 }, { "epoch": 0.5923076923076923, "eval_loss": 0.0223609060049057, "eval_runtime": 186.8039, "eval_samples_per_second": 246.895, "eval_steps_per_second": 1.933, "step": 308 }, { "epoch": 0.5942307692307692, "grad_norm": 1.8343030214309692, "learning_rate": 4.0769230769230773e-05, "loss": 0.0256, "step": 309 }, { "epoch": 0.5942307692307692, "eval_loss": 0.02358538843691349, "eval_runtime": 185.2135, "eval_samples_per_second": 249.015, "eval_steps_per_second": 1.949, "step": 309 }, { "epoch": 0.5961538461538461, "grad_norm": 0.8146288990974426, "learning_rate": 4.057692307692308e-05, "loss": 0.0071, "step": 310 }, { "epoch": 0.5961538461538461, "eval_loss": 0.02386535331606865, "eval_runtime": 183.8932, "eval_samples_per_second": 250.803, "eval_steps_per_second": 1.963, "step": 310 }, { "epoch": 0.5980769230769231, "grad_norm": 4.294083595275879, "learning_rate": 4.038461538461539e-05, "loss": 0.0456, "step": 311 }, { "epoch": 0.5980769230769231, "eval_loss": 0.023020418360829353, "eval_runtime": 183.5896, "eval_samples_per_second": 251.218, "eval_steps_per_second": 1.966, "step": 311 }, { "epoch": 0.6, "grad_norm": 1.3458585739135742, "learning_rate": 4.019230769230769e-05, "loss": 0.0293, "step": 312 }, { "epoch": 0.6, "eval_loss": 0.021461069583892822, "eval_runtime": 183.1076, "eval_samples_per_second": 251.879, "eval_steps_per_second": 1.972, "step": 312 }, { "epoch": 0.6019230769230769, "grad_norm": 1.0707663297653198, "learning_rate": 4e-05, "loss": 0.0183, "step": 313 }, { "epoch": 0.6019230769230769, "eval_loss": 0.020740246400237083, "eval_runtime": 182.7591, "eval_samples_per_second": 252.36, "eval_steps_per_second": 1.975, "step": 313 }, { "epoch": 0.6038461538461538, "grad_norm": 0.22060979902744293, "learning_rate": 3.980769230769231e-05, "loss": 0.0057, "step": 314 }, { "epoch": 0.6038461538461538, "eval_loss": 0.020306937396526337, "eval_runtime": 181.6239, "eval_samples_per_second": 253.937, "eval_steps_per_second": 1.988, "step": 314 }, { "epoch": 0.6057692307692307, "grad_norm": 0.45676878094673157, "learning_rate": 3.961538461538462e-05, "loss": 0.0559, "step": 315 }, { "epoch": 0.6057692307692307, "eval_loss": 0.019902806729078293, "eval_runtime": 187.9515, "eval_samples_per_second": 245.388, "eval_steps_per_second": 1.921, "step": 315 }, { "epoch": 0.6076923076923076, "grad_norm": 1.1361058950424194, "learning_rate": 3.942307692307692e-05, "loss": 0.014, "step": 316 }, { "epoch": 0.6076923076923076, "eval_loss": 0.01993195153772831, "eval_runtime": 183.33, "eval_samples_per_second": 251.574, "eval_steps_per_second": 1.969, "step": 316 }, { "epoch": 0.6096153846153847, "grad_norm": 2.384164810180664, "learning_rate": 3.923076923076923e-05, "loss": 0.0516, "step": 317 }, { "epoch": 0.6096153846153847, "eval_loss": 0.019640127196907997, "eval_runtime": 181.9327, "eval_samples_per_second": 253.506, "eval_steps_per_second": 1.984, "step": 317 }, { "epoch": 0.6115384615384616, "grad_norm": 2.3011910915374756, "learning_rate": 3.903846153846154e-05, "loss": 0.0502, "step": 318 }, { "epoch": 0.6115384615384616, "eval_loss": 0.019097890704870224, "eval_runtime": 182.2105, "eval_samples_per_second": 253.119, "eval_steps_per_second": 1.981, "step": 318 }, { "epoch": 0.6134615384615385, "grad_norm": 2.6372344493865967, "learning_rate": 3.884615384615385e-05, "loss": 0.0871, "step": 319 }, { "epoch": 0.6134615384615385, "eval_loss": 0.018500829115509987, "eval_runtime": 184.0891, "eval_samples_per_second": 250.536, "eval_steps_per_second": 1.961, "step": 319 }, { "epoch": 0.6153846153846154, "grad_norm": 0.08721642941236496, "learning_rate": 3.865384615384616e-05, "loss": 0.0039, "step": 320 }, { "epoch": 0.6153846153846154, "eval_loss": 0.018185600638389587, "eval_runtime": 185.4703, "eval_samples_per_second": 248.671, "eval_steps_per_second": 1.946, "step": 320 }, { "epoch": 0.6173076923076923, "grad_norm": 2.712874174118042, "learning_rate": 3.846153846153846e-05, "loss": 0.0125, "step": 321 }, { "epoch": 0.6173076923076923, "eval_loss": 0.018078332766890526, "eval_runtime": 182.7576, "eval_samples_per_second": 252.362, "eval_steps_per_second": 1.975, "step": 321 }, { "epoch": 0.6192307692307693, "grad_norm": 2.1178512573242188, "learning_rate": 3.826923076923077e-05, "loss": 0.0438, "step": 322 }, { "epoch": 0.6192307692307693, "eval_loss": 0.01808401755988598, "eval_runtime": 180.7165, "eval_samples_per_second": 255.212, "eval_steps_per_second": 1.998, "step": 322 }, { "epoch": 0.6211538461538462, "grad_norm": 1.4352222681045532, "learning_rate": 3.807692307692308e-05, "loss": 0.0148, "step": 323 }, { "epoch": 0.6211538461538462, "eval_loss": 0.018278954550623894, "eval_runtime": 180.0406, "eval_samples_per_second": 256.17, "eval_steps_per_second": 2.005, "step": 323 }, { "epoch": 0.6230769230769231, "grad_norm": 0.1123296320438385, "learning_rate": 3.788461538461538e-05, "loss": 0.004, "step": 324 }, { "epoch": 0.6230769230769231, "eval_loss": 0.018484123051166534, "eval_runtime": 180.4878, "eval_samples_per_second": 255.535, "eval_steps_per_second": 2.0, "step": 324 }, { "epoch": 0.625, "grad_norm": 2.1391522884368896, "learning_rate": 3.769230769230769e-05, "loss": 0.02, "step": 325 }, { "epoch": 0.625, "eval_loss": 0.01890737935900688, "eval_runtime": 179.5465, "eval_samples_per_second": 256.875, "eval_steps_per_second": 2.011, "step": 325 }, { "epoch": 0.6269230769230769, "grad_norm": 0.7092868089675903, "learning_rate": 3.7500000000000003e-05, "loss": 0.0103, "step": 326 }, { "epoch": 0.6269230769230769, "eval_loss": 0.01960405707359314, "eval_runtime": 180.3603, "eval_samples_per_second": 255.716, "eval_steps_per_second": 2.002, "step": 326 }, { "epoch": 0.6288461538461538, "grad_norm": 1.3859364986419678, "learning_rate": 3.730769230769231e-05, "loss": 0.0121, "step": 327 }, { "epoch": 0.6288461538461538, "eval_loss": 0.02096397802233696, "eval_runtime": 180.3107, "eval_samples_per_second": 255.786, "eval_steps_per_second": 2.002, "step": 327 }, { "epoch": 0.6307692307692307, "grad_norm": 1.0377469062805176, "learning_rate": 3.711538461538462e-05, "loss": 0.0251, "step": 328 }, { "epoch": 0.6307692307692307, "eval_loss": 0.023392662405967712, "eval_runtime": 180.6358, "eval_samples_per_second": 255.326, "eval_steps_per_second": 1.998, "step": 328 }, { "epoch": 0.6326923076923077, "grad_norm": 0.7841097712516785, "learning_rate": 3.692307692307693e-05, "loss": 0.0381, "step": 329 }, { "epoch": 0.6326923076923077, "eval_loss": 0.02570357732474804, "eval_runtime": 180.4709, "eval_samples_per_second": 255.559, "eval_steps_per_second": 2.0, "step": 329 }, { "epoch": 0.6346153846153846, "grad_norm": 0.651456356048584, "learning_rate": 3.673076923076923e-05, "loss": 0.0058, "step": 330 }, { "epoch": 0.6346153846153846, "eval_loss": 0.027660323306918144, "eval_runtime": 180.74, "eval_samples_per_second": 255.179, "eval_steps_per_second": 1.997, "step": 330 }, { "epoch": 0.6365384615384615, "grad_norm": 1.0176434516906738, "learning_rate": 3.653846153846154e-05, "loss": 0.0062, "step": 331 }, { "epoch": 0.6365384615384615, "eval_loss": 0.028412554413080215, "eval_runtime": 180.2559, "eval_samples_per_second": 255.864, "eval_steps_per_second": 2.003, "step": 331 }, { "epoch": 0.6384615384615384, "grad_norm": 1.440993070602417, "learning_rate": 3.634615384615385e-05, "loss": 0.0134, "step": 332 }, { "epoch": 0.6384615384615384, "eval_loss": 0.02854098007082939, "eval_runtime": 179.5351, "eval_samples_per_second": 256.891, "eval_steps_per_second": 2.011, "step": 332 }, { "epoch": 0.6403846153846153, "grad_norm": 0.8128412961959839, "learning_rate": 3.615384615384615e-05, "loss": 0.006, "step": 333 }, { "epoch": 0.6403846153846153, "eval_loss": 0.027823466807603836, "eval_runtime": 181.214, "eval_samples_per_second": 254.511, "eval_steps_per_second": 1.992, "step": 333 }, { "epoch": 0.6423076923076924, "grad_norm": 0.12252137064933777, "learning_rate": 3.596153846153846e-05, "loss": 0.0037, "step": 334 }, { "epoch": 0.6423076923076924, "eval_loss": 0.02721521630883217, "eval_runtime": 178.9703, "eval_samples_per_second": 257.702, "eval_steps_per_second": 2.017, "step": 334 }, { "epoch": 0.6442307692307693, "grad_norm": 2.2254228591918945, "learning_rate": 3.5769230769230774e-05, "loss": 0.0551, "step": 335 }, { "epoch": 0.6442307692307693, "eval_loss": 0.025851406157016754, "eval_runtime": 181.1771, "eval_samples_per_second": 254.563, "eval_steps_per_second": 1.993, "step": 335 }, { "epoch": 0.6461538461538462, "grad_norm": 4.576323986053467, "learning_rate": 3.557692307692308e-05, "loss": 0.0756, "step": 336 }, { "epoch": 0.6461538461538462, "eval_loss": 0.024042945355176926, "eval_runtime": 180.2984, "eval_samples_per_second": 255.804, "eval_steps_per_second": 2.002, "step": 336 }, { "epoch": 0.6480769230769231, "grad_norm": 2.2256715297698975, "learning_rate": 3.538461538461539e-05, "loss": 0.036, "step": 337 }, { "epoch": 0.6480769230769231, "eval_loss": 0.022502081468701363, "eval_runtime": 179.5004, "eval_samples_per_second": 256.941, "eval_steps_per_second": 2.011, "step": 337 }, { "epoch": 0.65, "grad_norm": 0.19705277681350708, "learning_rate": 3.51923076923077e-05, "loss": 0.0045, "step": 338 }, { "epoch": 0.65, "eval_loss": 0.021358314901590347, "eval_runtime": 179.5403, "eval_samples_per_second": 256.884, "eval_steps_per_second": 2.011, "step": 338 }, { "epoch": 0.6519230769230769, "grad_norm": 2.603549003601074, "learning_rate": 3.5e-05, "loss": 0.0265, "step": 339 }, { "epoch": 0.6519230769230769, "eval_loss": 0.020234843716025352, "eval_runtime": 180.9278, "eval_samples_per_second": 254.914, "eval_steps_per_second": 1.995, "step": 339 }, { "epoch": 0.6538461538461539, "grad_norm": 0.22635461390018463, "learning_rate": 3.480769230769231e-05, "loss": 0.0055, "step": 340 }, { "epoch": 0.6538461538461539, "eval_loss": 0.0195136871188879, "eval_runtime": 181.0715, "eval_samples_per_second": 254.711, "eval_steps_per_second": 1.994, "step": 340 }, { "epoch": 0.6557692307692308, "grad_norm": 2.3919484615325928, "learning_rate": 3.461538461538462e-05, "loss": 0.0184, "step": 341 }, { "epoch": 0.6557692307692308, "eval_loss": 0.019222378730773926, "eval_runtime": 180.631, "eval_samples_per_second": 255.333, "eval_steps_per_second": 1.999, "step": 341 }, { "epoch": 0.6576923076923077, "grad_norm": 0.7615954875946045, "learning_rate": 3.442307692307692e-05, "loss": 0.0068, "step": 342 }, { "epoch": 0.6576923076923077, "eval_loss": 0.019121317192912102, "eval_runtime": 179.541, "eval_samples_per_second": 256.883, "eval_steps_per_second": 2.011, "step": 342 }, { "epoch": 0.6596153846153846, "grad_norm": 0.15638679265975952, "learning_rate": 3.4230769230769234e-05, "loss": 0.0037, "step": 343 }, { "epoch": 0.6596153846153846, "eval_loss": 0.01907580904662609, "eval_runtime": 180.9808, "eval_samples_per_second": 254.839, "eval_steps_per_second": 1.995, "step": 343 }, { "epoch": 0.6615384615384615, "grad_norm": 1.0095447301864624, "learning_rate": 3.4038461538461544e-05, "loss": 0.029, "step": 344 }, { "epoch": 0.6615384615384615, "eval_loss": 0.018844593316316605, "eval_runtime": 181.0563, "eval_samples_per_second": 254.733, "eval_steps_per_second": 1.994, "step": 344 }, { "epoch": 0.6634615384615384, "grad_norm": 1.0952856540679932, "learning_rate": 3.384615384615385e-05, "loss": 0.0253, "step": 345 }, { "epoch": 0.6634615384615384, "eval_loss": 0.01858992874622345, "eval_runtime": 180.445, "eval_samples_per_second": 255.596, "eval_steps_per_second": 2.001, "step": 345 }, { "epoch": 0.6653846153846154, "grad_norm": 3.664583683013916, "learning_rate": 3.365384615384616e-05, "loss": 0.0702, "step": 346 }, { "epoch": 0.6653846153846154, "eval_loss": 0.01833895593881607, "eval_runtime": 181.7517, "eval_samples_per_second": 253.758, "eval_steps_per_second": 1.986, "step": 346 }, { "epoch": 0.6673076923076923, "grad_norm": 2.9066598415374756, "learning_rate": 3.346153846153846e-05, "loss": 0.0194, "step": 347 }, { "epoch": 0.6673076923076923, "eval_loss": 0.018282251432538033, "eval_runtime": 179.7843, "eval_samples_per_second": 256.535, "eval_steps_per_second": 2.008, "step": 347 }, { "epoch": 0.6692307692307692, "grad_norm": 1.9174058437347412, "learning_rate": 3.326923076923077e-05, "loss": 0.0123, "step": 348 }, { "epoch": 0.6692307692307692, "eval_loss": 0.018363026902079582, "eval_runtime": 179.7461, "eval_samples_per_second": 256.59, "eval_steps_per_second": 2.008, "step": 348 }, { "epoch": 0.6711538461538461, "grad_norm": 0.2594242990016937, "learning_rate": 3.307692307692308e-05, "loss": 0.0039, "step": 349 }, { "epoch": 0.6711538461538461, "eval_loss": 0.018489746376872063, "eval_runtime": 179.7349, "eval_samples_per_second": 256.606, "eval_steps_per_second": 2.009, "step": 349 }, { "epoch": 0.6730769230769231, "grad_norm": 5.305335521697998, "learning_rate": 3.288461538461539e-05, "loss": 0.0745, "step": 350 }, { "epoch": 0.6730769230769231, "eval_loss": 0.01867184229195118, "eval_runtime": 181.4346, "eval_samples_per_second": 254.202, "eval_steps_per_second": 1.99, "step": 350 } ], "logging_steps": 1, "max_steps": 520, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 22569032908800.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }