| { | |
| "best_global_step": 300, | |
| "best_metric": 0.017029576003551483, | |
| "best_model_checkpoint": "./results/checkpoint-300", | |
| "epoch": 0.6730769230769231, | |
| "eval_steps": 1, | |
| "global_step": 350, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0019230769230769232, | |
| "grad_norm": 7.937829494476318, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9513, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0019230769230769232, | |
| "eval_loss": 0.2997034788131714, | |
| "eval_runtime": 193.365, | |
| "eval_samples_per_second": 238.518, | |
| "eval_steps_per_second": 1.867, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0038461538461538464, | |
| "grad_norm": 7.157807350158691, | |
| "learning_rate": 9.980769230769231e-05, | |
| "loss": 0.5562, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0038461538461538464, | |
| "eval_loss": 0.2078278511762619, | |
| "eval_runtime": 194.5982, | |
| "eval_samples_per_second": 237.006, | |
| "eval_steps_per_second": 1.855, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0057692307692307696, | |
| "grad_norm": 8.241674423217773, | |
| "learning_rate": 9.961538461538463e-05, | |
| "loss": 0.439, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0057692307692307696, | |
| "eval_loss": 0.12207631766796112, | |
| "eval_runtime": 227.6594, | |
| "eval_samples_per_second": 202.588, | |
| "eval_steps_per_second": 1.586, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.007692307692307693, | |
| "grad_norm": 5.947928428649902, | |
| "learning_rate": 9.942307692307693e-05, | |
| "loss": 0.3473, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.007692307692307693, | |
| "eval_loss": 0.0955578088760376, | |
| "eval_runtime": 201.925, | |
| "eval_samples_per_second": 228.407, | |
| "eval_steps_per_second": 1.788, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.009615384615384616, | |
| "grad_norm": 3.876149892807007, | |
| "learning_rate": 9.923076923076923e-05, | |
| "loss": 0.1832, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.009615384615384616, | |
| "eval_loss": 0.07960505038499832, | |
| "eval_runtime": 194.1962, | |
| "eval_samples_per_second": 237.497, | |
| "eval_steps_per_second": 1.859, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.011538461538461539, | |
| "grad_norm": 2.4056999683380127, | |
| "learning_rate": 9.903846153846155e-05, | |
| "loss": 0.145, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.011538461538461539, | |
| "eval_loss": 0.07136845588684082, | |
| "eval_runtime": 190.7225, | |
| "eval_samples_per_second": 241.823, | |
| "eval_steps_per_second": 1.893, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.013461538461538462, | |
| "grad_norm": 3.1581223011016846, | |
| "learning_rate": 9.884615384615386e-05, | |
| "loss": 0.15, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.013461538461538462, | |
| "eval_loss": 0.0684390515089035, | |
| "eval_runtime": 195.3393, | |
| "eval_samples_per_second": 236.107, | |
| "eval_steps_per_second": 1.848, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.015384615384615385, | |
| "grad_norm": 3.687472343444824, | |
| "learning_rate": 9.865384615384616e-05, | |
| "loss": 0.1821, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.015384615384615385, | |
| "eval_loss": 0.07759178429841995, | |
| "eval_runtime": 192.748, | |
| "eval_samples_per_second": 239.281, | |
| "eval_steps_per_second": 1.873, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.01730769230769231, | |
| "grad_norm": 1.418228268623352, | |
| "learning_rate": 9.846153846153848e-05, | |
| "loss": 0.0854, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.01730769230769231, | |
| "eval_loss": 0.10003213584423065, | |
| "eval_runtime": 190.3642, | |
| "eval_samples_per_second": 242.278, | |
| "eval_steps_per_second": 1.896, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.019230769230769232, | |
| "grad_norm": 4.458162784576416, | |
| "learning_rate": 9.826923076923077e-05, | |
| "loss": 0.1871, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.019230769230769232, | |
| "eval_loss": 0.1014280766248703, | |
| "eval_runtime": 189.4477, | |
| "eval_samples_per_second": 243.45, | |
| "eval_steps_per_second": 1.906, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.021153846153846155, | |
| "grad_norm": 5.193276882171631, | |
| "learning_rate": 9.807692307692307e-05, | |
| "loss": 0.1438, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.021153846153846155, | |
| "eval_loss": 0.08028294146060944, | |
| "eval_runtime": 186.9296, | |
| "eval_samples_per_second": 246.729, | |
| "eval_steps_per_second": 1.931, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.023076923076923078, | |
| "grad_norm": 2.2287089824676514, | |
| "learning_rate": 9.788461538461539e-05, | |
| "loss": 0.1486, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.023076923076923078, | |
| "eval_loss": 0.07001757621765137, | |
| "eval_runtime": 184.0254, | |
| "eval_samples_per_second": 250.623, | |
| "eval_steps_per_second": 1.962, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 1.668630838394165, | |
| "learning_rate": 9.76923076923077e-05, | |
| "loss": 0.0869, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "eval_loss": 0.059553906321525574, | |
| "eval_runtime": 181.9616, | |
| "eval_samples_per_second": 253.466, | |
| "eval_steps_per_second": 1.984, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.026923076923076925, | |
| "grad_norm": 1.8616667985916138, | |
| "learning_rate": 9.75e-05, | |
| "loss": 0.1096, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.026923076923076925, | |
| "eval_loss": 0.0529908612370491, | |
| "eval_runtime": 183.3273, | |
| "eval_samples_per_second": 251.577, | |
| "eval_steps_per_second": 1.969, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.028846153846153848, | |
| "grad_norm": 2.9516372680664062, | |
| "learning_rate": 9.730769230769232e-05, | |
| "loss": 0.1237, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.028846153846153848, | |
| "eval_loss": 0.052440524101257324, | |
| "eval_runtime": 184.1599, | |
| "eval_samples_per_second": 250.44, | |
| "eval_steps_per_second": 1.96, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03076923076923077, | |
| "grad_norm": 1.757940649986267, | |
| "learning_rate": 9.711538461538462e-05, | |
| "loss": 0.0928, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.03076923076923077, | |
| "eval_loss": 0.05516982078552246, | |
| "eval_runtime": 184.2604, | |
| "eval_samples_per_second": 250.303, | |
| "eval_steps_per_second": 1.959, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.032692307692307694, | |
| "grad_norm": 2.269965887069702, | |
| "learning_rate": 9.692307692307692e-05, | |
| "loss": 0.0939, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.032692307692307694, | |
| "eval_loss": 0.058670658618211746, | |
| "eval_runtime": 184.3309, | |
| "eval_samples_per_second": 250.208, | |
| "eval_steps_per_second": 1.958, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.03461538461538462, | |
| "grad_norm": 2.7135467529296875, | |
| "learning_rate": 9.673076923076924e-05, | |
| "loss": 0.0731, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03461538461538462, | |
| "eval_loss": 0.06141166388988495, | |
| "eval_runtime": 184.7185, | |
| "eval_samples_per_second": 249.683, | |
| "eval_steps_per_second": 1.954, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03653846153846154, | |
| "grad_norm": 1.853324294090271, | |
| "learning_rate": 9.653846153846155e-05, | |
| "loss": 0.0995, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.03653846153846154, | |
| "eval_loss": 0.06201282888650894, | |
| "eval_runtime": 185.1901, | |
| "eval_samples_per_second": 249.047, | |
| "eval_steps_per_second": 1.949, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.038461538461538464, | |
| "grad_norm": 3.1507911682128906, | |
| "learning_rate": 9.634615384615385e-05, | |
| "loss": 0.1082, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.038461538461538464, | |
| "eval_loss": 0.054486844688653946, | |
| "eval_runtime": 186.7571, | |
| "eval_samples_per_second": 246.957, | |
| "eval_steps_per_second": 1.933, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04038461538461539, | |
| "grad_norm": 1.8622870445251465, | |
| "learning_rate": 9.615384615384617e-05, | |
| "loss": 0.1058, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.04038461538461539, | |
| "eval_loss": 0.04959222301840782, | |
| "eval_runtime": 185.3392, | |
| "eval_samples_per_second": 248.846, | |
| "eval_steps_per_second": 1.948, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.04230769230769231, | |
| "grad_norm": 2.0501298904418945, | |
| "learning_rate": 9.596153846153847e-05, | |
| "loss": 0.0793, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.04230769230769231, | |
| "eval_loss": 0.04307747259736061, | |
| "eval_runtime": 186.5253, | |
| "eval_samples_per_second": 247.264, | |
| "eval_steps_per_second": 1.935, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.04423076923076923, | |
| "grad_norm": 2.891623020172119, | |
| "learning_rate": 9.576923076923078e-05, | |
| "loss": 0.108, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.04423076923076923, | |
| "eval_loss": 0.039741210639476776, | |
| "eval_runtime": 186.9639, | |
| "eval_samples_per_second": 246.684, | |
| "eval_steps_per_second": 1.931, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.046153846153846156, | |
| "grad_norm": 3.0970637798309326, | |
| "learning_rate": 9.557692307692308e-05, | |
| "loss": 0.1315, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.046153846153846156, | |
| "eval_loss": 0.03945652395486832, | |
| "eval_runtime": 184.7227, | |
| "eval_samples_per_second": 249.677, | |
| "eval_steps_per_second": 1.954, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.04807692307692308, | |
| "grad_norm": 2.6016592979431152, | |
| "learning_rate": 9.53846153846154e-05, | |
| "loss": 0.0548, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04807692307692308, | |
| "eval_loss": 0.041967593133449554, | |
| "eval_runtime": 184.27, | |
| "eval_samples_per_second": 250.29, | |
| "eval_steps_per_second": 1.959, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.6659207344055176, | |
| "learning_rate": 9.519230769230769e-05, | |
| "loss": 0.0879, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 0.05009755492210388, | |
| "eval_runtime": 183.7036, | |
| "eval_samples_per_second": 251.062, | |
| "eval_steps_per_second": 1.965, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.051923076923076926, | |
| "grad_norm": 1.676653265953064, | |
| "learning_rate": 9.5e-05, | |
| "loss": 0.0619, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.051923076923076926, | |
| "eval_loss": 0.05302096903324127, | |
| "eval_runtime": 184.6409, | |
| "eval_samples_per_second": 249.788, | |
| "eval_steps_per_second": 1.955, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.05384615384615385, | |
| "grad_norm": 1.7307082414627075, | |
| "learning_rate": 9.480769230769231e-05, | |
| "loss": 0.0603, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.05384615384615385, | |
| "eval_loss": 0.050088509917259216, | |
| "eval_runtime": 185.2218, | |
| "eval_samples_per_second": 249.004, | |
| "eval_steps_per_second": 1.949, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.05576923076923077, | |
| "grad_norm": 2.250020742416382, | |
| "learning_rate": 9.461538461538461e-05, | |
| "loss": 0.1022, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.05576923076923077, | |
| "eval_loss": 0.043293166905641556, | |
| "eval_runtime": 184.2912, | |
| "eval_samples_per_second": 250.262, | |
| "eval_steps_per_second": 1.959, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.057692307692307696, | |
| "grad_norm": 1.9879577159881592, | |
| "learning_rate": 9.442307692307693e-05, | |
| "loss": 0.0528, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.057692307692307696, | |
| "eval_loss": 0.04137587174773216, | |
| "eval_runtime": 185.0258, | |
| "eval_samples_per_second": 249.268, | |
| "eval_steps_per_second": 1.951, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05961538461538462, | |
| "grad_norm": 1.5150986909866333, | |
| "learning_rate": 9.423076923076924e-05, | |
| "loss": 0.0844, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.05961538461538462, | |
| "eval_loss": 0.0410715714097023, | |
| "eval_runtime": 183.2326, | |
| "eval_samples_per_second": 251.707, | |
| "eval_steps_per_second": 1.97, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.06153846153846154, | |
| "grad_norm": 2.154343605041504, | |
| "learning_rate": 9.403846153846154e-05, | |
| "loss": 0.0756, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.06153846153846154, | |
| "eval_loss": 0.04246753454208374, | |
| "eval_runtime": 185.129, | |
| "eval_samples_per_second": 249.129, | |
| "eval_steps_per_second": 1.95, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.06346153846153846, | |
| "grad_norm": 2.0272536277770996, | |
| "learning_rate": 9.384615384615386e-05, | |
| "loss": 0.0892, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.06346153846153846, | |
| "eval_loss": 0.04080929979681969, | |
| "eval_runtime": 182.0873, | |
| "eval_samples_per_second": 253.291, | |
| "eval_steps_per_second": 1.983, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.06538461538461539, | |
| "grad_norm": 1.3116830587387085, | |
| "learning_rate": 9.365384615384616e-05, | |
| "loss": 0.0569, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.06538461538461539, | |
| "eval_loss": 0.04051681235432625, | |
| "eval_runtime": 182.1351, | |
| "eval_samples_per_second": 253.224, | |
| "eval_steps_per_second": 1.982, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.0673076923076923, | |
| "grad_norm": 1.3111695051193237, | |
| "learning_rate": 9.346153846153846e-05, | |
| "loss": 0.0717, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0673076923076923, | |
| "eval_loss": 0.0393197201192379, | |
| "eval_runtime": 184.6064, | |
| "eval_samples_per_second": 249.834, | |
| "eval_steps_per_second": 1.956, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.06923076923076923, | |
| "grad_norm": 0.8019259572029114, | |
| "learning_rate": 9.326923076923077e-05, | |
| "loss": 0.0428, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.06923076923076923, | |
| "eval_loss": 0.03660094365477562, | |
| "eval_runtime": 182.2104, | |
| "eval_samples_per_second": 253.12, | |
| "eval_steps_per_second": 1.981, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.07115384615384615, | |
| "grad_norm": 2.83894419670105, | |
| "learning_rate": 9.307692307692309e-05, | |
| "loss": 0.105, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.07115384615384615, | |
| "eval_loss": 0.03613066300749779, | |
| "eval_runtime": 184.1752, | |
| "eval_samples_per_second": 250.419, | |
| "eval_steps_per_second": 1.96, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.07307692307692308, | |
| "grad_norm": 1.8959332704544067, | |
| "learning_rate": 9.288461538461539e-05, | |
| "loss": 0.0419, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.07307692307692308, | |
| "eval_loss": 0.03350270912051201, | |
| "eval_runtime": 184.6287, | |
| "eval_samples_per_second": 249.804, | |
| "eval_steps_per_second": 1.955, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 2.977259397506714, | |
| "learning_rate": 9.26923076923077e-05, | |
| "loss": 0.1345, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "eval_loss": 0.03360990434885025, | |
| "eval_runtime": 184.2045, | |
| "eval_samples_per_second": 250.379, | |
| "eval_steps_per_second": 1.96, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.07692307692307693, | |
| "grad_norm": 1.0986078977584839, | |
| "learning_rate": 9.250000000000001e-05, | |
| "loss": 0.0586, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07692307692307693, | |
| "eval_loss": 0.03480533882975578, | |
| "eval_runtime": 185.2469, | |
| "eval_samples_per_second": 248.97, | |
| "eval_steps_per_second": 1.949, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07884615384615384, | |
| "grad_norm": 2.1644158363342285, | |
| "learning_rate": 9.230769230769232e-05, | |
| "loss": 0.048, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.07884615384615384, | |
| "eval_loss": 0.03521328046917915, | |
| "eval_runtime": 183.8053, | |
| "eval_samples_per_second": 250.923, | |
| "eval_steps_per_second": 1.964, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.08076923076923077, | |
| "grad_norm": 4.704512596130371, | |
| "learning_rate": 9.211538461538462e-05, | |
| "loss": 0.0967, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.08076923076923077, | |
| "eval_loss": 0.039289865642786026, | |
| "eval_runtime": 183.4419, | |
| "eval_samples_per_second": 251.42, | |
| "eval_steps_per_second": 1.968, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.08269230769230769, | |
| "grad_norm": 1.3928428888320923, | |
| "learning_rate": 9.192307692307692e-05, | |
| "loss": 0.0492, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.08269230769230769, | |
| "eval_loss": 0.04399557411670685, | |
| "eval_runtime": 183.3842, | |
| "eval_samples_per_second": 251.499, | |
| "eval_steps_per_second": 1.969, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.08461538461538462, | |
| "grad_norm": 1.3426079750061035, | |
| "learning_rate": 9.173076923076923e-05, | |
| "loss": 0.0756, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.08461538461538462, | |
| "eval_loss": 0.04704272374510765, | |
| "eval_runtime": 184.0623, | |
| "eval_samples_per_second": 250.573, | |
| "eval_steps_per_second": 1.961, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.08653846153846154, | |
| "grad_norm": 2.435299873352051, | |
| "learning_rate": 9.153846153846155e-05, | |
| "loss": 0.0769, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08653846153846154, | |
| "eval_loss": 0.04405398294329643, | |
| "eval_runtime": 186.5663, | |
| "eval_samples_per_second": 247.21, | |
| "eval_steps_per_second": 1.935, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08846153846153847, | |
| "grad_norm": 2.430859088897705, | |
| "learning_rate": 9.134615384615385e-05, | |
| "loss": 0.114, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.08846153846153847, | |
| "eval_loss": 0.041341982781887054, | |
| "eval_runtime": 184.1332, | |
| "eval_samples_per_second": 250.476, | |
| "eval_steps_per_second": 1.961, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.09038461538461538, | |
| "grad_norm": 2.121349334716797, | |
| "learning_rate": 9.115384615384615e-05, | |
| "loss": 0.0448, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.09038461538461538, | |
| "eval_loss": 0.036010172218084335, | |
| "eval_runtime": 182.9905, | |
| "eval_samples_per_second": 252.04, | |
| "eval_steps_per_second": 1.973, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.09230769230769231, | |
| "grad_norm": 1.8270937204360962, | |
| "learning_rate": 9.096153846153846e-05, | |
| "loss": 0.0642, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.09230769230769231, | |
| "eval_loss": 0.034751046448946, | |
| "eval_runtime": 182.6922, | |
| "eval_samples_per_second": 252.452, | |
| "eval_steps_per_second": 1.976, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.09423076923076923, | |
| "grad_norm": 1.234506368637085, | |
| "learning_rate": 9.076923076923078e-05, | |
| "loss": 0.0556, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.09423076923076923, | |
| "eval_loss": 0.0333593524992466, | |
| "eval_runtime": 183.8994, | |
| "eval_samples_per_second": 250.795, | |
| "eval_steps_per_second": 1.963, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.09615384615384616, | |
| "grad_norm": 1.265337347984314, | |
| "learning_rate": 9.057692307692308e-05, | |
| "loss": 0.0641, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09615384615384616, | |
| "eval_loss": 0.03294428065419197, | |
| "eval_runtime": 183.2342, | |
| "eval_samples_per_second": 251.705, | |
| "eval_steps_per_second": 1.97, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09807692307692308, | |
| "grad_norm": 1.0051465034484863, | |
| "learning_rate": 9.038461538461538e-05, | |
| "loss": 0.0402, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.09807692307692308, | |
| "eval_loss": 0.03119943104684353, | |
| "eval_runtime": 183.9091, | |
| "eval_samples_per_second": 250.781, | |
| "eval_steps_per_second": 1.963, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.3566607236862183, | |
| "learning_rate": 9.01923076923077e-05, | |
| "loss": 0.0447, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 0.03182150423526764, | |
| "eval_runtime": 183.9643, | |
| "eval_samples_per_second": 250.706, | |
| "eval_steps_per_second": 1.962, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.10192307692307692, | |
| "grad_norm": 3.3966872692108154, | |
| "learning_rate": 9e-05, | |
| "loss": 0.08, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.10192307692307692, | |
| "eval_loss": 0.03427828475832939, | |
| "eval_runtime": 183.0367, | |
| "eval_samples_per_second": 251.977, | |
| "eval_steps_per_second": 1.972, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.10384615384615385, | |
| "grad_norm": 1.4918586015701294, | |
| "learning_rate": 8.980769230769231e-05, | |
| "loss": 0.0785, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.10384615384615385, | |
| "eval_loss": 0.03516368940472603, | |
| "eval_runtime": 182.9678, | |
| "eval_samples_per_second": 252.072, | |
| "eval_steps_per_second": 1.973, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.10576923076923077, | |
| "grad_norm": 1.2416688203811646, | |
| "learning_rate": 8.961538461538463e-05, | |
| "loss": 0.0422, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.10576923076923077, | |
| "eval_loss": 0.0376507006585598, | |
| "eval_runtime": 183.9714, | |
| "eval_samples_per_second": 250.697, | |
| "eval_steps_per_second": 1.962, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1076923076923077, | |
| "grad_norm": 2.5123279094696045, | |
| "learning_rate": 8.942307692307693e-05, | |
| "loss": 0.1594, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1076923076923077, | |
| "eval_loss": 0.038452692329883575, | |
| "eval_runtime": 182.5423, | |
| "eval_samples_per_second": 252.659, | |
| "eval_steps_per_second": 1.978, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.10961538461538461, | |
| "grad_norm": 1.8067007064819336, | |
| "learning_rate": 8.923076923076924e-05, | |
| "loss": 0.0918, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.10961538461538461, | |
| "eval_loss": 0.036170173436403275, | |
| "eval_runtime": 182.3949, | |
| "eval_samples_per_second": 252.863, | |
| "eval_steps_per_second": 1.979, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.11153846153846154, | |
| "grad_norm": 2.269913911819458, | |
| "learning_rate": 8.903846153846154e-05, | |
| "loss": 0.0715, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.11153846153846154, | |
| "eval_loss": 0.03197428211569786, | |
| "eval_runtime": 182.2905, | |
| "eval_samples_per_second": 253.008, | |
| "eval_steps_per_second": 1.98, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.11346153846153846, | |
| "grad_norm": 1.736000418663025, | |
| "learning_rate": 8.884615384615384e-05, | |
| "loss": 0.0571, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.11346153846153846, | |
| "eval_loss": 0.02888152375817299, | |
| "eval_runtime": 182.3969, | |
| "eval_samples_per_second": 252.861, | |
| "eval_steps_per_second": 1.979, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.11538461538461539, | |
| "grad_norm": 1.3405442237854004, | |
| "learning_rate": 8.865384615384615e-05, | |
| "loss": 0.0856, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11538461538461539, | |
| "eval_loss": 0.026720238849520683, | |
| "eval_runtime": 182.7564, | |
| "eval_samples_per_second": 252.363, | |
| "eval_steps_per_second": 1.975, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11730769230769231, | |
| "grad_norm": 3.9098620414733887, | |
| "learning_rate": 8.846153846153847e-05, | |
| "loss": 0.0663, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.11730769230769231, | |
| "eval_loss": 0.02634381875395775, | |
| "eval_runtime": 182.2338, | |
| "eval_samples_per_second": 253.087, | |
| "eval_steps_per_second": 1.981, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.11923076923076924, | |
| "grad_norm": 2.598466634750366, | |
| "learning_rate": 8.826923076923077e-05, | |
| "loss": 0.0571, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.11923076923076924, | |
| "eval_loss": 0.027414267882704735, | |
| "eval_runtime": 184.1358, | |
| "eval_samples_per_second": 250.473, | |
| "eval_steps_per_second": 1.961, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.12115384615384615, | |
| "grad_norm": 2.4408273696899414, | |
| "learning_rate": 8.807692307692307e-05, | |
| "loss": 0.0538, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.12115384615384615, | |
| "eval_loss": 0.029992803931236267, | |
| "eval_runtime": 183.9982, | |
| "eval_samples_per_second": 250.66, | |
| "eval_steps_per_second": 1.962, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.12307692307692308, | |
| "grad_norm": 1.904429316520691, | |
| "learning_rate": 8.788461538461539e-05, | |
| "loss": 0.0478, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.12307692307692308, | |
| "eval_loss": 0.03255239129066467, | |
| "eval_runtime": 183.9668, | |
| "eval_samples_per_second": 250.703, | |
| "eval_steps_per_second": 1.962, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 2.515045166015625, | |
| "learning_rate": 8.76923076923077e-05, | |
| "loss": 0.0336, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "eval_loss": 0.03170439973473549, | |
| "eval_runtime": 184.8246, | |
| "eval_samples_per_second": 249.539, | |
| "eval_steps_per_second": 1.953, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.12692307692307692, | |
| "grad_norm": 1.5397083759307861, | |
| "learning_rate": 8.75e-05, | |
| "loss": 0.0335, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.12692307692307692, | |
| "eval_loss": 0.029536928981542587, | |
| "eval_runtime": 184.1714, | |
| "eval_samples_per_second": 250.424, | |
| "eval_steps_per_second": 1.96, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.12884615384615383, | |
| "grad_norm": 1.85903000831604, | |
| "learning_rate": 8.730769230769232e-05, | |
| "loss": 0.0543, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.12884615384615383, | |
| "eval_loss": 0.02915562316775322, | |
| "eval_runtime": 184.0688, | |
| "eval_samples_per_second": 250.564, | |
| "eval_steps_per_second": 1.961, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.13076923076923078, | |
| "grad_norm": 2.114628791809082, | |
| "learning_rate": 8.711538461538462e-05, | |
| "loss": 0.0557, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.13076923076923078, | |
| "eval_loss": 0.02956259623169899, | |
| "eval_runtime": 185.074, | |
| "eval_samples_per_second": 249.203, | |
| "eval_steps_per_second": 1.951, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.1326923076923077, | |
| "grad_norm": 0.6971794962882996, | |
| "learning_rate": 8.692307692307692e-05, | |
| "loss": 0.0516, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.1326923076923077, | |
| "eval_loss": 0.030795959755778313, | |
| "eval_runtime": 184.0759, | |
| "eval_samples_per_second": 250.554, | |
| "eval_steps_per_second": 1.961, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.1346153846153846, | |
| "grad_norm": 0.9613048434257507, | |
| "learning_rate": 8.673076923076924e-05, | |
| "loss": 0.0615, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1346153846153846, | |
| "eval_loss": 0.032335903495550156, | |
| "eval_runtime": 183.9235, | |
| "eval_samples_per_second": 250.762, | |
| "eval_steps_per_second": 1.963, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13653846153846153, | |
| "grad_norm": 2.4344446659088135, | |
| "learning_rate": 8.653846153846155e-05, | |
| "loss": 0.0757, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.13653846153846153, | |
| "eval_loss": 0.030952226370573044, | |
| "eval_runtime": 182.946, | |
| "eval_samples_per_second": 252.102, | |
| "eval_steps_per_second": 1.973, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.13846153846153847, | |
| "grad_norm": 0.8824633955955505, | |
| "learning_rate": 8.634615384615385e-05, | |
| "loss": 0.0385, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.13846153846153847, | |
| "eval_loss": 0.031768955290317535, | |
| "eval_runtime": 182.333, | |
| "eval_samples_per_second": 252.949, | |
| "eval_steps_per_second": 1.98, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.14038461538461539, | |
| "grad_norm": 1.980722188949585, | |
| "learning_rate": 8.615384615384617e-05, | |
| "loss": 0.0624, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.14038461538461539, | |
| "eval_loss": 0.03059910237789154, | |
| "eval_runtime": 182.0241, | |
| "eval_samples_per_second": 253.378, | |
| "eval_steps_per_second": 1.983, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.1423076923076923, | |
| "grad_norm": 1.5362180471420288, | |
| "learning_rate": 8.596153846153847e-05, | |
| "loss": 0.0779, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.1423076923076923, | |
| "eval_loss": 0.028434548527002335, | |
| "eval_runtime": 181.9839, | |
| "eval_samples_per_second": 253.435, | |
| "eval_steps_per_second": 1.984, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.14423076923076922, | |
| "grad_norm": 4.635537147521973, | |
| "learning_rate": 8.576923076923076e-05, | |
| "loss": 0.093, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.14423076923076922, | |
| "eval_loss": 0.029211917892098427, | |
| "eval_runtime": 182.7811, | |
| "eval_samples_per_second": 252.329, | |
| "eval_steps_per_second": 1.975, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.14615384615384616, | |
| "grad_norm": 0.548228919506073, | |
| "learning_rate": 8.557692307692308e-05, | |
| "loss": 0.0144, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.14615384615384616, | |
| "eval_loss": 0.02987842820584774, | |
| "eval_runtime": 183.1098, | |
| "eval_samples_per_second": 251.876, | |
| "eval_steps_per_second": 1.971, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.14807692307692308, | |
| "grad_norm": 2.241633176803589, | |
| "learning_rate": 8.538461538461538e-05, | |
| "loss": 0.0331, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.14807692307692308, | |
| "eval_loss": 0.029068879783153534, | |
| "eval_runtime": 182.0148, | |
| "eval_samples_per_second": 253.391, | |
| "eval_steps_per_second": 1.983, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 2.180568218231201, | |
| "learning_rate": 8.519230769230769e-05, | |
| "loss": 0.0683, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_loss": 0.02795836329460144, | |
| "eval_runtime": 183.975, | |
| "eval_samples_per_second": 250.692, | |
| "eval_steps_per_second": 1.962, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.1519230769230769, | |
| "grad_norm": 2.788595199584961, | |
| "learning_rate": 8.5e-05, | |
| "loss": 0.0753, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.1519230769230769, | |
| "eval_loss": 0.025826551020145416, | |
| "eval_runtime": 182.2724, | |
| "eval_samples_per_second": 253.033, | |
| "eval_steps_per_second": 1.981, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "grad_norm": 3.0569944381713867, | |
| "learning_rate": 8.480769230769231e-05, | |
| "loss": 0.0658, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "eval_loss": 0.023890940472483635, | |
| "eval_runtime": 183.5645, | |
| "eval_samples_per_second": 251.252, | |
| "eval_steps_per_second": 1.967, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15576923076923077, | |
| "grad_norm": 2.4523251056671143, | |
| "learning_rate": 8.461538461538461e-05, | |
| "loss": 0.0924, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.15576923076923077, | |
| "eval_loss": 0.023335987702012062, | |
| "eval_runtime": 182.5326, | |
| "eval_samples_per_second": 252.673, | |
| "eval_steps_per_second": 1.978, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.1576923076923077, | |
| "grad_norm": 2.6388676166534424, | |
| "learning_rate": 8.442307692307693e-05, | |
| "loss": 0.0725, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.1576923076923077, | |
| "eval_loss": 0.023239314556121826, | |
| "eval_runtime": 183.2766, | |
| "eval_samples_per_second": 251.647, | |
| "eval_steps_per_second": 1.97, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.1596153846153846, | |
| "grad_norm": 5.005173206329346, | |
| "learning_rate": 8.423076923076924e-05, | |
| "loss": 0.0981, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.1596153846153846, | |
| "eval_loss": 0.023996710777282715, | |
| "eval_runtime": 182.5391, | |
| "eval_samples_per_second": 252.664, | |
| "eval_steps_per_second": 1.978, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.16153846153846155, | |
| "grad_norm": 2.1891043186187744, | |
| "learning_rate": 8.403846153846154e-05, | |
| "loss": 0.0505, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.16153846153846155, | |
| "eval_loss": 0.026021044701337814, | |
| "eval_runtime": 181.6905, | |
| "eval_samples_per_second": 253.844, | |
| "eval_steps_per_second": 1.987, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.16346153846153846, | |
| "grad_norm": 1.2925877571105957, | |
| "learning_rate": 8.384615384615386e-05, | |
| "loss": 0.0465, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.16346153846153846, | |
| "eval_loss": 0.029900116845965385, | |
| "eval_runtime": 183.3081, | |
| "eval_samples_per_second": 251.604, | |
| "eval_steps_per_second": 1.969, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.16538461538461538, | |
| "grad_norm": 2.1117990016937256, | |
| "learning_rate": 8.365384615384616e-05, | |
| "loss": 0.0444, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.16538461538461538, | |
| "eval_loss": 0.03299647569656372, | |
| "eval_runtime": 182.6014, | |
| "eval_samples_per_second": 252.578, | |
| "eval_steps_per_second": 1.977, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.1673076923076923, | |
| "grad_norm": 2.243436813354492, | |
| "learning_rate": 8.346153846153847e-05, | |
| "loss": 0.0571, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.1673076923076923, | |
| "eval_loss": 0.034737322479486465, | |
| "eval_runtime": 181.6185, | |
| "eval_samples_per_second": 253.944, | |
| "eval_steps_per_second": 1.988, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.16923076923076924, | |
| "grad_norm": 2.0375781059265137, | |
| "learning_rate": 8.326923076923078e-05, | |
| "loss": 0.0529, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.16923076923076924, | |
| "eval_loss": 0.033401209861040115, | |
| "eval_runtime": 182.5847, | |
| "eval_samples_per_second": 252.601, | |
| "eval_steps_per_second": 1.977, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.17115384615384616, | |
| "grad_norm": 1.781933307647705, | |
| "learning_rate": 8.307692307692309e-05, | |
| "loss": 0.0823, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.17115384615384616, | |
| "eval_loss": 0.030983150005340576, | |
| "eval_runtime": 182.1974, | |
| "eval_samples_per_second": 253.137, | |
| "eval_steps_per_second": 1.981, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.17307692307692307, | |
| "grad_norm": 1.0016299486160278, | |
| "learning_rate": 8.288461538461539e-05, | |
| "loss": 0.0234, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17307692307692307, | |
| "eval_loss": 0.02751440368592739, | |
| "eval_runtime": 182.5732, | |
| "eval_samples_per_second": 252.616, | |
| "eval_steps_per_second": 1.977, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 2.1391496658325195, | |
| "learning_rate": 8.26923076923077e-05, | |
| "loss": 0.0802, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "eval_loss": 0.025574011728167534, | |
| "eval_runtime": 181.6703, | |
| "eval_samples_per_second": 253.872, | |
| "eval_steps_per_second": 1.987, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.17692307692307693, | |
| "grad_norm": 1.8795677423477173, | |
| "learning_rate": 8.25e-05, | |
| "loss": 0.041, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.17692307692307693, | |
| "eval_loss": 0.02361590415239334, | |
| "eval_runtime": 181.4712, | |
| "eval_samples_per_second": 254.151, | |
| "eval_steps_per_second": 1.989, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.17884615384615385, | |
| "grad_norm": 2.376096487045288, | |
| "learning_rate": 8.23076923076923e-05, | |
| "loss": 0.0387, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.17884615384615385, | |
| "eval_loss": 0.022694583982229233, | |
| "eval_runtime": 181.0427, | |
| "eval_samples_per_second": 254.752, | |
| "eval_steps_per_second": 1.994, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.18076923076923077, | |
| "grad_norm": 2.218397855758667, | |
| "learning_rate": 8.211538461538462e-05, | |
| "loss": 0.0409, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.18076923076923077, | |
| "eval_loss": 0.02242557518184185, | |
| "eval_runtime": 182.2902, | |
| "eval_samples_per_second": 253.009, | |
| "eval_steps_per_second": 1.98, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.18269230769230768, | |
| "grad_norm": 2.596670150756836, | |
| "learning_rate": 8.192307692307693e-05, | |
| "loss": 0.0595, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.18269230769230768, | |
| "eval_loss": 0.02277774177491665, | |
| "eval_runtime": 182.0412, | |
| "eval_samples_per_second": 253.355, | |
| "eval_steps_per_second": 1.983, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.18461538461538463, | |
| "grad_norm": 1.311848759651184, | |
| "learning_rate": 8.173076923076923e-05, | |
| "loss": 0.0389, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.18461538461538463, | |
| "eval_loss": 0.024142242968082428, | |
| "eval_runtime": 181.193, | |
| "eval_samples_per_second": 254.541, | |
| "eval_steps_per_second": 1.992, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.18653846153846154, | |
| "grad_norm": 0.9079859256744385, | |
| "learning_rate": 8.153846153846155e-05, | |
| "loss": 0.017, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.18653846153846154, | |
| "eval_loss": 0.027621854096651077, | |
| "eval_runtime": 181.9685, | |
| "eval_samples_per_second": 253.456, | |
| "eval_steps_per_second": 1.984, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.18846153846153846, | |
| "grad_norm": 1.0217198133468628, | |
| "learning_rate": 8.134615384615385e-05, | |
| "loss": 0.0186, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.18846153846153846, | |
| "eval_loss": 0.03133901208639145, | |
| "eval_runtime": 181.8849, | |
| "eval_samples_per_second": 253.572, | |
| "eval_steps_per_second": 1.985, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.19038461538461537, | |
| "grad_norm": 1.6041982173919678, | |
| "learning_rate": 8.115384615384616e-05, | |
| "loss": 0.0528, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.19038461538461537, | |
| "eval_loss": 0.03536279872059822, | |
| "eval_runtime": 181.8467, | |
| "eval_samples_per_second": 253.626, | |
| "eval_steps_per_second": 1.985, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.19230769230769232, | |
| "grad_norm": 2.190931558609009, | |
| "learning_rate": 8.096153846153847e-05, | |
| "loss": 0.0483, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19230769230769232, | |
| "eval_loss": 0.03583548963069916, | |
| "eval_runtime": 181.8924, | |
| "eval_samples_per_second": 253.562, | |
| "eval_steps_per_second": 1.985, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19423076923076923, | |
| "grad_norm": 1.281062126159668, | |
| "learning_rate": 8.076923076923078e-05, | |
| "loss": 0.021, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.19423076923076923, | |
| "eval_loss": 0.03581365570425987, | |
| "eval_runtime": 181.2306, | |
| "eval_samples_per_second": 254.488, | |
| "eval_steps_per_second": 1.992, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.19615384615384615, | |
| "grad_norm": 1.3938978910446167, | |
| "learning_rate": 8.057692307692308e-05, | |
| "loss": 0.0326, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.19615384615384615, | |
| "eval_loss": 0.03531914949417114, | |
| "eval_runtime": 182.1448, | |
| "eval_samples_per_second": 253.211, | |
| "eval_steps_per_second": 1.982, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.19807692307692307, | |
| "grad_norm": 3.451387643814087, | |
| "learning_rate": 8.038461538461538e-05, | |
| "loss": 0.1205, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.19807692307692307, | |
| "eval_loss": 0.03354247659444809, | |
| "eval_runtime": 182.8587, | |
| "eval_samples_per_second": 252.222, | |
| "eval_steps_per_second": 1.974, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.6962478160858154, | |
| "learning_rate": 8.01923076923077e-05, | |
| "loss": 0.0423, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 0.031101541593670845, | |
| "eval_runtime": 183.5946, | |
| "eval_samples_per_second": 251.211, | |
| "eval_steps_per_second": 1.966, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.20192307692307693, | |
| "grad_norm": 1.4025405645370483, | |
| "learning_rate": 8e-05, | |
| "loss": 0.0895, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.20192307692307693, | |
| "eval_loss": 0.02922738529741764, | |
| "eval_runtime": 185.2005, | |
| "eval_samples_per_second": 249.033, | |
| "eval_steps_per_second": 1.949, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.20384615384615384, | |
| "grad_norm": 1.1363201141357422, | |
| "learning_rate": 7.980769230769231e-05, | |
| "loss": 0.0254, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.20384615384615384, | |
| "eval_loss": 0.027069460600614548, | |
| "eval_runtime": 183.2009, | |
| "eval_samples_per_second": 251.751, | |
| "eval_steps_per_second": 1.971, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.20576923076923076, | |
| "grad_norm": 0.6673398613929749, | |
| "learning_rate": 7.961538461538461e-05, | |
| "loss": 0.0149, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.20576923076923076, | |
| "eval_loss": 0.025280024856328964, | |
| "eval_runtime": 182.5339, | |
| "eval_samples_per_second": 252.671, | |
| "eval_steps_per_second": 1.978, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.2076923076923077, | |
| "grad_norm": 2.2569580078125, | |
| "learning_rate": 7.942307692307692e-05, | |
| "loss": 0.0917, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.2076923076923077, | |
| "eval_loss": 0.024966858327388763, | |
| "eval_runtime": 183.4982, | |
| "eval_samples_per_second": 251.343, | |
| "eval_steps_per_second": 1.967, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.20961538461538462, | |
| "grad_norm": 2.1165764331817627, | |
| "learning_rate": 7.923076923076924e-05, | |
| "loss": 0.0888, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.20961538461538462, | |
| "eval_loss": 0.02572954259812832, | |
| "eval_runtime": 182.2057, | |
| "eval_samples_per_second": 253.126, | |
| "eval_steps_per_second": 1.981, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.21153846153846154, | |
| "grad_norm": 1.5322438478469849, | |
| "learning_rate": 7.903846153846154e-05, | |
| "loss": 0.0378, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.21153846153846154, | |
| "eval_loss": 0.027774129062891006, | |
| "eval_runtime": 183.4476, | |
| "eval_samples_per_second": 251.412, | |
| "eval_steps_per_second": 1.968, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.21346153846153845, | |
| "grad_norm": 1.5025537014007568, | |
| "learning_rate": 7.884615384615384e-05, | |
| "loss": 0.089, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.21346153846153845, | |
| "eval_loss": 0.03216475620865822, | |
| "eval_runtime": 182.6688, | |
| "eval_samples_per_second": 252.484, | |
| "eval_steps_per_second": 1.976, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.2153846153846154, | |
| "grad_norm": 1.4053198099136353, | |
| "learning_rate": 7.865384615384616e-05, | |
| "loss": 0.0632, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.2153846153846154, | |
| "eval_loss": 0.03782927617430687, | |
| "eval_runtime": 183.2384, | |
| "eval_samples_per_second": 251.699, | |
| "eval_steps_per_second": 1.97, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.2173076923076923, | |
| "grad_norm": 5.42258882522583, | |
| "learning_rate": 7.846153846153847e-05, | |
| "loss": 0.1271, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2173076923076923, | |
| "eval_loss": 0.040466830134391785, | |
| "eval_runtime": 181.9963, | |
| "eval_samples_per_second": 253.417, | |
| "eval_steps_per_second": 1.984, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.21923076923076923, | |
| "grad_norm": 2.1966440677642822, | |
| "learning_rate": 7.826923076923077e-05, | |
| "loss": 0.0392, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.21923076923076923, | |
| "eval_loss": 0.03886817768216133, | |
| "eval_runtime": 182.0201, | |
| "eval_samples_per_second": 253.384, | |
| "eval_steps_per_second": 1.983, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.22115384615384615, | |
| "grad_norm": 1.9059343338012695, | |
| "learning_rate": 7.807692307692307e-05, | |
| "loss": 0.026, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.22115384615384615, | |
| "eval_loss": 0.034843236207962036, | |
| "eval_runtime": 182.0596, | |
| "eval_samples_per_second": 253.329, | |
| "eval_steps_per_second": 1.983, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2230769230769231, | |
| "grad_norm": 3.3110170364379883, | |
| "learning_rate": 7.788461538461539e-05, | |
| "loss": 0.0732, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.2230769230769231, | |
| "eval_loss": 0.03086087852716446, | |
| "eval_runtime": 181.1715, | |
| "eval_samples_per_second": 254.571, | |
| "eval_steps_per_second": 1.993, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.64942467212677, | |
| "learning_rate": 7.76923076923077e-05, | |
| "loss": 0.0175, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "eval_loss": 0.028490141034126282, | |
| "eval_runtime": 181.1702, | |
| "eval_samples_per_second": 254.573, | |
| "eval_steps_per_second": 1.993, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.22692307692307692, | |
| "grad_norm": 1.8217730522155762, | |
| "learning_rate": 7.75e-05, | |
| "loss": 0.0407, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.22692307692307692, | |
| "eval_loss": 0.026924345642328262, | |
| "eval_runtime": 181.6397, | |
| "eval_samples_per_second": 253.915, | |
| "eval_steps_per_second": 1.987, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.22884615384615384, | |
| "grad_norm": 5.165238380432129, | |
| "learning_rate": 7.730769230769232e-05, | |
| "loss": 0.1066, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.22884615384615384, | |
| "eval_loss": 0.026207981631159782, | |
| "eval_runtime": 181.6267, | |
| "eval_samples_per_second": 253.933, | |
| "eval_steps_per_second": 1.988, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.23076923076923078, | |
| "grad_norm": 4.6868977546691895, | |
| "learning_rate": 7.711538461538462e-05, | |
| "loss": 0.0843, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.23076923076923078, | |
| "eval_loss": 0.02519201673567295, | |
| "eval_runtime": 181.6069, | |
| "eval_samples_per_second": 253.961, | |
| "eval_steps_per_second": 1.988, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2326923076923077, | |
| "grad_norm": 3.41184139251709, | |
| "learning_rate": 7.692307692307693e-05, | |
| "loss": 0.0988, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.2326923076923077, | |
| "eval_loss": 0.023151233792304993, | |
| "eval_runtime": 181.4734, | |
| "eval_samples_per_second": 254.147, | |
| "eval_steps_per_second": 1.989, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.23461538461538461, | |
| "grad_norm": 4.652563095092773, | |
| "learning_rate": 7.673076923076924e-05, | |
| "loss": 0.1316, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.23461538461538461, | |
| "eval_loss": 0.02223026566207409, | |
| "eval_runtime": 181.4249, | |
| "eval_samples_per_second": 254.215, | |
| "eval_steps_per_second": 1.99, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.23653846153846153, | |
| "grad_norm": 3.011662721633911, | |
| "learning_rate": 7.653846153846153e-05, | |
| "loss": 0.0805, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.23653846153846153, | |
| "eval_loss": 0.0226932130753994, | |
| "eval_runtime": 182.1195, | |
| "eval_samples_per_second": 253.246, | |
| "eval_steps_per_second": 1.982, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.23846153846153847, | |
| "grad_norm": 1.1616426706314087, | |
| "learning_rate": 7.634615384615385e-05, | |
| "loss": 0.0182, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.23846153846153847, | |
| "eval_loss": 0.024565137922763824, | |
| "eval_runtime": 182.4427, | |
| "eval_samples_per_second": 252.797, | |
| "eval_steps_per_second": 1.979, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.2403846153846154, | |
| "grad_norm": 0.7270947098731995, | |
| "learning_rate": 7.615384615384616e-05, | |
| "loss": 0.0133, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2403846153846154, | |
| "eval_loss": 0.026539519429206848, | |
| "eval_runtime": 183.1212, | |
| "eval_samples_per_second": 251.861, | |
| "eval_steps_per_second": 1.971, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2423076923076923, | |
| "grad_norm": 1.8908319473266602, | |
| "learning_rate": 7.596153846153846e-05, | |
| "loss": 0.0293, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.2423076923076923, | |
| "eval_loss": 0.028904786333441734, | |
| "eval_runtime": 181.1513, | |
| "eval_samples_per_second": 254.599, | |
| "eval_steps_per_second": 1.993, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.24423076923076922, | |
| "grad_norm": 1.1141525506973267, | |
| "learning_rate": 7.576923076923076e-05, | |
| "loss": 0.0208, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.24423076923076922, | |
| "eval_loss": 0.030000800266861916, | |
| "eval_runtime": 181.2927, | |
| "eval_samples_per_second": 254.401, | |
| "eval_steps_per_second": 1.991, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.24615384615384617, | |
| "grad_norm": 1.4708863496780396, | |
| "learning_rate": 7.557692307692308e-05, | |
| "loss": 0.033, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.24615384615384617, | |
| "eval_loss": 0.03249819576740265, | |
| "eval_runtime": 181.507, | |
| "eval_samples_per_second": 254.1, | |
| "eval_steps_per_second": 1.989, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.24807692307692308, | |
| "grad_norm": 3.061389684677124, | |
| "learning_rate": 7.538461538461539e-05, | |
| "loss": 0.0634, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.24807692307692308, | |
| "eval_loss": 0.03363075479865074, | |
| "eval_runtime": 182.3989, | |
| "eval_samples_per_second": 252.858, | |
| "eval_steps_per_second": 1.979, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.663599729537964, | |
| "learning_rate": 7.519230769230769e-05, | |
| "loss": 0.0414, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_loss": 0.03339559584856033, | |
| "eval_runtime": 182.3749, | |
| "eval_samples_per_second": 252.891, | |
| "eval_steps_per_second": 1.979, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2519230769230769, | |
| "grad_norm": 0.3709014058113098, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.0081, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.2519230769230769, | |
| "eval_loss": 0.03324908763170242, | |
| "eval_runtime": 181.6302, | |
| "eval_samples_per_second": 253.928, | |
| "eval_steps_per_second": 1.988, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.25384615384615383, | |
| "grad_norm": 2.106776475906372, | |
| "learning_rate": 7.480769230769231e-05, | |
| "loss": 0.0534, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.25384615384615383, | |
| "eval_loss": 0.03283924236893654, | |
| "eval_runtime": 182.1872, | |
| "eval_samples_per_second": 253.152, | |
| "eval_steps_per_second": 1.981, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.25576923076923075, | |
| "grad_norm": 3.1101605892181396, | |
| "learning_rate": 7.461538461538462e-05, | |
| "loss": 0.1828, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.25576923076923075, | |
| "eval_loss": 0.03169732913374901, | |
| "eval_runtime": 181.8198, | |
| "eval_samples_per_second": 253.663, | |
| "eval_steps_per_second": 1.985, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.25769230769230766, | |
| "grad_norm": 3.3294451236724854, | |
| "learning_rate": 7.442307692307693e-05, | |
| "loss": 0.0441, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.25769230769230766, | |
| "eval_loss": 0.031509336084127426, | |
| "eval_runtime": 181.3532, | |
| "eval_samples_per_second": 254.316, | |
| "eval_steps_per_second": 1.991, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.25961538461538464, | |
| "grad_norm": 0.6319352984428406, | |
| "learning_rate": 7.423076923076924e-05, | |
| "loss": 0.0126, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.25961538461538464, | |
| "eval_loss": 0.03166291490197182, | |
| "eval_runtime": 180.5095, | |
| "eval_samples_per_second": 255.505, | |
| "eval_steps_per_second": 2.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.26153846153846155, | |
| "grad_norm": 3.598060131072998, | |
| "learning_rate": 7.403846153846154e-05, | |
| "loss": 0.0637, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.26153846153846155, | |
| "eval_loss": 0.032380297780036926, | |
| "eval_runtime": 180.7907, | |
| "eval_samples_per_second": 255.107, | |
| "eval_steps_per_second": 1.997, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.26346153846153847, | |
| "grad_norm": 2.8882503509521484, | |
| "learning_rate": 7.384615384615386e-05, | |
| "loss": 0.0655, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.26346153846153847, | |
| "eval_loss": 0.03351568803191185, | |
| "eval_runtime": 181.8359, | |
| "eval_samples_per_second": 253.641, | |
| "eval_steps_per_second": 1.985, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.2653846153846154, | |
| "grad_norm": 1.519407868385315, | |
| "learning_rate": 7.365384615384616e-05, | |
| "loss": 0.0199, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.2653846153846154, | |
| "eval_loss": 0.03430071100592613, | |
| "eval_runtime": 181.9857, | |
| "eval_samples_per_second": 253.432, | |
| "eval_steps_per_second": 1.984, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.2673076923076923, | |
| "grad_norm": 0.8318182229995728, | |
| "learning_rate": 7.346153846153847e-05, | |
| "loss": 0.0158, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.2673076923076923, | |
| "eval_loss": 0.03548838198184967, | |
| "eval_runtime": 180.9902, | |
| "eval_samples_per_second": 254.826, | |
| "eval_steps_per_second": 1.995, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.2692307692307692, | |
| "grad_norm": 2.658010721206665, | |
| "learning_rate": 7.326923076923077e-05, | |
| "loss": 0.0582, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2692307692307692, | |
| "eval_loss": 0.03817412257194519, | |
| "eval_runtime": 181.1846, | |
| "eval_samples_per_second": 254.553, | |
| "eval_steps_per_second": 1.992, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.27115384615384613, | |
| "grad_norm": 3.6118085384368896, | |
| "learning_rate": 7.307692307692307e-05, | |
| "loss": 0.0506, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.27115384615384613, | |
| "eval_loss": 0.03875559940934181, | |
| "eval_runtime": 182.2632, | |
| "eval_samples_per_second": 253.046, | |
| "eval_steps_per_second": 1.981, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.27307692307692305, | |
| "grad_norm": 1.4768801927566528, | |
| "learning_rate": 7.288461538461538e-05, | |
| "loss": 0.0823, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.27307692307692305, | |
| "eval_loss": 0.03805309161543846, | |
| "eval_runtime": 181.8144, | |
| "eval_samples_per_second": 253.671, | |
| "eval_steps_per_second": 1.986, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 4.359960079193115, | |
| "learning_rate": 7.26923076923077e-05, | |
| "loss": 0.0601, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "eval_loss": 0.03436999395489693, | |
| "eval_runtime": 182.7585, | |
| "eval_samples_per_second": 252.36, | |
| "eval_steps_per_second": 1.975, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.27692307692307694, | |
| "grad_norm": 3.49018931388855, | |
| "learning_rate": 7.25e-05, | |
| "loss": 0.0697, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.27692307692307694, | |
| "eval_loss": 0.029680771753191948, | |
| "eval_runtime": 182.5284, | |
| "eval_samples_per_second": 252.678, | |
| "eval_steps_per_second": 1.978, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.27884615384615385, | |
| "grad_norm": 1.6799707412719727, | |
| "learning_rate": 7.23076923076923e-05, | |
| "loss": 0.047, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.27884615384615385, | |
| "eval_loss": 0.027619585394859314, | |
| "eval_runtime": 181.8266, | |
| "eval_samples_per_second": 253.654, | |
| "eval_steps_per_second": 1.985, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.28076923076923077, | |
| "grad_norm": 1.9224464893341064, | |
| "learning_rate": 7.211538461538462e-05, | |
| "loss": 0.0265, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.28076923076923077, | |
| "eval_loss": 0.025914136320352554, | |
| "eval_runtime": 181.3461, | |
| "eval_samples_per_second": 254.326, | |
| "eval_steps_per_second": 1.991, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.2826923076923077, | |
| "grad_norm": 1.8353599309921265, | |
| "learning_rate": 7.192307692307693e-05, | |
| "loss": 0.0265, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.2826923076923077, | |
| "eval_loss": 0.024255190044641495, | |
| "eval_runtime": 180.3297, | |
| "eval_samples_per_second": 255.759, | |
| "eval_steps_per_second": 2.002, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.2846153846153846, | |
| "grad_norm": 1.9630978107452393, | |
| "learning_rate": 7.173076923076923e-05, | |
| "loss": 0.0514, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.2846153846153846, | |
| "eval_loss": 0.023349367082118988, | |
| "eval_runtime": 181.2689, | |
| "eval_samples_per_second": 254.434, | |
| "eval_steps_per_second": 1.992, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.2865384615384615, | |
| "grad_norm": 1.8774313926696777, | |
| "learning_rate": 7.153846153846155e-05, | |
| "loss": 0.0339, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.2865384615384615, | |
| "eval_loss": 0.022075794637203217, | |
| "eval_runtime": 182.4232, | |
| "eval_samples_per_second": 252.824, | |
| "eval_steps_per_second": 1.979, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.28846153846153844, | |
| "grad_norm": 0.6925719380378723, | |
| "learning_rate": 7.134615384615385e-05, | |
| "loss": 0.013, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.28846153846153844, | |
| "eval_loss": 0.021298719570040703, | |
| "eval_runtime": 182.5149, | |
| "eval_samples_per_second": 252.697, | |
| "eval_steps_per_second": 1.978, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2903846153846154, | |
| "grad_norm": 1.8816715478897095, | |
| "learning_rate": 7.115384615384616e-05, | |
| "loss": 0.0645, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.2903846153846154, | |
| "eval_loss": 0.021134961396455765, | |
| "eval_runtime": 182.282, | |
| "eval_samples_per_second": 253.02, | |
| "eval_steps_per_second": 1.98, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.2923076923076923, | |
| "grad_norm": 0.7385954260826111, | |
| "learning_rate": 7.096153846153847e-05, | |
| "loss": 0.0175, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.2923076923076923, | |
| "eval_loss": 0.020952800288796425, | |
| "eval_runtime": 181.8302, | |
| "eval_samples_per_second": 253.649, | |
| "eval_steps_per_second": 1.985, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.29423076923076924, | |
| "grad_norm": 1.84195077419281, | |
| "learning_rate": 7.076923076923078e-05, | |
| "loss": 0.0512, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.29423076923076924, | |
| "eval_loss": 0.020642004907131195, | |
| "eval_runtime": 181.2698, | |
| "eval_samples_per_second": 254.433, | |
| "eval_steps_per_second": 1.992, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.29615384615384616, | |
| "grad_norm": 0.2837388813495636, | |
| "learning_rate": 7.057692307692308e-05, | |
| "loss": 0.0073, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.29615384615384616, | |
| "eval_loss": 0.020667677745223045, | |
| "eval_runtime": 181.7385, | |
| "eval_samples_per_second": 253.777, | |
| "eval_steps_per_second": 1.986, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.2980769230769231, | |
| "grad_norm": 2.8022420406341553, | |
| "learning_rate": 7.03846153846154e-05, | |
| "loss": 0.0436, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2980769230769231, | |
| "eval_loss": 0.02192995697259903, | |
| "eval_runtime": 180.8781, | |
| "eval_samples_per_second": 254.984, | |
| "eval_steps_per_second": 1.996, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.9743694067001343, | |
| "learning_rate": 7.019230769230769e-05, | |
| "loss": 0.0238, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 0.02315637096762657, | |
| "eval_runtime": 182.1362, | |
| "eval_samples_per_second": 253.223, | |
| "eval_steps_per_second": 1.982, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.3019230769230769, | |
| "grad_norm": 2.248816967010498, | |
| "learning_rate": 7e-05, | |
| "loss": 0.08, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.3019230769230769, | |
| "eval_loss": 0.02545558102428913, | |
| "eval_runtime": 181.4173, | |
| "eval_samples_per_second": 254.226, | |
| "eval_steps_per_second": 1.99, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.3038461538461538, | |
| "grad_norm": 2.220722198486328, | |
| "learning_rate": 6.980769230769231e-05, | |
| "loss": 0.0797, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.3038461538461538, | |
| "eval_loss": 0.02986188232898712, | |
| "eval_runtime": 180.9052, | |
| "eval_samples_per_second": 254.946, | |
| "eval_steps_per_second": 1.996, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.3057692307692308, | |
| "grad_norm": 3.1965675354003906, | |
| "learning_rate": 6.961538461538462e-05, | |
| "loss": 0.0801, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.3057692307692308, | |
| "eval_loss": 0.03733256086707115, | |
| "eval_runtime": 181.8686, | |
| "eval_samples_per_second": 253.595, | |
| "eval_steps_per_second": 1.985, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 3.019501209259033, | |
| "learning_rate": 6.942307692307692e-05, | |
| "loss": 0.0599, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "eval_loss": 0.04430044814944267, | |
| "eval_runtime": 182.2711, | |
| "eval_samples_per_second": 253.035, | |
| "eval_steps_per_second": 1.981, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3096153846153846, | |
| "grad_norm": 3.3681864738464355, | |
| "learning_rate": 6.923076923076924e-05, | |
| "loss": 0.0276, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.3096153846153846, | |
| "eval_loss": 0.04622860625386238, | |
| "eval_runtime": 182.9267, | |
| "eval_samples_per_second": 252.128, | |
| "eval_steps_per_second": 1.973, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.31153846153846154, | |
| "grad_norm": 1.2851427793502808, | |
| "learning_rate": 6.903846153846154e-05, | |
| "loss": 0.0426, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.31153846153846154, | |
| "eval_loss": 0.04630015045404434, | |
| "eval_runtime": 181.3811, | |
| "eval_samples_per_second": 254.277, | |
| "eval_steps_per_second": 1.99, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.31346153846153846, | |
| "grad_norm": 3.0502898693084717, | |
| "learning_rate": 6.884615384615385e-05, | |
| "loss": 0.045, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.31346153846153846, | |
| "eval_loss": 0.04202108457684517, | |
| "eval_runtime": 181.4928, | |
| "eval_samples_per_second": 254.12, | |
| "eval_steps_per_second": 1.989, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.3153846153846154, | |
| "grad_norm": 3.149498462677002, | |
| "learning_rate": 6.865384615384616e-05, | |
| "loss": 0.0708, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.3153846153846154, | |
| "eval_loss": 0.03506983816623688, | |
| "eval_runtime": 181.8089, | |
| "eval_samples_per_second": 253.678, | |
| "eval_steps_per_second": 1.986, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.3173076923076923, | |
| "grad_norm": 1.2693160772323608, | |
| "learning_rate": 6.846153846153847e-05, | |
| "loss": 0.0456, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3173076923076923, | |
| "eval_loss": 0.028076015412807465, | |
| "eval_runtime": 182.1103, | |
| "eval_samples_per_second": 253.259, | |
| "eval_steps_per_second": 1.982, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3192307692307692, | |
| "grad_norm": 1.5411460399627686, | |
| "learning_rate": 6.826923076923077e-05, | |
| "loss": 0.0261, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.3192307692307692, | |
| "eval_loss": 0.024974722415208817, | |
| "eval_runtime": 181.8849, | |
| "eval_samples_per_second": 253.572, | |
| "eval_steps_per_second": 1.985, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.3211538461538462, | |
| "grad_norm": 2.7267351150512695, | |
| "learning_rate": 6.807692307692309e-05, | |
| "loss": 0.0427, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.3211538461538462, | |
| "eval_loss": 0.022312704473733902, | |
| "eval_runtime": 181.8135, | |
| "eval_samples_per_second": 253.672, | |
| "eval_steps_per_second": 1.986, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.3230769230769231, | |
| "grad_norm": 1.551154613494873, | |
| "learning_rate": 6.788461538461539e-05, | |
| "loss": 0.038, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.3230769230769231, | |
| "eval_loss": 0.020363658666610718, | |
| "eval_runtime": 180.8456, | |
| "eval_samples_per_second": 255.03, | |
| "eval_steps_per_second": 1.996, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 1.9637490510940552, | |
| "learning_rate": 6.76923076923077e-05, | |
| "loss": 0.0275, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "eval_loss": 0.019464140757918358, | |
| "eval_runtime": 181.1515, | |
| "eval_samples_per_second": 254.599, | |
| "eval_steps_per_second": 1.993, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.3269230769230769, | |
| "grad_norm": 0.9584169983863831, | |
| "learning_rate": 6.750000000000001e-05, | |
| "loss": 0.0351, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3269230769230769, | |
| "eval_loss": 0.018735043704509735, | |
| "eval_runtime": 182.4595, | |
| "eval_samples_per_second": 252.774, | |
| "eval_steps_per_second": 1.979, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.32884615384615384, | |
| "grad_norm": 2.4790542125701904, | |
| "learning_rate": 6.730769230769232e-05, | |
| "loss": 0.0812, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.32884615384615384, | |
| "eval_loss": 0.01843821443617344, | |
| "eval_runtime": 184.1966, | |
| "eval_samples_per_second": 250.39, | |
| "eval_steps_per_second": 1.96, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.33076923076923076, | |
| "grad_norm": 1.239414095878601, | |
| "learning_rate": 6.711538461538461e-05, | |
| "loss": 0.0338, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.33076923076923076, | |
| "eval_loss": 0.018382636830210686, | |
| "eval_runtime": 182.3178, | |
| "eval_samples_per_second": 252.97, | |
| "eval_steps_per_second": 1.98, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.3326923076923077, | |
| "grad_norm": 2.3932952880859375, | |
| "learning_rate": 6.692307692307693e-05, | |
| "loss": 0.0716, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.3326923076923077, | |
| "eval_loss": 0.018522335216403008, | |
| "eval_runtime": 182.6222, | |
| "eval_samples_per_second": 252.549, | |
| "eval_steps_per_second": 1.977, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.3346153846153846, | |
| "grad_norm": 3.787052869796753, | |
| "learning_rate": 6.673076923076923e-05, | |
| "loss": 0.0869, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.3346153846153846, | |
| "eval_loss": 0.019300226122140884, | |
| "eval_runtime": 181.3346, | |
| "eval_samples_per_second": 254.342, | |
| "eval_steps_per_second": 1.991, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.33653846153846156, | |
| "grad_norm": 1.6532280445098877, | |
| "learning_rate": 6.653846153846153e-05, | |
| "loss": 0.0501, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.33653846153846156, | |
| "eval_loss": 0.021606482565402985, | |
| "eval_runtime": 182.1641, | |
| "eval_samples_per_second": 253.184, | |
| "eval_steps_per_second": 1.982, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3384615384615385, | |
| "grad_norm": 5.240301132202148, | |
| "learning_rate": 6.634615384615385e-05, | |
| "loss": 0.0421, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.3384615384615385, | |
| "eval_loss": 0.02588193118572235, | |
| "eval_runtime": 182.0323, | |
| "eval_samples_per_second": 253.367, | |
| "eval_steps_per_second": 1.983, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.3403846153846154, | |
| "grad_norm": 2.0743260383605957, | |
| "learning_rate": 6.615384615384616e-05, | |
| "loss": 0.0653, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.3403846153846154, | |
| "eval_loss": 0.031672630459070206, | |
| "eval_runtime": 182.8966, | |
| "eval_samples_per_second": 252.17, | |
| "eval_steps_per_second": 1.974, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.3423076923076923, | |
| "grad_norm": 2.0226855278015137, | |
| "learning_rate": 6.596153846153846e-05, | |
| "loss": 0.0776, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.3423076923076923, | |
| "eval_loss": 0.03493071347475052, | |
| "eval_runtime": 179.3302, | |
| "eval_samples_per_second": 257.185, | |
| "eval_steps_per_second": 2.013, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.34423076923076923, | |
| "grad_norm": 1.797101378440857, | |
| "learning_rate": 6.576923076923078e-05, | |
| "loss": 0.0338, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.34423076923076923, | |
| "eval_loss": 0.03656415641307831, | |
| "eval_runtime": 178.9522, | |
| "eval_samples_per_second": 257.728, | |
| "eval_steps_per_second": 2.017, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.34615384615384615, | |
| "grad_norm": 3.3137032985687256, | |
| "learning_rate": 6.557692307692308e-05, | |
| "loss": 0.0469, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.34615384615384615, | |
| "eval_loss": 0.03430218622088432, | |
| "eval_runtime": 179.502, | |
| "eval_samples_per_second": 256.939, | |
| "eval_steps_per_second": 2.011, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.34807692307692306, | |
| "grad_norm": 0.8624777793884277, | |
| "learning_rate": 6.538461538461539e-05, | |
| "loss": 0.0218, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.34807692307692306, | |
| "eval_loss": 0.029889389872550964, | |
| "eval_runtime": 179.85, | |
| "eval_samples_per_second": 256.441, | |
| "eval_steps_per_second": 2.007, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.7499721050262451, | |
| "learning_rate": 6.519230769230769e-05, | |
| "loss": 0.0426, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_loss": 0.02500992640852928, | |
| "eval_runtime": 179.0168, | |
| "eval_samples_per_second": 257.635, | |
| "eval_steps_per_second": 2.017, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.35192307692307695, | |
| "grad_norm": 0.598138689994812, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 0.0166, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.35192307692307695, | |
| "eval_loss": 0.021099161356687546, | |
| "eval_runtime": 179.3361, | |
| "eval_samples_per_second": 257.176, | |
| "eval_steps_per_second": 2.013, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.35384615384615387, | |
| "grad_norm": 1.0341280698776245, | |
| "learning_rate": 6.480769230769231e-05, | |
| "loss": 0.0245, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.35384615384615387, | |
| "eval_loss": 0.01857278123497963, | |
| "eval_runtime": 178.9082, | |
| "eval_samples_per_second": 257.791, | |
| "eval_steps_per_second": 2.018, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.3557692307692308, | |
| "grad_norm": 0.775191605091095, | |
| "learning_rate": 6.461538461538462e-05, | |
| "loss": 0.0173, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3557692307692308, | |
| "eval_loss": 0.01758418418467045, | |
| "eval_runtime": 178.4482, | |
| "eval_samples_per_second": 258.456, | |
| "eval_steps_per_second": 2.023, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3576923076923077, | |
| "grad_norm": 1.1633071899414062, | |
| "learning_rate": 6.442307692307693e-05, | |
| "loss": 0.0204, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.3576923076923077, | |
| "eval_loss": 0.017436116933822632, | |
| "eval_runtime": 177.889, | |
| "eval_samples_per_second": 259.268, | |
| "eval_steps_per_second": 2.029, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.3596153846153846, | |
| "grad_norm": 2.2689030170440674, | |
| "learning_rate": 6.423076923076924e-05, | |
| "loss": 0.0402, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.3596153846153846, | |
| "eval_loss": 0.017784688621759415, | |
| "eval_runtime": 179.0045, | |
| "eval_samples_per_second": 257.653, | |
| "eval_steps_per_second": 2.017, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.36153846153846153, | |
| "grad_norm": 1.899922490119934, | |
| "learning_rate": 6.403846153846154e-05, | |
| "loss": 0.0392, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.36153846153846153, | |
| "eval_loss": 0.018613914027810097, | |
| "eval_runtime": 178.4172, | |
| "eval_samples_per_second": 258.501, | |
| "eval_steps_per_second": 2.023, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.36346153846153845, | |
| "grad_norm": 1.4108256101608276, | |
| "learning_rate": 6.384615384615385e-05, | |
| "loss": 0.0481, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.36346153846153845, | |
| "eval_loss": 0.019706113263964653, | |
| "eval_runtime": 178.8412, | |
| "eval_samples_per_second": 257.888, | |
| "eval_steps_per_second": 2.019, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.36538461538461536, | |
| "grad_norm": 2.3156585693359375, | |
| "learning_rate": 6.365384615384615e-05, | |
| "loss": 0.0475, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.36538461538461536, | |
| "eval_loss": 0.02033383585512638, | |
| "eval_runtime": 179.2749, | |
| "eval_samples_per_second": 257.264, | |
| "eval_steps_per_second": 2.014, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.36730769230769234, | |
| "grad_norm": 1.200846552848816, | |
| "learning_rate": 6.346153846153847e-05, | |
| "loss": 0.0364, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.36730769230769234, | |
| "eval_loss": 0.02068951539695263, | |
| "eval_runtime": 179.023, | |
| "eval_samples_per_second": 257.626, | |
| "eval_steps_per_second": 2.017, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.36923076923076925, | |
| "grad_norm": 0.5756196975708008, | |
| "learning_rate": 6.326923076923077e-05, | |
| "loss": 0.0139, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.36923076923076925, | |
| "eval_loss": 0.020952697843313217, | |
| "eval_runtime": 178.3352, | |
| "eval_samples_per_second": 258.62, | |
| "eval_steps_per_second": 2.024, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.37115384615384617, | |
| "grad_norm": 0.9504797458648682, | |
| "learning_rate": 6.307692307692308e-05, | |
| "loss": 0.012, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.37115384615384617, | |
| "eval_loss": 0.021903619170188904, | |
| "eval_runtime": 179.2178, | |
| "eval_samples_per_second": 257.346, | |
| "eval_steps_per_second": 2.014, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.3730769230769231, | |
| "grad_norm": 0.8241443634033203, | |
| "learning_rate": 6.288461538461538e-05, | |
| "loss": 0.0095, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.3730769230769231, | |
| "eval_loss": 0.023630516603589058, | |
| "eval_runtime": 177.9821, | |
| "eval_samples_per_second": 259.133, | |
| "eval_steps_per_second": 2.028, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 3.045523166656494, | |
| "learning_rate": 6.26923076923077e-05, | |
| "loss": 0.0798, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "eval_loss": 0.02465650625526905, | |
| "eval_runtime": 179.008, | |
| "eval_samples_per_second": 257.648, | |
| "eval_steps_per_second": 2.017, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3769230769230769, | |
| "grad_norm": 3.0229413509368896, | |
| "learning_rate": 6.25e-05, | |
| "loss": 0.0548, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.3769230769230769, | |
| "eval_loss": 0.02465982548892498, | |
| "eval_runtime": 179.1942, | |
| "eval_samples_per_second": 257.38, | |
| "eval_steps_per_second": 2.015, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.37884615384615383, | |
| "grad_norm": 1.4179023504257202, | |
| "learning_rate": 6.23076923076923e-05, | |
| "loss": 0.0456, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.37884615384615383, | |
| "eval_loss": 0.02369517832994461, | |
| "eval_runtime": 179.1101, | |
| "eval_samples_per_second": 257.501, | |
| "eval_steps_per_second": 2.016, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.38076923076923075, | |
| "grad_norm": 1.207181692123413, | |
| "learning_rate": 6.211538461538462e-05, | |
| "loss": 0.0167, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.38076923076923075, | |
| "eval_loss": 0.02231222204864025, | |
| "eval_runtime": 178.044, | |
| "eval_samples_per_second": 259.043, | |
| "eval_steps_per_second": 2.028, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.38269230769230766, | |
| "grad_norm": 0.838901937007904, | |
| "learning_rate": 6.192307692307693e-05, | |
| "loss": 0.0182, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.38269230769230766, | |
| "eval_loss": 0.021893635392189026, | |
| "eval_runtime": 178.3295, | |
| "eval_samples_per_second": 258.628, | |
| "eval_steps_per_second": 2.024, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 0.957854151725769, | |
| "learning_rate": 6.173076923076923e-05, | |
| "loss": 0.0156, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "eval_loss": 0.021150017157197, | |
| "eval_runtime": 178.9639, | |
| "eval_samples_per_second": 257.711, | |
| "eval_steps_per_second": 2.017, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.38653846153846155, | |
| "grad_norm": 4.174772262573242, | |
| "learning_rate": 6.153846153846155e-05, | |
| "loss": 0.0678, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.38653846153846155, | |
| "eval_loss": 0.02083815075457096, | |
| "eval_runtime": 179.5827, | |
| "eval_samples_per_second": 256.823, | |
| "eval_steps_per_second": 2.01, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.38846153846153847, | |
| "grad_norm": 1.1510279178619385, | |
| "learning_rate": 6.134615384615385e-05, | |
| "loss": 0.0575, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.38846153846153847, | |
| "eval_loss": 0.020570380613207817, | |
| "eval_runtime": 180.2019, | |
| "eval_samples_per_second": 255.941, | |
| "eval_steps_per_second": 2.003, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.3903846153846154, | |
| "grad_norm": 1.705539345741272, | |
| "learning_rate": 6.115384615384616e-05, | |
| "loss": 0.0345, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.3903846153846154, | |
| "eval_loss": 0.020152855664491653, | |
| "eval_runtime": 179.2612, | |
| "eval_samples_per_second": 257.284, | |
| "eval_steps_per_second": 2.014, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.3923076923076923, | |
| "grad_norm": 3.794814348220825, | |
| "learning_rate": 6.096153846153847e-05, | |
| "loss": 0.1773, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.3923076923076923, | |
| "eval_loss": 0.019848283380270004, | |
| "eval_runtime": 178.9587, | |
| "eval_samples_per_second": 257.719, | |
| "eval_steps_per_second": 2.017, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.3942307692307692, | |
| "grad_norm": 2.470301866531372, | |
| "learning_rate": 6.0769230769230765e-05, | |
| "loss": 0.0211, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.3942307692307692, | |
| "eval_loss": 0.019784526899456978, | |
| "eval_runtime": 179.1875, | |
| "eval_samples_per_second": 257.39, | |
| "eval_steps_per_second": 2.015, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.39615384615384613, | |
| "grad_norm": 3.789724349975586, | |
| "learning_rate": 6.0576923076923076e-05, | |
| "loss": 0.0377, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.39615384615384613, | |
| "eval_loss": 0.02005017176270485, | |
| "eval_runtime": 178.7582, | |
| "eval_samples_per_second": 258.008, | |
| "eval_steps_per_second": 2.019, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.39807692307692305, | |
| "grad_norm": 2.2620668411254883, | |
| "learning_rate": 6.038461538461539e-05, | |
| "loss": 0.0709, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.39807692307692305, | |
| "eval_loss": 0.020129531621932983, | |
| "eval_runtime": 179.2997, | |
| "eval_samples_per_second": 257.228, | |
| "eval_steps_per_second": 2.013, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.889387607574463, | |
| "learning_rate": 6.019230769230769e-05, | |
| "loss": 0.0436, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 0.02092353254556656, | |
| "eval_runtime": 178.7779, | |
| "eval_samples_per_second": 257.979, | |
| "eval_steps_per_second": 2.019, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.40192307692307694, | |
| "grad_norm": 1.8757256269454956, | |
| "learning_rate": 6e-05, | |
| "loss": 0.0242, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.40192307692307694, | |
| "eval_loss": 0.022796517238020897, | |
| "eval_runtime": 179.275, | |
| "eval_samples_per_second": 257.264, | |
| "eval_steps_per_second": 2.014, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.40384615384615385, | |
| "grad_norm": 0.5746337175369263, | |
| "learning_rate": 5.980769230769231e-05, | |
| "loss": 0.0093, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.40384615384615385, | |
| "eval_loss": 0.026022404432296753, | |
| "eval_runtime": 179.3393, | |
| "eval_samples_per_second": 257.172, | |
| "eval_steps_per_second": 2.013, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.40576923076923077, | |
| "grad_norm": 0.6468233466148376, | |
| "learning_rate": 5.9615384615384616e-05, | |
| "loss": 0.0291, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.40576923076923077, | |
| "eval_loss": 0.028801048174500465, | |
| "eval_runtime": 180.0726, | |
| "eval_samples_per_second": 256.124, | |
| "eval_steps_per_second": 2.005, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.4076923076923077, | |
| "grad_norm": 3.5019261837005615, | |
| "learning_rate": 5.942307692307693e-05, | |
| "loss": 0.0955, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.4076923076923077, | |
| "eval_loss": 0.03047165460884571, | |
| "eval_runtime": 179.3877, | |
| "eval_samples_per_second": 257.102, | |
| "eval_steps_per_second": 2.012, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.4096153846153846, | |
| "grad_norm": 2.398637056350708, | |
| "learning_rate": 5.923076923076923e-05, | |
| "loss": 0.0332, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.4096153846153846, | |
| "eval_loss": 0.03037206083536148, | |
| "eval_runtime": 179.177, | |
| "eval_samples_per_second": 257.405, | |
| "eval_steps_per_second": 2.015, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.4115384615384615, | |
| "grad_norm": 0.37438809871673584, | |
| "learning_rate": 5.903846153846154e-05, | |
| "loss": 0.0086, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.4115384615384615, | |
| "eval_loss": 0.02970583364367485, | |
| "eval_runtime": 180.2418, | |
| "eval_samples_per_second": 255.884, | |
| "eval_steps_per_second": 2.003, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.41346153846153844, | |
| "grad_norm": 3.2650182247161865, | |
| "learning_rate": 5.884615384615385e-05, | |
| "loss": 0.0226, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.41346153846153844, | |
| "eval_loss": 0.027145324274897575, | |
| "eval_runtime": 179.8572, | |
| "eval_samples_per_second": 256.431, | |
| "eval_steps_per_second": 2.007, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4153846153846154, | |
| "grad_norm": 3.719679832458496, | |
| "learning_rate": 5.865384615384616e-05, | |
| "loss": 0.088, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.4153846153846154, | |
| "eval_loss": 0.02398001216351986, | |
| "eval_runtime": 179.4786, | |
| "eval_samples_per_second": 256.972, | |
| "eval_steps_per_second": 2.011, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.4173076923076923, | |
| "grad_norm": 3.390564441680908, | |
| "learning_rate": 5.846153846153847e-05, | |
| "loss": 0.073, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.4173076923076923, | |
| "eval_loss": 0.022082313895225525, | |
| "eval_runtime": 178.9292, | |
| "eval_samples_per_second": 257.761, | |
| "eval_steps_per_second": 2.018, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.41923076923076924, | |
| "grad_norm": 0.4555812180042267, | |
| "learning_rate": 5.826923076923078e-05, | |
| "loss": 0.0085, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.41923076923076924, | |
| "eval_loss": 0.020960917696356773, | |
| "eval_runtime": 179.7744, | |
| "eval_samples_per_second": 256.549, | |
| "eval_steps_per_second": 2.008, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.42115384615384616, | |
| "grad_norm": 1.5771949291229248, | |
| "learning_rate": 5.807692307692308e-05, | |
| "loss": 0.0355, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.42115384615384616, | |
| "eval_loss": 0.020832329988479614, | |
| "eval_runtime": 179.0768, | |
| "eval_samples_per_second": 257.549, | |
| "eval_steps_per_second": 2.016, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.4230769230769231, | |
| "grad_norm": 1.863141417503357, | |
| "learning_rate": 5.7884615384615394e-05, | |
| "loss": 0.0319, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4230769230769231, | |
| "eval_loss": 0.020400503650307655, | |
| "eval_runtime": 179.8136, | |
| "eval_samples_per_second": 256.493, | |
| "eval_steps_per_second": 2.008, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 2.5015952587127686, | |
| "learning_rate": 5.769230769230769e-05, | |
| "loss": 0.0297, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "eval_loss": 0.019669881090521812, | |
| "eval_runtime": 179.629, | |
| "eval_samples_per_second": 256.757, | |
| "eval_steps_per_second": 2.01, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.4269230769230769, | |
| "grad_norm": 2.183319568634033, | |
| "learning_rate": 5.7499999999999995e-05, | |
| "loss": 0.041, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.4269230769230769, | |
| "eval_loss": 0.018956847488880157, | |
| "eval_runtime": 179.7634, | |
| "eval_samples_per_second": 256.565, | |
| "eval_steps_per_second": 2.008, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.4288461538461538, | |
| "grad_norm": 0.6641192436218262, | |
| "learning_rate": 5.7307692307692306e-05, | |
| "loss": 0.0103, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.4288461538461538, | |
| "eval_loss": 0.018505413085222244, | |
| "eval_runtime": 178.939, | |
| "eval_samples_per_second": 257.747, | |
| "eval_steps_per_second": 2.017, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.4307692307692308, | |
| "grad_norm": 2.056015968322754, | |
| "learning_rate": 5.711538461538462e-05, | |
| "loss": 0.0509, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.4307692307692308, | |
| "eval_loss": 0.018195876851677895, | |
| "eval_runtime": 178.7256, | |
| "eval_samples_per_second": 258.055, | |
| "eval_steps_per_second": 2.02, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.4326923076923077, | |
| "grad_norm": 3.368030071258545, | |
| "learning_rate": 5.692307692307692e-05, | |
| "loss": 0.0265, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4326923076923077, | |
| "eval_loss": 0.01840364933013916, | |
| "eval_runtime": 178.4536, | |
| "eval_samples_per_second": 258.448, | |
| "eval_steps_per_second": 2.023, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4346153846153846, | |
| "grad_norm": 2.7104804515838623, | |
| "learning_rate": 5.673076923076923e-05, | |
| "loss": 0.0607, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.4346153846153846, | |
| "eval_loss": 0.018871352076530457, | |
| "eval_runtime": 179.4064, | |
| "eval_samples_per_second": 257.076, | |
| "eval_steps_per_second": 2.012, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.43653846153846154, | |
| "grad_norm": 1.095210313796997, | |
| "learning_rate": 5.653846153846154e-05, | |
| "loss": 0.0148, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.43653846153846154, | |
| "eval_loss": 0.019451051950454712, | |
| "eval_runtime": 178.892, | |
| "eval_samples_per_second": 257.815, | |
| "eval_steps_per_second": 2.018, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.43846153846153846, | |
| "grad_norm": 1.179794192314148, | |
| "learning_rate": 5.6346153846153846e-05, | |
| "loss": 0.0146, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.43846153846153846, | |
| "eval_loss": 0.01976831443607807, | |
| "eval_runtime": 179.5309, | |
| "eval_samples_per_second": 256.897, | |
| "eval_steps_per_second": 2.011, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.4403846153846154, | |
| "grad_norm": 1.9117586612701416, | |
| "learning_rate": 5.615384615384616e-05, | |
| "loss": 0.029, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.4403846153846154, | |
| "eval_loss": 0.020064150914549828, | |
| "eval_runtime": 179.5303, | |
| "eval_samples_per_second": 256.898, | |
| "eval_steps_per_second": 2.011, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.4423076923076923, | |
| "grad_norm": 2.375997304916382, | |
| "learning_rate": 5.596153846153847e-05, | |
| "loss": 0.0324, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4423076923076923, | |
| "eval_loss": 0.02067544311285019, | |
| "eval_runtime": 180.6883, | |
| "eval_samples_per_second": 255.252, | |
| "eval_steps_per_second": 1.998, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4442307692307692, | |
| "grad_norm": 2.2388057708740234, | |
| "learning_rate": 5.576923076923077e-05, | |
| "loss": 0.0335, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.4442307692307692, | |
| "eval_loss": 0.021712226793169975, | |
| "eval_runtime": 178.3336, | |
| "eval_samples_per_second": 258.622, | |
| "eval_steps_per_second": 2.024, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.4461538461538462, | |
| "grad_norm": 2.2240569591522217, | |
| "learning_rate": 5.557692307692308e-05, | |
| "loss": 0.0512, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.4461538461538462, | |
| "eval_loss": 0.022264475002884865, | |
| "eval_runtime": 183.6952, | |
| "eval_samples_per_second": 251.074, | |
| "eval_steps_per_second": 1.965, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.4480769230769231, | |
| "grad_norm": 0.18751874566078186, | |
| "learning_rate": 5.538461538461539e-05, | |
| "loss": 0.006, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.4480769230769231, | |
| "eval_loss": 0.022908175364136696, | |
| "eval_runtime": 180.2674, | |
| "eval_samples_per_second": 255.848, | |
| "eval_steps_per_second": 2.003, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.9546294212341309, | |
| "learning_rate": 5.51923076923077e-05, | |
| "loss": 0.02, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "eval_loss": 0.023024283349514008, | |
| "eval_runtime": 178.9188, | |
| "eval_samples_per_second": 257.776, | |
| "eval_steps_per_second": 2.018, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.4519230769230769, | |
| "grad_norm": 2.1330456733703613, | |
| "learning_rate": 5.500000000000001e-05, | |
| "loss": 0.0322, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.4519230769230769, | |
| "eval_loss": 0.023195333778858185, | |
| "eval_runtime": 179.3369, | |
| "eval_samples_per_second": 257.175, | |
| "eval_steps_per_second": 2.013, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.45384615384615384, | |
| "grad_norm": 1.1413002014160156, | |
| "learning_rate": 5.480769230769231e-05, | |
| "loss": 0.0535, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.45384615384615384, | |
| "eval_loss": 0.02287602610886097, | |
| "eval_runtime": 180.2062, | |
| "eval_samples_per_second": 255.935, | |
| "eval_steps_per_second": 2.003, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.45576923076923076, | |
| "grad_norm": 1.1129988431930542, | |
| "learning_rate": 5.461538461538461e-05, | |
| "loss": 0.0301, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.45576923076923076, | |
| "eval_loss": 0.022405732423067093, | |
| "eval_runtime": 178.7035, | |
| "eval_samples_per_second": 258.087, | |
| "eval_steps_per_second": 2.02, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.4576923076923077, | |
| "grad_norm": 0.41665732860565186, | |
| "learning_rate": 5.442307692307692e-05, | |
| "loss": 0.0073, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.4576923076923077, | |
| "eval_loss": 0.022149918600916862, | |
| "eval_runtime": 177.8064, | |
| "eval_samples_per_second": 259.389, | |
| "eval_steps_per_second": 2.03, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.4596153846153846, | |
| "grad_norm": 1.4682824611663818, | |
| "learning_rate": 5.423076923076923e-05, | |
| "loss": 0.0206, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.4596153846153846, | |
| "eval_loss": 0.02222280018031597, | |
| "eval_runtime": 178.3278, | |
| "eval_samples_per_second": 258.63, | |
| "eval_steps_per_second": 2.024, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "grad_norm": 0.2500247657299042, | |
| "learning_rate": 5.4038461538461536e-05, | |
| "loss": 0.0058, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "eval_loss": 0.022345269098877907, | |
| "eval_runtime": 178.187, | |
| "eval_samples_per_second": 258.835, | |
| "eval_steps_per_second": 2.026, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4634615384615385, | |
| "grad_norm": 1.4518128633499146, | |
| "learning_rate": 5.384615384615385e-05, | |
| "loss": 0.0333, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.4634615384615385, | |
| "eval_loss": 0.022659137845039368, | |
| "eval_runtime": 178.3156, | |
| "eval_samples_per_second": 258.648, | |
| "eval_steps_per_second": 2.025, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.4653846153846154, | |
| "grad_norm": 0.997244119644165, | |
| "learning_rate": 5.365384615384616e-05, | |
| "loss": 0.0407, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.4653846153846154, | |
| "eval_loss": 0.02270101197063923, | |
| "eval_runtime": 178.6996, | |
| "eval_samples_per_second": 258.092, | |
| "eval_steps_per_second": 2.02, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.4673076923076923, | |
| "grad_norm": 2.6662564277648926, | |
| "learning_rate": 5.346153846153846e-05, | |
| "loss": 0.0426, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.4673076923076923, | |
| "eval_loss": 0.02220647782087326, | |
| "eval_runtime": 179.6128, | |
| "eval_samples_per_second": 256.78, | |
| "eval_steps_per_second": 2.01, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.46923076923076923, | |
| "grad_norm": 0.8665458559989929, | |
| "learning_rate": 5.326923076923077e-05, | |
| "loss": 0.0114, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.46923076923076923, | |
| "eval_loss": 0.02131798304617405, | |
| "eval_runtime": 179.1555, | |
| "eval_samples_per_second": 257.436, | |
| "eval_steps_per_second": 2.015, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.47115384615384615, | |
| "grad_norm": 1.1648316383361816, | |
| "learning_rate": 5.3076923076923076e-05, | |
| "loss": 0.0122, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.47115384615384615, | |
| "eval_loss": 0.02076887898147106, | |
| "eval_runtime": 179.2242, | |
| "eval_samples_per_second": 257.337, | |
| "eval_steps_per_second": 2.014, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.47307692307692306, | |
| "grad_norm": 1.3646942377090454, | |
| "learning_rate": 5.288461538461539e-05, | |
| "loss": 0.0202, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.47307692307692306, | |
| "eval_loss": 0.0202037263661623, | |
| "eval_runtime": 179.1515, | |
| "eval_samples_per_second": 257.441, | |
| "eval_steps_per_second": 2.015, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 2.266969919204712, | |
| "learning_rate": 5.26923076923077e-05, | |
| "loss": 0.0168, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "eval_loss": 0.02008403278887272, | |
| "eval_runtime": 179.642, | |
| "eval_samples_per_second": 256.738, | |
| "eval_steps_per_second": 2.01, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.47692307692307695, | |
| "grad_norm": 1.709193229675293, | |
| "learning_rate": 5.25e-05, | |
| "loss": 0.0222, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.47692307692307695, | |
| "eval_loss": 0.02022576704621315, | |
| "eval_runtime": 179.6915, | |
| "eval_samples_per_second": 256.668, | |
| "eval_steps_per_second": 2.009, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.47884615384615387, | |
| "grad_norm": 0.7432993054389954, | |
| "learning_rate": 5.230769230769231e-05, | |
| "loss": 0.0091, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.47884615384615387, | |
| "eval_loss": 0.0207006074488163, | |
| "eval_runtime": 178.7806, | |
| "eval_samples_per_second": 257.975, | |
| "eval_steps_per_second": 2.019, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.4807692307692308, | |
| "grad_norm": 1.0147693157196045, | |
| "learning_rate": 5.2115384615384624e-05, | |
| "loss": 0.0336, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4807692307692308, | |
| "eval_loss": 0.020908081904053688, | |
| "eval_runtime": 179.032, | |
| "eval_samples_per_second": 257.613, | |
| "eval_steps_per_second": 2.016, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4826923076923077, | |
| "grad_norm": 0.11277324706315994, | |
| "learning_rate": 5.192307692307693e-05, | |
| "loss": 0.0049, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.4826923076923077, | |
| "eval_loss": 0.021289991214871407, | |
| "eval_runtime": 178.7696, | |
| "eval_samples_per_second": 257.991, | |
| "eval_steps_per_second": 2.019, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.4846153846153846, | |
| "grad_norm": 1.4250966310501099, | |
| "learning_rate": 5.173076923076924e-05, | |
| "loss": 0.0725, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.4846153846153846, | |
| "eval_loss": 0.02207464724779129, | |
| "eval_runtime": 178.046, | |
| "eval_samples_per_second": 259.04, | |
| "eval_steps_per_second": 2.028, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.48653846153846153, | |
| "grad_norm": 0.6005804538726807, | |
| "learning_rate": 5.1538461538461536e-05, | |
| "loss": 0.0069, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.48653846153846153, | |
| "eval_loss": 0.02242344059050083, | |
| "eval_runtime": 178.0571, | |
| "eval_samples_per_second": 259.024, | |
| "eval_steps_per_second": 2.027, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.48846153846153845, | |
| "grad_norm": 2.921394109725952, | |
| "learning_rate": 5.134615384615385e-05, | |
| "loss": 0.0435, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.48846153846153845, | |
| "eval_loss": 0.022368701174855232, | |
| "eval_runtime": 177.6433, | |
| "eval_samples_per_second": 259.627, | |
| "eval_steps_per_second": 2.032, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.49038461538461536, | |
| "grad_norm": 2.527122974395752, | |
| "learning_rate": 5.115384615384615e-05, | |
| "loss": 0.0227, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.49038461538461536, | |
| "eval_loss": 0.023167185485363007, | |
| "eval_runtime": 178.2025, | |
| "eval_samples_per_second": 258.812, | |
| "eval_steps_per_second": 2.026, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.49230769230769234, | |
| "grad_norm": 1.260136604309082, | |
| "learning_rate": 5.096153846153846e-05, | |
| "loss": 0.0641, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.49230769230769234, | |
| "eval_loss": 0.02456340193748474, | |
| "eval_runtime": 179.2064, | |
| "eval_samples_per_second": 257.362, | |
| "eval_steps_per_second": 2.014, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.49423076923076925, | |
| "grad_norm": 4.960824489593506, | |
| "learning_rate": 5.0769230769230766e-05, | |
| "loss": 0.1312, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.49423076923076925, | |
| "eval_loss": 0.026521550491452217, | |
| "eval_runtime": 177.6317, | |
| "eval_samples_per_second": 259.644, | |
| "eval_steps_per_second": 2.032, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.49615384615384617, | |
| "grad_norm": 2.000896692276001, | |
| "learning_rate": 5.057692307692308e-05, | |
| "loss": 0.022, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.49615384615384617, | |
| "eval_loss": 0.027118589729070663, | |
| "eval_runtime": 179.2158, | |
| "eval_samples_per_second": 257.349, | |
| "eval_steps_per_second": 2.014, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.4980769230769231, | |
| "grad_norm": 2.8646159172058105, | |
| "learning_rate": 5.038461538461539e-05, | |
| "loss": 0.0257, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.4980769230769231, | |
| "eval_loss": 0.026049936190247536, | |
| "eval_runtime": 179.4408, | |
| "eval_samples_per_second": 257.026, | |
| "eval_steps_per_second": 2.012, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.943261981010437, | |
| "learning_rate": 5.019230769230769e-05, | |
| "loss": 0.0233, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.025271113961935043, | |
| "eval_runtime": 178.2347, | |
| "eval_samples_per_second": 258.766, | |
| "eval_steps_per_second": 2.025, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5019230769230769, | |
| "grad_norm": 0.20943577587604523, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0049, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.5019230769230769, | |
| "eval_loss": 0.024457741528749466, | |
| "eval_runtime": 178.8024, | |
| "eval_samples_per_second": 257.944, | |
| "eval_steps_per_second": 2.019, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.5038461538461538, | |
| "grad_norm": 1.5787253379821777, | |
| "learning_rate": 4.980769230769231e-05, | |
| "loss": 0.0253, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.5038461538461538, | |
| "eval_loss": 0.023463794961571693, | |
| "eval_runtime": 179.4617, | |
| "eval_samples_per_second": 256.996, | |
| "eval_steps_per_second": 2.012, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.5057692307692307, | |
| "grad_norm": 1.8370299339294434, | |
| "learning_rate": 4.961538461538462e-05, | |
| "loss": 0.123, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.5057692307692307, | |
| "eval_loss": 0.022153466939926147, | |
| "eval_runtime": 177.8706, | |
| "eval_samples_per_second": 259.295, | |
| "eval_steps_per_second": 2.03, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.5076923076923077, | |
| "grad_norm": 1.3905123472213745, | |
| "learning_rate": 4.942307692307693e-05, | |
| "loss": 0.0796, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.5076923076923077, | |
| "eval_loss": 0.02036619931459427, | |
| "eval_runtime": 179.1649, | |
| "eval_samples_per_second": 257.422, | |
| "eval_steps_per_second": 2.015, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.5096153846153846, | |
| "grad_norm": 0.5677681565284729, | |
| "learning_rate": 4.923076923076924e-05, | |
| "loss": 0.0374, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5096153846153846, | |
| "eval_loss": 0.019072143360972404, | |
| "eval_runtime": 177.7811, | |
| "eval_samples_per_second": 259.426, | |
| "eval_steps_per_second": 2.031, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5115384615384615, | |
| "grad_norm": 2.8789877891540527, | |
| "learning_rate": 4.9038461538461536e-05, | |
| "loss": 0.0607, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.5115384615384615, | |
| "eval_loss": 0.01831653155386448, | |
| "eval_runtime": 177.7329, | |
| "eval_samples_per_second": 259.496, | |
| "eval_steps_per_second": 2.031, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.5134615384615384, | |
| "grad_norm": 2.489546060562134, | |
| "learning_rate": 4.884615384615385e-05, | |
| "loss": 0.0265, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.5134615384615384, | |
| "eval_loss": 0.018156476318836212, | |
| "eval_runtime": 178.3037, | |
| "eval_samples_per_second": 258.665, | |
| "eval_steps_per_second": 2.025, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.5153846153846153, | |
| "grad_norm": 0.33813557028770447, | |
| "learning_rate": 4.865384615384616e-05, | |
| "loss": 0.0081, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.5153846153846153, | |
| "eval_loss": 0.018267083913087845, | |
| "eval_runtime": 178.0725, | |
| "eval_samples_per_second": 259.001, | |
| "eval_steps_per_second": 2.027, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.5173076923076924, | |
| "grad_norm": 1.1769758462905884, | |
| "learning_rate": 4.846153846153846e-05, | |
| "loss": 0.0109, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.5173076923076924, | |
| "eval_loss": 0.018560878932476044, | |
| "eval_runtime": 178.4098, | |
| "eval_samples_per_second": 258.512, | |
| "eval_steps_per_second": 2.023, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.5192307692307693, | |
| "grad_norm": 1.614059329032898, | |
| "learning_rate": 4.826923076923077e-05, | |
| "loss": 0.0297, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5192307692307693, | |
| "eval_loss": 0.019179968163371086, | |
| "eval_runtime": 178.4615, | |
| "eval_samples_per_second": 258.437, | |
| "eval_steps_per_second": 2.023, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5211538461538462, | |
| "grad_norm": 2.350944757461548, | |
| "learning_rate": 4.8076923076923084e-05, | |
| "loss": 0.0431, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.5211538461538462, | |
| "eval_loss": 0.019696904346346855, | |
| "eval_runtime": 179.3451, | |
| "eval_samples_per_second": 257.163, | |
| "eval_steps_per_second": 2.013, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.5230769230769231, | |
| "grad_norm": 1.677561640739441, | |
| "learning_rate": 4.788461538461539e-05, | |
| "loss": 0.0622, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.5230769230769231, | |
| "eval_loss": 0.019987458363175392, | |
| "eval_runtime": 178.1412, | |
| "eval_samples_per_second": 258.901, | |
| "eval_steps_per_second": 2.026, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 0.178351029753685, | |
| "learning_rate": 4.76923076923077e-05, | |
| "loss": 0.0043, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "eval_loss": 0.02033291384577751, | |
| "eval_runtime": 178.408, | |
| "eval_samples_per_second": 258.514, | |
| "eval_steps_per_second": 2.023, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.5269230769230769, | |
| "grad_norm": 2.8552165031433105, | |
| "learning_rate": 4.75e-05, | |
| "loss": 0.0395, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.5269230769230769, | |
| "eval_loss": 0.020751679316163063, | |
| "eval_runtime": 178.8897, | |
| "eval_samples_per_second": 257.818, | |
| "eval_steps_per_second": 2.018, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.5288461538461539, | |
| "grad_norm": 1.308340311050415, | |
| "learning_rate": 4.730769230769231e-05, | |
| "loss": 0.0139, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5288461538461539, | |
| "eval_loss": 0.02126193419098854, | |
| "eval_runtime": 179.0787, | |
| "eval_samples_per_second": 257.546, | |
| "eval_steps_per_second": 2.016, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5307692307692308, | |
| "grad_norm": 2.3591983318328857, | |
| "learning_rate": 4.711538461538462e-05, | |
| "loss": 0.0831, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.5307692307692308, | |
| "eval_loss": 0.022048989310860634, | |
| "eval_runtime": 178.4149, | |
| "eval_samples_per_second": 258.504, | |
| "eval_steps_per_second": 2.023, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.5326923076923077, | |
| "grad_norm": 1.4477872848510742, | |
| "learning_rate": 4.692307692307693e-05, | |
| "loss": 0.077, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.5326923076923077, | |
| "eval_loss": 0.02285471186041832, | |
| "eval_runtime": 178.6672, | |
| "eval_samples_per_second": 258.139, | |
| "eval_steps_per_second": 2.021, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.5346153846153846, | |
| "grad_norm": 1.7352992296218872, | |
| "learning_rate": 4.673076923076923e-05, | |
| "loss": 0.0475, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.5346153846153846, | |
| "eval_loss": 0.023948505520820618, | |
| "eval_runtime": 182.3486, | |
| "eval_samples_per_second": 252.928, | |
| "eval_steps_per_second": 1.98, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.5365384615384615, | |
| "grad_norm": 0.7470586895942688, | |
| "learning_rate": 4.653846153846154e-05, | |
| "loss": 0.0404, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.5365384615384615, | |
| "eval_loss": 0.025173615664243698, | |
| "eval_runtime": 190.4732, | |
| "eval_samples_per_second": 242.139, | |
| "eval_steps_per_second": 1.895, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.5384615384615384, | |
| "grad_norm": 1.587988257408142, | |
| "learning_rate": 4.634615384615385e-05, | |
| "loss": 0.0825, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5384615384615384, | |
| "eval_loss": 0.026492061093449593, | |
| "eval_runtime": 179.4359, | |
| "eval_samples_per_second": 257.033, | |
| "eval_steps_per_second": 2.012, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5403846153846154, | |
| "grad_norm": 1.2091609239578247, | |
| "learning_rate": 4.615384615384616e-05, | |
| "loss": 0.0158, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.5403846153846154, | |
| "eval_loss": 0.02694498375058174, | |
| "eval_runtime": 180.0245, | |
| "eval_samples_per_second": 256.193, | |
| "eval_steps_per_second": 2.005, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.5423076923076923, | |
| "grad_norm": 2.1175975799560547, | |
| "learning_rate": 4.596153846153846e-05, | |
| "loss": 0.0591, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.5423076923076923, | |
| "eval_loss": 0.026078298687934875, | |
| "eval_runtime": 180.9972, | |
| "eval_samples_per_second": 254.816, | |
| "eval_steps_per_second": 1.995, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.5442307692307692, | |
| "grad_norm": 1.398386836051941, | |
| "learning_rate": 4.576923076923077e-05, | |
| "loss": 0.0313, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.5442307692307692, | |
| "eval_loss": 0.02475452423095703, | |
| "eval_runtime": 181.364, | |
| "eval_samples_per_second": 254.301, | |
| "eval_steps_per_second": 1.99, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.5461538461538461, | |
| "grad_norm": 0.47163820266723633, | |
| "learning_rate": 4.557692307692308e-05, | |
| "loss": 0.0078, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.5461538461538461, | |
| "eval_loss": 0.023184489458799362, | |
| "eval_runtime": 182.4892, | |
| "eval_samples_per_second": 252.733, | |
| "eval_steps_per_second": 1.978, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.5480769230769231, | |
| "grad_norm": 1.4991772174835205, | |
| "learning_rate": 4.538461538461539e-05, | |
| "loss": 0.0266, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5480769230769231, | |
| "eval_loss": 0.021967096254229546, | |
| "eval_runtime": 184.1286, | |
| "eval_samples_per_second": 250.483, | |
| "eval_steps_per_second": 1.961, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.7483875155448914, | |
| "learning_rate": 4.519230769230769e-05, | |
| "loss": 0.0084, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "eval_loss": 0.020483436062932014, | |
| "eval_runtime": 183.7344, | |
| "eval_samples_per_second": 251.02, | |
| "eval_steps_per_second": 1.965, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.551923076923077, | |
| "grad_norm": 0.33013495802879333, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.0057, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.551923076923077, | |
| "eval_loss": 0.019703133031725883, | |
| "eval_runtime": 181.8834, | |
| "eval_samples_per_second": 253.575, | |
| "eval_steps_per_second": 1.985, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.5538461538461539, | |
| "grad_norm": 2.1860246658325195, | |
| "learning_rate": 4.4807692307692314e-05, | |
| "loss": 0.0241, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.5538461538461539, | |
| "eval_loss": 0.018677791580557823, | |
| "eval_runtime": 182.9398, | |
| "eval_samples_per_second": 252.11, | |
| "eval_steps_per_second": 1.973, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.5557692307692308, | |
| "grad_norm": 1.618175983428955, | |
| "learning_rate": 4.461538461538462e-05, | |
| "loss": 0.0187, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.5557692307692308, | |
| "eval_loss": 0.018410805612802505, | |
| "eval_runtime": 184.3183, | |
| "eval_samples_per_second": 250.225, | |
| "eval_steps_per_second": 1.959, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.5576923076923077, | |
| "grad_norm": 1.3602591753005981, | |
| "learning_rate": 4.442307692307692e-05, | |
| "loss": 0.0465, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5576923076923077, | |
| "eval_loss": 0.018333878368139267, | |
| "eval_runtime": 183.9716, | |
| "eval_samples_per_second": 250.696, | |
| "eval_steps_per_second": 1.962, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5596153846153846, | |
| "grad_norm": 2.3707096576690674, | |
| "learning_rate": 4.423076923076923e-05, | |
| "loss": 0.071, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.5596153846153846, | |
| "eval_loss": 0.01808229647576809, | |
| "eval_runtime": 183.4525, | |
| "eval_samples_per_second": 251.406, | |
| "eval_steps_per_second": 1.968, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.5615384615384615, | |
| "grad_norm": 1.8033769130706787, | |
| "learning_rate": 4.403846153846154e-05, | |
| "loss": 0.0206, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.5615384615384615, | |
| "eval_loss": 0.017939355224370956, | |
| "eval_runtime": 184.5771, | |
| "eval_samples_per_second": 249.874, | |
| "eval_steps_per_second": 1.956, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.5634615384615385, | |
| "grad_norm": 2.6258585453033447, | |
| "learning_rate": 4.384615384615385e-05, | |
| "loss": 0.0291, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.5634615384615385, | |
| "eval_loss": 0.01776740886271, | |
| "eval_runtime": 184.1944, | |
| "eval_samples_per_second": 250.393, | |
| "eval_steps_per_second": 1.96, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.5653846153846154, | |
| "grad_norm": 0.5941898226737976, | |
| "learning_rate": 4.365384615384616e-05, | |
| "loss": 0.0103, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.5653846153846154, | |
| "eval_loss": 0.017663318663835526, | |
| "eval_runtime": 185.8504, | |
| "eval_samples_per_second": 248.162, | |
| "eval_steps_per_second": 1.942, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.5673076923076923, | |
| "grad_norm": 1.4761334657669067, | |
| "learning_rate": 4.346153846153846e-05, | |
| "loss": 0.0336, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5673076923076923, | |
| "eval_loss": 0.017522111535072327, | |
| "eval_runtime": 185.2284, | |
| "eval_samples_per_second": 248.995, | |
| "eval_steps_per_second": 1.949, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5692307692307692, | |
| "grad_norm": 3.071438789367676, | |
| "learning_rate": 4.326923076923077e-05, | |
| "loss": 0.0481, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.5692307692307692, | |
| "eval_loss": 0.01728859543800354, | |
| "eval_runtime": 182.7731, | |
| "eval_samples_per_second": 252.34, | |
| "eval_steps_per_second": 1.975, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.5711538461538461, | |
| "grad_norm": 2.6939680576324463, | |
| "learning_rate": 4.3076923076923084e-05, | |
| "loss": 0.0361, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.5711538461538461, | |
| "eval_loss": 0.01725272834300995, | |
| "eval_runtime": 183.1168, | |
| "eval_samples_per_second": 251.867, | |
| "eval_steps_per_second": 1.971, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.573076923076923, | |
| "grad_norm": 1.5768874883651733, | |
| "learning_rate": 4.288461538461538e-05, | |
| "loss": 0.0165, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.573076923076923, | |
| "eval_loss": 0.017209839075803757, | |
| "eval_runtime": 183.6691, | |
| "eval_samples_per_second": 251.109, | |
| "eval_steps_per_second": 1.965, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 0.6393303871154785, | |
| "learning_rate": 4.269230769230769e-05, | |
| "loss": 0.0093, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "eval_loss": 0.017135918140411377, | |
| "eval_runtime": 187.5931, | |
| "eval_samples_per_second": 245.857, | |
| "eval_steps_per_second": 1.924, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.5769230769230769, | |
| "grad_norm": 3.0353610515594482, | |
| "learning_rate": 4.25e-05, | |
| "loss": 0.0203, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5769230769230769, | |
| "eval_loss": 0.017029576003551483, | |
| "eval_runtime": 187.2345, | |
| "eval_samples_per_second": 246.327, | |
| "eval_steps_per_second": 1.928, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5788461538461539, | |
| "grad_norm": 1.7909221649169922, | |
| "learning_rate": 4.230769230769231e-05, | |
| "loss": 0.0302, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.5788461538461539, | |
| "eval_loss": 0.01717795990407467, | |
| "eval_runtime": 187.1637, | |
| "eval_samples_per_second": 246.421, | |
| "eval_steps_per_second": 1.929, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.5807692307692308, | |
| "grad_norm": 0.9146409034729004, | |
| "learning_rate": 4.211538461538462e-05, | |
| "loss": 0.0089, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.5807692307692308, | |
| "eval_loss": 0.017274800688028336, | |
| "eval_runtime": 186.4945, | |
| "eval_samples_per_second": 247.305, | |
| "eval_steps_per_second": 1.936, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.5826923076923077, | |
| "grad_norm": 0.9270340800285339, | |
| "learning_rate": 4.192307692307693e-05, | |
| "loss": 0.0118, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.5826923076923077, | |
| "eval_loss": 0.01724671758711338, | |
| "eval_runtime": 186.0669, | |
| "eval_samples_per_second": 247.873, | |
| "eval_steps_per_second": 1.94, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.5846153846153846, | |
| "grad_norm": 1.39608895778656, | |
| "learning_rate": 4.173076923076923e-05, | |
| "loss": 0.0338, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.5846153846153846, | |
| "eval_loss": 0.017357762902975082, | |
| "eval_runtime": 187.0885, | |
| "eval_samples_per_second": 246.52, | |
| "eval_steps_per_second": 1.93, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.5865384615384616, | |
| "grad_norm": 1.7433981895446777, | |
| "learning_rate": 4.1538461538461544e-05, | |
| "loss": 0.0175, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.5865384615384616, | |
| "eval_loss": 0.017904143780469894, | |
| "eval_runtime": 186.9937, | |
| "eval_samples_per_second": 246.645, | |
| "eval_steps_per_second": 1.931, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.5884615384615385, | |
| "grad_norm": 0.5012370944023132, | |
| "learning_rate": 4.134615384615385e-05, | |
| "loss": 0.0075, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.5884615384615385, | |
| "eval_loss": 0.018834874033927917, | |
| "eval_runtime": 187.3986, | |
| "eval_samples_per_second": 246.112, | |
| "eval_steps_per_second": 1.926, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.5903846153846154, | |
| "grad_norm": 2.3230066299438477, | |
| "learning_rate": 4.115384615384615e-05, | |
| "loss": 0.015, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.5903846153846154, | |
| "eval_loss": 0.020419873297214508, | |
| "eval_runtime": 188.2229, | |
| "eval_samples_per_second": 245.034, | |
| "eval_steps_per_second": 1.918, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.5923076923076923, | |
| "grad_norm": 1.2293483018875122, | |
| "learning_rate": 4.096153846153846e-05, | |
| "loss": 0.0147, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.5923076923076923, | |
| "eval_loss": 0.0223609060049057, | |
| "eval_runtime": 186.8039, | |
| "eval_samples_per_second": 246.895, | |
| "eval_steps_per_second": 1.933, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.5942307692307692, | |
| "grad_norm": 1.8343030214309692, | |
| "learning_rate": 4.0769230769230773e-05, | |
| "loss": 0.0256, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.5942307692307692, | |
| "eval_loss": 0.02358538843691349, | |
| "eval_runtime": 185.2135, | |
| "eval_samples_per_second": 249.015, | |
| "eval_steps_per_second": 1.949, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.5961538461538461, | |
| "grad_norm": 0.8146288990974426, | |
| "learning_rate": 4.057692307692308e-05, | |
| "loss": 0.0071, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5961538461538461, | |
| "eval_loss": 0.02386535331606865, | |
| "eval_runtime": 183.8932, | |
| "eval_samples_per_second": 250.803, | |
| "eval_steps_per_second": 1.963, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5980769230769231, | |
| "grad_norm": 4.294083595275879, | |
| "learning_rate": 4.038461538461539e-05, | |
| "loss": 0.0456, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.5980769230769231, | |
| "eval_loss": 0.023020418360829353, | |
| "eval_runtime": 183.5896, | |
| "eval_samples_per_second": 251.218, | |
| "eval_steps_per_second": 1.966, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.3458585739135742, | |
| "learning_rate": 4.019230769230769e-05, | |
| "loss": 0.0293, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 0.021461069583892822, | |
| "eval_runtime": 183.1076, | |
| "eval_samples_per_second": 251.879, | |
| "eval_steps_per_second": 1.972, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.6019230769230769, | |
| "grad_norm": 1.0707663297653198, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0183, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.6019230769230769, | |
| "eval_loss": 0.020740246400237083, | |
| "eval_runtime": 182.7591, | |
| "eval_samples_per_second": 252.36, | |
| "eval_steps_per_second": 1.975, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.6038461538461538, | |
| "grad_norm": 0.22060979902744293, | |
| "learning_rate": 3.980769230769231e-05, | |
| "loss": 0.0057, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.6038461538461538, | |
| "eval_loss": 0.020306937396526337, | |
| "eval_runtime": 181.6239, | |
| "eval_samples_per_second": 253.937, | |
| "eval_steps_per_second": 1.988, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.6057692307692307, | |
| "grad_norm": 0.45676878094673157, | |
| "learning_rate": 3.961538461538462e-05, | |
| "loss": 0.0559, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6057692307692307, | |
| "eval_loss": 0.019902806729078293, | |
| "eval_runtime": 187.9515, | |
| "eval_samples_per_second": 245.388, | |
| "eval_steps_per_second": 1.921, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6076923076923076, | |
| "grad_norm": 1.1361058950424194, | |
| "learning_rate": 3.942307692307692e-05, | |
| "loss": 0.014, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.6076923076923076, | |
| "eval_loss": 0.01993195153772831, | |
| "eval_runtime": 183.33, | |
| "eval_samples_per_second": 251.574, | |
| "eval_steps_per_second": 1.969, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.6096153846153847, | |
| "grad_norm": 2.384164810180664, | |
| "learning_rate": 3.923076923076923e-05, | |
| "loss": 0.0516, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.6096153846153847, | |
| "eval_loss": 0.019640127196907997, | |
| "eval_runtime": 181.9327, | |
| "eval_samples_per_second": 253.506, | |
| "eval_steps_per_second": 1.984, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.6115384615384616, | |
| "grad_norm": 2.3011910915374756, | |
| "learning_rate": 3.903846153846154e-05, | |
| "loss": 0.0502, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.6115384615384616, | |
| "eval_loss": 0.019097890704870224, | |
| "eval_runtime": 182.2105, | |
| "eval_samples_per_second": 253.119, | |
| "eval_steps_per_second": 1.981, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.6134615384615385, | |
| "grad_norm": 2.6372344493865967, | |
| "learning_rate": 3.884615384615385e-05, | |
| "loss": 0.0871, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.6134615384615385, | |
| "eval_loss": 0.018500829115509987, | |
| "eval_runtime": 184.0891, | |
| "eval_samples_per_second": 250.536, | |
| "eval_steps_per_second": 1.961, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.08721642941236496, | |
| "learning_rate": 3.865384615384616e-05, | |
| "loss": 0.0039, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "eval_loss": 0.018185600638389587, | |
| "eval_runtime": 185.4703, | |
| "eval_samples_per_second": 248.671, | |
| "eval_steps_per_second": 1.946, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6173076923076923, | |
| "grad_norm": 2.712874174118042, | |
| "learning_rate": 3.846153846153846e-05, | |
| "loss": 0.0125, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.6173076923076923, | |
| "eval_loss": 0.018078332766890526, | |
| "eval_runtime": 182.7576, | |
| "eval_samples_per_second": 252.362, | |
| "eval_steps_per_second": 1.975, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.6192307692307693, | |
| "grad_norm": 2.1178512573242188, | |
| "learning_rate": 3.826923076923077e-05, | |
| "loss": 0.0438, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.6192307692307693, | |
| "eval_loss": 0.01808401755988598, | |
| "eval_runtime": 180.7165, | |
| "eval_samples_per_second": 255.212, | |
| "eval_steps_per_second": 1.998, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.6211538461538462, | |
| "grad_norm": 1.4352222681045532, | |
| "learning_rate": 3.807692307692308e-05, | |
| "loss": 0.0148, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.6211538461538462, | |
| "eval_loss": 0.018278954550623894, | |
| "eval_runtime": 180.0406, | |
| "eval_samples_per_second": 256.17, | |
| "eval_steps_per_second": 2.005, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.6230769230769231, | |
| "grad_norm": 0.1123296320438385, | |
| "learning_rate": 3.788461538461538e-05, | |
| "loss": 0.004, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.6230769230769231, | |
| "eval_loss": 0.018484123051166534, | |
| "eval_runtime": 180.4878, | |
| "eval_samples_per_second": 255.535, | |
| "eval_steps_per_second": 2.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 2.1391522884368896, | |
| "learning_rate": 3.769230769230769e-05, | |
| "loss": 0.02, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "eval_loss": 0.01890737935900688, | |
| "eval_runtime": 179.5465, | |
| "eval_samples_per_second": 256.875, | |
| "eval_steps_per_second": 2.011, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6269230769230769, | |
| "grad_norm": 0.7092868089675903, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.0103, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.6269230769230769, | |
| "eval_loss": 0.01960405707359314, | |
| "eval_runtime": 180.3603, | |
| "eval_samples_per_second": 255.716, | |
| "eval_steps_per_second": 2.002, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.6288461538461538, | |
| "grad_norm": 1.3859364986419678, | |
| "learning_rate": 3.730769230769231e-05, | |
| "loss": 0.0121, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.6288461538461538, | |
| "eval_loss": 0.02096397802233696, | |
| "eval_runtime": 180.3107, | |
| "eval_samples_per_second": 255.786, | |
| "eval_steps_per_second": 2.002, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.6307692307692307, | |
| "grad_norm": 1.0377469062805176, | |
| "learning_rate": 3.711538461538462e-05, | |
| "loss": 0.0251, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.6307692307692307, | |
| "eval_loss": 0.023392662405967712, | |
| "eval_runtime": 180.6358, | |
| "eval_samples_per_second": 255.326, | |
| "eval_steps_per_second": 1.998, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.6326923076923077, | |
| "grad_norm": 0.7841097712516785, | |
| "learning_rate": 3.692307692307693e-05, | |
| "loss": 0.0381, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.6326923076923077, | |
| "eval_loss": 0.02570357732474804, | |
| "eval_runtime": 180.4709, | |
| "eval_samples_per_second": 255.559, | |
| "eval_steps_per_second": 2.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.6346153846153846, | |
| "grad_norm": 0.651456356048584, | |
| "learning_rate": 3.673076923076923e-05, | |
| "loss": 0.0058, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6346153846153846, | |
| "eval_loss": 0.027660323306918144, | |
| "eval_runtime": 180.74, | |
| "eval_samples_per_second": 255.179, | |
| "eval_steps_per_second": 1.997, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6365384615384615, | |
| "grad_norm": 1.0176434516906738, | |
| "learning_rate": 3.653846153846154e-05, | |
| "loss": 0.0062, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.6365384615384615, | |
| "eval_loss": 0.028412554413080215, | |
| "eval_runtime": 180.2559, | |
| "eval_samples_per_second": 255.864, | |
| "eval_steps_per_second": 2.003, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.6384615384615384, | |
| "grad_norm": 1.440993070602417, | |
| "learning_rate": 3.634615384615385e-05, | |
| "loss": 0.0134, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.6384615384615384, | |
| "eval_loss": 0.02854098007082939, | |
| "eval_runtime": 179.5351, | |
| "eval_samples_per_second": 256.891, | |
| "eval_steps_per_second": 2.011, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.6403846153846153, | |
| "grad_norm": 0.8128412961959839, | |
| "learning_rate": 3.615384615384615e-05, | |
| "loss": 0.006, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.6403846153846153, | |
| "eval_loss": 0.027823466807603836, | |
| "eval_runtime": 181.214, | |
| "eval_samples_per_second": 254.511, | |
| "eval_steps_per_second": 1.992, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.6423076923076924, | |
| "grad_norm": 0.12252137064933777, | |
| "learning_rate": 3.596153846153846e-05, | |
| "loss": 0.0037, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.6423076923076924, | |
| "eval_loss": 0.02721521630883217, | |
| "eval_runtime": 178.9703, | |
| "eval_samples_per_second": 257.702, | |
| "eval_steps_per_second": 2.017, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.6442307692307693, | |
| "grad_norm": 2.2254228591918945, | |
| "learning_rate": 3.5769230769230774e-05, | |
| "loss": 0.0551, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6442307692307693, | |
| "eval_loss": 0.025851406157016754, | |
| "eval_runtime": 181.1771, | |
| "eval_samples_per_second": 254.563, | |
| "eval_steps_per_second": 1.993, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6461538461538462, | |
| "grad_norm": 4.576323986053467, | |
| "learning_rate": 3.557692307692308e-05, | |
| "loss": 0.0756, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.6461538461538462, | |
| "eval_loss": 0.024042945355176926, | |
| "eval_runtime": 180.2984, | |
| "eval_samples_per_second": 255.804, | |
| "eval_steps_per_second": 2.002, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.6480769230769231, | |
| "grad_norm": 2.2256715297698975, | |
| "learning_rate": 3.538461538461539e-05, | |
| "loss": 0.036, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.6480769230769231, | |
| "eval_loss": 0.022502081468701363, | |
| "eval_runtime": 179.5004, | |
| "eval_samples_per_second": 256.941, | |
| "eval_steps_per_second": 2.011, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.19705277681350708, | |
| "learning_rate": 3.51923076923077e-05, | |
| "loss": 0.0045, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "eval_loss": 0.021358314901590347, | |
| "eval_runtime": 179.5403, | |
| "eval_samples_per_second": 256.884, | |
| "eval_steps_per_second": 2.011, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.6519230769230769, | |
| "grad_norm": 2.603549003601074, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.0265, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.6519230769230769, | |
| "eval_loss": 0.020234843716025352, | |
| "eval_runtime": 180.9278, | |
| "eval_samples_per_second": 254.914, | |
| "eval_steps_per_second": 1.995, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.6538461538461539, | |
| "grad_norm": 0.22635461390018463, | |
| "learning_rate": 3.480769230769231e-05, | |
| "loss": 0.0055, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6538461538461539, | |
| "eval_loss": 0.0195136871188879, | |
| "eval_runtime": 181.0715, | |
| "eval_samples_per_second": 254.711, | |
| "eval_steps_per_second": 1.994, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6557692307692308, | |
| "grad_norm": 2.3919484615325928, | |
| "learning_rate": 3.461538461538462e-05, | |
| "loss": 0.0184, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.6557692307692308, | |
| "eval_loss": 0.019222378730773926, | |
| "eval_runtime": 180.631, | |
| "eval_samples_per_second": 255.333, | |
| "eval_steps_per_second": 1.999, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.6576923076923077, | |
| "grad_norm": 0.7615954875946045, | |
| "learning_rate": 3.442307692307692e-05, | |
| "loss": 0.0068, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.6576923076923077, | |
| "eval_loss": 0.019121317192912102, | |
| "eval_runtime": 179.541, | |
| "eval_samples_per_second": 256.883, | |
| "eval_steps_per_second": 2.011, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.6596153846153846, | |
| "grad_norm": 0.15638679265975952, | |
| "learning_rate": 3.4230769230769234e-05, | |
| "loss": 0.0037, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.6596153846153846, | |
| "eval_loss": 0.01907580904662609, | |
| "eval_runtime": 180.9808, | |
| "eval_samples_per_second": 254.839, | |
| "eval_steps_per_second": 1.995, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.6615384615384615, | |
| "grad_norm": 1.0095447301864624, | |
| "learning_rate": 3.4038461538461544e-05, | |
| "loss": 0.029, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.6615384615384615, | |
| "eval_loss": 0.018844593316316605, | |
| "eval_runtime": 181.0563, | |
| "eval_samples_per_second": 254.733, | |
| "eval_steps_per_second": 1.994, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.6634615384615384, | |
| "grad_norm": 1.0952856540679932, | |
| "learning_rate": 3.384615384615385e-05, | |
| "loss": 0.0253, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6634615384615384, | |
| "eval_loss": 0.01858992874622345, | |
| "eval_runtime": 180.445, | |
| "eval_samples_per_second": 255.596, | |
| "eval_steps_per_second": 2.001, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6653846153846154, | |
| "grad_norm": 3.664583683013916, | |
| "learning_rate": 3.365384615384616e-05, | |
| "loss": 0.0702, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.6653846153846154, | |
| "eval_loss": 0.01833895593881607, | |
| "eval_runtime": 181.7517, | |
| "eval_samples_per_second": 253.758, | |
| "eval_steps_per_second": 1.986, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.6673076923076923, | |
| "grad_norm": 2.9066598415374756, | |
| "learning_rate": 3.346153846153846e-05, | |
| "loss": 0.0194, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.6673076923076923, | |
| "eval_loss": 0.018282251432538033, | |
| "eval_runtime": 179.7843, | |
| "eval_samples_per_second": 256.535, | |
| "eval_steps_per_second": 2.008, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.6692307692307692, | |
| "grad_norm": 1.9174058437347412, | |
| "learning_rate": 3.326923076923077e-05, | |
| "loss": 0.0123, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.6692307692307692, | |
| "eval_loss": 0.018363026902079582, | |
| "eval_runtime": 179.7461, | |
| "eval_samples_per_second": 256.59, | |
| "eval_steps_per_second": 2.008, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.6711538461538461, | |
| "grad_norm": 0.2594242990016937, | |
| "learning_rate": 3.307692307692308e-05, | |
| "loss": 0.0039, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.6711538461538461, | |
| "eval_loss": 0.018489746376872063, | |
| "eval_runtime": 179.7349, | |
| "eval_samples_per_second": 256.606, | |
| "eval_steps_per_second": 2.009, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.6730769230769231, | |
| "grad_norm": 5.305335521697998, | |
| "learning_rate": 3.288461538461539e-05, | |
| "loss": 0.0745, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6730769230769231, | |
| "eval_loss": 0.01867184229195118, | |
| "eval_runtime": 181.4346, | |
| "eval_samples_per_second": 254.202, | |
| "eval_steps_per_second": 1.99, | |
| "step": 350 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 520, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 22569032908800.0, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |