| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.988235294117647, |
| "eval_steps": 15, |
| "global_step": 296, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0, |
| "eval_loss": 3.8581998348236084, |
| "eval_runtime": 30.2305, |
| "eval_samples_per_second": 41.647, |
| "eval_steps_per_second": 5.227, |
| "step": 0 |
| }, |
| { |
| "epoch": 0.0067226890756302525, |
| "grad_norm": 49.25, |
| "learning_rate": 0.0, |
| "loss": 3.8278, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.013445378151260505, |
| "grad_norm": 48.0, |
| "learning_rate": 1.25e-06, |
| "loss": 3.877, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.020168067226890758, |
| "grad_norm": 49.5, |
| "learning_rate": 2.5e-06, |
| "loss": 3.8607, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.02689075630252101, |
| "grad_norm": 52.0, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 3.7408, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.03361344537815126, |
| "grad_norm": 43.25, |
| "learning_rate": 5e-06, |
| "loss": 3.7054, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.040336134453781515, |
| "grad_norm": 44.75, |
| "learning_rate": 6.25e-06, |
| "loss": 3.7652, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.047058823529411764, |
| "grad_norm": 36.75, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 3.7236, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.05378151260504202, |
| "grad_norm": 35.75, |
| "learning_rate": 8.750000000000001e-06, |
| "loss": 3.7419, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.06050420168067227, |
| "grad_norm": 21.875, |
| "learning_rate": 1e-05, |
| "loss": 3.6896, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.06722689075630252, |
| "grad_norm": 17.5, |
| "learning_rate": 9.99970252500075e-06, |
| "loss": 3.5791, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07394957983193277, |
| "grad_norm": 15.375, |
| "learning_rate": 9.998810135399545e-06, |
| "loss": 3.5491, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.08067226890756303, |
| "grad_norm": 12.4375, |
| "learning_rate": 9.997322937381829e-06, |
| "loss": 3.5476, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.08739495798319327, |
| "grad_norm": 11.8125, |
| "learning_rate": 9.99524110790929e-06, |
| "loss": 3.5258, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.09411764705882353, |
| "grad_norm": 10.25, |
| "learning_rate": 9.992564894698816e-06, |
| "loss": 3.5072, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.10084033613445378, |
| "grad_norm": 9.375, |
| "learning_rate": 9.989294616193018e-06, |
| "loss": 3.4802, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.10084033613445378, |
| "eval_loss": 3.5118489265441895, |
| "eval_runtime": 30.2065, |
| "eval_samples_per_second": 41.68, |
| "eval_steps_per_second": 5.231, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.10756302521008404, |
| "grad_norm": 7.8125, |
| "learning_rate": 9.985430661522333e-06, |
| "loss": 3.4675, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.11428571428571428, |
| "grad_norm": 7.5, |
| "learning_rate": 9.980973490458728e-06, |
| "loss": 3.4914, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.12100840336134454, |
| "grad_norm": 7.65625, |
| "learning_rate": 9.975923633360985e-06, |
| "loss": 3.5515, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.12773109243697478, |
| "grad_norm": 7.25, |
| "learning_rate": 9.970281691111598e-06, |
| "loss": 3.4299, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.13445378151260504, |
| "grad_norm": 7.5625, |
| "learning_rate": 9.964048335045276e-06, |
| "loss": 3.4942, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1411764705882353, |
| "grad_norm": 6.5625, |
| "learning_rate": 9.957224306869053e-06, |
| "loss": 3.4355, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.14789915966386555, |
| "grad_norm": 6.53125, |
| "learning_rate": 9.94981041857404e-06, |
| "loss": 3.4162, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.1546218487394958, |
| "grad_norm": 6.25, |
| "learning_rate": 9.941807552338805e-06, |
| "loss": 3.4885, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.16134453781512606, |
| "grad_norm": 6.03125, |
| "learning_rate": 9.933216660424396e-06, |
| "loss": 3.5423, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.16806722689075632, |
| "grad_norm": 5.84375, |
| "learning_rate": 9.924038765061042e-06, |
| "loss": 3.4046, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.17478991596638654, |
| "grad_norm": 6.1875, |
| "learning_rate": 9.914274958326507e-06, |
| "loss": 3.3982, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.1815126050420168, |
| "grad_norm": 5.6875, |
| "learning_rate": 9.903926402016153e-06, |
| "loss": 3.4689, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.18823529411764706, |
| "grad_norm": 7.03125, |
| "learning_rate": 9.892994327504693e-06, |
| "loss": 3.5937, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.1949579831932773, |
| "grad_norm": 6.15625, |
| "learning_rate": 9.881480035599667e-06, |
| "loss": 3.4518, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.20168067226890757, |
| "grad_norm": 5.9375, |
| "learning_rate": 9.869384896386669e-06, |
| "loss": 3.4608, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.20168067226890757, |
| "eval_loss": 3.489025354385376, |
| "eval_runtime": 29.7531, |
| "eval_samples_per_second": 42.315, |
| "eval_steps_per_second": 5.31, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.20840336134453782, |
| "grad_norm": 5.40625, |
| "learning_rate": 9.856710349066307e-06, |
| "loss": 3.5205, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.21512605042016808, |
| "grad_norm": 5.0, |
| "learning_rate": 9.843457901782967e-06, |
| "loss": 3.4322, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.2218487394957983, |
| "grad_norm": 5.875, |
| "learning_rate": 9.829629131445342e-06, |
| "loss": 3.4981, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.22857142857142856, |
| "grad_norm": 5.875, |
| "learning_rate": 9.815225683538814e-06, |
| "loss": 3.3736, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.23529411764705882, |
| "grad_norm": 5.1875, |
| "learning_rate": 9.800249271929645e-06, |
| "loss": 3.4398, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.24201680672268908, |
| "grad_norm": 4.90625, |
| "learning_rate": 9.784701678661045e-06, |
| "loss": 3.4934, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.24873949579831933, |
| "grad_norm": 4.84375, |
| "learning_rate": 9.768584753741134e-06, |
| "loss": 3.4506, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.25546218487394956, |
| "grad_norm": 4.5, |
| "learning_rate": 9.751900414922807e-06, |
| "loss": 3.4764, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.26218487394957984, |
| "grad_norm": 4.8125, |
| "learning_rate": 9.73465064747553e-06, |
| "loss": 3.547, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.2689075630252101, |
| "grad_norm": 5.3125, |
| "learning_rate": 9.716837503949128e-06, |
| "loss": 3.4394, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.27563025210084036, |
| "grad_norm": 4.6875, |
| "learning_rate": 9.698463103929542e-06, |
| "loss": 3.4722, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.2823529411764706, |
| "grad_norm": 4.75, |
| "learning_rate": 9.67952963378663e-06, |
| "loss": 3.4639, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.28907563025210087, |
| "grad_norm": 5.0, |
| "learning_rate": 9.660039346413994e-06, |
| "loss": 3.4936, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.2957983193277311, |
| "grad_norm": 4.40625, |
| "learning_rate": 9.639994560960923e-06, |
| "loss": 3.5191, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.3025210084033613, |
| "grad_norm": 4.0625, |
| "learning_rate": 9.619397662556434e-06, |
| "loss": 3.5272, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.3025210084033613, |
| "eval_loss": 3.5188682079315186, |
| "eval_runtime": 30.2246, |
| "eval_samples_per_second": 41.655, |
| "eval_steps_per_second": 5.228, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.3092436974789916, |
| "grad_norm": 4.59375, |
| "learning_rate": 9.598251102025463e-06, |
| "loss": 3.5391, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.31596638655462184, |
| "grad_norm": 4.53125, |
| "learning_rate": 9.576557395597237e-06, |
| "loss": 3.4851, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.3226890756302521, |
| "grad_norm": 5.125, |
| "learning_rate": 9.55431912460588e-06, |
| "loss": 3.5334, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.32941176470588235, |
| "grad_norm": 4.625, |
| "learning_rate": 9.531538935183252e-06, |
| "loss": 3.4687, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.33613445378151263, |
| "grad_norm": 5.53125, |
| "learning_rate": 9.50821953794408e-06, |
| "loss": 3.539, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.34285714285714286, |
| "grad_norm": 4.75, |
| "learning_rate": 9.484363707663443e-06, |
| "loss": 3.5205, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.3495798319327731, |
| "grad_norm": 4.84375, |
| "learning_rate": 9.459974282946572e-06, |
| "loss": 3.5856, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.3563025210084034, |
| "grad_norm": 4.78125, |
| "learning_rate": 9.43505416589111e-06, |
| "loss": 3.5938, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.3630252100840336, |
| "grad_norm": 5.0625, |
| "learning_rate": 9.409606321741776e-06, |
| "loss": 3.5446, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.3697478991596639, |
| "grad_norm": 4.625, |
| "learning_rate": 9.38363377853754e-06, |
| "loss": 3.5746, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.3764705882352941, |
| "grad_norm": 4.875, |
| "learning_rate": 9.357139626751308e-06, |
| "loss": 3.5536, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.3831932773109244, |
| "grad_norm": 4.53125, |
| "learning_rate": 9.330127018922195e-06, |
| "loss": 3.4815, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.3899159663865546, |
| "grad_norm": 4.5, |
| "learning_rate": 9.302599169280395e-06, |
| "loss": 3.5294, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.39663865546218485, |
| "grad_norm": 3.96875, |
| "learning_rate": 9.274559353364734e-06, |
| "loss": 3.476, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.40336134453781514, |
| "grad_norm": 5.09375, |
| "learning_rate": 9.246010907632894e-06, |
| "loss": 3.559, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.40336134453781514, |
| "eval_loss": 3.575310230255127, |
| "eval_runtime": 29.7614, |
| "eval_samples_per_second": 42.303, |
| "eval_steps_per_second": 5.309, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.41008403361344536, |
| "grad_norm": 4.4375, |
| "learning_rate": 9.21695722906443e-06, |
| "loss": 3.5118, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.41680672268907565, |
| "grad_norm": 5.0625, |
| "learning_rate": 9.18740177475654e-06, |
| "loss": 3.5701, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.4235294117647059, |
| "grad_norm": 4.5, |
| "learning_rate": 9.157348061512728e-06, |
| "loss": 3.5679, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.43025210084033616, |
| "grad_norm": 4.0625, |
| "learning_rate": 9.126799665424319e-06, |
| "loss": 3.5001, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.4369747899159664, |
| "grad_norm": 4.28125, |
| "learning_rate": 9.09576022144496e-06, |
| "loss": 3.559, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.4436974789915966, |
| "grad_norm": 4.75, |
| "learning_rate": 9.064233422958078e-06, |
| "loss": 3.4816, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.4504201680672269, |
| "grad_norm": 3.953125, |
| "learning_rate": 9.032223021337415e-06, |
| "loss": 3.6286, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.45714285714285713, |
| "grad_norm": 4.5625, |
| "learning_rate": 8.999732825500649e-06, |
| "loss": 3.5596, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.4638655462184874, |
| "grad_norm": 4.46875, |
| "learning_rate": 8.966766701456177e-06, |
| "loss": 3.5409, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.47058823529411764, |
| "grad_norm": 6.9375, |
| "learning_rate": 8.933328571843086e-06, |
| "loss": 3.5449, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.4773109243697479, |
| "grad_norm": 5.8125, |
| "learning_rate": 8.899422415464409e-06, |
| "loss": 3.6107, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.48403361344537815, |
| "grad_norm": 5.21875, |
| "learning_rate": 8.865052266813686e-06, |
| "loss": 3.6243, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.4907563025210084, |
| "grad_norm": 4.59375, |
| "learning_rate": 8.83022221559489e-06, |
| "loss": 3.6119, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.49747899159663866, |
| "grad_norm": 5.0, |
| "learning_rate": 8.79493640623581e-06, |
| "loss": 3.563, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.5042016806722689, |
| "grad_norm": 4.875, |
| "learning_rate": 8.759199037394888e-06, |
| "loss": 3.5817, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5042016806722689, |
| "eval_loss": 3.612149238586426, |
| "eval_runtime": 30.2292, |
| "eval_samples_per_second": 41.648, |
| "eval_steps_per_second": 5.227, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5109243697478991, |
| "grad_norm": 4.1875, |
| "learning_rate": 8.723014361461633e-06, |
| "loss": 3.5643, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.5176470588235295, |
| "grad_norm": 4.40625, |
| "learning_rate": 8.68638668405062e-06, |
| "loss": 3.5424, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.5243697478991597, |
| "grad_norm": 4.875, |
| "learning_rate": 8.649320363489178e-06, |
| "loss": 3.5679, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.5310924369747899, |
| "grad_norm": 5.4375, |
| "learning_rate": 8.611819810298778e-06, |
| "loss": 3.5269, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.5378151260504201, |
| "grad_norm": 4.46875, |
| "learning_rate": 8.573889486670233e-06, |
| "loss": 3.5913, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5445378151260504, |
| "grad_norm": 4.21875, |
| "learning_rate": 8.535533905932739e-06, |
| "loss": 3.7066, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.5512605042016807, |
| "grad_norm": 4.40625, |
| "learning_rate": 8.496757632016836e-06, |
| "loss": 3.6143, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.5579831932773109, |
| "grad_norm": 4.5, |
| "learning_rate": 8.457565278911349e-06, |
| "loss": 3.6007, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.5647058823529412, |
| "grad_norm": 5.5, |
| "learning_rate": 8.417961510114357e-06, |
| "loss": 3.5805, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "grad_norm": 4.15625, |
| "learning_rate": 8.377951038078303e-06, |
| "loss": 3.5255, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.5781512605042017, |
| "grad_norm": 4.21875, |
| "learning_rate": 8.337538623649237e-06, |
| "loss": 3.6272, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.584873949579832, |
| "grad_norm": 4.40625, |
| "learning_rate": 8.296729075500345e-06, |
| "loss": 3.4642, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.5915966386554622, |
| "grad_norm": 4.40625, |
| "learning_rate": 8.255527249559747e-06, |
| "loss": 3.6105, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.5983193277310924, |
| "grad_norm": 5.34375, |
| "learning_rate": 8.213938048432697e-06, |
| "loss": 3.7054, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.6050420168067226, |
| "grad_norm": 4.03125, |
| "learning_rate": 8.171966420818227e-06, |
| "loss": 3.6349, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6050420168067226, |
| "eval_loss": 3.647097110748291, |
| "eval_runtime": 29.7281, |
| "eval_samples_per_second": 42.351, |
| "eval_steps_per_second": 5.315, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.611764705882353, |
| "grad_norm": 4.46875, |
| "learning_rate": 8.129617360920297e-06, |
| "loss": 3.5585, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.6184873949579832, |
| "grad_norm": 4.09375, |
| "learning_rate": 8.086895907853526e-06, |
| "loss": 3.6065, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.6252100840336134, |
| "grad_norm": 4.0625, |
| "learning_rate": 8.043807145043604e-06, |
| "loss": 3.5808, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.6319327731092437, |
| "grad_norm": 4.25, |
| "learning_rate": 8.000356199622406e-06, |
| "loss": 3.6742, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.6386554621848739, |
| "grad_norm": 3.75, |
| "learning_rate": 7.956548241817914e-06, |
| "loss": 3.609, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.6453781512605042, |
| "grad_norm": 4.34375, |
| "learning_rate": 7.912388484339012e-06, |
| "loss": 3.5559, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.6521008403361345, |
| "grad_norm": 4.59375, |
| "learning_rate": 7.86788218175523e-06, |
| "loss": 3.6504, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.6588235294117647, |
| "grad_norm": 4.59375, |
| "learning_rate": 7.823034629871503e-06, |
| "loss": 3.5724, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.6655462184873949, |
| "grad_norm": 5.25, |
| "learning_rate": 7.777851165098012e-06, |
| "loss": 3.6483, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.6722689075630253, |
| "grad_norm": 5.40625, |
| "learning_rate": 7.732337163815218e-06, |
| "loss": 3.5782, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6789915966386555, |
| "grad_norm": 4.15625, |
| "learning_rate": 7.686498041734121e-06, |
| "loss": 3.5653, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.6857142857142857, |
| "grad_norm": 4.125, |
| "learning_rate": 7.64033925325184e-06, |
| "loss": 3.6252, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.692436974789916, |
| "grad_norm": 4.5625, |
| "learning_rate": 7.593866290802608e-06, |
| "loss": 3.7141, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.6991596638655462, |
| "grad_norm": 4.3125, |
| "learning_rate": 7.54708468420421e-06, |
| "loss": 3.6884, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "grad_norm": 4.21875, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 3.68, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "eval_loss": 3.672091484069824, |
| "eval_runtime": 30.2768, |
| "eval_samples_per_second": 41.583, |
| "eval_steps_per_second": 5.219, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.7126050420168067, |
| "grad_norm": 4.34375, |
| "learning_rate": 7.4526178407965396e-06, |
| "loss": 3.5934, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.719327731092437, |
| "grad_norm": 3.765625, |
| "learning_rate": 7.404943844596939e-06, |
| "loss": 3.5845, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.7260504201680672, |
| "grad_norm": 3.953125, |
| "learning_rate": 7.3569836841299905e-06, |
| "loss": 3.6421, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.7327731092436974, |
| "grad_norm": 4.53125, |
| "learning_rate": 7.308743066175172e-06, |
| "loss": 3.6617, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.7394957983193278, |
| "grad_norm": 4.71875, |
| "learning_rate": 7.2602277308836e-06, |
| "loss": 3.6388, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.746218487394958, |
| "grad_norm": 4.40625, |
| "learning_rate": 7.211443451095007e-06, |
| "loss": 3.6798, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.7529411764705882, |
| "grad_norm": 4.6875, |
| "learning_rate": 7.162396031650831e-06, |
| "loss": 3.8081, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.7596638655462185, |
| "grad_norm": 4.59375, |
| "learning_rate": 7.113091308703498e-06, |
| "loss": 3.762, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.7663865546218488, |
| "grad_norm": 4.25, |
| "learning_rate": 7.063535149021974e-06, |
| "loss": 3.5991, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.773109243697479, |
| "grad_norm": 4.09375, |
| "learning_rate": 7.0137334492936875e-06, |
| "loss": 3.6272, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.7798319327731092, |
| "grad_norm": 5.1875, |
| "learning_rate": 6.963692135422872e-06, |
| "loss": 3.7034, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.7865546218487395, |
| "grad_norm": 4.6875, |
| "learning_rate": 6.913417161825449e-06, |
| "loss": 3.6734, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.7932773109243697, |
| "grad_norm": 4.5625, |
| "learning_rate": 6.862914510720515e-06, |
| "loss": 3.6013, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 4.4375, |
| "learning_rate": 6.812190191418508e-06, |
| "loss": 3.81, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.8067226890756303, |
| "grad_norm": 4.15625, |
| "learning_rate": 6.7612502396061685e-06, |
| "loss": 3.6597, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.8067226890756303, |
| "eval_loss": 3.696960926055908, |
| "eval_runtime": 29.7615, |
| "eval_samples_per_second": 42.303, |
| "eval_steps_per_second": 5.309, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.8134453781512605, |
| "grad_norm": 4.125, |
| "learning_rate": 6.710100716628345e-06, |
| "loss": 3.7104, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.8201680672268907, |
| "grad_norm": 4.5, |
| "learning_rate": 6.6587477087667615e-06, |
| "loss": 3.639, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.826890756302521, |
| "grad_norm": 4.09375, |
| "learning_rate": 6.607197326515808e-06, |
| "loss": 3.6311, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.8336134453781513, |
| "grad_norm": 4.5, |
| "learning_rate": 6.555455703855454e-06, |
| "loss": 3.7333, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.8403361344537815, |
| "grad_norm": 4.375, |
| "learning_rate": 6.503528997521365e-06, |
| "loss": 3.7003, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8470588235294118, |
| "grad_norm": 4.375, |
| "learning_rate": 6.451423386272312e-06, |
| "loss": 3.6759, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.853781512605042, |
| "grad_norm": 4.59375, |
| "learning_rate": 6.399145070154962e-06, |
| "loss": 3.6546, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.8605042016806723, |
| "grad_norm": 3.984375, |
| "learning_rate": 6.346700269766132e-06, |
| "loss": 3.7089, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.8672268907563025, |
| "grad_norm": 4.03125, |
| "learning_rate": 6.294095225512604e-06, |
| "loss": 3.5802, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.8739495798319328, |
| "grad_norm": 4.0, |
| "learning_rate": 6.241336196868582e-06, |
| "loss": 3.7225, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.880672268907563, |
| "grad_norm": 4.25, |
| "learning_rate": 6.188429461630866e-06, |
| "loss": 3.7397, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.8873949579831932, |
| "grad_norm": 4.0625, |
| "learning_rate": 6.135381315171867e-06, |
| "loss": 3.6903, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.8941176470588236, |
| "grad_norm": 4.28125, |
| "learning_rate": 6.0821980696905145e-06, |
| "loss": 3.6114, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.9008403361344538, |
| "grad_norm": 4.03125, |
| "learning_rate": 6.028886053461175e-06, |
| "loss": 3.7576, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.907563025210084, |
| "grad_norm": 3.890625, |
| "learning_rate": 5.975451610080643e-06, |
| "loss": 3.6462, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.907563025210084, |
| "eval_loss": 3.706806182861328, |
| "eval_runtime": 30.2476, |
| "eval_samples_per_second": 41.623, |
| "eval_steps_per_second": 5.224, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.9142857142857143, |
| "grad_norm": 5.0, |
| "learning_rate": 5.921901097713317e-06, |
| "loss": 3.6685, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.9210084033613445, |
| "grad_norm": 4.9375, |
| "learning_rate": 5.8682408883346535e-06, |
| "loss": 3.6868, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.9277310924369748, |
| "grad_norm": 4.15625, |
| "learning_rate": 5.814477366972945e-06, |
| "loss": 3.5962, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.934453781512605, |
| "grad_norm": 4.0625, |
| "learning_rate": 5.760616930949584e-06, |
| "loss": 3.6538, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.9411764705882353, |
| "grad_norm": 4.25, |
| "learning_rate": 5.7066659891178385e-06, |
| "loss": 3.7465, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.9478991596638655, |
| "grad_norm": 4.0625, |
| "learning_rate": 5.65263096110026e-06, |
| "loss": 3.6044, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.9546218487394958, |
| "grad_norm": 4.40625, |
| "learning_rate": 5.598518276524813e-06, |
| "loss": 3.6922, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.9613445378151261, |
| "grad_norm": 4.6875, |
| "learning_rate": 5.544334374259823e-06, |
| "loss": 3.6808, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.9680672268907563, |
| "grad_norm": 4.5, |
| "learning_rate": 5.490085701647805e-06, |
| "loss": 3.6849, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.9747899159663865, |
| "grad_norm": 7.125, |
| "learning_rate": 5.435778713738292e-06, |
| "loss": 3.7327, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.9815126050420168, |
| "grad_norm": 4.40625, |
| "learning_rate": 5.381419872519763e-06, |
| "loss": 3.7792, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.9882352941176471, |
| "grad_norm": 4.59375, |
| "learning_rate": 5.327015646150716e-06, |
| "loss": 3.8095, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.9949579831932773, |
| "grad_norm": 4.09375, |
| "learning_rate": 5.272572508190033e-06, |
| "loss": 3.5693, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 4.84375, |
| "learning_rate": 5.218096936826681e-06, |
| "loss": 3.7536, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.0067226890756302, |
| "grad_norm": 4.21875, |
| "learning_rate": 5.1635954141088815e-06, |
| "loss": 3.7009, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.0067226890756302, |
| "eval_loss": 3.721317768096924, |
| "eval_runtime": 29.8173, |
| "eval_samples_per_second": 42.224, |
| "eval_steps_per_second": 5.299, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.0134453781512605, |
| "grad_norm": 4.15625, |
| "learning_rate": 5.109074425172806e-06, |
| "loss": 3.7465, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.0201680672268907, |
| "grad_norm": 3.890625, |
| "learning_rate": 5.054540457470912e-06, |
| "loss": 3.71, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.026890756302521, |
| "grad_norm": 4.625, |
| "learning_rate": 5e-06, |
| "loss": 3.5906, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.0336134453781514, |
| "grad_norm": 3.859375, |
| "learning_rate": 4.945459542529089e-06, |
| "loss": 3.6227, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.0403361344537816, |
| "grad_norm": 4.71875, |
| "learning_rate": 4.890925574827195e-06, |
| "loss": 3.6398, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.0470588235294118, |
| "grad_norm": 4.71875, |
| "learning_rate": 4.83640458589112e-06, |
| "loss": 3.6522, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.053781512605042, |
| "grad_norm": 4.9375, |
| "learning_rate": 4.781903063173321e-06, |
| "loss": 3.7183, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.0605042016806723, |
| "grad_norm": 4.125, |
| "learning_rate": 4.727427491809968e-06, |
| "loss": 3.765, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.0672268907563025, |
| "grad_norm": 4.65625, |
| "learning_rate": 4.672984353849285e-06, |
| "loss": 3.6848, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.0739495798319327, |
| "grad_norm": 5.28125, |
| "learning_rate": 4.618580127480239e-06, |
| "loss": 3.7065, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.080672268907563, |
| "grad_norm": 4.46875, |
| "learning_rate": 4.564221286261709e-06, |
| "loss": 3.7159, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.0873949579831932, |
| "grad_norm": 4.75, |
| "learning_rate": 4.509914298352197e-06, |
| "loss": 3.7166, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.0941176470588236, |
| "grad_norm": 4.40625, |
| "learning_rate": 4.4556656257401786e-06, |
| "loss": 3.7179, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.1008403361344539, |
| "grad_norm": 4.75, |
| "learning_rate": 4.401481723475189e-06, |
| "loss": 3.6981, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.107563025210084, |
| "grad_norm": 4.3125, |
| "learning_rate": 4.347369038899744e-06, |
| "loss": 3.6717, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.107563025210084, |
| "eval_loss": 3.731348752975464, |
| "eval_runtime": 30.2759, |
| "eval_samples_per_second": 41.584, |
| "eval_steps_per_second": 5.219, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.1142857142857143, |
| "grad_norm": 4.15625, |
| "learning_rate": 4.293334010882164e-06, |
| "loss": 3.7169, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.1210084033613446, |
| "grad_norm": 4.5625, |
| "learning_rate": 4.239383069050417e-06, |
| "loss": 3.7929, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.1277310924369748, |
| "grad_norm": 4.25, |
| "learning_rate": 4.185522633027057e-06, |
| "loss": 3.663, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.134453781512605, |
| "grad_norm": 4.71875, |
| "learning_rate": 4.131759111665349e-06, |
| "loss": 3.7563, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.1411764705882352, |
| "grad_norm": 4.0, |
| "learning_rate": 4.078098902286684e-06, |
| "loss": 3.6651, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.1478991596638655, |
| "grad_norm": 4.15625, |
| "learning_rate": 4.02454838991936e-06, |
| "loss": 3.6607, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.1546218487394957, |
| "grad_norm": 4.4375, |
| "learning_rate": 3.971113946538826e-06, |
| "loss": 3.7405, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.1613445378151261, |
| "grad_norm": 4.46875, |
| "learning_rate": 3.917801930309486e-06, |
| "loss": 3.7962, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.1680672268907564, |
| "grad_norm": 4.34375, |
| "learning_rate": 3.864618684828135e-06, |
| "loss": 3.645, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.1747899159663866, |
| "grad_norm": 4.15625, |
| "learning_rate": 3.8115705383691354e-06, |
| "loss": 3.6461, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.1815126050420168, |
| "grad_norm": 4.03125, |
| "learning_rate": 3.7586638031314182e-06, |
| "loss": 3.71, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.188235294117647, |
| "grad_norm": 8.6875, |
| "learning_rate": 3.705904774487396e-06, |
| "loss": 3.8202, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.1949579831932773, |
| "grad_norm": 4.09375, |
| "learning_rate": 3.6532997302338704e-06, |
| "loss": 3.7077, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.2016806722689075, |
| "grad_norm": 4.78125, |
| "learning_rate": 3.6008549298450403e-06, |
| "loss": 3.7005, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.2084033613445377, |
| "grad_norm": 4.09375, |
| "learning_rate": 3.5485766137276894e-06, |
| "loss": 3.7631, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.2084033613445377, |
| "eval_loss": 3.7338194847106934, |
| "eval_runtime": 29.8219, |
| "eval_samples_per_second": 42.217, |
| "eval_steps_per_second": 5.298, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.2151260504201682, |
| "grad_norm": 4.03125, |
| "learning_rate": 3.4964710024786354e-06, |
| "loss": 3.6634, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.2218487394957984, |
| "grad_norm": 4.5625, |
| "learning_rate": 3.444544296144546e-06, |
| "loss": 3.747, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.2285714285714286, |
| "grad_norm": 4.9375, |
| "learning_rate": 3.3928026734841935e-06, |
| "loss": 3.6196, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.2352941176470589, |
| "grad_norm": 3.90625, |
| "learning_rate": 3.341252291233241e-06, |
| "loss": 3.6693, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.242016806722689, |
| "grad_norm": 4.21875, |
| "learning_rate": 3.289899283371657e-06, |
| "loss": 3.7271, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.2487394957983193, |
| "grad_norm": 4.21875, |
| "learning_rate": 3.2387497603938327e-06, |
| "loss": 3.678, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.2554621848739496, |
| "grad_norm": 4.0625, |
| "learning_rate": 3.1878098085814926e-06, |
| "loss": 3.702, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.2621848739495798, |
| "grad_norm": 4.09375, |
| "learning_rate": 3.1370854892794855e-06, |
| "loss": 3.7787, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.26890756302521, |
| "grad_norm": 4.4375, |
| "learning_rate": 3.0865828381745515e-06, |
| "loss": 3.6845, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.2756302521008402, |
| "grad_norm": 4.125, |
| "learning_rate": 3.0363078645771303e-06, |
| "loss": 3.6905, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.2823529411764705, |
| "grad_norm": 4.28125, |
| "learning_rate": 2.986266550706315e-06, |
| "loss": 3.6823, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.289075630252101, |
| "grad_norm": 4.6875, |
| "learning_rate": 2.936464850978027e-06, |
| "loss": 3.7313, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.2957983193277312, |
| "grad_norm": 4.09375, |
| "learning_rate": 2.886908691296504e-06, |
| "loss": 3.7439, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.3025210084033614, |
| "grad_norm": 3.828125, |
| "learning_rate": 2.8376039683491683e-06, |
| "loss": 3.7323, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.3092436974789916, |
| "grad_norm": 4.0625, |
| "learning_rate": 2.7885565489049948e-06, |
| "loss": 3.7535, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.3092436974789916, |
| "eval_loss": 3.734619379043579, |
| "eval_runtime": 30.2166, |
| "eval_samples_per_second": 41.666, |
| "eval_steps_per_second": 5.229, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.3159663865546218, |
| "grad_norm": 4.125, |
| "learning_rate": 2.739772269116402e-06, |
| "loss": 3.6891, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.322689075630252, |
| "grad_norm": 4.34375, |
| "learning_rate": 2.6912569338248317e-06, |
| "loss": 3.7449, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.3294117647058823, |
| "grad_norm": 3.96875, |
| "learning_rate": 2.6430163158700116e-06, |
| "loss": 3.6608, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.3361344537815127, |
| "grad_norm": 4.75, |
| "learning_rate": 2.595056155403063e-06, |
| "loss": 3.7449, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.342857142857143, |
| "grad_norm": 4.0, |
| "learning_rate": 2.5473821592034604e-06, |
| "loss": 3.7139, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.3495798319327732, |
| "grad_norm": 4.59375, |
| "learning_rate": 2.5000000000000015e-06, |
| "loss": 3.7823, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.3563025210084034, |
| "grad_norm": 4.21875, |
| "learning_rate": 2.4529153157957913e-06, |
| "loss": 3.7754, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.3630252100840337, |
| "grad_norm": 4.25, |
| "learning_rate": 2.406133709197392e-06, |
| "loss": 3.7373, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.3697478991596639, |
| "grad_norm": 4.125, |
| "learning_rate": 2.3596607467481602e-06, |
| "loss": 3.7617, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.3764705882352941, |
| "grad_norm": 4.15625, |
| "learning_rate": 2.3135019582658803e-06, |
| "loss": 3.7332, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.3831932773109243, |
| "grad_norm": 4.15625, |
| "learning_rate": 2.2676628361847834e-06, |
| "loss": 3.639, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.3899159663865546, |
| "grad_norm": 4.25, |
| "learning_rate": 2.2221488349019903e-06, |
| "loss": 3.6918, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.3966386554621848, |
| "grad_norm": 4.0625, |
| "learning_rate": 2.1769653701284983e-06, |
| "loss": 3.622, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.403361344537815, |
| "grad_norm": 4.53125, |
| "learning_rate": 2.132117818244771e-06, |
| "loss": 3.7286, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.4100840336134453, |
| "grad_norm": 4.0625, |
| "learning_rate": 2.08761151566099e-06, |
| "loss": 3.668, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.4100840336134453, |
| "eval_loss": 3.7375030517578125, |
| "eval_runtime": 29.7795, |
| "eval_samples_per_second": 42.277, |
| "eval_steps_per_second": 5.306, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.4168067226890757, |
| "grad_norm": 4.21875, |
| "learning_rate": 2.0434517581820893e-06, |
| "loss": 3.7376, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.423529411764706, |
| "grad_norm": 3.984375, |
| "learning_rate": 1.999643800377596e-06, |
| "loss": 3.7108, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.4302521008403362, |
| "grad_norm": 3.734375, |
| "learning_rate": 1.956192854956397e-06, |
| "loss": 3.6391, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.4369747899159664, |
| "grad_norm": 4.0, |
| "learning_rate": 1.913104092146476e-06, |
| "loss": 3.6956, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.4436974789915966, |
| "grad_norm": 4.375, |
| "learning_rate": 1.8703826390797047e-06, |
| "loss": 3.6241, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.4504201680672268, |
| "grad_norm": 4.125, |
| "learning_rate": 1.8280335791817733e-06, |
| "loss": 3.7801, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.457142857142857, |
| "grad_norm": 4.0, |
| "learning_rate": 1.7860619515673034e-06, |
| "loss": 3.6977, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.4638655462184875, |
| "grad_norm": 4.71875, |
| "learning_rate": 1.7444727504402554e-06, |
| "loss": 3.6897, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.4705882352941178, |
| "grad_norm": 4.65625, |
| "learning_rate": 1.7032709244996559e-06, |
| "loss": 3.6878, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.477310924369748, |
| "grad_norm": 5.21875, |
| "learning_rate": 1.662461376350764e-06, |
| "loss": 3.7517, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.4840336134453782, |
| "grad_norm": 4.53125, |
| "learning_rate": 1.6220489619216988e-06, |
| "loss": 3.7621, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.4907563025210084, |
| "grad_norm": 3.984375, |
| "learning_rate": 1.5820384898856433e-06, |
| "loss": 3.7284, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.4974789915966387, |
| "grad_norm": 4.3125, |
| "learning_rate": 1.5424347210886538e-06, |
| "loss": 3.6888, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.504201680672269, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.5032423679831642e-06, |
| "loss": 3.705, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.5109243697478991, |
| "grad_norm": 3.765625, |
| "learning_rate": 1.4644660940672628e-06, |
| "loss": 3.679, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.5109243697478991, |
| "eval_loss": 3.7383294105529785, |
| "eval_runtime": 30.258, |
| "eval_samples_per_second": 41.609, |
| "eval_steps_per_second": 5.222, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.5176470588235293, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.4261105133297693e-06, |
| "loss": 3.6644, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.5243697478991596, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.3881801897012225e-06, |
| "loss": 3.6869, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.5310924369747898, |
| "grad_norm": 4.4375, |
| "learning_rate": 1.3506796365108232e-06, |
| "loss": 3.6292, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.53781512605042, |
| "grad_norm": 4.09375, |
| "learning_rate": 1.3136133159493803e-06, |
| "loss": 3.6962, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.5445378151260503, |
| "grad_norm": 4.125, |
| "learning_rate": 1.2769856385383689e-06, |
| "loss": 3.8197, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.5512605042016807, |
| "grad_norm": 4.21875, |
| "learning_rate": 1.2408009626051137e-06, |
| "loss": 3.7204, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.557983193277311, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.2050635937641909e-06, |
| "loss": 3.7022, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.5647058823529412, |
| "grad_norm": 5.125, |
| "learning_rate": 1.1697777844051105e-06, |
| "loss": 3.6865, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.5714285714285714, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.134947733186315e-06, |
| "loss": 3.6203, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.5781512605042018, |
| "grad_norm": 4.34375, |
| "learning_rate": 1.100577584535592e-06, |
| "loss": 3.7241, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.584873949579832, |
| "grad_norm": 3.84375, |
| "learning_rate": 1.0666714281569152e-06, |
| "loss": 3.5546, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.5915966386554623, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.0332332985438248e-06, |
| "loss": 3.7072, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.5983193277310925, |
| "grad_norm": 4.96875, |
| "learning_rate": 1.0002671744993519e-06, |
| "loss": 3.8113, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.6050420168067228, |
| "grad_norm": 3.953125, |
| "learning_rate": 9.677769786625869e-07, |
| "loss": 3.7273, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.611764705882353, |
| "grad_norm": 4.3125, |
| "learning_rate": 9.357665770419244e-07, |
| "loss": 3.6539, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.611764705882353, |
| "eval_loss": 3.7385716438293457, |
| "eval_runtime": 29.7733, |
| "eval_samples_per_second": 42.286, |
| "eval_steps_per_second": 5.307, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.6184873949579832, |
| "grad_norm": 4.0, |
| "learning_rate": 9.042397785550405e-07, |
| "loss": 3.6896, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.6252100840336134, |
| "grad_norm": 4.0, |
| "learning_rate": 8.732003345756812e-07, |
| "loss": 3.6619, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.6319327731092437, |
| "grad_norm": 4.09375, |
| "learning_rate": 8.426519384872733e-07, |
| "loss": 3.7638, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.638655462184874, |
| "grad_norm": 3.703125, |
| "learning_rate": 8.125982252434611e-07, |
| "loss": 3.685, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.6453781512605041, |
| "grad_norm": 4.21875, |
| "learning_rate": 7.830427709355726e-07, |
| "loss": 3.6325, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.6521008403361344, |
| "grad_norm": 4.28125, |
| "learning_rate": 7.539890923671061e-07, |
| "loss": 3.7295, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.6588235294117646, |
| "grad_norm": 4.09375, |
| "learning_rate": 7.254406466352682e-07, |
| "loss": 3.6432, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.6655462184873948, |
| "grad_norm": 5.09375, |
| "learning_rate": 6.974008307196057e-07, |
| "loss": 3.729, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.6722689075630253, |
| "grad_norm": 4.09375, |
| "learning_rate": 6.698729810778065e-07, |
| "loss": 3.6502, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.6789915966386555, |
| "grad_norm": 3.96875, |
| "learning_rate": 6.428603732486938e-07, |
| "loss": 3.6288, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.6857142857142857, |
| "grad_norm": 3.84375, |
| "learning_rate": 6.163662214624616e-07, |
| "loss": 3.6903, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.692436974789916, |
| "grad_norm": 4.34375, |
| "learning_rate": 5.903936782582253e-07, |
| "loss": 3.7859, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.6991596638655462, |
| "grad_norm": 4.125, |
| "learning_rate": 5.649458341088915e-07, |
| "loss": 3.7541, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.7058823529411766, |
| "grad_norm": 4.0625, |
| "learning_rate": 5.400257170534296e-07, |
| "loss": 3.7466, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.7126050420168069, |
| "grad_norm": 4.125, |
| "learning_rate": 5.156362923365587e-07, |
| "loss": 3.6547, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.7126050420168069, |
| "eval_loss": 3.738647222518921, |
| "eval_runtime": 30.241, |
| "eval_samples_per_second": 41.632, |
| "eval_steps_per_second": 5.225, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.719327731092437, |
| "grad_norm": 3.78125, |
| "learning_rate": 4.917804620559202e-07, |
| "loss": 3.6395, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.7260504201680673, |
| "grad_norm": 4.25, |
| "learning_rate": 4.6846106481675035e-07, |
| "loss": 3.7057, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.7327731092436975, |
| "grad_norm": 4.125, |
| "learning_rate": 4.456808753941205e-07, |
| "loss": 3.7292, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.7394957983193278, |
| "grad_norm": 4.3125, |
| "learning_rate": 4.2344260440276455e-07, |
| "loss": 3.7007, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.746218487394958, |
| "grad_norm": 4.09375, |
| "learning_rate": 4.0174889797453875e-07, |
| "loss": 3.744, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.7529411764705882, |
| "grad_norm": 4.3125, |
| "learning_rate": 3.8060233744356634e-07, |
| "loss": 3.8662, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.7596638655462185, |
| "grad_norm": 4.3125, |
| "learning_rate": 3.600054390390778e-07, |
| "loss": 3.8242, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.7663865546218487, |
| "grad_norm": 3.921875, |
| "learning_rate": 3.399606535860078e-07, |
| "loss": 3.6502, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.773109243697479, |
| "grad_norm": 3.9375, |
| "learning_rate": 3.204703662133724e-07, |
| "loss": 3.6803, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.7798319327731091, |
| "grad_norm": 4.90625, |
| "learning_rate": 3.015368960704584e-07, |
| "loss": 3.7613, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.7865546218487394, |
| "grad_norm": 4.3125, |
| "learning_rate": 2.8316249605087386e-07, |
| "loss": 3.7316, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.7932773109243696, |
| "grad_norm": 4.125, |
| "learning_rate": 2.653493525244721e-07, |
| "loss": 3.6491, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 4.15625, |
| "learning_rate": 2.4809958507719444e-07, |
| "loss": 3.8626, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.8067226890756303, |
| "grad_norm": 3.9375, |
| "learning_rate": 2.314152462588659e-07, |
| "loss": 3.7007, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.8134453781512605, |
| "grad_norm": 3.921875, |
| "learning_rate": 2.152983213389559e-07, |
| "loss": 3.7533, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.8134453781512605, |
| "eval_loss": 3.7400190830230713, |
| "eval_runtime": 29.7815, |
| "eval_samples_per_second": 42.275, |
| "eval_steps_per_second": 5.305, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.8201680672268907, |
| "grad_norm": 4.15625, |
| "learning_rate": 1.99750728070357e-07, |
| "loss": 3.6811, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.826890756302521, |
| "grad_norm": 3.921875, |
| "learning_rate": 1.8477431646118648e-07, |
| "loss": 3.6697, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.8336134453781514, |
| "grad_norm": 4.375, |
| "learning_rate": 1.7037086855465902e-07, |
| "loss": 3.7755, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.8403361344537816, |
| "grad_norm": 4.34375, |
| "learning_rate": 1.5654209821703458e-07, |
| "loss": 3.7415, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.8470588235294119, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.4328965093369284e-07, |
| "loss": 3.7171, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.853781512605042, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.3061510361333186e-07, |
| "loss": 3.692, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.8605042016806723, |
| "grad_norm": 3.9375, |
| "learning_rate": 1.185199644003332e-07, |
| "loss": 3.7456, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.8672268907563025, |
| "grad_norm": 3.875, |
| "learning_rate": 1.0700567249530835e-07, |
| "loss": 3.6095, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.8739495798319328, |
| "grad_norm": 3.90625, |
| "learning_rate": 9.607359798384785e-08, |
| "loss": 3.759, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.880672268907563, |
| "grad_norm": 3.984375, |
| "learning_rate": 8.572504167349449e-08, |
| "loss": 3.776, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.8873949579831932, |
| "grad_norm": 4.0625, |
| "learning_rate": 7.59612349389599e-08, |
| "loss": 3.7251, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.8941176470588235, |
| "grad_norm": 4.0625, |
| "learning_rate": 6.678333957560513e-08, |
| "loss": 3.6457, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.9008403361344537, |
| "grad_norm": 3.984375, |
| "learning_rate": 5.8192447661196694e-08, |
| "loss": 3.7916, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.907563025210084, |
| "grad_norm": 3.84375, |
| "learning_rate": 5.0189581425960644e-08, |
| "loss": 3.6759, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.9142857142857141, |
| "grad_norm": 4.5625, |
| "learning_rate": 4.2775693130948094e-08, |
| "loss": 3.6983, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.9142857142857141, |
| "eval_loss": 3.7386996746063232, |
| "eval_runtime": 30.2618, |
| "eval_samples_per_second": 41.604, |
| "eval_steps_per_second": 5.221, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.9210084033613444, |
| "grad_norm": 4.59375, |
| "learning_rate": 3.59516649547248e-08, |
| "loss": 3.7151, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.9277310924369748, |
| "grad_norm": 4.0, |
| "learning_rate": 2.971830888840177e-08, |
| "loss": 3.6223, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.934453781512605, |
| "grad_norm": 4.03125, |
| "learning_rate": 2.4076366639015914e-08, |
| "loss": 3.6781, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.9411764705882353, |
| "grad_norm": 4.21875, |
| "learning_rate": 1.9026509541272276e-08, |
| "loss": 3.7715, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.9478991596638655, |
| "grad_norm": 4.0, |
| "learning_rate": 1.4569338477666838e-08, |
| "loss": 3.6257, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.954621848739496, |
| "grad_norm": 4.125, |
| "learning_rate": 1.0705383806982606e-08, |
| "loss": 3.7141, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.9613445378151262, |
| "grad_norm": 4.34375, |
| "learning_rate": 7.43510530118452e-09, |
| "loss": 3.6997, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.9680672268907564, |
| "grad_norm": 4.21875, |
| "learning_rate": 4.758892090711009e-09, |
| "loss": 3.7004, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.9747899159663866, |
| "grad_norm": 6.25, |
| "learning_rate": 2.6770626181715776e-09, |
| "loss": 3.7454, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.9815126050420169, |
| "grad_norm": 4.5625, |
| "learning_rate": 1.189864600454338e-09, |
| "loss": 3.7919, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.988235294117647, |
| "grad_norm": 4.1875, |
| "learning_rate": 2.974749992512571e-10, |
| "loss": 3.8223, |
| "step": 296 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 296, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 15, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.333328305140531e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|