| { |
| "best_global_step": 1260, |
| "best_metric": 0.10533556342124939, |
| "best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_svamp_1757340245/checkpoint-1260", |
| "epoch": 20.0, |
| "eval_steps": 315, |
| "global_step": 6300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.015873015873015872, |
| "grad_norm": 89.08386993408203, |
| "learning_rate": 3.174603174603175e-07, |
| "loss": 5.7094, |
| "num_input_tokens_seen": 1056, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.031746031746031744, |
| "grad_norm": 83.91914367675781, |
| "learning_rate": 7.142857142857143e-07, |
| "loss": 5.3977, |
| "num_input_tokens_seen": 2112, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.047619047619047616, |
| "grad_norm": 61.71086883544922, |
| "learning_rate": 1.1111111111111112e-06, |
| "loss": 5.1387, |
| "num_input_tokens_seen": 3152, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.06349206349206349, |
| "grad_norm": 53.70893096923828, |
| "learning_rate": 1.507936507936508e-06, |
| "loss": 4.76, |
| "num_input_tokens_seen": 4272, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.07936507936507936, |
| "grad_norm": 62.33582305908203, |
| "learning_rate": 1.9047619047619051e-06, |
| "loss": 4.1702, |
| "num_input_tokens_seen": 5296, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.09523809523809523, |
| "grad_norm": 39.57314682006836, |
| "learning_rate": 2.301587301587302e-06, |
| "loss": 3.7174, |
| "num_input_tokens_seen": 6352, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1111111111111111, |
| "grad_norm": 44.72786331176758, |
| "learning_rate": 2.6984126984126986e-06, |
| "loss": 3.4702, |
| "num_input_tokens_seen": 7472, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.12698412698412698, |
| "grad_norm": 44.22084426879883, |
| "learning_rate": 3.0952380952380953e-06, |
| "loss": 3.1998, |
| "num_input_tokens_seen": 8544, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.14285714285714285, |
| "grad_norm": 40.643310546875, |
| "learning_rate": 3.4920634920634924e-06, |
| "loss": 2.862, |
| "num_input_tokens_seen": 9648, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.15873015873015872, |
| "grad_norm": 47.437110900878906, |
| "learning_rate": 3.888888888888889e-06, |
| "loss": 2.2716, |
| "num_input_tokens_seen": 10720, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1746031746031746, |
| "grad_norm": 50.03641128540039, |
| "learning_rate": 4.285714285714286e-06, |
| "loss": 2.1866, |
| "num_input_tokens_seen": 11792, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.19047619047619047, |
| "grad_norm": 36.45273971557617, |
| "learning_rate": 4.682539682539683e-06, |
| "loss": 1.8074, |
| "num_input_tokens_seen": 12864, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.20634920634920634, |
| "grad_norm": 47.23276901245117, |
| "learning_rate": 5.07936507936508e-06, |
| "loss": 1.5586, |
| "num_input_tokens_seen": 13920, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.2222222222222222, |
| "grad_norm": 25.937341690063477, |
| "learning_rate": 5.4761904761904765e-06, |
| "loss": 1.4778, |
| "num_input_tokens_seen": 15072, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.23809523809523808, |
| "grad_norm": 39.08681106567383, |
| "learning_rate": 5.873015873015873e-06, |
| "loss": 1.2204, |
| "num_input_tokens_seen": 16064, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.25396825396825395, |
| "grad_norm": 44.03461837768555, |
| "learning_rate": 6.26984126984127e-06, |
| "loss": 1.0868, |
| "num_input_tokens_seen": 17216, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2698412698412698, |
| "grad_norm": 34.26078796386719, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 1.0351, |
| "num_input_tokens_seen": 18272, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.2857142857142857, |
| "grad_norm": 49.58864974975586, |
| "learning_rate": 7.063492063492063e-06, |
| "loss": 0.8208, |
| "num_input_tokens_seen": 19312, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.30158730158730157, |
| "grad_norm": 44.86882400512695, |
| "learning_rate": 7.460317460317461e-06, |
| "loss": 0.8872, |
| "num_input_tokens_seen": 20384, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.31746031746031744, |
| "grad_norm": 37.76105880737305, |
| "learning_rate": 7.857142857142858e-06, |
| "loss": 0.92, |
| "num_input_tokens_seen": 21424, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 28.7552490234375, |
| "learning_rate": 8.253968253968254e-06, |
| "loss": 0.7919, |
| "num_input_tokens_seen": 22480, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.3492063492063492, |
| "grad_norm": 47.71187973022461, |
| "learning_rate": 8.650793650793651e-06, |
| "loss": 0.8995, |
| "num_input_tokens_seen": 23568, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.36507936507936506, |
| "grad_norm": 44.39586639404297, |
| "learning_rate": 9.047619047619047e-06, |
| "loss": 0.714, |
| "num_input_tokens_seen": 24672, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.38095238095238093, |
| "grad_norm": 26.836511611938477, |
| "learning_rate": 9.444444444444445e-06, |
| "loss": 0.6963, |
| "num_input_tokens_seen": 25776, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3968253968253968, |
| "grad_norm": 32.43899917602539, |
| "learning_rate": 9.841269841269842e-06, |
| "loss": 0.8225, |
| "num_input_tokens_seen": 26912, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.4126984126984127, |
| "grad_norm": 29.625259399414062, |
| "learning_rate": 1.0238095238095238e-05, |
| "loss": 0.7967, |
| "num_input_tokens_seen": 27936, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.42857142857142855, |
| "grad_norm": 51.56465530395508, |
| "learning_rate": 1.0634920634920636e-05, |
| "loss": 0.6416, |
| "num_input_tokens_seen": 29024, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 36.9901123046875, |
| "learning_rate": 1.1031746031746031e-05, |
| "loss": 0.6575, |
| "num_input_tokens_seen": 30128, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4603174603174603, |
| "grad_norm": 32.11468505859375, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": 0.8359, |
| "num_input_tokens_seen": 31168, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 17.278911590576172, |
| "learning_rate": 1.1825396825396825e-05, |
| "loss": 0.6762, |
| "num_input_tokens_seen": 32256, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.49206349206349204, |
| "grad_norm": 28.327159881591797, |
| "learning_rate": 1.2222222222222222e-05, |
| "loss": 0.5471, |
| "num_input_tokens_seen": 33296, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.5079365079365079, |
| "grad_norm": 22.584383010864258, |
| "learning_rate": 1.261904761904762e-05, |
| "loss": 0.5509, |
| "num_input_tokens_seen": 34368, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5238095238095238, |
| "grad_norm": 33.20157241821289, |
| "learning_rate": 1.3015873015873018e-05, |
| "loss": 1.0047, |
| "num_input_tokens_seen": 35504, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5396825396825397, |
| "grad_norm": 29.51561737060547, |
| "learning_rate": 1.3412698412698413e-05, |
| "loss": 0.6524, |
| "num_input_tokens_seen": 36512, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 21.06121826171875, |
| "learning_rate": 1.3809523809523811e-05, |
| "loss": 0.5967, |
| "num_input_tokens_seen": 37600, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "grad_norm": 22.211185455322266, |
| "learning_rate": 1.4206349206349207e-05, |
| "loss": 0.5267, |
| "num_input_tokens_seen": 38656, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5873015873015873, |
| "grad_norm": 23.984838485717773, |
| "learning_rate": 1.4603174603174605e-05, |
| "loss": 0.4941, |
| "num_input_tokens_seen": 39712, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.6031746031746031, |
| "grad_norm": 31.058792114257812, |
| "learning_rate": 1.5e-05, |
| "loss": 0.7086, |
| "num_input_tokens_seen": 40784, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6190476190476191, |
| "grad_norm": 21.773157119750977, |
| "learning_rate": 1.5396825396825398e-05, |
| "loss": 0.6814, |
| "num_input_tokens_seen": 41840, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6349206349206349, |
| "grad_norm": 13.91670036315918, |
| "learning_rate": 1.5793650793650794e-05, |
| "loss": 0.6264, |
| "num_input_tokens_seen": 42880, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6507936507936508, |
| "grad_norm": 24.21043586730957, |
| "learning_rate": 1.6190476190476193e-05, |
| "loss": 0.7654, |
| "num_input_tokens_seen": 44000, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 23.567092895507812, |
| "learning_rate": 1.658730158730159e-05, |
| "loss": 0.6436, |
| "num_input_tokens_seen": 45008, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6825396825396826, |
| "grad_norm": 30.81178092956543, |
| "learning_rate": 1.6984126984126985e-05, |
| "loss": 0.7867, |
| "num_input_tokens_seen": 46064, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.6984126984126984, |
| "grad_norm": 21.90506935119629, |
| "learning_rate": 1.738095238095238e-05, |
| "loss": 0.7717, |
| "num_input_tokens_seen": 47152, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 39.183982849121094, |
| "learning_rate": 1.777777777777778e-05, |
| "loss": 0.9016, |
| "num_input_tokens_seen": 48176, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7301587301587301, |
| "grad_norm": 12.529756546020508, |
| "learning_rate": 1.8174603174603176e-05, |
| "loss": 0.591, |
| "num_input_tokens_seen": 49232, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.746031746031746, |
| "grad_norm": 16.73536491394043, |
| "learning_rate": 1.8571428571428572e-05, |
| "loss": 0.7902, |
| "num_input_tokens_seen": 50288, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.7619047619047619, |
| "grad_norm": 12.837594985961914, |
| "learning_rate": 1.8968253968253968e-05, |
| "loss": 0.706, |
| "num_input_tokens_seen": 51376, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7777777777777778, |
| "grad_norm": 31.631135940551758, |
| "learning_rate": 1.9365079365079367e-05, |
| "loss": 0.7042, |
| "num_input_tokens_seen": 52448, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.7936507936507936, |
| "grad_norm": 10.814046859741211, |
| "learning_rate": 1.9761904761904763e-05, |
| "loss": 0.6963, |
| "num_input_tokens_seen": 53520, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8095238095238095, |
| "grad_norm": 16.586753845214844, |
| "learning_rate": 2.015873015873016e-05, |
| "loss": 1.092, |
| "num_input_tokens_seen": 54608, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.8253968253968254, |
| "grad_norm": 7.401493549346924, |
| "learning_rate": 2.0555555555555555e-05, |
| "loss": 0.582, |
| "num_input_tokens_seen": 55664, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8412698412698413, |
| "grad_norm": 8.16193962097168, |
| "learning_rate": 2.0952380952380954e-05, |
| "loss": 0.6384, |
| "num_input_tokens_seen": 56832, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.8571428571428571, |
| "grad_norm": 14.588497161865234, |
| "learning_rate": 2.134920634920635e-05, |
| "loss": 0.5567, |
| "num_input_tokens_seen": 57968, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.873015873015873, |
| "grad_norm": 13.264519691467285, |
| "learning_rate": 2.174603174603175e-05, |
| "loss": 0.603, |
| "num_input_tokens_seen": 59072, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 19.701818466186523, |
| "learning_rate": 2.214285714285714e-05, |
| "loss": 0.7061, |
| "num_input_tokens_seen": 60112, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9047619047619048, |
| "grad_norm": 15.928413391113281, |
| "learning_rate": 2.253968253968254e-05, |
| "loss": 0.5645, |
| "num_input_tokens_seen": 61184, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.9206349206349206, |
| "grad_norm": 20.14051055908203, |
| "learning_rate": 2.2936507936507937e-05, |
| "loss": 0.6211, |
| "num_input_tokens_seen": 62240, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9365079365079365, |
| "grad_norm": 28.218061447143555, |
| "learning_rate": 2.3333333333333336e-05, |
| "loss": 0.6783, |
| "num_input_tokens_seen": 63312, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 15.398136138916016, |
| "learning_rate": 2.373015873015873e-05, |
| "loss": 0.6398, |
| "num_input_tokens_seen": 64384, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9682539682539683, |
| "grad_norm": 6.9146246910095215, |
| "learning_rate": 2.4126984126984128e-05, |
| "loss": 0.6358, |
| "num_input_tokens_seen": 65392, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.9841269841269841, |
| "grad_norm": 6.8382062911987305, |
| "learning_rate": 2.4523809523809523e-05, |
| "loss": 0.5311, |
| "num_input_tokens_seen": 66448, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 10.719470024108887, |
| "learning_rate": 2.4920634920634923e-05, |
| "loss": 0.6612, |
| "num_input_tokens_seen": 67504, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6388102769851685, |
| "eval_runtime": 1.455, |
| "eval_samples_per_second": 48.109, |
| "eval_steps_per_second": 24.054, |
| "num_input_tokens_seen": 67504, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.0158730158730158, |
| "grad_norm": 11.29240608215332, |
| "learning_rate": 2.531746031746032e-05, |
| "loss": 0.6203, |
| "num_input_tokens_seen": 68624, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0317460317460316, |
| "grad_norm": 10.736639022827148, |
| "learning_rate": 2.5714285714285714e-05, |
| "loss": 0.5772, |
| "num_input_tokens_seen": 69632, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.0476190476190477, |
| "grad_norm": 14.154836654663086, |
| "learning_rate": 2.6111111111111114e-05, |
| "loss": 0.5529, |
| "num_input_tokens_seen": 70736, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.0634920634920635, |
| "grad_norm": 10.680912971496582, |
| "learning_rate": 2.650793650793651e-05, |
| "loss": 0.3824, |
| "num_input_tokens_seen": 71760, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.0793650793650793, |
| "grad_norm": 41.33735656738281, |
| "learning_rate": 2.6904761904761905e-05, |
| "loss": 0.9206, |
| "num_input_tokens_seen": 72880, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.0952380952380953, |
| "grad_norm": 7.261141777038574, |
| "learning_rate": 2.73015873015873e-05, |
| "loss": 0.6019, |
| "num_input_tokens_seen": 73952, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 6.877208709716797, |
| "learning_rate": 2.76984126984127e-05, |
| "loss": 0.692, |
| "num_input_tokens_seen": 75072, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.126984126984127, |
| "grad_norm": 6.756972312927246, |
| "learning_rate": 2.8095238095238096e-05, |
| "loss": 0.7479, |
| "num_input_tokens_seen": 76144, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.1428571428571428, |
| "grad_norm": 11.503968238830566, |
| "learning_rate": 2.8492063492063492e-05, |
| "loss": 0.6113, |
| "num_input_tokens_seen": 77248, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.1587301587301586, |
| "grad_norm": 7.765186309814453, |
| "learning_rate": 2.8888888888888888e-05, |
| "loss": 0.5137, |
| "num_input_tokens_seen": 78336, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.1746031746031746, |
| "grad_norm": 15.50655460357666, |
| "learning_rate": 2.9285714285714288e-05, |
| "loss": 0.5522, |
| "num_input_tokens_seen": 79392, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1904761904761905, |
| "grad_norm": 4.964603900909424, |
| "learning_rate": 2.9682539682539683e-05, |
| "loss": 0.4755, |
| "num_input_tokens_seen": 80480, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.2063492063492063, |
| "grad_norm": 11.66644287109375, |
| "learning_rate": 3.007936507936508e-05, |
| "loss": 0.9369, |
| "num_input_tokens_seen": 81568, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.2222222222222223, |
| "grad_norm": 5.162360191345215, |
| "learning_rate": 3.0476190476190482e-05, |
| "loss": 0.3816, |
| "num_input_tokens_seen": 82608, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.2380952380952381, |
| "grad_norm": 13.151936531066895, |
| "learning_rate": 3.0873015873015874e-05, |
| "loss": 0.6825, |
| "num_input_tokens_seen": 83680, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.253968253968254, |
| "grad_norm": 10.89046573638916, |
| "learning_rate": 3.1269841269841274e-05, |
| "loss": 0.5983, |
| "num_input_tokens_seen": 84784, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.2698412698412698, |
| "grad_norm": 11.605130195617676, |
| "learning_rate": 3.1666666666666666e-05, |
| "loss": 0.527, |
| "num_input_tokens_seen": 85824, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.2857142857142856, |
| "grad_norm": 7.945504665374756, |
| "learning_rate": 3.2063492063492065e-05, |
| "loss": 0.5015, |
| "num_input_tokens_seen": 86928, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.3015873015873016, |
| "grad_norm": 11.201025009155273, |
| "learning_rate": 3.2460317460317465e-05, |
| "loss": 0.6972, |
| "num_input_tokens_seen": 88032, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.3174603174603174, |
| "grad_norm": 4.484563827514648, |
| "learning_rate": 3.285714285714286e-05, |
| "loss": 0.424, |
| "num_input_tokens_seen": 89088, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 11.90958023071289, |
| "learning_rate": 3.3253968253968256e-05, |
| "loss": 0.4407, |
| "num_input_tokens_seen": 90128, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.3492063492063493, |
| "grad_norm": 9.70373249053955, |
| "learning_rate": 3.3650793650793656e-05, |
| "loss": 0.5097, |
| "num_input_tokens_seen": 91184, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.3650793650793651, |
| "grad_norm": 8.781391143798828, |
| "learning_rate": 3.404761904761905e-05, |
| "loss": 0.6233, |
| "num_input_tokens_seen": 92256, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.380952380952381, |
| "grad_norm": 5.017049312591553, |
| "learning_rate": 3.444444444444445e-05, |
| "loss": 0.4957, |
| "num_input_tokens_seen": 93312, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.3968253968253967, |
| "grad_norm": 7.375868320465088, |
| "learning_rate": 3.484126984126984e-05, |
| "loss": 0.5737, |
| "num_input_tokens_seen": 94384, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.4126984126984126, |
| "grad_norm": 6.288750648498535, |
| "learning_rate": 3.523809523809524e-05, |
| "loss": 0.7232, |
| "num_input_tokens_seen": 95504, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 4.126834869384766, |
| "learning_rate": 3.563492063492064e-05, |
| "loss": 0.5578, |
| "num_input_tokens_seen": 96608, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.4444444444444444, |
| "grad_norm": 3.2642154693603516, |
| "learning_rate": 3.603174603174603e-05, |
| "loss": 0.4916, |
| "num_input_tokens_seen": 97632, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.4603174603174602, |
| "grad_norm": 9.297268867492676, |
| "learning_rate": 3.642857142857143e-05, |
| "loss": 0.5753, |
| "num_input_tokens_seen": 98752, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.4761904761904763, |
| "grad_norm": 11.799019813537598, |
| "learning_rate": 3.682539682539683e-05, |
| "loss": 0.974, |
| "num_input_tokens_seen": 99904, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.492063492063492, |
| "grad_norm": 4.892351150512695, |
| "learning_rate": 3.722222222222222e-05, |
| "loss": 0.5816, |
| "num_input_tokens_seen": 100960, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.507936507936508, |
| "grad_norm": 3.8664710521698, |
| "learning_rate": 3.761904761904762e-05, |
| "loss": 0.4051, |
| "num_input_tokens_seen": 102016, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.5238095238095237, |
| "grad_norm": 3.5097477436065674, |
| "learning_rate": 3.8015873015873014e-05, |
| "loss": 0.4769, |
| "num_input_tokens_seen": 103104, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.5396825396825395, |
| "grad_norm": 5.085179328918457, |
| "learning_rate": 3.841269841269842e-05, |
| "loss": 0.4481, |
| "num_input_tokens_seen": 104144, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.5555555555555556, |
| "grad_norm": 9.254948616027832, |
| "learning_rate": 3.880952380952381e-05, |
| "loss": 0.7627, |
| "num_input_tokens_seen": 105152, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.5714285714285714, |
| "grad_norm": 5.68356466293335, |
| "learning_rate": 3.9206349206349205e-05, |
| "loss": 0.583, |
| "num_input_tokens_seen": 106192, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.5873015873015874, |
| "grad_norm": 3.4472599029541016, |
| "learning_rate": 3.9603174603174604e-05, |
| "loss": 0.4961, |
| "num_input_tokens_seen": 107216, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.6031746031746033, |
| "grad_norm": 4.6199212074279785, |
| "learning_rate": 4e-05, |
| "loss": 0.4046, |
| "num_input_tokens_seen": 108368, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.619047619047619, |
| "grad_norm": 22.77120590209961, |
| "learning_rate": 4.03968253968254e-05, |
| "loss": 0.6756, |
| "num_input_tokens_seen": 109440, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.6349206349206349, |
| "grad_norm": 5.753880500793457, |
| "learning_rate": 4.0793650793650795e-05, |
| "loss": 0.5926, |
| "num_input_tokens_seen": 110480, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.6507936507936507, |
| "grad_norm": 4.747334003448486, |
| "learning_rate": 4.119047619047619e-05, |
| "loss": 0.4798, |
| "num_input_tokens_seen": 111488, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 4.838809967041016, |
| "learning_rate": 4.1587301587301594e-05, |
| "loss": 0.5422, |
| "num_input_tokens_seen": 112528, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.6825396825396826, |
| "grad_norm": 6.677746295928955, |
| "learning_rate": 4.1984126984126986e-05, |
| "loss": 0.5234, |
| "num_input_tokens_seen": 113600, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.6984126984126984, |
| "grad_norm": 9.256402015686035, |
| "learning_rate": 4.2380952380952385e-05, |
| "loss": 0.5388, |
| "num_input_tokens_seen": 114720, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.7142857142857144, |
| "grad_norm": 6.039902687072754, |
| "learning_rate": 4.277777777777778e-05, |
| "loss": 0.4843, |
| "num_input_tokens_seen": 115744, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.7301587301587302, |
| "grad_norm": 14.398706436157227, |
| "learning_rate": 4.317460317460318e-05, |
| "loss": 0.3626, |
| "num_input_tokens_seen": 116880, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.746031746031746, |
| "grad_norm": 5.464714050292969, |
| "learning_rate": 4.3571428571428576e-05, |
| "loss": 0.6535, |
| "num_input_tokens_seen": 117984, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.7619047619047619, |
| "grad_norm": 4.07396125793457, |
| "learning_rate": 4.396825396825397e-05, |
| "loss": 0.37, |
| "num_input_tokens_seen": 119008, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "grad_norm": 9.14253044128418, |
| "learning_rate": 4.436507936507937e-05, |
| "loss": 0.6951, |
| "num_input_tokens_seen": 120080, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.7936507936507935, |
| "grad_norm": 3.3419647216796875, |
| "learning_rate": 4.476190476190477e-05, |
| "loss": 0.238, |
| "num_input_tokens_seen": 121136, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.8095238095238095, |
| "grad_norm": 6.380158424377441, |
| "learning_rate": 4.515873015873016e-05, |
| "loss": 0.2852, |
| "num_input_tokens_seen": 122144, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.8253968253968254, |
| "grad_norm": 5.351354122161865, |
| "learning_rate": 4.555555555555556e-05, |
| "loss": 0.3958, |
| "num_input_tokens_seen": 123264, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.8412698412698414, |
| "grad_norm": 2.423280954360962, |
| "learning_rate": 4.595238095238095e-05, |
| "loss": 0.2796, |
| "num_input_tokens_seen": 124304, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.8571428571428572, |
| "grad_norm": 3.0386385917663574, |
| "learning_rate": 4.634920634920635e-05, |
| "loss": 0.2782, |
| "num_input_tokens_seen": 125408, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.873015873015873, |
| "grad_norm": 4.597468376159668, |
| "learning_rate": 4.674603174603175e-05, |
| "loss": 0.3238, |
| "num_input_tokens_seen": 126464, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.8888888888888888, |
| "grad_norm": 1.0736624002456665, |
| "learning_rate": 4.714285714285714e-05, |
| "loss": 0.2223, |
| "num_input_tokens_seen": 127600, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 4.550055503845215, |
| "learning_rate": 4.753968253968254e-05, |
| "loss": 0.2229, |
| "num_input_tokens_seen": 128656, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.9206349206349205, |
| "grad_norm": 2.1782174110412598, |
| "learning_rate": 4.793650793650794e-05, |
| "loss": 0.2182, |
| "num_input_tokens_seen": 129712, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.9365079365079365, |
| "grad_norm": 7.886104583740234, |
| "learning_rate": 4.8333333333333334e-05, |
| "loss": 0.4634, |
| "num_input_tokens_seen": 130720, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.9523809523809523, |
| "grad_norm": 1.8265987634658813, |
| "learning_rate": 4.873015873015873e-05, |
| "loss": 0.3124, |
| "num_input_tokens_seen": 131792, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.9682539682539684, |
| "grad_norm": 1.8508260250091553, |
| "learning_rate": 4.9126984126984125e-05, |
| "loss": 0.2745, |
| "num_input_tokens_seen": 132896, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.9841269841269842, |
| "grad_norm": 2.103785753250122, |
| "learning_rate": 4.9523809523809525e-05, |
| "loss": 0.2197, |
| "num_input_tokens_seen": 133920, |
| "step": 625 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.9688537120819092, |
| "learning_rate": 4.9920634920634924e-05, |
| "loss": 0.2233, |
| "num_input_tokens_seen": 135040, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.31226205825805664, |
| "eval_runtime": 1.4549, |
| "eval_samples_per_second": 48.114, |
| "eval_steps_per_second": 24.057, |
| "num_input_tokens_seen": 135040, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.015873015873016, |
| "grad_norm": 1.8913108110427856, |
| "learning_rate": 4.9999938600696385e-05, |
| "loss": 0.2113, |
| "num_input_tokens_seen": 136096, |
| "step": 635 |
| }, |
| { |
| "epoch": 2.0317460317460316, |
| "grad_norm": 1.2923585176467896, |
| "learning_rate": 4.9999689166542295e-05, |
| "loss": 0.2567, |
| "num_input_tokens_seen": 137120, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.0476190476190474, |
| "grad_norm": 1.235941767692566, |
| "learning_rate": 4.9999247861994194e-05, |
| "loss": 0.17, |
| "num_input_tokens_seen": 138176, |
| "step": 645 |
| }, |
| { |
| "epoch": 2.0634920634920633, |
| "grad_norm": 2.3454935550689697, |
| "learning_rate": 4.9998614690439037e-05, |
| "loss": 0.1193, |
| "num_input_tokens_seen": 139200, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.0793650793650795, |
| "grad_norm": 0.9585188031196594, |
| "learning_rate": 4.9997789656736365e-05, |
| "loss": 0.2354, |
| "num_input_tokens_seen": 140288, |
| "step": 655 |
| }, |
| { |
| "epoch": 2.0952380952380953, |
| "grad_norm": 9.852370262145996, |
| "learning_rate": 4.9996772767218244e-05, |
| "loss": 0.3074, |
| "num_input_tokens_seen": 141360, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.111111111111111, |
| "grad_norm": 1.8546109199523926, |
| "learning_rate": 4.9995564029689204e-05, |
| "loss": 0.0918, |
| "num_input_tokens_seen": 142416, |
| "step": 665 |
| }, |
| { |
| "epoch": 2.126984126984127, |
| "grad_norm": 5.140466690063477, |
| "learning_rate": 4.999416345342619e-05, |
| "loss": 0.159, |
| "num_input_tokens_seen": 143552, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 4.833157062530518, |
| "learning_rate": 4.9992571049178516e-05, |
| "loss": 0.1179, |
| "num_input_tokens_seen": 144592, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.1587301587301586, |
| "grad_norm": 3.79502272605896, |
| "learning_rate": 4.999078682916774e-05, |
| "loss": 0.1536, |
| "num_input_tokens_seen": 145696, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.1746031746031744, |
| "grad_norm": 12.648764610290527, |
| "learning_rate": 4.9988810807087584e-05, |
| "loss": 0.2089, |
| "num_input_tokens_seen": 146784, |
| "step": 685 |
| }, |
| { |
| "epoch": 2.1904761904761907, |
| "grad_norm": 0.6488538980484009, |
| "learning_rate": 4.998664299810385e-05, |
| "loss": 0.1116, |
| "num_input_tokens_seen": 147840, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.2063492063492065, |
| "grad_norm": 3.367002487182617, |
| "learning_rate": 4.9984283418854284e-05, |
| "loss": 0.0847, |
| "num_input_tokens_seen": 148912, |
| "step": 695 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 2.9967145919799805, |
| "learning_rate": 4.998173208744843e-05, |
| "loss": 0.1363, |
| "num_input_tokens_seen": 149872, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.238095238095238, |
| "grad_norm": 0.24820205569267273, |
| "learning_rate": 4.9978989023467536e-05, |
| "loss": 0.1622, |
| "num_input_tokens_seen": 150976, |
| "step": 705 |
| }, |
| { |
| "epoch": 2.253968253968254, |
| "grad_norm": 2.1588690280914307, |
| "learning_rate": 4.997605424796439e-05, |
| "loss": 0.0558, |
| "num_input_tokens_seen": 152032, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.2698412698412698, |
| "grad_norm": 8.513988494873047, |
| "learning_rate": 4.997292778346312e-05, |
| "loss": 0.1075, |
| "num_input_tokens_seen": 153072, |
| "step": 715 |
| }, |
| { |
| "epoch": 2.2857142857142856, |
| "grad_norm": 5.093414306640625, |
| "learning_rate": 4.996960965395906e-05, |
| "loss": 0.2232, |
| "num_input_tokens_seen": 154080, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.3015873015873014, |
| "grad_norm": 0.1848161220550537, |
| "learning_rate": 4.996609988491856e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 155168, |
| "step": 725 |
| }, |
| { |
| "epoch": 2.317460317460317, |
| "grad_norm": 1.1659319400787354, |
| "learning_rate": 4.99623985032788e-05, |
| "loss": 0.1902, |
| "num_input_tokens_seen": 156192, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.3333333333333335, |
| "grad_norm": 0.7289673686027527, |
| "learning_rate": 4.9958505537447535e-05, |
| "loss": 0.0223, |
| "num_input_tokens_seen": 157184, |
| "step": 735 |
| }, |
| { |
| "epoch": 2.3492063492063493, |
| "grad_norm": 5.103140354156494, |
| "learning_rate": 4.9954421017302947e-05, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 158240, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.365079365079365, |
| "grad_norm": 4.727173328399658, |
| "learning_rate": 4.9950144974193364e-05, |
| "loss": 0.207, |
| "num_input_tokens_seen": 159312, |
| "step": 745 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 1.829289197921753, |
| "learning_rate": 4.994567744093703e-05, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 160304, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.3968253968253967, |
| "grad_norm": 2.2619996070861816, |
| "learning_rate": 4.9941018451821866e-05, |
| "loss": 0.1312, |
| "num_input_tokens_seen": 161424, |
| "step": 755 |
| }, |
| { |
| "epoch": 2.4126984126984126, |
| "grad_norm": 3.3826417922973633, |
| "learning_rate": 4.993616804260521e-05, |
| "loss": 0.1096, |
| "num_input_tokens_seen": 162480, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.4285714285714284, |
| "grad_norm": 1.208579659461975, |
| "learning_rate": 4.9931126250513516e-05, |
| "loss": 0.0433, |
| "num_input_tokens_seen": 163568, |
| "step": 765 |
| }, |
| { |
| "epoch": 2.4444444444444446, |
| "grad_norm": 2.259871006011963, |
| "learning_rate": 4.992589311424208e-05, |
| "loss": 0.0786, |
| "num_input_tokens_seen": 164608, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.4603174603174605, |
| "grad_norm": 1.2732272148132324, |
| "learning_rate": 4.992046867395478e-05, |
| "loss": 0.1996, |
| "num_input_tokens_seen": 165712, |
| "step": 775 |
| }, |
| { |
| "epoch": 2.4761904761904763, |
| "grad_norm": 5.162652492523193, |
| "learning_rate": 4.991485297128369e-05, |
| "loss": 0.1116, |
| "num_input_tokens_seen": 166736, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.492063492063492, |
| "grad_norm": 3.201205253601074, |
| "learning_rate": 4.9909046049328846e-05, |
| "loss": 0.1202, |
| "num_input_tokens_seen": 167872, |
| "step": 785 |
| }, |
| { |
| "epoch": 2.507936507936508, |
| "grad_norm": 2.6469757556915283, |
| "learning_rate": 4.9903047952657856e-05, |
| "loss": 0.0957, |
| "num_input_tokens_seen": 168960, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.5238095238095237, |
| "grad_norm": 1.523950219154358, |
| "learning_rate": 4.989685872730557e-05, |
| "loss": 0.0956, |
| "num_input_tokens_seen": 169968, |
| "step": 795 |
| }, |
| { |
| "epoch": 2.5396825396825395, |
| "grad_norm": 0.4077323377132416, |
| "learning_rate": 4.9890478420773746e-05, |
| "loss": 0.0856, |
| "num_input_tokens_seen": 171072, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.5555555555555554, |
| "grad_norm": 1.0244964361190796, |
| "learning_rate": 4.988390708203068e-05, |
| "loss": 0.0398, |
| "num_input_tokens_seen": 172144, |
| "step": 805 |
| }, |
| { |
| "epoch": 2.571428571428571, |
| "grad_norm": 0.2922608554363251, |
| "learning_rate": 4.9877144761510806e-05, |
| "loss": 0.1052, |
| "num_input_tokens_seen": 173264, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.5873015873015874, |
| "grad_norm": 4.774709701538086, |
| "learning_rate": 4.987019151111433e-05, |
| "loss": 0.1851, |
| "num_input_tokens_seen": 174352, |
| "step": 815 |
| }, |
| { |
| "epoch": 2.6031746031746033, |
| "grad_norm": 4.259119510650635, |
| "learning_rate": 4.9863047384206835e-05, |
| "loss": 0.0609, |
| "num_input_tokens_seen": 175456, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.619047619047619, |
| "grad_norm": 0.8433787226676941, |
| "learning_rate": 4.9855712435618864e-05, |
| "loss": 0.0427, |
| "num_input_tokens_seen": 176576, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.634920634920635, |
| "grad_norm": 1.5698630809783936, |
| "learning_rate": 4.9848186721645484e-05, |
| "loss": 0.1784, |
| "num_input_tokens_seen": 177632, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.6507936507936507, |
| "grad_norm": 5.571624279022217, |
| "learning_rate": 4.98404703000459e-05, |
| "loss": 0.1248, |
| "num_input_tokens_seen": 178784, |
| "step": 835 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 3.441417932510376, |
| "learning_rate": 4.983256323004295e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 179888, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.682539682539683, |
| "grad_norm": 0.750073254108429, |
| "learning_rate": 4.982446557232269e-05, |
| "loss": 0.1023, |
| "num_input_tokens_seen": 180976, |
| "step": 845 |
| }, |
| { |
| "epoch": 2.6984126984126986, |
| "grad_norm": 15.302396774291992, |
| "learning_rate": 4.981617738903393e-05, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 182064, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.7142857142857144, |
| "grad_norm": 3.7665250301361084, |
| "learning_rate": 4.9807698743787744e-05, |
| "loss": 0.1289, |
| "num_input_tokens_seen": 183168, |
| "step": 855 |
| }, |
| { |
| "epoch": 2.7301587301587302, |
| "grad_norm": 3.019672393798828, |
| "learning_rate": 4.9799029701656975e-05, |
| "loss": 0.0422, |
| "num_input_tokens_seen": 184192, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.746031746031746, |
| "grad_norm": 0.010081956163048744, |
| "learning_rate": 4.9790170329175754e-05, |
| "loss": 0.0462, |
| "num_input_tokens_seen": 185248, |
| "step": 865 |
| }, |
| { |
| "epoch": 2.761904761904762, |
| "grad_norm": 3.496649980545044, |
| "learning_rate": 4.978112069433899e-05, |
| "loss": 0.106, |
| "num_input_tokens_seen": 186352, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.7777777777777777, |
| "grad_norm": 0.3301718831062317, |
| "learning_rate": 4.97718808666018e-05, |
| "loss": 0.0115, |
| "num_input_tokens_seen": 187376, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.7936507936507935, |
| "grad_norm": 0.4660927951335907, |
| "learning_rate": 4.976245091687906e-05, |
| "loss": 0.0609, |
| "num_input_tokens_seen": 188448, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.8095238095238093, |
| "grad_norm": 2.0272302627563477, |
| "learning_rate": 4.975283091754479e-05, |
| "loss": 0.2057, |
| "num_input_tokens_seen": 189520, |
| "step": 885 |
| }, |
| { |
| "epoch": 2.825396825396825, |
| "grad_norm": 0.16836805641651154, |
| "learning_rate": 4.974302094243164e-05, |
| "loss": 0.0908, |
| "num_input_tokens_seen": 190608, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.8412698412698414, |
| "grad_norm": 0.5413910150527954, |
| "learning_rate": 4.973302106683029e-05, |
| "loss": 0.1349, |
| "num_input_tokens_seen": 191760, |
| "step": 895 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 3.522494077682495, |
| "learning_rate": 4.972283136748889e-05, |
| "loss": 0.0679, |
| "num_input_tokens_seen": 192784, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.873015873015873, |
| "grad_norm": 1.0514668226242065, |
| "learning_rate": 4.971245192261249e-05, |
| "loss": 0.1376, |
| "num_input_tokens_seen": 193840, |
| "step": 905 |
| }, |
| { |
| "epoch": 2.888888888888889, |
| "grad_norm": 5.272775650024414, |
| "learning_rate": 4.970188281186241e-05, |
| "loss": 0.1013, |
| "num_input_tokens_seen": 194928, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.9047619047619047, |
| "grad_norm": 0.7582864165306091, |
| "learning_rate": 4.9691124116355617e-05, |
| "loss": 0.0419, |
| "num_input_tokens_seen": 196016, |
| "step": 915 |
| }, |
| { |
| "epoch": 2.9206349206349205, |
| "grad_norm": 2.6769752502441406, |
| "learning_rate": 4.968017591866416e-05, |
| "loss": 0.2212, |
| "num_input_tokens_seen": 197152, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.9365079365079367, |
| "grad_norm": 2.0376977920532227, |
| "learning_rate": 4.966903830281449e-05, |
| "loss": 0.0717, |
| "num_input_tokens_seen": 198208, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.9523809523809526, |
| "grad_norm": 1.3439394235610962, |
| "learning_rate": 4.96577113542868e-05, |
| "loss": 0.0629, |
| "num_input_tokens_seen": 199296, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.9682539682539684, |
| "grad_norm": 1.4719243049621582, |
| "learning_rate": 4.964619516001442e-05, |
| "loss": 0.0406, |
| "num_input_tokens_seen": 200352, |
| "step": 935 |
| }, |
| { |
| "epoch": 2.984126984126984, |
| "grad_norm": 0.41749292612075806, |
| "learning_rate": 4.963448980838312e-05, |
| "loss": 0.0661, |
| "num_input_tokens_seen": 201424, |
| "step": 940 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.5001729726791382, |
| "learning_rate": 4.9622595389230445e-05, |
| "loss": 0.1283, |
| "num_input_tokens_seen": 202528, |
| "step": 945 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.10918113589286804, |
| "eval_runtime": 1.4612, |
| "eval_samples_per_second": 47.905, |
| "eval_steps_per_second": 23.952, |
| "num_input_tokens_seen": 202528, |
| "step": 945 |
| }, |
| { |
| "epoch": 3.015873015873016, |
| "grad_norm": 3.957029104232788, |
| "learning_rate": 4.9610511993844986e-05, |
| "loss": 0.0249, |
| "num_input_tokens_seen": 203552, |
| "step": 950 |
| }, |
| { |
| "epoch": 3.0317460317460316, |
| "grad_norm": 2.8302228450775146, |
| "learning_rate": 4.959823971496574e-05, |
| "loss": 0.0556, |
| "num_input_tokens_seen": 204656, |
| "step": 955 |
| }, |
| { |
| "epoch": 3.0476190476190474, |
| "grad_norm": 0.05084440857172012, |
| "learning_rate": 4.9585778646781364e-05, |
| "loss": 0.0979, |
| "num_input_tokens_seen": 205744, |
| "step": 960 |
| }, |
| { |
| "epoch": 3.0634920634920633, |
| "grad_norm": 0.25631648302078247, |
| "learning_rate": 4.957312888492944e-05, |
| "loss": 0.0046, |
| "num_input_tokens_seen": 206800, |
| "step": 965 |
| }, |
| { |
| "epoch": 3.0793650793650795, |
| "grad_norm": 3.6322312355041504, |
| "learning_rate": 4.9560290526495764e-05, |
| "loss": 0.0374, |
| "num_input_tokens_seen": 207808, |
| "step": 970 |
| }, |
| { |
| "epoch": 3.0952380952380953, |
| "grad_norm": 5.5144171714782715, |
| "learning_rate": 4.954726367001361e-05, |
| "loss": 0.1074, |
| "num_input_tokens_seen": 208896, |
| "step": 975 |
| }, |
| { |
| "epoch": 3.111111111111111, |
| "grad_norm": 6.550605773925781, |
| "learning_rate": 4.9534048415462934e-05, |
| "loss": 0.0411, |
| "num_input_tokens_seen": 209920, |
| "step": 980 |
| }, |
| { |
| "epoch": 3.126984126984127, |
| "grad_norm": 0.2177072912454605, |
| "learning_rate": 4.952064486426965e-05, |
| "loss": 0.0858, |
| "num_input_tokens_seen": 210992, |
| "step": 985 |
| }, |
| { |
| "epoch": 3.142857142857143, |
| "grad_norm": 1.0022085905075073, |
| "learning_rate": 4.9507053119304805e-05, |
| "loss": 0.0535, |
| "num_input_tokens_seen": 212064, |
| "step": 990 |
| }, |
| { |
| "epoch": 3.1587301587301586, |
| "grad_norm": 1.5411404371261597, |
| "learning_rate": 4.9493273284883854e-05, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 213152, |
| "step": 995 |
| }, |
| { |
| "epoch": 3.1746031746031744, |
| "grad_norm": 0.013450037688016891, |
| "learning_rate": 4.947930546676579e-05, |
| "loss": 0.0833, |
| "num_input_tokens_seen": 214192, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.1904761904761907, |
| "grad_norm": 2.3490381240844727, |
| "learning_rate": 4.946514977215238e-05, |
| "loss": 0.0799, |
| "num_input_tokens_seen": 215264, |
| "step": 1005 |
| }, |
| { |
| "epoch": 3.2063492063492065, |
| "grad_norm": 1.3413841724395752, |
| "learning_rate": 4.945080630968733e-05, |
| "loss": 0.0728, |
| "num_input_tokens_seen": 216320, |
| "step": 1010 |
| }, |
| { |
| "epoch": 3.2222222222222223, |
| "grad_norm": 2.792207956314087, |
| "learning_rate": 4.943627518945543e-05, |
| "loss": 0.0547, |
| "num_input_tokens_seen": 217376, |
| "step": 1015 |
| }, |
| { |
| "epoch": 3.238095238095238, |
| "grad_norm": 0.3178693652153015, |
| "learning_rate": 4.942155652298174e-05, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 218512, |
| "step": 1020 |
| }, |
| { |
| "epoch": 3.253968253968254, |
| "grad_norm": 2.8839704990386963, |
| "learning_rate": 4.940665042323072e-05, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 219552, |
| "step": 1025 |
| }, |
| { |
| "epoch": 3.2698412698412698, |
| "grad_norm": 3.0642411708831787, |
| "learning_rate": 4.939155700460536e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 220720, |
| "step": 1030 |
| }, |
| { |
| "epoch": 3.2857142857142856, |
| "grad_norm": 2.056198835372925, |
| "learning_rate": 4.9376276382946304e-05, |
| "loss": 0.1516, |
| "num_input_tokens_seen": 221792, |
| "step": 1035 |
| }, |
| { |
| "epoch": 3.3015873015873014, |
| "grad_norm": 0.08110915124416351, |
| "learning_rate": 4.936080867553099e-05, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 222880, |
| "step": 1040 |
| }, |
| { |
| "epoch": 3.317460317460317, |
| "grad_norm": 2.887831449508667, |
| "learning_rate": 4.934515400107266e-05, |
| "loss": 0.0852, |
| "num_input_tokens_seen": 223984, |
| "step": 1045 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 1.9948076009750366, |
| "learning_rate": 4.932931247971958e-05, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 225056, |
| "step": 1050 |
| }, |
| { |
| "epoch": 3.3492063492063493, |
| "grad_norm": 4.810222625732422, |
| "learning_rate": 4.9313284233054004e-05, |
| "loss": 0.1105, |
| "num_input_tokens_seen": 226160, |
| "step": 1055 |
| }, |
| { |
| "epoch": 3.365079365079365, |
| "grad_norm": 0.02646580897271633, |
| "learning_rate": 4.9297069384091306e-05, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 227232, |
| "step": 1060 |
| }, |
| { |
| "epoch": 3.380952380952381, |
| "grad_norm": 0.13596875965595245, |
| "learning_rate": 4.9280668057279014e-05, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 228304, |
| "step": 1065 |
| }, |
| { |
| "epoch": 3.3968253968253967, |
| "grad_norm": 0.041388608515262604, |
| "learning_rate": 4.9264080378495846e-05, |
| "loss": 0.0326, |
| "num_input_tokens_seen": 229344, |
| "step": 1070 |
| }, |
| { |
| "epoch": 3.4126984126984126, |
| "grad_norm": 1.731412410736084, |
| "learning_rate": 4.924730647505078e-05, |
| "loss": 0.0411, |
| "num_input_tokens_seen": 230368, |
| "step": 1075 |
| }, |
| { |
| "epoch": 3.4285714285714284, |
| "grad_norm": 0.9932308793067932, |
| "learning_rate": 4.923034647568202e-05, |
| "loss": 0.0689, |
| "num_input_tokens_seen": 231408, |
| "step": 1080 |
| }, |
| { |
| "epoch": 3.4444444444444446, |
| "grad_norm": 4.429158687591553, |
| "learning_rate": 4.921320051055606e-05, |
| "loss": 0.0849, |
| "num_input_tokens_seen": 232464, |
| "step": 1085 |
| }, |
| { |
| "epoch": 3.4603174603174605, |
| "grad_norm": 0.05143646523356438, |
| "learning_rate": 4.919586871126667e-05, |
| "loss": 0.0295, |
| "num_input_tokens_seen": 233552, |
| "step": 1090 |
| }, |
| { |
| "epoch": 3.4761904761904763, |
| "grad_norm": 0.026359835639595985, |
| "learning_rate": 4.917835121083384e-05, |
| "loss": 0.0546, |
| "num_input_tokens_seen": 234624, |
| "step": 1095 |
| }, |
| { |
| "epoch": 3.492063492063492, |
| "grad_norm": 1.5680820941925049, |
| "learning_rate": 4.916064814370287e-05, |
| "loss": 0.0547, |
| "num_input_tokens_seen": 235696, |
| "step": 1100 |
| }, |
| { |
| "epoch": 3.507936507936508, |
| "grad_norm": 0.5346999168395996, |
| "learning_rate": 4.91427596457432e-05, |
| "loss": 0.0992, |
| "num_input_tokens_seen": 236704, |
| "step": 1105 |
| }, |
| { |
| "epoch": 3.5238095238095237, |
| "grad_norm": 3.054236888885498, |
| "learning_rate": 4.9124685854247465e-05, |
| "loss": 0.0986, |
| "num_input_tokens_seen": 237776, |
| "step": 1110 |
| }, |
| { |
| "epoch": 3.5396825396825395, |
| "grad_norm": 0.5600343942642212, |
| "learning_rate": 4.910642690793043e-05, |
| "loss": 0.048, |
| "num_input_tokens_seen": 238864, |
| "step": 1115 |
| }, |
| { |
| "epoch": 3.5555555555555554, |
| "grad_norm": 3.9517204761505127, |
| "learning_rate": 4.908798294692786e-05, |
| "loss": 0.1061, |
| "num_input_tokens_seen": 239856, |
| "step": 1120 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 0.37156346440315247, |
| "learning_rate": 4.906935411279553e-05, |
| "loss": 0.011, |
| "num_input_tokens_seen": 240896, |
| "step": 1125 |
| }, |
| { |
| "epoch": 3.5873015873015874, |
| "grad_norm": 0.4936334192752838, |
| "learning_rate": 4.9050540548508094e-05, |
| "loss": 0.0569, |
| "num_input_tokens_seen": 242000, |
| "step": 1130 |
| }, |
| { |
| "epoch": 3.6031746031746033, |
| "grad_norm": 10.88504695892334, |
| "learning_rate": 4.9031542398457974e-05, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 243056, |
| "step": 1135 |
| }, |
| { |
| "epoch": 3.619047619047619, |
| "grad_norm": 1.345165491104126, |
| "learning_rate": 4.901235980845429e-05, |
| "loss": 0.0429, |
| "num_input_tokens_seen": 244112, |
| "step": 1140 |
| }, |
| { |
| "epoch": 3.634920634920635, |
| "grad_norm": 0.43414610624313354, |
| "learning_rate": 4.899299292572172e-05, |
| "loss": 0.0253, |
| "num_input_tokens_seen": 245216, |
| "step": 1145 |
| }, |
| { |
| "epoch": 3.6507936507936507, |
| "grad_norm": 1.8910837173461914, |
| "learning_rate": 4.897344189889936e-05, |
| "loss": 0.0114, |
| "num_input_tokens_seen": 246272, |
| "step": 1150 |
| }, |
| { |
| "epoch": 3.6666666666666665, |
| "grad_norm": 0.01159939169883728, |
| "learning_rate": 4.895370687803962e-05, |
| "loss": 0.0804, |
| "num_input_tokens_seen": 247392, |
| "step": 1155 |
| }, |
| { |
| "epoch": 3.682539682539683, |
| "grad_norm": 6.754380702972412, |
| "learning_rate": 4.893378801460702e-05, |
| "loss": 0.067, |
| "num_input_tokens_seen": 248480, |
| "step": 1160 |
| }, |
| { |
| "epoch": 3.6984126984126986, |
| "grad_norm": 1.1904765367507935, |
| "learning_rate": 4.8913685461477066e-05, |
| "loss": 0.0089, |
| "num_input_tokens_seen": 249536, |
| "step": 1165 |
| }, |
| { |
| "epoch": 3.7142857142857144, |
| "grad_norm": 1.1461362838745117, |
| "learning_rate": 4.889339937293508e-05, |
| "loss": 0.0426, |
| "num_input_tokens_seen": 250608, |
| "step": 1170 |
| }, |
| { |
| "epoch": 3.7301587301587302, |
| "grad_norm": 1.271159052848816, |
| "learning_rate": 4.8872929904674966e-05, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 251632, |
| "step": 1175 |
| }, |
| { |
| "epoch": 3.746031746031746, |
| "grad_norm": 3.5823814868927, |
| "learning_rate": 4.8852277213798106e-05, |
| "loss": 0.076, |
| "num_input_tokens_seen": 252752, |
| "step": 1180 |
| }, |
| { |
| "epoch": 3.761904761904762, |
| "grad_norm": 0.5149009823799133, |
| "learning_rate": 4.883144145881205e-05, |
| "loss": 0.0884, |
| "num_input_tokens_seen": 253808, |
| "step": 1185 |
| }, |
| { |
| "epoch": 3.7777777777777777, |
| "grad_norm": 1.5054774284362793, |
| "learning_rate": 4.8810422799629375e-05, |
| "loss": 0.1014, |
| "num_input_tokens_seen": 254864, |
| "step": 1190 |
| }, |
| { |
| "epoch": 3.7936507936507935, |
| "grad_norm": 2.613424301147461, |
| "learning_rate": 4.878922139756641e-05, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 255904, |
| "step": 1195 |
| }, |
| { |
| "epoch": 3.8095238095238093, |
| "grad_norm": 0.6467133164405823, |
| "learning_rate": 4.876783741534204e-05, |
| "loss": 0.0327, |
| "num_input_tokens_seen": 256976, |
| "step": 1200 |
| }, |
| { |
| "epoch": 3.825396825396825, |
| "grad_norm": 0.5593597292900085, |
| "learning_rate": 4.874627101707644e-05, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 258016, |
| "step": 1205 |
| }, |
| { |
| "epoch": 3.8412698412698414, |
| "grad_norm": 2.611604928970337, |
| "learning_rate": 4.872452236828979e-05, |
| "loss": 0.1157, |
| "num_input_tokens_seen": 259072, |
| "step": 1210 |
| }, |
| { |
| "epoch": 3.857142857142857, |
| "grad_norm": 8.799260139465332, |
| "learning_rate": 4.870259163590103e-05, |
| "loss": 0.1447, |
| "num_input_tokens_seen": 260224, |
| "step": 1215 |
| }, |
| { |
| "epoch": 3.873015873015873, |
| "grad_norm": 1.5195016860961914, |
| "learning_rate": 4.8680478988226606e-05, |
| "loss": 0.0436, |
| "num_input_tokens_seen": 261312, |
| "step": 1220 |
| }, |
| { |
| "epoch": 3.888888888888889, |
| "grad_norm": 7.93579626083374, |
| "learning_rate": 4.865818459497911e-05, |
| "loss": 0.113, |
| "num_input_tokens_seen": 262352, |
| "step": 1225 |
| }, |
| { |
| "epoch": 3.9047619047619047, |
| "grad_norm": 1.4610778093338013, |
| "learning_rate": 4.863570862726603e-05, |
| "loss": 0.3412, |
| "num_input_tokens_seen": 263440, |
| "step": 1230 |
| }, |
| { |
| "epoch": 3.9206349206349205, |
| "grad_norm": 0.9858604073524475, |
| "learning_rate": 4.861305125758842e-05, |
| "loss": 0.0656, |
| "num_input_tokens_seen": 264480, |
| "step": 1235 |
| }, |
| { |
| "epoch": 3.9365079365079367, |
| "grad_norm": 5.148216724395752, |
| "learning_rate": 4.859021265983959e-05, |
| "loss": 0.0796, |
| "num_input_tokens_seen": 265616, |
| "step": 1240 |
| }, |
| { |
| "epoch": 3.9523809523809526, |
| "grad_norm": 0.5790823698043823, |
| "learning_rate": 4.856719300930375e-05, |
| "loss": 0.0355, |
| "num_input_tokens_seen": 266720, |
| "step": 1245 |
| }, |
| { |
| "epoch": 3.9682539682539684, |
| "grad_norm": 0.20517916977405548, |
| "learning_rate": 4.854399248265465e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 267824, |
| "step": 1250 |
| }, |
| { |
| "epoch": 3.984126984126984, |
| "grad_norm": 0.5621991753578186, |
| "learning_rate": 4.852061125795431e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 268832, |
| "step": 1255 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 2.5780136585235596, |
| "learning_rate": 4.8497049514651514e-05, |
| "loss": 0.1397, |
| "num_input_tokens_seen": 269840, |
| "step": 1260 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.10533556342124939, |
| "eval_runtime": 1.4561, |
| "eval_samples_per_second": 48.074, |
| "eval_steps_per_second": 24.037, |
| "num_input_tokens_seen": 269840, |
| "step": 1260 |
| }, |
| { |
| "epoch": 4.015873015873016, |
| "grad_norm": 0.0031560775823891163, |
| "learning_rate": 4.8473307433580575e-05, |
| "loss": 0.0282, |
| "num_input_tokens_seen": 270944, |
| "step": 1265 |
| }, |
| { |
| "epoch": 4.031746031746032, |
| "grad_norm": 0.22770990431308746, |
| "learning_rate": 4.844938519695984e-05, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 272032, |
| "step": 1270 |
| }, |
| { |
| "epoch": 4.0476190476190474, |
| "grad_norm": 1.012162208557129, |
| "learning_rate": 4.8425282988390376e-05, |
| "loss": 0.0079, |
| "num_input_tokens_seen": 273104, |
| "step": 1275 |
| }, |
| { |
| "epoch": 4.063492063492063, |
| "grad_norm": 1.5583217144012451, |
| "learning_rate": 4.840100099285446e-05, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 274144, |
| "step": 1280 |
| }, |
| { |
| "epoch": 4.079365079365079, |
| "grad_norm": 1.6686698198318481, |
| "learning_rate": 4.837653939671427e-05, |
| "loss": 0.0068, |
| "num_input_tokens_seen": 275216, |
| "step": 1285 |
| }, |
| { |
| "epoch": 4.095238095238095, |
| "grad_norm": 2.978330612182617, |
| "learning_rate": 4.8351898387710394e-05, |
| "loss": 0.0831, |
| "num_input_tokens_seen": 276352, |
| "step": 1290 |
| }, |
| { |
| "epoch": 4.111111111111111, |
| "grad_norm": 0.0014791067223995924, |
| "learning_rate": 4.832707815496036e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 277360, |
| "step": 1295 |
| }, |
| { |
| "epoch": 4.1269841269841265, |
| "grad_norm": 0.04002991318702698, |
| "learning_rate": 4.830207888895727e-05, |
| "loss": 0.0338, |
| "num_input_tokens_seen": 278448, |
| "step": 1300 |
| }, |
| { |
| "epoch": 4.142857142857143, |
| "grad_norm": 12.669215202331543, |
| "learning_rate": 4.827690078156826e-05, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 279552, |
| "step": 1305 |
| }, |
| { |
| "epoch": 4.158730158730159, |
| "grad_norm": 2.2725536823272705, |
| "learning_rate": 4.825154402603308e-05, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 280624, |
| "step": 1310 |
| }, |
| { |
| "epoch": 4.174603174603175, |
| "grad_norm": 0.0256810262799263, |
| "learning_rate": 4.822600881696256e-05, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 281728, |
| "step": 1315 |
| }, |
| { |
| "epoch": 4.190476190476191, |
| "grad_norm": 2.8046810626983643, |
| "learning_rate": 4.820029535033719e-05, |
| "loss": 0.0931, |
| "num_input_tokens_seen": 282864, |
| "step": 1320 |
| }, |
| { |
| "epoch": 4.2063492063492065, |
| "grad_norm": 0.17485536634922028, |
| "learning_rate": 4.817440382350551e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 283952, |
| "step": 1325 |
| }, |
| { |
| "epoch": 4.222222222222222, |
| "grad_norm": 1.145687222480774, |
| "learning_rate": 4.814833443518271e-05, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 284960, |
| "step": 1330 |
| }, |
| { |
| "epoch": 4.238095238095238, |
| "grad_norm": 0.14578741788864136, |
| "learning_rate": 4.812208738544901e-05, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 286000, |
| "step": 1335 |
| }, |
| { |
| "epoch": 4.253968253968254, |
| "grad_norm": 0.21778497099876404, |
| "learning_rate": 4.809566287574821e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 287088, |
| "step": 1340 |
| }, |
| { |
| "epoch": 4.26984126984127, |
| "grad_norm": 2.5348594188690186, |
| "learning_rate": 4.806906110888606e-05, |
| "loss": 0.0074, |
| "num_input_tokens_seen": 288208, |
| "step": 1345 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 0.19400948286056519, |
| "learning_rate": 4.804228228902876e-05, |
| "loss": 0.0265, |
| "num_input_tokens_seen": 289232, |
| "step": 1350 |
| }, |
| { |
| "epoch": 4.301587301587301, |
| "grad_norm": 0.38872888684272766, |
| "learning_rate": 4.8015326621701386e-05, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 290288, |
| "step": 1355 |
| }, |
| { |
| "epoch": 4.317460317460317, |
| "grad_norm": 0.0075505380518734455, |
| "learning_rate": 4.7988194313786275e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 291328, |
| "step": 1360 |
| }, |
| { |
| "epoch": 4.333333333333333, |
| "grad_norm": 0.0007070943829603493, |
| "learning_rate": 4.796088557352148e-05, |
| "loss": 0.0647, |
| "num_input_tokens_seen": 292400, |
| "step": 1365 |
| }, |
| { |
| "epoch": 4.349206349206349, |
| "grad_norm": 2.9666683673858643, |
| "learning_rate": 4.7933400610499164e-05, |
| "loss": 0.104, |
| "num_input_tokens_seen": 293520, |
| "step": 1370 |
| }, |
| { |
| "epoch": 4.365079365079365, |
| "grad_norm": 0.0026558933313935995, |
| "learning_rate": 4.7905739635663984e-05, |
| "loss": 0.0021, |
| "num_input_tokens_seen": 294608, |
| "step": 1375 |
| }, |
| { |
| "epoch": 4.380952380952381, |
| "grad_norm": 9.051935195922852, |
| "learning_rate": 4.7877902861311446e-05, |
| "loss": 0.1086, |
| "num_input_tokens_seen": 295648, |
| "step": 1380 |
| }, |
| { |
| "epoch": 4.396825396825397, |
| "grad_norm": 0.06698460876941681, |
| "learning_rate": 4.784989050108634e-05, |
| "loss": 0.0073, |
| "num_input_tokens_seen": 296704, |
| "step": 1385 |
| }, |
| { |
| "epoch": 4.412698412698413, |
| "grad_norm": 0.30536431074142456, |
| "learning_rate": 4.782170276998104e-05, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 297760, |
| "step": 1390 |
| }, |
| { |
| "epoch": 4.428571428571429, |
| "grad_norm": 0.0008031931356526911, |
| "learning_rate": 4.779333988433386e-05, |
| "loss": 0.0607, |
| "num_input_tokens_seen": 298848, |
| "step": 1395 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 0.02665814757347107, |
| "learning_rate": 4.7764802061827455e-05, |
| "loss": 0.0705, |
| "num_input_tokens_seen": 299936, |
| "step": 1400 |
| }, |
| { |
| "epoch": 4.4603174603174605, |
| "grad_norm": 1.0428848266601562, |
| "learning_rate": 4.773608952148706e-05, |
| "loss": 0.0549, |
| "num_input_tokens_seen": 301056, |
| "step": 1405 |
| }, |
| { |
| "epoch": 4.476190476190476, |
| "grad_norm": 0.06535102427005768, |
| "learning_rate": 4.770720248367887e-05, |
| "loss": 0.0352, |
| "num_input_tokens_seen": 302080, |
| "step": 1410 |
| }, |
| { |
| "epoch": 4.492063492063492, |
| "grad_norm": 0.04799408093094826, |
| "learning_rate": 4.7678141170108345e-05, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 303104, |
| "step": 1415 |
| }, |
| { |
| "epoch": 4.507936507936508, |
| "grad_norm": 6.913591384887695, |
| "learning_rate": 4.764890580381849e-05, |
| "loss": 0.1083, |
| "num_input_tokens_seen": 304240, |
| "step": 1420 |
| }, |
| { |
| "epoch": 4.523809523809524, |
| "grad_norm": 4.7112956047058105, |
| "learning_rate": 4.761949660918814e-05, |
| "loss": 0.037, |
| "num_input_tokens_seen": 305360, |
| "step": 1425 |
| }, |
| { |
| "epoch": 4.5396825396825395, |
| "grad_norm": 0.5090931057929993, |
| "learning_rate": 4.7589913811930234e-05, |
| "loss": 0.0063, |
| "num_input_tokens_seen": 306416, |
| "step": 1430 |
| }, |
| { |
| "epoch": 4.555555555555555, |
| "grad_norm": 0.002227133372798562, |
| "learning_rate": 4.756015763909014e-05, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 307408, |
| "step": 1435 |
| }, |
| { |
| "epoch": 4.571428571428571, |
| "grad_norm": 0.0006507275975309312, |
| "learning_rate": 4.753022831904383e-05, |
| "loss": 0.0223, |
| "num_input_tokens_seen": 308432, |
| "step": 1440 |
| }, |
| { |
| "epoch": 4.587301587301587, |
| "grad_norm": 3.9856300354003906, |
| "learning_rate": 4.750012608149618e-05, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 309472, |
| "step": 1445 |
| }, |
| { |
| "epoch": 4.603174603174603, |
| "grad_norm": 1.1103370189666748, |
| "learning_rate": 4.7469851157479177e-05, |
| "loss": 0.1615, |
| "num_input_tokens_seen": 310512, |
| "step": 1450 |
| }, |
| { |
| "epoch": 4.619047619047619, |
| "grad_norm": 1.3942312002182007, |
| "learning_rate": 4.743940377935019e-05, |
| "loss": 0.0413, |
| "num_input_tokens_seen": 311632, |
| "step": 1455 |
| }, |
| { |
| "epoch": 4.634920634920634, |
| "grad_norm": 0.05732967332005501, |
| "learning_rate": 4.740878418079014e-05, |
| "loss": 0.0197, |
| "num_input_tokens_seen": 312688, |
| "step": 1460 |
| }, |
| { |
| "epoch": 4.650793650793651, |
| "grad_norm": 7.539809226989746, |
| "learning_rate": 4.737799259680172e-05, |
| "loss": 0.119, |
| "num_input_tokens_seen": 313760, |
| "step": 1465 |
| }, |
| { |
| "epoch": 4.666666666666667, |
| "grad_norm": 1.541993260383606, |
| "learning_rate": 4.73470292637076e-05, |
| "loss": 0.043, |
| "num_input_tokens_seen": 314816, |
| "step": 1470 |
| }, |
| { |
| "epoch": 4.682539682539683, |
| "grad_norm": 0.03592051565647125, |
| "learning_rate": 4.731589441914862e-05, |
| "loss": 0.013, |
| "num_input_tokens_seen": 315840, |
| "step": 1475 |
| }, |
| { |
| "epoch": 4.698412698412699, |
| "grad_norm": 5.116149425506592, |
| "learning_rate": 4.7284588302081946e-05, |
| "loss": 0.0987, |
| "num_input_tokens_seen": 316960, |
| "step": 1480 |
| }, |
| { |
| "epoch": 4.714285714285714, |
| "grad_norm": 0.07333391159772873, |
| "learning_rate": 4.725311115277924e-05, |
| "loss": 0.0773, |
| "num_input_tokens_seen": 318016, |
| "step": 1485 |
| }, |
| { |
| "epoch": 4.73015873015873, |
| "grad_norm": 2.1563048362731934, |
| "learning_rate": 4.7221463212824835e-05, |
| "loss": 0.0783, |
| "num_input_tokens_seen": 319136, |
| "step": 1490 |
| }, |
| { |
| "epoch": 4.746031746031746, |
| "grad_norm": 0.4926490783691406, |
| "learning_rate": 4.718964472511386e-05, |
| "loss": 0.0329, |
| "num_input_tokens_seen": 320288, |
| "step": 1495 |
| }, |
| { |
| "epoch": 4.761904761904762, |
| "grad_norm": 0.00580169539898634, |
| "learning_rate": 4.715765593385036e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 321312, |
| "step": 1500 |
| }, |
| { |
| "epoch": 4.777777777777778, |
| "grad_norm": 0.00831407681107521, |
| "learning_rate": 4.71254970845455e-05, |
| "loss": 0.0561, |
| "num_input_tokens_seen": 322368, |
| "step": 1505 |
| }, |
| { |
| "epoch": 4.7936507936507935, |
| "grad_norm": 0.007532383315265179, |
| "learning_rate": 4.709316842401557e-05, |
| "loss": 0.036, |
| "num_input_tokens_seen": 323392, |
| "step": 1510 |
| }, |
| { |
| "epoch": 4.809523809523809, |
| "grad_norm": 0.6496375799179077, |
| "learning_rate": 4.706067020038017e-05, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 324576, |
| "step": 1515 |
| }, |
| { |
| "epoch": 4.825396825396825, |
| "grad_norm": 0.14115196466445923, |
| "learning_rate": 4.70280026630603e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 325648, |
| "step": 1520 |
| }, |
| { |
| "epoch": 4.841269841269841, |
| "grad_norm": 0.0023193114902824163, |
| "learning_rate": 4.699516606277638e-05, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 326720, |
| "step": 1525 |
| }, |
| { |
| "epoch": 4.857142857142857, |
| "grad_norm": 1.0476378202438354, |
| "learning_rate": 4.6962160651546416e-05, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 327808, |
| "step": 1530 |
| }, |
| { |
| "epoch": 4.8730158730158735, |
| "grad_norm": 0.0014104668516665697, |
| "learning_rate": 4.6928986682684004e-05, |
| "loss": 0.0093, |
| "num_input_tokens_seen": 328912, |
| "step": 1535 |
| }, |
| { |
| "epoch": 4.888888888888889, |
| "grad_norm": 1.5215229988098145, |
| "learning_rate": 4.6895644410796416e-05, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 329952, |
| "step": 1540 |
| }, |
| { |
| "epoch": 4.904761904761905, |
| "grad_norm": 5.0272603034973145, |
| "learning_rate": 4.686213409178262e-05, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 331008, |
| "step": 1545 |
| }, |
| { |
| "epoch": 4.920634920634921, |
| "grad_norm": 0.038513634353876114, |
| "learning_rate": 4.6828455982831334e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 332048, |
| "step": 1550 |
| }, |
| { |
| "epoch": 4.936507936507937, |
| "grad_norm": 0.07715779542922974, |
| "learning_rate": 4.679461034241906e-05, |
| "loss": 0.0687, |
| "num_input_tokens_seen": 333152, |
| "step": 1555 |
| }, |
| { |
| "epoch": 4.9523809523809526, |
| "grad_norm": 0.0013984109973534942, |
| "learning_rate": 4.6760597430308085e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 334160, |
| "step": 1560 |
| }, |
| { |
| "epoch": 4.968253968253968, |
| "grad_norm": 0.2820828855037689, |
| "learning_rate": 4.672641750754449e-05, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 335184, |
| "step": 1565 |
| }, |
| { |
| "epoch": 4.984126984126984, |
| "grad_norm": 0.04478468745946884, |
| "learning_rate": 4.6692070836456126e-05, |
| "loss": 0.0442, |
| "num_input_tokens_seen": 336256, |
| "step": 1570 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 2.1595120429992676, |
| "learning_rate": 4.6657557680650666e-05, |
| "loss": 0.0351, |
| "num_input_tokens_seen": 337408, |
| "step": 1575 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.11039518564939499, |
| "eval_runtime": 1.4716, |
| "eval_samples_per_second": 47.568, |
| "eval_steps_per_second": 23.784, |
| "num_input_tokens_seen": 337408, |
| "step": 1575 |
| }, |
| { |
| "epoch": 5.015873015873016, |
| "grad_norm": 0.616256594657898, |
| "learning_rate": 4.6622878305013505e-05, |
| "loss": 0.0074, |
| "num_input_tokens_seen": 338496, |
| "step": 1580 |
| }, |
| { |
| "epoch": 5.031746031746032, |
| "grad_norm": 0.009507289156317711, |
| "learning_rate": 4.658803297570577e-05, |
| "loss": 0.0417, |
| "num_input_tokens_seen": 339568, |
| "step": 1585 |
| }, |
| { |
| "epoch": 5.0476190476190474, |
| "grad_norm": 2.117201089859009, |
| "learning_rate": 4.655302196016228e-05, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 340608, |
| "step": 1590 |
| }, |
| { |
| "epoch": 5.063492063492063, |
| "grad_norm": 0.0259502362459898, |
| "learning_rate": 4.651784552708947e-05, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 341648, |
| "step": 1595 |
| }, |
| { |
| "epoch": 5.079365079365079, |
| "grad_norm": 0.0022856765426695347, |
| "learning_rate": 4.6482503946463315e-05, |
| "loss": 0.0115, |
| "num_input_tokens_seen": 342768, |
| "step": 1600 |
| }, |
| { |
| "epoch": 5.095238095238095, |
| "grad_norm": 2.402535915374756, |
| "learning_rate": 4.644699748952733e-05, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 343792, |
| "step": 1605 |
| }, |
| { |
| "epoch": 5.111111111111111, |
| "grad_norm": 0.044295862317085266, |
| "learning_rate": 4.641132642879041e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 344832, |
| "step": 1610 |
| }, |
| { |
| "epoch": 5.1269841269841265, |
| "grad_norm": 0.0057604555040597916, |
| "learning_rate": 4.6375491038024785e-05, |
| "loss": 0.0412, |
| "num_input_tokens_seen": 345904, |
| "step": 1615 |
| }, |
| { |
| "epoch": 5.142857142857143, |
| "grad_norm": 0.006179352756589651, |
| "learning_rate": 4.6339491592263896e-05, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 346912, |
| "step": 1620 |
| }, |
| { |
| "epoch": 5.158730158730159, |
| "grad_norm": 0.049357738345861435, |
| "learning_rate": 4.6303328367800284e-05, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 348000, |
| "step": 1625 |
| }, |
| { |
| "epoch": 5.174603174603175, |
| "grad_norm": 14.490152359008789, |
| "learning_rate": 4.6267001642183496e-05, |
| "loss": 0.1828, |
| "num_input_tokens_seen": 349104, |
| "step": 1630 |
| }, |
| { |
| "epoch": 5.190476190476191, |
| "grad_norm": 0.024869563058018684, |
| "learning_rate": 4.6230511694217904e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 350192, |
| "step": 1635 |
| }, |
| { |
| "epoch": 5.2063492063492065, |
| "grad_norm": 0.1924477219581604, |
| "learning_rate": 4.619385880396064e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 351312, |
| "step": 1640 |
| }, |
| { |
| "epoch": 5.222222222222222, |
| "grad_norm": 0.003086981363594532, |
| "learning_rate": 4.615704325271937e-05, |
| "loss": 0.0787, |
| "num_input_tokens_seen": 352368, |
| "step": 1645 |
| }, |
| { |
| "epoch": 5.238095238095238, |
| "grad_norm": 0.07954048365354538, |
| "learning_rate": 4.612006532305019e-05, |
| "loss": 0.0655, |
| "num_input_tokens_seen": 353408, |
| "step": 1650 |
| }, |
| { |
| "epoch": 5.253968253968254, |
| "grad_norm": 0.02726924791932106, |
| "learning_rate": 4.608292529875541e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 354464, |
| "step": 1655 |
| }, |
| { |
| "epoch": 5.26984126984127, |
| "grad_norm": 0.007141091860830784, |
| "learning_rate": 4.604562346488144e-05, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 355488, |
| "step": 1660 |
| }, |
| { |
| "epoch": 5.285714285714286, |
| "grad_norm": 0.007293707691133022, |
| "learning_rate": 4.600816010771652e-05, |
| "loss": 0.0065, |
| "num_input_tokens_seen": 356544, |
| "step": 1665 |
| }, |
| { |
| "epoch": 5.301587301587301, |
| "grad_norm": 2.8001534938812256, |
| "learning_rate": 4.5970535514788596e-05, |
| "loss": 0.0226, |
| "num_input_tokens_seen": 357680, |
| "step": 1670 |
| }, |
| { |
| "epoch": 5.317460317460317, |
| "grad_norm": 0.003933146595954895, |
| "learning_rate": 4.593274997486309e-05, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 358816, |
| "step": 1675 |
| }, |
| { |
| "epoch": 5.333333333333333, |
| "grad_norm": 0.19064919650554657, |
| "learning_rate": 4.589480377794064e-05, |
| "loss": 0.0082, |
| "num_input_tokens_seen": 359872, |
| "step": 1680 |
| }, |
| { |
| "epoch": 5.349206349206349, |
| "grad_norm": 3.8689186573028564, |
| "learning_rate": 4.585669721525496e-05, |
| "loss": 0.0432, |
| "num_input_tokens_seen": 360928, |
| "step": 1685 |
| }, |
| { |
| "epoch": 5.365079365079365, |
| "grad_norm": 1.693311333656311, |
| "learning_rate": 4.581843057927053e-05, |
| "loss": 0.0032, |
| "num_input_tokens_seen": 362032, |
| "step": 1690 |
| }, |
| { |
| "epoch": 5.380952380952381, |
| "grad_norm": 0.000897102989256382, |
| "learning_rate": 4.5780004163680365e-05, |
| "loss": 0.0865, |
| "num_input_tokens_seen": 363168, |
| "step": 1695 |
| }, |
| { |
| "epoch": 5.396825396825397, |
| "grad_norm": 0.0013103618985041976, |
| "learning_rate": 4.574141826340382e-05, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 364224, |
| "step": 1700 |
| }, |
| { |
| "epoch": 5.412698412698413, |
| "grad_norm": 4.227046489715576, |
| "learning_rate": 4.570267317458423e-05, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 365376, |
| "step": 1705 |
| }, |
| { |
| "epoch": 5.428571428571429, |
| "grad_norm": 8.825630187988281, |
| "learning_rate": 4.566376919458672e-05, |
| "loss": 0.1041, |
| "num_input_tokens_seen": 366448, |
| "step": 1710 |
| }, |
| { |
| "epoch": 5.444444444444445, |
| "grad_norm": 0.020684687420725822, |
| "learning_rate": 4.562470662199588e-05, |
| "loss": 0.0078, |
| "num_input_tokens_seen": 367552, |
| "step": 1715 |
| }, |
| { |
| "epoch": 5.4603174603174605, |
| "grad_norm": 0.7338157296180725, |
| "learning_rate": 4.5585485756613486e-05, |
| "loss": 0.0226, |
| "num_input_tokens_seen": 368672, |
| "step": 1720 |
| }, |
| { |
| "epoch": 5.476190476190476, |
| "grad_norm": 5.003768444061279, |
| "learning_rate": 4.5546106899456186e-05, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 369744, |
| "step": 1725 |
| }, |
| { |
| "epoch": 5.492063492063492, |
| "grad_norm": 0.0029771197587251663, |
| "learning_rate": 4.550657035275323e-05, |
| "loss": 0.0051, |
| "num_input_tokens_seen": 370784, |
| "step": 1730 |
| }, |
| { |
| "epoch": 5.507936507936508, |
| "grad_norm": 2.141559600830078, |
| "learning_rate": 4.546687641994409e-05, |
| "loss": 0.0283, |
| "num_input_tokens_seen": 371872, |
| "step": 1735 |
| }, |
| { |
| "epoch": 5.523809523809524, |
| "grad_norm": 0.06480997800827026, |
| "learning_rate": 4.542702540567618e-05, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 372912, |
| "step": 1740 |
| }, |
| { |
| "epoch": 5.5396825396825395, |
| "grad_norm": 5.961637020111084, |
| "learning_rate": 4.53870176158025e-05, |
| "loss": 0.0199, |
| "num_input_tokens_seen": 374080, |
| "step": 1745 |
| }, |
| { |
| "epoch": 5.555555555555555, |
| "grad_norm": 0.004954809322953224, |
| "learning_rate": 4.534685335737926e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 375120, |
| "step": 1750 |
| }, |
| { |
| "epoch": 5.571428571428571, |
| "grad_norm": 2.826611042022705, |
| "learning_rate": 4.530653293866361e-05, |
| "loss": 0.008, |
| "num_input_tokens_seen": 376224, |
| "step": 1755 |
| }, |
| { |
| "epoch": 5.587301587301587, |
| "grad_norm": 8.976028442382812, |
| "learning_rate": 4.526605666911116e-05, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 377248, |
| "step": 1760 |
| }, |
| { |
| "epoch": 5.603174603174603, |
| "grad_norm": 1.2282873392105103, |
| "learning_rate": 4.522542485937369e-05, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 378288, |
| "step": 1765 |
| }, |
| { |
| "epoch": 5.619047619047619, |
| "grad_norm": 1.1623200178146362, |
| "learning_rate": 4.518463782129673e-05, |
| "loss": 0.0388, |
| "num_input_tokens_seen": 379376, |
| "step": 1770 |
| }, |
| { |
| "epoch": 5.634920634920634, |
| "grad_norm": 7.183865547180176, |
| "learning_rate": 4.514369586791718e-05, |
| "loss": 0.3362, |
| "num_input_tokens_seen": 380480, |
| "step": 1775 |
| }, |
| { |
| "epoch": 5.650793650793651, |
| "grad_norm": 0.01554072555154562, |
| "learning_rate": 4.510259931346088e-05, |
| "loss": 0.004, |
| "num_input_tokens_seen": 381584, |
| "step": 1780 |
| }, |
| { |
| "epoch": 5.666666666666667, |
| "grad_norm": 0.002607325091958046, |
| "learning_rate": 4.506134847334026e-05, |
| "loss": 0.0069, |
| "num_input_tokens_seen": 382576, |
| "step": 1785 |
| }, |
| { |
| "epoch": 5.682539682539683, |
| "grad_norm": 0.003622877411544323, |
| "learning_rate": 4.5019943664151836e-05, |
| "loss": 0.003, |
| "num_input_tokens_seen": 383616, |
| "step": 1790 |
| }, |
| { |
| "epoch": 5.698412698412699, |
| "grad_norm": 0.0043928856030106544, |
| "learning_rate": 4.4978385203673845e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 384560, |
| "step": 1795 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "grad_norm": 0.11677252501249313, |
| "learning_rate": 4.493667341086379e-05, |
| "loss": 0.0065, |
| "num_input_tokens_seen": 385568, |
| "step": 1800 |
| }, |
| { |
| "epoch": 5.73015873015873, |
| "grad_norm": 0.7192163467407227, |
| "learning_rate": 4.4894808605855966e-05, |
| "loss": 0.0052, |
| "num_input_tokens_seen": 386672, |
| "step": 1805 |
| }, |
| { |
| "epoch": 5.746031746031746, |
| "grad_norm": 0.004437014926224947, |
| "learning_rate": 4.485279110995903e-05, |
| "loss": 0.0114, |
| "num_input_tokens_seen": 387712, |
| "step": 1810 |
| }, |
| { |
| "epoch": 5.761904761904762, |
| "grad_norm": 0.002189048333093524, |
| "learning_rate": 4.481062124565354e-05, |
| "loss": 0.0072, |
| "num_input_tokens_seen": 388752, |
| "step": 1815 |
| }, |
| { |
| "epoch": 5.777777777777778, |
| "grad_norm": 0.0018596505979076028, |
| "learning_rate": 4.476829933658946e-05, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 389744, |
| "step": 1820 |
| }, |
| { |
| "epoch": 5.7936507936507935, |
| "grad_norm": 0.004972801543772221, |
| "learning_rate": 4.472582570758367e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 390800, |
| "step": 1825 |
| }, |
| { |
| "epoch": 5.809523809523809, |
| "grad_norm": 0.3761378228664398, |
| "learning_rate": 4.4683200684617516e-05, |
| "loss": 0.0444, |
| "num_input_tokens_seen": 391872, |
| "step": 1830 |
| }, |
| { |
| "epoch": 5.825396825396825, |
| "grad_norm": 3.0954763889312744, |
| "learning_rate": 4.464042459483425e-05, |
| "loss": 0.0032, |
| "num_input_tokens_seen": 393056, |
| "step": 1835 |
| }, |
| { |
| "epoch": 5.841269841269841, |
| "grad_norm": 0.12228795886039734, |
| "learning_rate": 4.459749776653658e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 394096, |
| "step": 1840 |
| }, |
| { |
| "epoch": 5.857142857142857, |
| "grad_norm": 0.0031736583914607763, |
| "learning_rate": 4.455442052918408e-05, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 395200, |
| "step": 1845 |
| }, |
| { |
| "epoch": 5.8730158730158735, |
| "grad_norm": 6.396153450012207, |
| "learning_rate": 4.4511193213390736e-05, |
| "loss": 0.0638, |
| "num_input_tokens_seen": 396288, |
| "step": 1850 |
| }, |
| { |
| "epoch": 5.888888888888889, |
| "grad_norm": 9.804662704467773, |
| "learning_rate": 4.446781615092235e-05, |
| "loss": 0.0354, |
| "num_input_tokens_seen": 397344, |
| "step": 1855 |
| }, |
| { |
| "epoch": 5.904761904761905, |
| "grad_norm": 0.2461097687482834, |
| "learning_rate": 4.442428967469403e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 398480, |
| "step": 1860 |
| }, |
| { |
| "epoch": 5.920634920634921, |
| "grad_norm": 0.31156083941459656, |
| "learning_rate": 4.4380614118767604e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 399536, |
| "step": 1865 |
| }, |
| { |
| "epoch": 5.936507936507937, |
| "grad_norm": 1.9175351858139038, |
| "learning_rate": 4.43367898183491e-05, |
| "loss": 0.0077, |
| "num_input_tokens_seen": 400624, |
| "step": 1870 |
| }, |
| { |
| "epoch": 5.9523809523809526, |
| "grad_norm": 0.004446444101631641, |
| "learning_rate": 4.429281710978612e-05, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 401696, |
| "step": 1875 |
| }, |
| { |
| "epoch": 5.968253968253968, |
| "grad_norm": 0.01167643815279007, |
| "learning_rate": 4.4248696330565305e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 402768, |
| "step": 1880 |
| }, |
| { |
| "epoch": 5.984126984126984, |
| "grad_norm": 0.02273648977279663, |
| "learning_rate": 4.42044278193097e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 403872, |
| "step": 1885 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 0.07438544929027557, |
| "learning_rate": 4.4160011915776224e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 404880, |
| "step": 1890 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.11757393181324005, |
| "eval_runtime": 1.4812, |
| "eval_samples_per_second": 47.26, |
| "eval_steps_per_second": 23.63, |
| "num_input_tokens_seen": 404880, |
| "step": 1890 |
| }, |
| { |
| "epoch": 6.015873015873016, |
| "grad_norm": 0.0006745086866430938, |
| "learning_rate": 4.4115448960852965e-05, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 405952, |
| "step": 1895 |
| }, |
| { |
| "epoch": 6.031746031746032, |
| "grad_norm": 0.01215858943760395, |
| "learning_rate": 4.407073929655666e-05, |
| "loss": 0.0054, |
| "num_input_tokens_seen": 407040, |
| "step": 1900 |
| }, |
| { |
| "epoch": 6.0476190476190474, |
| "grad_norm": 0.0003449621726758778, |
| "learning_rate": 4.402588326603002e-05, |
| "loss": 0.003, |
| "num_input_tokens_seen": 408128, |
| "step": 1905 |
| }, |
| { |
| "epoch": 6.063492063492063, |
| "grad_norm": 0.0432710237801075, |
| "learning_rate": 4.398088121353907e-05, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 409168, |
| "step": 1910 |
| }, |
| { |
| "epoch": 6.079365079365079, |
| "grad_norm": 0.01051880232989788, |
| "learning_rate": 4.393573348447059e-05, |
| "loss": 0.0091, |
| "num_input_tokens_seen": 410208, |
| "step": 1915 |
| }, |
| { |
| "epoch": 6.095238095238095, |
| "grad_norm": 0.29065069556236267, |
| "learning_rate": 4.3890440425329367e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 411264, |
| "step": 1920 |
| }, |
| { |
| "epoch": 6.111111111111111, |
| "grad_norm": 0.0008527844329364598, |
| "learning_rate": 4.384500238373563e-05, |
| "loss": 0.0015, |
| "num_input_tokens_seen": 412272, |
| "step": 1925 |
| }, |
| { |
| "epoch": 6.1269841269841265, |
| "grad_norm": 0.02228499948978424, |
| "learning_rate": 4.37994197084223e-05, |
| "loss": 0.0691, |
| "num_input_tokens_seen": 413312, |
| "step": 1930 |
| }, |
| { |
| "epoch": 6.142857142857143, |
| "grad_norm": 0.06491630524396896, |
| "learning_rate": 4.375369274923237e-05, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 414352, |
| "step": 1935 |
| }, |
| { |
| "epoch": 6.158730158730159, |
| "grad_norm": 0.008396030403673649, |
| "learning_rate": 4.3707821857116176e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 415440, |
| "step": 1940 |
| }, |
| { |
| "epoch": 6.174603174603175, |
| "grad_norm": 0.4968828558921814, |
| "learning_rate": 4.366180738412876e-05, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 416576, |
| "step": 1945 |
| }, |
| { |
| "epoch": 6.190476190476191, |
| "grad_norm": 6.394617557525635, |
| "learning_rate": 4.3615649683427094e-05, |
| "loss": 0.0095, |
| "num_input_tokens_seen": 417680, |
| "step": 1950 |
| }, |
| { |
| "epoch": 6.2063492063492065, |
| "grad_norm": 0.004320154897868633, |
| "learning_rate": 4.356934910926746e-05, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 418784, |
| "step": 1955 |
| }, |
| { |
| "epoch": 6.222222222222222, |
| "grad_norm": 0.004840894602239132, |
| "learning_rate": 4.352290601700263e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 419824, |
| "step": 1960 |
| }, |
| { |
| "epoch": 6.238095238095238, |
| "grad_norm": 0.009394151158630848, |
| "learning_rate": 4.347632076307921e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 420912, |
| "step": 1965 |
| }, |
| { |
| "epoch": 6.253968253968254, |
| "grad_norm": 9.433398246765137, |
| "learning_rate": 4.3429593705034896e-05, |
| "loss": 0.0521, |
| "num_input_tokens_seen": 421968, |
| "step": 1970 |
| }, |
| { |
| "epoch": 6.26984126984127, |
| "grad_norm": 0.0032175371889024973, |
| "learning_rate": 4.3382725201495723e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 423040, |
| "step": 1975 |
| }, |
| { |
| "epoch": 6.285714285714286, |
| "grad_norm": 0.002436830196529627, |
| "learning_rate": 4.333571561217326e-05, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 424112, |
| "step": 1980 |
| }, |
| { |
| "epoch": 6.301587301587301, |
| "grad_norm": 0.001278755022212863, |
| "learning_rate": 4.328856529786196e-05, |
| "loss": 0.0071, |
| "num_input_tokens_seen": 425200, |
| "step": 1985 |
| }, |
| { |
| "epoch": 6.317460317460317, |
| "grad_norm": 0.0035431934520602226, |
| "learning_rate": 4.324127462043627e-05, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 426240, |
| "step": 1990 |
| }, |
| { |
| "epoch": 6.333333333333333, |
| "grad_norm": 0.05192156881093979, |
| "learning_rate": 4.319384394284797e-05, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 427328, |
| "step": 1995 |
| }, |
| { |
| "epoch": 6.349206349206349, |
| "grad_norm": 0.0009910666849464178, |
| "learning_rate": 4.314627362912327e-05, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 428320, |
| "step": 2000 |
| }, |
| { |
| "epoch": 6.365079365079365, |
| "grad_norm": 0.0481291264295578, |
| "learning_rate": 4.309856404436012e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 429360, |
| "step": 2005 |
| }, |
| { |
| "epoch": 6.380952380952381, |
| "grad_norm": 1.888258934020996, |
| "learning_rate": 4.305071555472534e-05, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 430432, |
| "step": 2010 |
| }, |
| { |
| "epoch": 6.396825396825397, |
| "grad_norm": 0.028912700712680817, |
| "learning_rate": 4.300272852745184e-05, |
| "loss": 0.0283, |
| "num_input_tokens_seen": 431488, |
| "step": 2015 |
| }, |
| { |
| "epoch": 6.412698412698413, |
| "grad_norm": 0.34435898065567017, |
| "learning_rate": 4.2954603330835794e-05, |
| "loss": 0.0064, |
| "num_input_tokens_seen": 432544, |
| "step": 2020 |
| }, |
| { |
| "epoch": 6.428571428571429, |
| "grad_norm": 0.0017212865641340613, |
| "learning_rate": 4.290634033423381e-05, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 433664, |
| "step": 2025 |
| }, |
| { |
| "epoch": 6.444444444444445, |
| "grad_norm": 0.38410279154777527, |
| "learning_rate": 4.2857939908060094e-05, |
| "loss": 0.0066, |
| "num_input_tokens_seen": 434784, |
| "step": 2030 |
| }, |
| { |
| "epoch": 6.4603174603174605, |
| "grad_norm": 0.005071816500276327, |
| "learning_rate": 4.2809402423783624e-05, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 435856, |
| "step": 2035 |
| }, |
| { |
| "epoch": 6.476190476190476, |
| "grad_norm": 0.0019863054621964693, |
| "learning_rate": 4.276072825392528e-05, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 436912, |
| "step": 2040 |
| }, |
| { |
| "epoch": 6.492063492063492, |
| "grad_norm": 0.0015548643423244357, |
| "learning_rate": 4.2711917772055e-05, |
| "loss": 0.025, |
| "num_input_tokens_seen": 437984, |
| "step": 2045 |
| }, |
| { |
| "epoch": 6.507936507936508, |
| "grad_norm": 0.008908271789550781, |
| "learning_rate": 4.2662971352788886e-05, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 439040, |
| "step": 2050 |
| }, |
| { |
| "epoch": 6.523809523809524, |
| "grad_norm": 5.70704984664917, |
| "learning_rate": 4.261388937178636e-05, |
| "loss": 0.0701, |
| "num_input_tokens_seen": 440160, |
| "step": 2055 |
| }, |
| { |
| "epoch": 6.5396825396825395, |
| "grad_norm": 0.027168529108166695, |
| "learning_rate": 4.256467220574728e-05, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 441264, |
| "step": 2060 |
| }, |
| { |
| "epoch": 6.555555555555555, |
| "grad_norm": 0.1720827966928482, |
| "learning_rate": 4.251532023240901e-05, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 442288, |
| "step": 2065 |
| }, |
| { |
| "epoch": 6.571428571428571, |
| "grad_norm": 0.4792938530445099, |
| "learning_rate": 4.246583383054357e-05, |
| "loss": 0.012, |
| "num_input_tokens_seen": 443344, |
| "step": 2070 |
| }, |
| { |
| "epoch": 6.587301587301587, |
| "grad_norm": 0.03523029014468193, |
| "learning_rate": 4.241621337995469e-05, |
| "loss": 0.0116, |
| "num_input_tokens_seen": 444480, |
| "step": 2075 |
| }, |
| { |
| "epoch": 6.603174603174603, |
| "grad_norm": 0.007401628885418177, |
| "learning_rate": 4.2366459261474933e-05, |
| "loss": 0.0536, |
| "num_input_tokens_seen": 445504, |
| "step": 2080 |
| }, |
| { |
| "epoch": 6.619047619047619, |
| "grad_norm": 0.007667475380003452, |
| "learning_rate": 4.2316571856962736e-05, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 446544, |
| "step": 2085 |
| }, |
| { |
| "epoch": 6.634920634920634, |
| "grad_norm": 0.042317815124988556, |
| "learning_rate": 4.2266551549299496e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 447632, |
| "step": 2090 |
| }, |
| { |
| "epoch": 6.650793650793651, |
| "grad_norm": 0.008923333138227463, |
| "learning_rate": 4.221639872238662e-05, |
| "loss": 0.0121, |
| "num_input_tokens_seen": 448672, |
| "step": 2095 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.0037640526425093412, |
| "learning_rate": 4.2166113761142626e-05, |
| "loss": 0.0126, |
| "num_input_tokens_seen": 449792, |
| "step": 2100 |
| }, |
| { |
| "epoch": 6.682539682539683, |
| "grad_norm": 1.3072272539138794, |
| "learning_rate": 4.2115697051500104e-05, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 450848, |
| "step": 2105 |
| }, |
| { |
| "epoch": 6.698412698412699, |
| "grad_norm": 0.001850523636676371, |
| "learning_rate": 4.2065148980402835e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 451952, |
| "step": 2110 |
| }, |
| { |
| "epoch": 6.714285714285714, |
| "grad_norm": 0.004209038335829973, |
| "learning_rate": 4.201446993580276e-05, |
| "loss": 0.0066, |
| "num_input_tokens_seen": 453008, |
| "step": 2115 |
| }, |
| { |
| "epoch": 6.73015873015873, |
| "grad_norm": 0.32531654834747314, |
| "learning_rate": 4.1963660306657074e-05, |
| "loss": 0.053, |
| "num_input_tokens_seen": 454080, |
| "step": 2120 |
| }, |
| { |
| "epoch": 6.746031746031746, |
| "grad_norm": 0.0027206395752727985, |
| "learning_rate": 4.191272048292513e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 455168, |
| "step": 2125 |
| }, |
| { |
| "epoch": 6.761904761904762, |
| "grad_norm": 0.0008674302371218801, |
| "learning_rate": 4.186165085556558e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 456272, |
| "step": 2130 |
| }, |
| { |
| "epoch": 6.777777777777778, |
| "grad_norm": 0.019082186743617058, |
| "learning_rate": 4.181045181653327e-05, |
| "loss": 0.0741, |
| "num_input_tokens_seen": 457328, |
| "step": 2135 |
| }, |
| { |
| "epoch": 6.7936507936507935, |
| "grad_norm": 0.0038378050085157156, |
| "learning_rate": 4.175912375877628e-05, |
| "loss": 0.0443, |
| "num_input_tokens_seen": 458416, |
| "step": 2140 |
| }, |
| { |
| "epoch": 6.809523809523809, |
| "grad_norm": 0.00021153379930183291, |
| "learning_rate": 4.170766707623289e-05, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 459520, |
| "step": 2145 |
| }, |
| { |
| "epoch": 6.825396825396825, |
| "grad_norm": 0.05300087854266167, |
| "learning_rate": 4.1656082163828566e-05, |
| "loss": 0.0099, |
| "num_input_tokens_seen": 460576, |
| "step": 2150 |
| }, |
| { |
| "epoch": 6.841269841269841, |
| "grad_norm": 0.0009460219880566001, |
| "learning_rate": 4.160436941747293e-05, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 461648, |
| "step": 2155 |
| }, |
| { |
| "epoch": 6.857142857142857, |
| "grad_norm": 5.004072666168213, |
| "learning_rate": 4.155252923405672e-05, |
| "loss": 0.0033, |
| "num_input_tokens_seen": 462672, |
| "step": 2160 |
| }, |
| { |
| "epoch": 6.8730158730158735, |
| "grad_norm": 0.0005544586456380785, |
| "learning_rate": 4.1500562011448744e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 463760, |
| "step": 2165 |
| }, |
| { |
| "epoch": 6.888888888888889, |
| "grad_norm": 2.4104061126708984, |
| "learning_rate": 4.144846814849282e-05, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 464784, |
| "step": 2170 |
| }, |
| { |
| "epoch": 6.904761904761905, |
| "grad_norm": 0.004637118428945541, |
| "learning_rate": 4.1396248045004703e-05, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 465808, |
| "step": 2175 |
| }, |
| { |
| "epoch": 6.920634920634921, |
| "grad_norm": 0.05216735601425171, |
| "learning_rate": 4.134390210176907e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 466896, |
| "step": 2180 |
| }, |
| { |
| "epoch": 6.936507936507937, |
| "grad_norm": 0.0004108586290385574, |
| "learning_rate": 4.129143072053638e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 468000, |
| "step": 2185 |
| }, |
| { |
| "epoch": 6.9523809523809526, |
| "grad_norm": 0.009106731042265892, |
| "learning_rate": 4.1238834304019825e-05, |
| "loss": 0.0033, |
| "num_input_tokens_seen": 469056, |
| "step": 2190 |
| }, |
| { |
| "epoch": 6.968253968253968, |
| "grad_norm": 0.0018852164503186941, |
| "learning_rate": 4.118611325589222e-05, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 470176, |
| "step": 2195 |
| }, |
| { |
| "epoch": 6.984126984126984, |
| "grad_norm": 0.1440330594778061, |
| "learning_rate": 4.113326798078294e-05, |
| "loss": 0.0058, |
| "num_input_tokens_seen": 471216, |
| "step": 2200 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 0.00032442359952256083, |
| "learning_rate": 4.108029888427476e-05, |
| "loss": 0.0416, |
| "num_input_tokens_seen": 472240, |
| "step": 2205 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.20159904658794403, |
| "eval_runtime": 1.4568, |
| "eval_samples_per_second": 48.05, |
| "eval_steps_per_second": 24.025, |
| "num_input_tokens_seen": 472240, |
| "step": 2205 |
| }, |
| { |
| "epoch": 7.015873015873016, |
| "grad_norm": 0.2510806918144226, |
| "learning_rate": 4.1027206372900816e-05, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 473312, |
| "step": 2210 |
| }, |
| { |
| "epoch": 7.031746031746032, |
| "grad_norm": 10.269424438476562, |
| "learning_rate": 4.09739908541414e-05, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 474416, |
| "step": 2215 |
| }, |
| { |
| "epoch": 7.0476190476190474, |
| "grad_norm": 7.632909774780273, |
| "learning_rate": 4.09206527364209e-05, |
| "loss": 0.0409, |
| "num_input_tokens_seen": 475504, |
| "step": 2220 |
| }, |
| { |
| "epoch": 7.063492063492063, |
| "grad_norm": 7.554219722747803, |
| "learning_rate": 4.0867192429104627e-05, |
| "loss": 0.0383, |
| "num_input_tokens_seen": 476512, |
| "step": 2225 |
| }, |
| { |
| "epoch": 7.079365079365079, |
| "grad_norm": 0.004015278071165085, |
| "learning_rate": 4.08136103424957e-05, |
| "loss": 0.0046, |
| "num_input_tokens_seen": 477648, |
| "step": 2230 |
| }, |
| { |
| "epoch": 7.095238095238095, |
| "grad_norm": 2.802708148956299, |
| "learning_rate": 4.075990688783185e-05, |
| "loss": 0.0081, |
| "num_input_tokens_seen": 478768, |
| "step": 2235 |
| }, |
| { |
| "epoch": 7.111111111111111, |
| "grad_norm": 0.0031349605415016413, |
| "learning_rate": 4.070608247728236e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 479840, |
| "step": 2240 |
| }, |
| { |
| "epoch": 7.1269841269841265, |
| "grad_norm": 0.017263107001781464, |
| "learning_rate": 4.065213752394478e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 480928, |
| "step": 2245 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "grad_norm": 0.0022108752746134996, |
| "learning_rate": 4.059807244184183e-05, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 481984, |
| "step": 2250 |
| }, |
| { |
| "epoch": 7.158730158730159, |
| "grad_norm": 0.007918480783700943, |
| "learning_rate": 4.054388764591822e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 483104, |
| "step": 2255 |
| }, |
| { |
| "epoch": 7.174603174603175, |
| "grad_norm": 0.06933493167161942, |
| "learning_rate": 4.048958355203746e-05, |
| "loss": 0.0088, |
| "num_input_tokens_seen": 484240, |
| "step": 2260 |
| }, |
| { |
| "epoch": 7.190476190476191, |
| "grad_norm": 0.16983921825885773, |
| "learning_rate": 4.043516057697862e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 485328, |
| "step": 2265 |
| }, |
| { |
| "epoch": 7.2063492063492065, |
| "grad_norm": 0.004488496109843254, |
| "learning_rate": 4.038061913843322e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 486432, |
| "step": 2270 |
| }, |
| { |
| "epoch": 7.222222222222222, |
| "grad_norm": 0.0012461389414966106, |
| "learning_rate": 4.032595965500195e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 487520, |
| "step": 2275 |
| }, |
| { |
| "epoch": 7.238095238095238, |
| "grad_norm": 0.0025354893878102303, |
| "learning_rate": 4.02711825461915e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 488576, |
| "step": 2280 |
| }, |
| { |
| "epoch": 7.253968253968254, |
| "grad_norm": 0.16530795395374298, |
| "learning_rate": 4.0216288232411296e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 489648, |
| "step": 2285 |
| }, |
| { |
| "epoch": 7.26984126984127, |
| "grad_norm": 0.09864303469657898, |
| "learning_rate": 4.0161277134970345e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 490736, |
| "step": 2290 |
| }, |
| { |
| "epoch": 7.285714285714286, |
| "grad_norm": 0.24059996008872986, |
| "learning_rate": 4.010614967607391e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 491792, |
| "step": 2295 |
| }, |
| { |
| "epoch": 7.301587301587301, |
| "grad_norm": 0.06463675945997238, |
| "learning_rate": 4.005090627882035e-05, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 492832, |
| "step": 2300 |
| }, |
| { |
| "epoch": 7.317460317460317, |
| "grad_norm": 0.0006509863305836916, |
| "learning_rate": 3.9995547367197845e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 493824, |
| "step": 2305 |
| }, |
| { |
| "epoch": 7.333333333333333, |
| "grad_norm": 3.077014684677124, |
| "learning_rate": 3.9940073366081114e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 494880, |
| "step": 2310 |
| }, |
| { |
| "epoch": 7.349206349206349, |
| "grad_norm": 0.0008826262201182544, |
| "learning_rate": 3.988448470122819e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 495952, |
| "step": 2315 |
| }, |
| { |
| "epoch": 7.365079365079365, |
| "grad_norm": 1.0985521078109741, |
| "learning_rate": 3.982878179927714e-05, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 497024, |
| "step": 2320 |
| }, |
| { |
| "epoch": 7.380952380952381, |
| "grad_norm": 0.0007766135386191308, |
| "learning_rate": 3.977296508774278e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 498048, |
| "step": 2325 |
| }, |
| { |
| "epoch": 7.396825396825397, |
| "grad_norm": 0.001252246554940939, |
| "learning_rate": 3.971703499501344e-05, |
| "loss": 0.002, |
| "num_input_tokens_seen": 499120, |
| "step": 2330 |
| }, |
| { |
| "epoch": 7.412698412698413, |
| "grad_norm": 5.723186016082764, |
| "learning_rate": 3.9660991950347576e-05, |
| "loss": 0.0089, |
| "num_input_tokens_seen": 500160, |
| "step": 2335 |
| }, |
| { |
| "epoch": 7.428571428571429, |
| "grad_norm": 0.0003589835832826793, |
| "learning_rate": 3.960483638387061e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 501264, |
| "step": 2340 |
| }, |
| { |
| "epoch": 7.444444444444445, |
| "grad_norm": 5.556662082672119, |
| "learning_rate": 3.954856872657151e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 502384, |
| "step": 2345 |
| }, |
| { |
| "epoch": 7.4603174603174605, |
| "grad_norm": 0.05227256566286087, |
| "learning_rate": 3.9492189410299566e-05, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 503456, |
| "step": 2350 |
| }, |
| { |
| "epoch": 7.476190476190476, |
| "grad_norm": 0.0018900517607107759, |
| "learning_rate": 3.9435698867760996e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 504480, |
| "step": 2355 |
| }, |
| { |
| "epoch": 7.492063492063492, |
| "grad_norm": 19.124439239501953, |
| "learning_rate": 3.9379097532515725e-05, |
| "loss": 0.0634, |
| "num_input_tokens_seen": 505584, |
| "step": 2360 |
| }, |
| { |
| "epoch": 7.507936507936508, |
| "grad_norm": 0.006673491094261408, |
| "learning_rate": 3.932238583897395e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 506640, |
| "step": 2365 |
| }, |
| { |
| "epoch": 7.523809523809524, |
| "grad_norm": 0.0003134336438961327, |
| "learning_rate": 3.9265564222392905e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 507696, |
| "step": 2370 |
| }, |
| { |
| "epoch": 7.5396825396825395, |
| "grad_norm": 0.0006025207112543285, |
| "learning_rate": 3.920863311887344e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 508688, |
| "step": 2375 |
| }, |
| { |
| "epoch": 7.555555555555555, |
| "grad_norm": 0.07202655076980591, |
| "learning_rate": 3.9151592965356705e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 509792, |
| "step": 2380 |
| }, |
| { |
| "epoch": 7.571428571428571, |
| "grad_norm": 0.012028384022414684, |
| "learning_rate": 3.909444419962083e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 510880, |
| "step": 2385 |
| }, |
| { |
| "epoch": 7.587301587301587, |
| "grad_norm": 0.031248344108462334, |
| "learning_rate": 3.9037187260277515e-05, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 511888, |
| "step": 2390 |
| }, |
| { |
| "epoch": 7.603174603174603, |
| "grad_norm": 6.459189414978027, |
| "learning_rate": 3.897982258676867e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 513008, |
| "step": 2395 |
| }, |
| { |
| "epoch": 7.619047619047619, |
| "grad_norm": 4.979741096496582, |
| "learning_rate": 3.892235061936309e-05, |
| "loss": 0.0406, |
| "num_input_tokens_seen": 514048, |
| "step": 2400 |
| }, |
| { |
| "epoch": 7.634920634920634, |
| "grad_norm": 4.767922401428223, |
| "learning_rate": 3.886477179915301e-05, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 515200, |
| "step": 2405 |
| }, |
| { |
| "epoch": 7.650793650793651, |
| "grad_norm": 0.14602185785770416, |
| "learning_rate": 3.880708656805075e-05, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 516224, |
| "step": 2410 |
| }, |
| { |
| "epoch": 7.666666666666667, |
| "grad_norm": 0.0051375702023506165, |
| "learning_rate": 3.874929536878536e-05, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 517392, |
| "step": 2415 |
| }, |
| { |
| "epoch": 7.682539682539683, |
| "grad_norm": 7.391900539398193, |
| "learning_rate": 3.869139864489915e-05, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 518464, |
| "step": 2420 |
| }, |
| { |
| "epoch": 7.698412698412699, |
| "grad_norm": 7.830747604370117, |
| "learning_rate": 3.863339684074432e-05, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 519520, |
| "step": 2425 |
| }, |
| { |
| "epoch": 7.714285714285714, |
| "grad_norm": 0.024749888107180595, |
| "learning_rate": 3.8575290401479586e-05, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 520640, |
| "step": 2430 |
| }, |
| { |
| "epoch": 7.73015873015873, |
| "grad_norm": 0.36452969908714294, |
| "learning_rate": 3.85170797730667e-05, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 521648, |
| "step": 2435 |
| }, |
| { |
| "epoch": 7.746031746031746, |
| "grad_norm": 0.03560644015669823, |
| "learning_rate": 3.845876540226706e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 522720, |
| "step": 2440 |
| }, |
| { |
| "epoch": 7.761904761904762, |
| "grad_norm": 0.005877711810171604, |
| "learning_rate": 3.840034773663829e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 523808, |
| "step": 2445 |
| }, |
| { |
| "epoch": 7.777777777777778, |
| "grad_norm": 0.0006981467013247311, |
| "learning_rate": 3.834182722453079e-05, |
| "loss": 0.0082, |
| "num_input_tokens_seen": 524864, |
| "step": 2450 |
| }, |
| { |
| "epoch": 7.7936507936507935, |
| "grad_norm": 0.06691333651542664, |
| "learning_rate": 3.828320431508429e-05, |
| "loss": 0.0113, |
| "num_input_tokens_seen": 525952, |
| "step": 2455 |
| }, |
| { |
| "epoch": 7.809523809523809, |
| "grad_norm": 0.7026005387306213, |
| "learning_rate": 3.8224479458224396e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 527008, |
| "step": 2460 |
| }, |
| { |
| "epoch": 7.825396825396825, |
| "grad_norm": 0.0011852675816044211, |
| "learning_rate": 3.8165653104659185e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 528000, |
| "step": 2465 |
| }, |
| { |
| "epoch": 7.841269841269841, |
| "grad_norm": 0.0029516047798097134, |
| "learning_rate": 3.81067257058757e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 529088, |
| "step": 2470 |
| }, |
| { |
| "epoch": 7.857142857142857, |
| "grad_norm": 0.2371506243944168, |
| "learning_rate": 3.804769771413649e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 530144, |
| "step": 2475 |
| }, |
| { |
| "epoch": 7.8730158730158735, |
| "grad_norm": 0.015095270238816738, |
| "learning_rate": 3.7988569582476144e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 531232, |
| "step": 2480 |
| }, |
| { |
| "epoch": 7.888888888888889, |
| "grad_norm": 0.0005210601957514882, |
| "learning_rate": 3.7929341764697816e-05, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 532240, |
| "step": 2485 |
| }, |
| { |
| "epoch": 7.904761904761905, |
| "grad_norm": 0.07286439836025238, |
| "learning_rate": 3.787001471536976e-05, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 533280, |
| "step": 2490 |
| }, |
| { |
| "epoch": 7.920634920634921, |
| "grad_norm": 0.004520993679761887, |
| "learning_rate": 3.78105888898218e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 534352, |
| "step": 2495 |
| }, |
| { |
| "epoch": 7.936507936507937, |
| "grad_norm": 0.013191165402531624, |
| "learning_rate": 3.775106474414188e-05, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 535392, |
| "step": 2500 |
| }, |
| { |
| "epoch": 7.9523809523809526, |
| "grad_norm": 0.02418154664337635, |
| "learning_rate": 3.769144273517253e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 536496, |
| "step": 2505 |
| }, |
| { |
| "epoch": 7.968253968253968, |
| "grad_norm": 0.009249486960470676, |
| "learning_rate": 3.7631723320507364e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 537552, |
| "step": 2510 |
| }, |
| { |
| "epoch": 7.984126984126984, |
| "grad_norm": 0.002061500446870923, |
| "learning_rate": 3.7571906958487584e-05, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 538656, |
| "step": 2515 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 0.028927626088261604, |
| "learning_rate": 3.751199410819847e-05, |
| "loss": 0.0112, |
| "num_input_tokens_seen": 539744, |
| "step": 2520 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.13668464124202728, |
| "eval_runtime": 1.4554, |
| "eval_samples_per_second": 48.095, |
| "eval_steps_per_second": 24.048, |
| "num_input_tokens_seen": 539744, |
| "step": 2520 |
| }, |
| { |
| "epoch": 8.015873015873016, |
| "grad_norm": 0.0014253915287554264, |
| "learning_rate": 3.745198522946582e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 540800, |
| "step": 2525 |
| }, |
| { |
| "epoch": 8.031746031746032, |
| "grad_norm": 0.07428203523159027, |
| "learning_rate": 3.739188078285244e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 541936, |
| "step": 2530 |
| }, |
| { |
| "epoch": 8.047619047619047, |
| "grad_norm": 6.837933540344238, |
| "learning_rate": 3.7331681229654635e-05, |
| "loss": 0.01, |
| "num_input_tokens_seen": 543040, |
| "step": 2535 |
| }, |
| { |
| "epoch": 8.063492063492063, |
| "grad_norm": 0.0031517180614173412, |
| "learning_rate": 3.727138703189862e-05, |
| "loss": 0.0068, |
| "num_input_tokens_seen": 544016, |
| "step": 2540 |
| }, |
| { |
| "epoch": 8.079365079365079, |
| "grad_norm": 0.00036266801180318, |
| "learning_rate": 3.721099865233701e-05, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 545120, |
| "step": 2545 |
| }, |
| { |
| "epoch": 8.095238095238095, |
| "grad_norm": 0.00027236941969022155, |
| "learning_rate": 3.7150516554445256e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 546256, |
| "step": 2550 |
| }, |
| { |
| "epoch": 8.11111111111111, |
| "grad_norm": 0.00025823916075751185, |
| "learning_rate": 3.708994120241809e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 547344, |
| "step": 2555 |
| }, |
| { |
| "epoch": 8.126984126984127, |
| "grad_norm": 0.0003119041211903095, |
| "learning_rate": 3.702927306116595e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 548352, |
| "step": 2560 |
| }, |
| { |
| "epoch": 8.142857142857142, |
| "grad_norm": 0.00040353136137127876, |
| "learning_rate": 3.6968512596311435e-05, |
| "loss": 0.0102, |
| "num_input_tokens_seen": 549328, |
| "step": 2565 |
| }, |
| { |
| "epoch": 8.158730158730158, |
| "grad_norm": 0.0004562221292871982, |
| "learning_rate": 3.690766027418573e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 550416, |
| "step": 2570 |
| }, |
| { |
| "epoch": 8.174603174603174, |
| "grad_norm": 0.006335284095257521, |
| "learning_rate": 3.6846716561824965e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 551504, |
| "step": 2575 |
| }, |
| { |
| "epoch": 8.19047619047619, |
| "grad_norm": 0.0002137407718691975, |
| "learning_rate": 3.678568192696677e-05, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 552640, |
| "step": 2580 |
| }, |
| { |
| "epoch": 8.206349206349206, |
| "grad_norm": 0.000498905370477587, |
| "learning_rate": 3.672455683804651e-05, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 553712, |
| "step": 2585 |
| }, |
| { |
| "epoch": 8.222222222222221, |
| "grad_norm": 0.0003141605411656201, |
| "learning_rate": 3.6663341764193834e-05, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 554800, |
| "step": 2590 |
| }, |
| { |
| "epoch": 8.238095238095237, |
| "grad_norm": 3.5881121158599854, |
| "learning_rate": 3.6602037175228986e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 555824, |
| "step": 2595 |
| }, |
| { |
| "epoch": 8.253968253968253, |
| "grad_norm": 0.008271710947155952, |
| "learning_rate": 3.6540643541659245e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 556944, |
| "step": 2600 |
| }, |
| { |
| "epoch": 8.26984126984127, |
| "grad_norm": 0.0003118402964901179, |
| "learning_rate": 3.6479161334675296e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 557968, |
| "step": 2605 |
| }, |
| { |
| "epoch": 8.285714285714286, |
| "grad_norm": 0.010142582468688488, |
| "learning_rate": 3.641759102614761e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 559104, |
| "step": 2610 |
| }, |
| { |
| "epoch": 8.301587301587302, |
| "grad_norm": 0.0003936065186280757, |
| "learning_rate": 3.6355933088622854e-05, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 560128, |
| "step": 2615 |
| }, |
| { |
| "epoch": 8.317460317460318, |
| "grad_norm": 0.274812787771225, |
| "learning_rate": 3.6294187995320214e-05, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 561248, |
| "step": 2620 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.00024725758703425527, |
| "learning_rate": 3.6232356220127785e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 562320, |
| "step": 2625 |
| }, |
| { |
| "epoch": 8.34920634920635, |
| "grad_norm": 0.0018928394420072436, |
| "learning_rate": 3.617043823759897e-05, |
| "loss": 0.0205, |
| "num_input_tokens_seen": 563392, |
| "step": 2630 |
| }, |
| { |
| "epoch": 8.365079365079366, |
| "grad_norm": 0.03286740928888321, |
| "learning_rate": 3.610843452294877e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 564480, |
| "step": 2635 |
| }, |
| { |
| "epoch": 8.380952380952381, |
| "grad_norm": 4.7357635498046875, |
| "learning_rate": 3.60463455520502e-05, |
| "loss": 0.0121, |
| "num_input_tokens_seen": 565616, |
| "step": 2640 |
| }, |
| { |
| "epoch": 8.396825396825397, |
| "grad_norm": 0.004558645188808441, |
| "learning_rate": 3.598417180143058e-05, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 566704, |
| "step": 2645 |
| }, |
| { |
| "epoch": 8.412698412698413, |
| "grad_norm": 0.14909231662750244, |
| "learning_rate": 3.5921913748267945e-05, |
| "loss": 0.0276, |
| "num_input_tokens_seen": 567744, |
| "step": 2650 |
| }, |
| { |
| "epoch": 8.428571428571429, |
| "grad_norm": 0.00042908909381367266, |
| "learning_rate": 3.5859571870387304e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 568816, |
| "step": 2655 |
| }, |
| { |
| "epoch": 8.444444444444445, |
| "grad_norm": 0.012058122083544731, |
| "learning_rate": 3.579714664625706e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 569872, |
| "step": 2660 |
| }, |
| { |
| "epoch": 8.46031746031746, |
| "grad_norm": 0.11031365394592285, |
| "learning_rate": 3.5734638554985236e-05, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 570976, |
| "step": 2665 |
| }, |
| { |
| "epoch": 8.476190476190476, |
| "grad_norm": 1.1275529861450195, |
| "learning_rate": 3.567204807631589e-05, |
| "loss": 0.0091, |
| "num_input_tokens_seen": 572064, |
| "step": 2670 |
| }, |
| { |
| "epoch": 8.492063492063492, |
| "grad_norm": 0.0019642009865492582, |
| "learning_rate": 3.560937569062538e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 573152, |
| "step": 2675 |
| }, |
| { |
| "epoch": 8.507936507936508, |
| "grad_norm": 0.027548756450414658, |
| "learning_rate": 3.554662187891873e-05, |
| "loss": 0.0057, |
| "num_input_tokens_seen": 574224, |
| "step": 2680 |
| }, |
| { |
| "epoch": 8.523809523809524, |
| "grad_norm": 0.0007244048174470663, |
| "learning_rate": 3.548378712282584e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 575360, |
| "step": 2685 |
| }, |
| { |
| "epoch": 8.53968253968254, |
| "grad_norm": 2.4872758388519287, |
| "learning_rate": 3.5420871904597895e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 576416, |
| "step": 2690 |
| }, |
| { |
| "epoch": 8.555555555555555, |
| "grad_norm": 0.05758047476410866, |
| "learning_rate": 3.5357876707103596e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 577440, |
| "step": 2695 |
| }, |
| { |
| "epoch": 8.571428571428571, |
| "grad_norm": 3.4211556911468506, |
| "learning_rate": 3.529480201382551e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 578560, |
| "step": 2700 |
| }, |
| { |
| "epoch": 8.587301587301587, |
| "grad_norm": 0.19005665183067322, |
| "learning_rate": 3.523164830885629e-05, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 579600, |
| "step": 2705 |
| }, |
| { |
| "epoch": 8.603174603174603, |
| "grad_norm": 0.00690614664927125, |
| "learning_rate": 3.516841607689501e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 580672, |
| "step": 2710 |
| }, |
| { |
| "epoch": 8.619047619047619, |
| "grad_norm": 0.0064396848902106285, |
| "learning_rate": 3.510510580324344e-05, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 581728, |
| "step": 2715 |
| }, |
| { |
| "epoch": 8.634920634920634, |
| "grad_norm": 0.004690147936344147, |
| "learning_rate": 3.504171797380231e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 582752, |
| "step": 2720 |
| }, |
| { |
| "epoch": 8.65079365079365, |
| "grad_norm": 0.1530577689409256, |
| "learning_rate": 3.497825307506758e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 583856, |
| "step": 2725 |
| }, |
| { |
| "epoch": 8.666666666666666, |
| "grad_norm": 0.0008610127260908484, |
| "learning_rate": 3.491471159412672e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 584960, |
| "step": 2730 |
| }, |
| { |
| "epoch": 8.682539682539682, |
| "grad_norm": 0.001741685438901186, |
| "learning_rate": 3.485109401865493e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 586032, |
| "step": 2735 |
| }, |
| { |
| "epoch": 8.698412698412698, |
| "grad_norm": 1.3038556575775146, |
| "learning_rate": 3.478740083691147e-05, |
| "loss": 0.0104, |
| "num_input_tokens_seen": 587088, |
| "step": 2740 |
| }, |
| { |
| "epoch": 8.714285714285714, |
| "grad_norm": 0.11273951083421707, |
| "learning_rate": 3.4723632537735846e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 588128, |
| "step": 2745 |
| }, |
| { |
| "epoch": 8.73015873015873, |
| "grad_norm": 0.0002958101104013622, |
| "learning_rate": 3.46597896105441e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 589152, |
| "step": 2750 |
| }, |
| { |
| "epoch": 8.746031746031747, |
| "grad_norm": 0.007939610630273819, |
| "learning_rate": 3.459587254532502e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 590208, |
| "step": 2755 |
| }, |
| { |
| "epoch": 8.761904761904763, |
| "grad_norm": 0.0005917858798056841, |
| "learning_rate": 3.453188183263639e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 591344, |
| "step": 2760 |
| }, |
| { |
| "epoch": 8.777777777777779, |
| "grad_norm": 0.0001375034626107663, |
| "learning_rate": 3.4467817963601264e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 592384, |
| "step": 2765 |
| }, |
| { |
| "epoch": 8.793650793650794, |
| "grad_norm": 0.010883470997214317, |
| "learning_rate": 3.440368142990416e-05, |
| "loss": 0.0105, |
| "num_input_tokens_seen": 593472, |
| "step": 2770 |
| }, |
| { |
| "epoch": 8.80952380952381, |
| "grad_norm": 0.0002069953188765794, |
| "learning_rate": 3.433947272378726e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 594560, |
| "step": 2775 |
| }, |
| { |
| "epoch": 8.825396825396826, |
| "grad_norm": 0.003037715097889304, |
| "learning_rate": 3.427519233804667e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 595664, |
| "step": 2780 |
| }, |
| { |
| "epoch": 8.841269841269842, |
| "grad_norm": 0.002563037909567356, |
| "learning_rate": 3.421084076602867e-05, |
| "loss": 0.0096, |
| "num_input_tokens_seen": 596800, |
| "step": 2785 |
| }, |
| { |
| "epoch": 8.857142857142858, |
| "grad_norm": 0.007072729524224997, |
| "learning_rate": 3.414641850162584e-05, |
| "loss": 0.002, |
| "num_input_tokens_seen": 597904, |
| "step": 2790 |
| }, |
| { |
| "epoch": 8.873015873015873, |
| "grad_norm": 0.00026764694484882057, |
| "learning_rate": 3.408192603927334e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 598960, |
| "step": 2795 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 0.04823247343301773, |
| "learning_rate": 3.40173638739451e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 600080, |
| "step": 2800 |
| }, |
| { |
| "epoch": 8.904761904761905, |
| "grad_norm": 0.00041813074494712055, |
| "learning_rate": 3.395273250114999e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 601104, |
| "step": 2805 |
| }, |
| { |
| "epoch": 8.920634920634921, |
| "grad_norm": 0.011840839870274067, |
| "learning_rate": 3.388803241692807e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 602160, |
| "step": 2810 |
| }, |
| { |
| "epoch": 8.936507936507937, |
| "grad_norm": 0.00030943809542804956, |
| "learning_rate": 3.382326411784672e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 603184, |
| "step": 2815 |
| }, |
| { |
| "epoch": 8.952380952380953, |
| "grad_norm": 0.0344560369849205, |
| "learning_rate": 3.375842810099692e-05, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 604208, |
| "step": 2820 |
| }, |
| { |
| "epoch": 8.968253968253968, |
| "grad_norm": 0.12215026468038559, |
| "learning_rate": 3.36935248639893e-05, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 605344, |
| "step": 2825 |
| }, |
| { |
| "epoch": 8.984126984126984, |
| "grad_norm": 0.009739014320075512, |
| "learning_rate": 3.362855490495047e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 606416, |
| "step": 2830 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 0.05252106115221977, |
| "learning_rate": 3.356351872251908e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 607456, |
| "step": 2835 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.15105664730072021, |
| "eval_runtime": 1.4521, |
| "eval_samples_per_second": 48.206, |
| "eval_steps_per_second": 24.103, |
| "num_input_tokens_seen": 607456, |
| "step": 2835 |
| }, |
| { |
| "epoch": 9.015873015873016, |
| "grad_norm": 0.02688731625676155, |
| "learning_rate": 3.349841681584206e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 608528, |
| "step": 2840 |
| }, |
| { |
| "epoch": 9.031746031746032, |
| "grad_norm": 0.0004183761775493622, |
| "learning_rate": 3.343324968457076e-05, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 609584, |
| "step": 2845 |
| }, |
| { |
| "epoch": 9.047619047619047, |
| "grad_norm": 0.000365029409294948, |
| "learning_rate": 3.336801782885712e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 610640, |
| "step": 2850 |
| }, |
| { |
| "epoch": 9.063492063492063, |
| "grad_norm": 0.00026122824056074023, |
| "learning_rate": 3.3302721749349834e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 611680, |
| "step": 2855 |
| }, |
| { |
| "epoch": 9.079365079365079, |
| "grad_norm": 0.00021482273587025702, |
| "learning_rate": 3.3237361947190536e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 612736, |
| "step": 2860 |
| }, |
| { |
| "epoch": 9.095238095238095, |
| "grad_norm": 0.00018885769532062113, |
| "learning_rate": 3.317193892400988e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 613808, |
| "step": 2865 |
| }, |
| { |
| "epoch": 9.11111111111111, |
| "grad_norm": 0.00014850927982479334, |
| "learning_rate": 3.310645318192378e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 614880, |
| "step": 2870 |
| }, |
| { |
| "epoch": 9.126984126984127, |
| "grad_norm": 0.1906156688928604, |
| "learning_rate": 3.304090522352946e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 616000, |
| "step": 2875 |
| }, |
| { |
| "epoch": 9.142857142857142, |
| "grad_norm": 0.14535604417324066, |
| "learning_rate": 3.2975295551901714e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 617072, |
| "step": 2880 |
| }, |
| { |
| "epoch": 9.158730158730158, |
| "grad_norm": 0.0005936230882070959, |
| "learning_rate": 3.290962467058891e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 618080, |
| "step": 2885 |
| }, |
| { |
| "epoch": 9.174603174603174, |
| "grad_norm": 0.00011047060252167284, |
| "learning_rate": 3.284389308360927e-05, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 619152, |
| "step": 2890 |
| }, |
| { |
| "epoch": 9.19047619047619, |
| "grad_norm": 0.0350705049932003, |
| "learning_rate": 3.277810129544685e-05, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 620224, |
| "step": 2895 |
| }, |
| { |
| "epoch": 9.206349206349206, |
| "grad_norm": 8.115587115753442e-05, |
| "learning_rate": 3.2712249811047785e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 621312, |
| "step": 2900 |
| }, |
| { |
| "epoch": 9.222222222222221, |
| "grad_norm": 0.00012116412835894153, |
| "learning_rate": 3.2646339135816386e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 622400, |
| "step": 2905 |
| }, |
| { |
| "epoch": 9.238095238095237, |
| "grad_norm": 0.0076861935667693615, |
| "learning_rate": 3.258036977561123e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 623520, |
| "step": 2910 |
| }, |
| { |
| "epoch": 9.253968253968253, |
| "grad_norm": 0.0010363998590037227, |
| "learning_rate": 3.251434223674129e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 624624, |
| "step": 2915 |
| }, |
| { |
| "epoch": 9.26984126984127, |
| "grad_norm": 0.0008949214825406671, |
| "learning_rate": 3.244825702596205e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 625712, |
| "step": 2920 |
| }, |
| { |
| "epoch": 9.285714285714286, |
| "grad_norm": 0.002219531685113907, |
| "learning_rate": 3.238211465047166e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 626784, |
| "step": 2925 |
| }, |
| { |
| "epoch": 9.301587301587302, |
| "grad_norm": 0.00024516129633411765, |
| "learning_rate": 3.231591561790696e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 627872, |
| "step": 2930 |
| }, |
| { |
| "epoch": 9.317460317460318, |
| "grad_norm": 0.00031894820858724415, |
| "learning_rate": 3.224966043633966e-05, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 628992, |
| "step": 2935 |
| }, |
| { |
| "epoch": 9.333333333333334, |
| "grad_norm": 0.0001425828959327191, |
| "learning_rate": 3.2183349614272374e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 630048, |
| "step": 2940 |
| }, |
| { |
| "epoch": 9.34920634920635, |
| "grad_norm": 0.00026056909700855613, |
| "learning_rate": 3.2116983660634787e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 631120, |
| "step": 2945 |
| }, |
| { |
| "epoch": 9.365079365079366, |
| "grad_norm": 0.00025601257220841944, |
| "learning_rate": 3.205056308477969e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 632128, |
| "step": 2950 |
| }, |
| { |
| "epoch": 9.380952380952381, |
| "grad_norm": 9.780770051293075e-05, |
| "learning_rate": 3.198408839647911e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 633136, |
| "step": 2955 |
| }, |
| { |
| "epoch": 9.396825396825397, |
| "grad_norm": 0.00019243262067902833, |
| "learning_rate": 3.191756010592038e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 634208, |
| "step": 2960 |
| }, |
| { |
| "epoch": 9.412698412698413, |
| "grad_norm": 0.0024152263067662716, |
| "learning_rate": 3.185097872370221e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 635312, |
| "step": 2965 |
| }, |
| { |
| "epoch": 9.428571428571429, |
| "grad_norm": 0.0002794755273498595, |
| "learning_rate": 3.17843447608308e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 636336, |
| "step": 2970 |
| }, |
| { |
| "epoch": 9.444444444444445, |
| "grad_norm": 0.10732641071081161, |
| "learning_rate": 3.17176587287159e-05, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 637472, |
| "step": 2975 |
| }, |
| { |
| "epoch": 9.46031746031746, |
| "grad_norm": 0.0006334662321023643, |
| "learning_rate": 3.165092113916688e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 638576, |
| "step": 2980 |
| }, |
| { |
| "epoch": 9.476190476190476, |
| "grad_norm": 0.000548696902114898, |
| "learning_rate": 3.158413250438882e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 639584, |
| "step": 2985 |
| }, |
| { |
| "epoch": 9.492063492063492, |
| "grad_norm": 0.00021050528448540717, |
| "learning_rate": 3.151729333697854e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 640640, |
| "step": 2990 |
| }, |
| { |
| "epoch": 9.507936507936508, |
| "grad_norm": 0.0002193010732298717, |
| "learning_rate": 3.1450404149920736e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 641696, |
| "step": 2995 |
| }, |
| { |
| "epoch": 9.523809523809524, |
| "grad_norm": 0.7367925643920898, |
| "learning_rate": 3.138346545658397e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 642816, |
| "step": 3000 |
| }, |
| { |
| "epoch": 9.53968253968254, |
| "grad_norm": 0.11529196798801422, |
| "learning_rate": 3.131647777071677e-05, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 643920, |
| "step": 3005 |
| }, |
| { |
| "epoch": 9.555555555555555, |
| "grad_norm": 8.780050120549276e-05, |
| "learning_rate": 3.1249441606443665e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 644944, |
| "step": 3010 |
| }, |
| { |
| "epoch": 9.571428571428571, |
| "grad_norm": 0.14377683401107788, |
| "learning_rate": 3.1182357478261274e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 645968, |
| "step": 3015 |
| }, |
| { |
| "epoch": 9.587301587301587, |
| "grad_norm": 0.00015247806732077152, |
| "learning_rate": 3.111522590103432e-05, |
| "loss": 0.0251, |
| "num_input_tokens_seen": 647040, |
| "step": 3020 |
| }, |
| { |
| "epoch": 9.603174603174603, |
| "grad_norm": 0.00019754045933950692, |
| "learning_rate": 3.104804738999169e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 648112, |
| "step": 3025 |
| }, |
| { |
| "epoch": 9.619047619047619, |
| "grad_norm": 0.00045227553346194327, |
| "learning_rate": 3.0980822460722504e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 649168, |
| "step": 3030 |
| }, |
| { |
| "epoch": 9.634920634920634, |
| "grad_norm": 0.14106620848178864, |
| "learning_rate": 3.091355162917211e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 650192, |
| "step": 3035 |
| }, |
| { |
| "epoch": 9.65079365079365, |
| "grad_norm": 6.160133361816406, |
| "learning_rate": 3.084623541163817e-05, |
| "loss": 0.0301, |
| "num_input_tokens_seen": 651280, |
| "step": 3040 |
| }, |
| { |
| "epoch": 9.666666666666666, |
| "grad_norm": 0.0001889690029202029, |
| "learning_rate": 3.0778874324766676e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 652288, |
| "step": 3045 |
| }, |
| { |
| "epoch": 9.682539682539682, |
| "grad_norm": 0.001083326991647482, |
| "learning_rate": 3.071146888554799e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 653408, |
| "step": 3050 |
| }, |
| { |
| "epoch": 9.698412698412698, |
| "grad_norm": 0.002526765689253807, |
| "learning_rate": 3.0644019611312865e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 654496, |
| "step": 3055 |
| }, |
| { |
| "epoch": 9.714285714285714, |
| "grad_norm": 0.0024127070792019367, |
| "learning_rate": 3.057652701972848e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 655632, |
| "step": 3060 |
| }, |
| { |
| "epoch": 9.73015873015873, |
| "grad_norm": 0.0006554791470989585, |
| "learning_rate": 3.050899162879451e-05, |
| "loss": 0.0202, |
| "num_input_tokens_seen": 656720, |
| "step": 3065 |
| }, |
| { |
| "epoch": 9.746031746031747, |
| "grad_norm": 0.008043559268116951, |
| "learning_rate": 3.044141395683906e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 657824, |
| "step": 3070 |
| }, |
| { |
| "epoch": 9.761904761904763, |
| "grad_norm": 0.00044315162813290954, |
| "learning_rate": 3.037379452251477e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 658912, |
| "step": 3075 |
| }, |
| { |
| "epoch": 9.777777777777779, |
| "grad_norm": 0.0003389069461263716, |
| "learning_rate": 3.0306133844794783e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 660000, |
| "step": 3080 |
| }, |
| { |
| "epoch": 9.793650793650794, |
| "grad_norm": 0.0010802766773849726, |
| "learning_rate": 3.02384324429688e-05, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 661040, |
| "step": 3085 |
| }, |
| { |
| "epoch": 9.80952380952381, |
| "grad_norm": 6.0305914878845215, |
| "learning_rate": 3.0170690836639065e-05, |
| "loss": 0.0093, |
| "num_input_tokens_seen": 662016, |
| "step": 3090 |
| }, |
| { |
| "epoch": 9.825396825396826, |
| "grad_norm": 0.0006042938912287354, |
| "learning_rate": 3.0102909545716396e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 663040, |
| "step": 3095 |
| }, |
| { |
| "epoch": 9.841269841269842, |
| "grad_norm": 0.00038514367770403624, |
| "learning_rate": 3.003508909041617e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 664096, |
| "step": 3100 |
| }, |
| { |
| "epoch": 9.857142857142858, |
| "grad_norm": 0.0013175939675420523, |
| "learning_rate": 2.9967229991254363e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 665104, |
| "step": 3105 |
| }, |
| { |
| "epoch": 9.873015873015873, |
| "grad_norm": 0.0002575514663476497, |
| "learning_rate": 2.989933276904353e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 666272, |
| "step": 3110 |
| }, |
| { |
| "epoch": 9.88888888888889, |
| "grad_norm": 0.00040172351873479784, |
| "learning_rate": 2.9831397944888833e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 667344, |
| "step": 3115 |
| }, |
| { |
| "epoch": 9.904761904761905, |
| "grad_norm": 0.0011199676664546132, |
| "learning_rate": 2.9763426040184007e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 668400, |
| "step": 3120 |
| }, |
| { |
| "epoch": 9.920634920634921, |
| "grad_norm": 0.0007416466833092272, |
| "learning_rate": 2.9695417576607376e-05, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 669504, |
| "step": 3125 |
| }, |
| { |
| "epoch": 9.936507936507937, |
| "grad_norm": 0.00034860780579037964, |
| "learning_rate": 2.9627373076117863e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 670608, |
| "step": 3130 |
| }, |
| { |
| "epoch": 9.952380952380953, |
| "grad_norm": 0.0011542192660272121, |
| "learning_rate": 2.9559293060950977e-05, |
| "loss": 0.0103, |
| "num_input_tokens_seen": 671632, |
| "step": 3135 |
| }, |
| { |
| "epoch": 9.968253968253968, |
| "grad_norm": 0.00018505194748286158, |
| "learning_rate": 2.9491178053614776e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 672736, |
| "step": 3140 |
| }, |
| { |
| "epoch": 9.984126984126984, |
| "grad_norm": 0.0003018953138962388, |
| "learning_rate": 2.9423028576885893e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 673760, |
| "step": 3145 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.0003000242286361754, |
| "learning_rate": 2.9354845153805505e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 674784, |
| "step": 3150 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.1815599650144577, |
| "eval_runtime": 1.4451, |
| "eval_samples_per_second": 48.439, |
| "eval_steps_per_second": 24.219, |
| "num_input_tokens_seen": 674784, |
| "step": 3150 |
| }, |
| { |
| "epoch": 10.015873015873016, |
| "grad_norm": 0.0002667237422429025, |
| "learning_rate": 2.928662830767534e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 675840, |
| "step": 3155 |
| }, |
| { |
| "epoch": 10.031746031746032, |
| "grad_norm": 1.0574944019317627, |
| "learning_rate": 2.9218378562053623e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 676896, |
| "step": 3160 |
| }, |
| { |
| "epoch": 10.047619047619047, |
| "grad_norm": 0.00017189487698487937, |
| "learning_rate": 2.9150096440751107e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 677952, |
| "step": 3165 |
| }, |
| { |
| "epoch": 10.063492063492063, |
| "grad_norm": 0.0003118932945653796, |
| "learning_rate": 2.908178246782698e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 678976, |
| "step": 3170 |
| }, |
| { |
| "epoch": 10.079365079365079, |
| "grad_norm": 0.23004379868507385, |
| "learning_rate": 2.9013437167584944e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 680016, |
| "step": 3175 |
| }, |
| { |
| "epoch": 10.095238095238095, |
| "grad_norm": 0.0002921000123023987, |
| "learning_rate": 2.894506106456909e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 681088, |
| "step": 3180 |
| }, |
| { |
| "epoch": 10.11111111111111, |
| "grad_norm": 0.05020623281598091, |
| "learning_rate": 2.8876654683559944e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 682160, |
| "step": 3185 |
| }, |
| { |
| "epoch": 10.126984126984127, |
| "grad_norm": 0.0664471909403801, |
| "learning_rate": 2.8808218549570408e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 683232, |
| "step": 3190 |
| }, |
| { |
| "epoch": 10.142857142857142, |
| "grad_norm": 0.05454224720597267, |
| "learning_rate": 2.8739753187841733e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 684304, |
| "step": 3195 |
| }, |
| { |
| "epoch": 10.158730158730158, |
| "grad_norm": 0.004288592375814915, |
| "learning_rate": 2.8671259123839472e-05, |
| "loss": 0.0078, |
| "num_input_tokens_seen": 685440, |
| "step": 3200 |
| }, |
| { |
| "epoch": 10.174603174603174, |
| "grad_norm": 0.0005515673547051847, |
| "learning_rate": 2.8602736883249503e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 686576, |
| "step": 3205 |
| }, |
| { |
| "epoch": 10.19047619047619, |
| "grad_norm": 0.005215165205299854, |
| "learning_rate": 2.8534186991973932e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 687632, |
| "step": 3210 |
| }, |
| { |
| "epoch": 10.206349206349206, |
| "grad_norm": 0.01046276930719614, |
| "learning_rate": 2.8465609976127082e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 688704, |
| "step": 3215 |
| }, |
| { |
| "epoch": 10.222222222222221, |
| "grad_norm": 0.000193351210327819, |
| "learning_rate": 2.839700636203146e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 689776, |
| "step": 3220 |
| }, |
| { |
| "epoch": 10.238095238095237, |
| "grad_norm": 0.014234524220228195, |
| "learning_rate": 2.8328376676213713e-05, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 690864, |
| "step": 3225 |
| }, |
| { |
| "epoch": 10.253968253968253, |
| "grad_norm": 0.0002087104512611404, |
| "learning_rate": 2.8259721445400577e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 691904, |
| "step": 3230 |
| }, |
| { |
| "epoch": 10.26984126984127, |
| "grad_norm": 0.006072845309972763, |
| "learning_rate": 2.8191041196514873e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 692992, |
| "step": 3235 |
| }, |
| { |
| "epoch": 10.285714285714286, |
| "grad_norm": 0.059259865432977676, |
| "learning_rate": 2.8122336456671378e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 694016, |
| "step": 3240 |
| }, |
| { |
| "epoch": 10.301587301587302, |
| "grad_norm": 8.876676559448242, |
| "learning_rate": 2.8053607753172895e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 695152, |
| "step": 3245 |
| }, |
| { |
| "epoch": 10.317460317460318, |
| "grad_norm": 0.0069928658194839954, |
| "learning_rate": 2.7984855613506107e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 696176, |
| "step": 3250 |
| }, |
| { |
| "epoch": 10.333333333333334, |
| "grad_norm": 0.00021716665651183575, |
| "learning_rate": 2.791608056533759e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 697312, |
| "step": 3255 |
| }, |
| { |
| "epoch": 10.34920634920635, |
| "grad_norm": 0.00015753868501633406, |
| "learning_rate": 2.7847283136509717e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 698336, |
| "step": 3260 |
| }, |
| { |
| "epoch": 10.365079365079366, |
| "grad_norm": 0.00016610305465292186, |
| "learning_rate": 2.7778463855036657e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 699488, |
| "step": 3265 |
| }, |
| { |
| "epoch": 10.380952380952381, |
| "grad_norm": 0.00039039889816194773, |
| "learning_rate": 2.770962324910027e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 700512, |
| "step": 3270 |
| }, |
| { |
| "epoch": 10.396825396825397, |
| "grad_norm": 0.00014512175403069705, |
| "learning_rate": 2.7640761847046105e-05, |
| "loss": 0.005, |
| "num_input_tokens_seen": 701552, |
| "step": 3275 |
| }, |
| { |
| "epoch": 10.412698412698413, |
| "grad_norm": 0.0017348774708807468, |
| "learning_rate": 2.75718801773793e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 702672, |
| "step": 3280 |
| }, |
| { |
| "epoch": 10.428571428571429, |
| "grad_norm": 0.00017332640709355474, |
| "learning_rate": 2.750297876876055e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 703712, |
| "step": 3285 |
| }, |
| { |
| "epoch": 10.444444444444445, |
| "grad_norm": 0.00014672847464680672, |
| "learning_rate": 2.743405815000205e-05, |
| "loss": 0.0309, |
| "num_input_tokens_seen": 704816, |
| "step": 3290 |
| }, |
| { |
| "epoch": 10.46031746031746, |
| "grad_norm": 0.0005181178566999733, |
| "learning_rate": 2.736511885006343e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 705904, |
| "step": 3295 |
| }, |
| { |
| "epoch": 10.476190476190476, |
| "grad_norm": 0.0001759826991474256, |
| "learning_rate": 2.729616139804769e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 706944, |
| "step": 3300 |
| }, |
| { |
| "epoch": 10.492063492063492, |
| "grad_norm": 0.004806222394108772, |
| "learning_rate": 2.7227186323197162e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 708048, |
| "step": 3305 |
| }, |
| { |
| "epoch": 10.507936507936508, |
| "grad_norm": 0.00019694813818205148, |
| "learning_rate": 2.7158194154889394e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 709040, |
| "step": 3310 |
| }, |
| { |
| "epoch": 10.523809523809524, |
| "grad_norm": 0.005902845412492752, |
| "learning_rate": 2.7089185422633178e-05, |
| "loss": 0.0094, |
| "num_input_tokens_seen": 710112, |
| "step": 3315 |
| }, |
| { |
| "epoch": 10.53968253968254, |
| "grad_norm": 0.0002289386175107211, |
| "learning_rate": 2.7020160656064382e-05, |
| "loss": 0.0042, |
| "num_input_tokens_seen": 711120, |
| "step": 3320 |
| }, |
| { |
| "epoch": 10.555555555555555, |
| "grad_norm": 0.00042782543459907174, |
| "learning_rate": 2.695112038494198e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 712272, |
| "step": 3325 |
| }, |
| { |
| "epoch": 10.571428571428571, |
| "grad_norm": 0.014789941720664501, |
| "learning_rate": 2.6882065139143907e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 713360, |
| "step": 3330 |
| }, |
| { |
| "epoch": 10.587301587301587, |
| "grad_norm": 1.8601882457733154, |
| "learning_rate": 2.6812995448663047e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 714496, |
| "step": 3335 |
| }, |
| { |
| "epoch": 10.603174603174603, |
| "grad_norm": 0.0001709021016722545, |
| "learning_rate": 2.674391184360313e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 715568, |
| "step": 3340 |
| }, |
| { |
| "epoch": 10.619047619047619, |
| "grad_norm": 0.0006503509357571602, |
| "learning_rate": 2.6674814854174708e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 716688, |
| "step": 3345 |
| }, |
| { |
| "epoch": 10.634920634920634, |
| "grad_norm": 0.0003370628983248025, |
| "learning_rate": 2.6605705010691025e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 717664, |
| "step": 3350 |
| }, |
| { |
| "epoch": 10.65079365079365, |
| "grad_norm": 11.525466918945312, |
| "learning_rate": 2.6536582843563995e-05, |
| "loss": 0.0642, |
| "num_input_tokens_seen": 718784, |
| "step": 3355 |
| }, |
| { |
| "epoch": 10.666666666666666, |
| "grad_norm": 0.011019658297300339, |
| "learning_rate": 2.6467448883300104e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 719840, |
| "step": 3360 |
| }, |
| { |
| "epoch": 10.682539682539682, |
| "grad_norm": 0.0003383358125574887, |
| "learning_rate": 2.6398303660496376e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 720960, |
| "step": 3365 |
| }, |
| { |
| "epoch": 10.698412698412698, |
| "grad_norm": 0.0012447485933080316, |
| "learning_rate": 2.6329147705836238e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 722064, |
| "step": 3370 |
| }, |
| { |
| "epoch": 10.714285714285714, |
| "grad_norm": 0.00022383588657248765, |
| "learning_rate": 2.6259981550085504e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 723152, |
| "step": 3375 |
| }, |
| { |
| "epoch": 10.73015873015873, |
| "grad_norm": 0.00043943486525677145, |
| "learning_rate": 2.6190805724088274e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 724208, |
| "step": 3380 |
| }, |
| { |
| "epoch": 10.746031746031747, |
| "grad_norm": 0.019387539476156235, |
| "learning_rate": 2.6121620758762877e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 725296, |
| "step": 3385 |
| }, |
| { |
| "epoch": 10.761904761904763, |
| "grad_norm": 0.026385366916656494, |
| "learning_rate": 2.6052427185097765e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 726384, |
| "step": 3390 |
| }, |
| { |
| "epoch": 10.777777777777779, |
| "grad_norm": 0.0005208961665630341, |
| "learning_rate": 2.598322553414749e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 727424, |
| "step": 3395 |
| }, |
| { |
| "epoch": 10.793650793650794, |
| "grad_norm": 0.0025588928256183863, |
| "learning_rate": 2.591401633702856e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 728528, |
| "step": 3400 |
| }, |
| { |
| "epoch": 10.80952380952381, |
| "grad_norm": 0.0004624544526450336, |
| "learning_rate": 2.584480012491542e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 729616, |
| "step": 3405 |
| }, |
| { |
| "epoch": 10.825396825396826, |
| "grad_norm": 0.016153214499354362, |
| "learning_rate": 2.5775577429036345e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 730640, |
| "step": 3410 |
| }, |
| { |
| "epoch": 10.841269841269842, |
| "grad_norm": 0.00031025870703160763, |
| "learning_rate": 2.5706348780669393e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 731712, |
| "step": 3415 |
| }, |
| { |
| "epoch": 10.857142857142858, |
| "grad_norm": 0.0006250953883863986, |
| "learning_rate": 2.5637114711138282e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 732720, |
| "step": 3420 |
| }, |
| { |
| "epoch": 10.873015873015873, |
| "grad_norm": 0.013909174129366875, |
| "learning_rate": 2.5567875751808353e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 733792, |
| "step": 3425 |
| }, |
| { |
| "epoch": 10.88888888888889, |
| "grad_norm": 0.0007239268743433058, |
| "learning_rate": 2.5498632434082452e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 734880, |
| "step": 3430 |
| }, |
| { |
| "epoch": 10.904761904761905, |
| "grad_norm": 0.0003793005016632378, |
| "learning_rate": 2.542938528939691e-05, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 735936, |
| "step": 3435 |
| }, |
| { |
| "epoch": 10.920634920634921, |
| "grad_norm": 0.00018004873709287494, |
| "learning_rate": 2.5360134849217416e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 736976, |
| "step": 3440 |
| }, |
| { |
| "epoch": 10.936507936507937, |
| "grad_norm": 0.0006164236110635102, |
| "learning_rate": 2.5290881645034932e-05, |
| "loss": 0.0292, |
| "num_input_tokens_seen": 738064, |
| "step": 3445 |
| }, |
| { |
| "epoch": 10.952380952380953, |
| "grad_norm": 0.0004222550487611443, |
| "learning_rate": 2.5221626208361655e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 739152, |
| "step": 3450 |
| }, |
| { |
| "epoch": 10.968253968253968, |
| "grad_norm": 0.0003485404886305332, |
| "learning_rate": 2.515236907072691e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 740240, |
| "step": 3455 |
| }, |
| { |
| "epoch": 10.984126984126984, |
| "grad_norm": 0.0003257024218328297, |
| "learning_rate": 2.5083110763673085e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 741328, |
| "step": 3460 |
| }, |
| { |
| "epoch": 11.0, |
| "grad_norm": 0.0008957489626482129, |
| "learning_rate": 2.5013851818751534e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 742336, |
| "step": 3465 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.19778476655483246, |
| "eval_runtime": 1.4538, |
| "eval_samples_per_second": 48.149, |
| "eval_steps_per_second": 24.075, |
| "num_input_tokens_seen": 742336, |
| "step": 3465 |
| }, |
| { |
| "epoch": 11.015873015873016, |
| "grad_norm": 0.00035931816091760993, |
| "learning_rate": 2.4944592767518495e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 743456, |
| "step": 3470 |
| }, |
| { |
| "epoch": 11.031746031746032, |
| "grad_norm": 0.00027192573179490864, |
| "learning_rate": 2.4875334141531052e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 744528, |
| "step": 3475 |
| }, |
| { |
| "epoch": 11.047619047619047, |
| "grad_norm": 0.004379758145660162, |
| "learning_rate": 2.4806076472342997e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 745520, |
| "step": 3480 |
| }, |
| { |
| "epoch": 11.063492063492063, |
| "grad_norm": 0.0010909591801464558, |
| "learning_rate": 2.4736820291500793e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 746592, |
| "step": 3485 |
| }, |
| { |
| "epoch": 11.079365079365079, |
| "grad_norm": 0.0003718891239259392, |
| "learning_rate": 2.466756613053948e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 747696, |
| "step": 3490 |
| }, |
| { |
| "epoch": 11.095238095238095, |
| "grad_norm": 18.98329734802246, |
| "learning_rate": 2.459831452097859e-05, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 748816, |
| "step": 3495 |
| }, |
| { |
| "epoch": 11.11111111111111, |
| "grad_norm": 0.003277825890108943, |
| "learning_rate": 2.4529065994318078e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 749840, |
| "step": 3500 |
| }, |
| { |
| "epoch": 11.126984126984127, |
| "grad_norm": 0.013009368441998959, |
| "learning_rate": 2.445982108203422e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 750944, |
| "step": 3505 |
| }, |
| { |
| "epoch": 11.142857142857142, |
| "grad_norm": 0.03715138137340546, |
| "learning_rate": 2.43905803155756e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 752000, |
| "step": 3510 |
| }, |
| { |
| "epoch": 11.158730158730158, |
| "grad_norm": 0.0052580940537154675, |
| "learning_rate": 2.432134422635893e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 753152, |
| "step": 3515 |
| }, |
| { |
| "epoch": 11.174603174603174, |
| "grad_norm": 0.0003532212576828897, |
| "learning_rate": 2.4252113345765046e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 754224, |
| "step": 3520 |
| }, |
| { |
| "epoch": 11.19047619047619, |
| "grad_norm": 0.0006754444329999387, |
| "learning_rate": 2.4182888205134797e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 755312, |
| "step": 3525 |
| }, |
| { |
| "epoch": 11.206349206349206, |
| "grad_norm": 0.00020079000387340784, |
| "learning_rate": 2.4113669335765017e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 756336, |
| "step": 3530 |
| }, |
| { |
| "epoch": 11.222222222222221, |
| "grad_norm": 0.00023593910736963153, |
| "learning_rate": 2.404445726890437e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 757360, |
| "step": 3535 |
| }, |
| { |
| "epoch": 11.238095238095237, |
| "grad_norm": 0.0003324486897327006, |
| "learning_rate": 2.397525253574931e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 758400, |
| "step": 3540 |
| }, |
| { |
| "epoch": 11.253968253968253, |
| "grad_norm": 0.003988176584243774, |
| "learning_rate": 2.390605566744002e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 759456, |
| "step": 3545 |
| }, |
| { |
| "epoch": 11.26984126984127, |
| "grad_norm": 0.0004338165163062513, |
| "learning_rate": 2.3836867195056335e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 760480, |
| "step": 3550 |
| }, |
| { |
| "epoch": 11.285714285714286, |
| "grad_norm": 0.0001763407635735348, |
| "learning_rate": 2.376768764961362e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 761552, |
| "step": 3555 |
| }, |
| { |
| "epoch": 11.301587301587302, |
| "grad_norm": 0.0491175577044487, |
| "learning_rate": 2.3698517562058758e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 762624, |
| "step": 3560 |
| }, |
| { |
| "epoch": 11.317460317460318, |
| "grad_norm": 0.00014863189426250756, |
| "learning_rate": 2.3629357463265995e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 763696, |
| "step": 3565 |
| }, |
| { |
| "epoch": 11.333333333333334, |
| "grad_norm": 0.004388798493891954, |
| "learning_rate": 2.3560207884032987e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 764800, |
| "step": 3570 |
| }, |
| { |
| "epoch": 11.34920634920635, |
| "grad_norm": 0.0027450949419289827, |
| "learning_rate": 2.349106935507659e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 765872, |
| "step": 3575 |
| }, |
| { |
| "epoch": 11.365079365079366, |
| "grad_norm": 0.0010159960947930813, |
| "learning_rate": 2.342194240702888e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 766928, |
| "step": 3580 |
| }, |
| { |
| "epoch": 11.380952380952381, |
| "grad_norm": 0.0001987464347621426, |
| "learning_rate": 2.3352827570433036e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 768064, |
| "step": 3585 |
| }, |
| { |
| "epoch": 11.396825396825397, |
| "grad_norm": 0.001201266422867775, |
| "learning_rate": 2.3283725375739303e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 769168, |
| "step": 3590 |
| }, |
| { |
| "epoch": 11.412698412698413, |
| "grad_norm": 0.03675874322652817, |
| "learning_rate": 2.321463635330088e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 770224, |
| "step": 3595 |
| }, |
| { |
| "epoch": 11.428571428571429, |
| "grad_norm": 0.000684223894495517, |
| "learning_rate": 2.3145561033369877e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 771296, |
| "step": 3600 |
| }, |
| { |
| "epoch": 11.444444444444445, |
| "grad_norm": 0.0001652841456234455, |
| "learning_rate": 2.3076499946093243e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 772400, |
| "step": 3605 |
| }, |
| { |
| "epoch": 11.46031746031746, |
| "grad_norm": 0.02017052099108696, |
| "learning_rate": 2.300745362150869e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 773456, |
| "step": 3610 |
| }, |
| { |
| "epoch": 11.476190476190476, |
| "grad_norm": 0.00025198451476171613, |
| "learning_rate": 2.2938422589540627e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 774432, |
| "step": 3615 |
| }, |
| { |
| "epoch": 11.492063492063492, |
| "grad_norm": 0.014424529857933521, |
| "learning_rate": 2.2869407379996088e-05, |
| "loss": 0.0071, |
| "num_input_tokens_seen": 775520, |
| "step": 3620 |
| }, |
| { |
| "epoch": 11.507936507936508, |
| "grad_norm": 0.0003219831851311028, |
| "learning_rate": 2.2800408522560678e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 776544, |
| "step": 3625 |
| }, |
| { |
| "epoch": 11.523809523809524, |
| "grad_norm": 0.0002450917090754956, |
| "learning_rate": 2.2731426546794508e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 777728, |
| "step": 3630 |
| }, |
| { |
| "epoch": 11.53968253968254, |
| "grad_norm": 0.00040673837065696716, |
| "learning_rate": 2.2662461982128108e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 778784, |
| "step": 3635 |
| }, |
| { |
| "epoch": 11.555555555555555, |
| "grad_norm": 0.00013897109602112323, |
| "learning_rate": 2.259351535785839e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 779888, |
| "step": 3640 |
| }, |
| { |
| "epoch": 11.571428571428571, |
| "grad_norm": 0.000246451236307621, |
| "learning_rate": 2.2524587203144565e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 780960, |
| "step": 3645 |
| }, |
| { |
| "epoch": 11.587301587301587, |
| "grad_norm": 0.00041408365359529853, |
| "learning_rate": 2.2455678047004107e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 782048, |
| "step": 3650 |
| }, |
| { |
| "epoch": 11.603174603174603, |
| "grad_norm": 0.0011768187396228313, |
| "learning_rate": 2.238678841830867e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 783104, |
| "step": 3655 |
| }, |
| { |
| "epoch": 11.619047619047619, |
| "grad_norm": 0.01867389678955078, |
| "learning_rate": 2.2317918845780027e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 784160, |
| "step": 3660 |
| }, |
| { |
| "epoch": 11.634920634920634, |
| "grad_norm": 0.00020991513156332076, |
| "learning_rate": 2.2249069857986027e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 785264, |
| "step": 3665 |
| }, |
| { |
| "epoch": 11.65079365079365, |
| "grad_norm": 0.00016364398470614105, |
| "learning_rate": 2.218024198333656e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 786288, |
| "step": 3670 |
| }, |
| { |
| "epoch": 11.666666666666666, |
| "grad_norm": 0.0001746386114973575, |
| "learning_rate": 2.2111435750079434e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 787360, |
| "step": 3675 |
| }, |
| { |
| "epoch": 11.682539682539682, |
| "grad_norm": 0.0014488694723695517, |
| "learning_rate": 2.2042651686296378e-05, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 788400, |
| "step": 3680 |
| }, |
| { |
| "epoch": 11.698412698412698, |
| "grad_norm": 0.0005274708964861929, |
| "learning_rate": 2.1973890319898963e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 789472, |
| "step": 3685 |
| }, |
| { |
| "epoch": 11.714285714285714, |
| "grad_norm": 0.0021408062893897295, |
| "learning_rate": 2.1905152178624595e-05, |
| "loss": 0.002, |
| "num_input_tokens_seen": 790512, |
| "step": 3690 |
| }, |
| { |
| "epoch": 11.73015873015873, |
| "grad_norm": 0.001208798261359334, |
| "learning_rate": 2.183643779003239e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 791600, |
| "step": 3695 |
| }, |
| { |
| "epoch": 11.746031746031747, |
| "grad_norm": 0.0007551803719252348, |
| "learning_rate": 2.1767747681499176e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 792704, |
| "step": 3700 |
| }, |
| { |
| "epoch": 11.761904761904763, |
| "grad_norm": 0.00040706252912059426, |
| "learning_rate": 2.1699082380215425e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 793792, |
| "step": 3705 |
| }, |
| { |
| "epoch": 11.777777777777779, |
| "grad_norm": 0.0019093899754807353, |
| "learning_rate": 2.1630442413181246e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 794944, |
| "step": 3710 |
| }, |
| { |
| "epoch": 11.793650793650794, |
| "grad_norm": 0.0044355797581374645, |
| "learning_rate": 2.156182830720228e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 795968, |
| "step": 3715 |
| }, |
| { |
| "epoch": 11.80952380952381, |
| "grad_norm": 0.0013154816115275025, |
| "learning_rate": 2.14932405888857e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 797040, |
| "step": 3720 |
| }, |
| { |
| "epoch": 11.825396825396826, |
| "grad_norm": 0.05848317593336105, |
| "learning_rate": 2.1424679784636144e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 798064, |
| "step": 3725 |
| }, |
| { |
| "epoch": 11.841269841269842, |
| "grad_norm": 0.00017026295245159417, |
| "learning_rate": 2.1356146420651706e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 799104, |
| "step": 3730 |
| }, |
| { |
| "epoch": 11.857142857142858, |
| "grad_norm": 0.0005860523087903857, |
| "learning_rate": 2.1287641022919866e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 800240, |
| "step": 3735 |
| }, |
| { |
| "epoch": 11.873015873015873, |
| "grad_norm": 0.0011898577213287354, |
| "learning_rate": 2.121916411721346e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 801312, |
| "step": 3740 |
| }, |
| { |
| "epoch": 11.88888888888889, |
| "grad_norm": 1.2069417238235474, |
| "learning_rate": 2.115071622908666e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 802352, |
| "step": 3745 |
| }, |
| { |
| "epoch": 11.904761904761905, |
| "grad_norm": 0.00797231961041689, |
| "learning_rate": 2.1082297883870937e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 803424, |
| "step": 3750 |
| }, |
| { |
| "epoch": 11.920634920634921, |
| "grad_norm": 0.0008888400625437498, |
| "learning_rate": 2.1013909606671004e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 804512, |
| "step": 3755 |
| }, |
| { |
| "epoch": 11.936507936507937, |
| "grad_norm": 0.0006856803665868938, |
| "learning_rate": 2.0945551922360818e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 805584, |
| "step": 3760 |
| }, |
| { |
| "epoch": 11.952380952380953, |
| "grad_norm": 3.1141719818115234, |
| "learning_rate": 2.087722535557953e-05, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 806608, |
| "step": 3765 |
| }, |
| { |
| "epoch": 11.968253968253968, |
| "grad_norm": 0.00021470033971127123, |
| "learning_rate": 2.0808930430727484e-05, |
| "loss": 0.0326, |
| "num_input_tokens_seen": 807680, |
| "step": 3770 |
| }, |
| { |
| "epoch": 11.984126984126984, |
| "grad_norm": 0.00046644502435810864, |
| "learning_rate": 2.0740667671962156e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 808720, |
| "step": 3775 |
| }, |
| { |
| "epoch": 12.0, |
| "grad_norm": 0.0005882336990907788, |
| "learning_rate": 2.067243760319415e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 809792, |
| "step": 3780 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.17750194668769836, |
| "eval_runtime": 1.4455, |
| "eval_samples_per_second": 48.426, |
| "eval_steps_per_second": 24.213, |
| "num_input_tokens_seen": 809792, |
| "step": 3780 |
| }, |
| { |
| "epoch": 12.015873015873016, |
| "grad_norm": 0.002724026096984744, |
| "learning_rate": 2.060424074808319e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 810880, |
| "step": 3785 |
| }, |
| { |
| "epoch": 12.031746031746032, |
| "grad_norm": 0.002534243743866682, |
| "learning_rate": 2.0536077630034086e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 811920, |
| "step": 3790 |
| }, |
| { |
| "epoch": 12.047619047619047, |
| "grad_norm": 0.051372405141592026, |
| "learning_rate": 2.0467948772192713e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 812976, |
| "step": 3795 |
| }, |
| { |
| "epoch": 12.063492063492063, |
| "grad_norm": 0.0038024040404707193, |
| "learning_rate": 2.0399854697442e-05, |
| "loss": 0.007, |
| "num_input_tokens_seen": 814112, |
| "step": 3800 |
| }, |
| { |
| "epoch": 12.079365079365079, |
| "grad_norm": 0.1364995837211609, |
| "learning_rate": 2.0331795928397916e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 815152, |
| "step": 3805 |
| }, |
| { |
| "epoch": 12.095238095238095, |
| "grad_norm": 0.034437455236911774, |
| "learning_rate": 2.0263772987405494e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 816224, |
| "step": 3810 |
| }, |
| { |
| "epoch": 12.11111111111111, |
| "grad_norm": 0.00039293619920499623, |
| "learning_rate": 2.0195786396534743e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 817328, |
| "step": 3815 |
| }, |
| { |
| "epoch": 12.126984126984127, |
| "grad_norm": 0.00905533879995346, |
| "learning_rate": 2.0127836677576717e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 818416, |
| "step": 3820 |
| }, |
| { |
| "epoch": 12.142857142857142, |
| "grad_norm": 0.001928988378494978, |
| "learning_rate": 2.0059924352039463e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 819536, |
| "step": 3825 |
| }, |
| { |
| "epoch": 12.158730158730158, |
| "grad_norm": 0.000600858882535249, |
| "learning_rate": 1.9992049941144066e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 820608, |
| "step": 3830 |
| }, |
| { |
| "epoch": 12.174603174603174, |
| "grad_norm": 0.0003672520397230983, |
| "learning_rate": 1.99242139658206e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 821664, |
| "step": 3835 |
| }, |
| { |
| "epoch": 12.19047619047619, |
| "grad_norm": 0.00022892758715897799, |
| "learning_rate": 1.985641694670414e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 822768, |
| "step": 3840 |
| }, |
| { |
| "epoch": 12.206349206349206, |
| "grad_norm": 0.00028045850922353566, |
| "learning_rate": 1.9788659404130776e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 823792, |
| "step": 3845 |
| }, |
| { |
| "epoch": 12.222222222222221, |
| "grad_norm": 0.00015690652071498334, |
| "learning_rate": 1.9720941858133658e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 824928, |
| "step": 3850 |
| }, |
| { |
| "epoch": 12.238095238095237, |
| "grad_norm": 0.00023055235214997083, |
| "learning_rate": 1.9653264828438923e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 825952, |
| "step": 3855 |
| }, |
| { |
| "epoch": 12.253968253968253, |
| "grad_norm": 0.00021435992675833404, |
| "learning_rate": 1.9585628834461766e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 826960, |
| "step": 3860 |
| }, |
| { |
| "epoch": 12.26984126984127, |
| "grad_norm": 0.0001954881736310199, |
| "learning_rate": 1.9518034395302414e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 827968, |
| "step": 3865 |
| }, |
| { |
| "epoch": 12.285714285714286, |
| "grad_norm": 0.0007147770957089961, |
| "learning_rate": 1.9450482029742217e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 829040, |
| "step": 3870 |
| }, |
| { |
| "epoch": 12.301587301587302, |
| "grad_norm": 0.4170708954334259, |
| "learning_rate": 1.9382972256239563e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 830080, |
| "step": 3875 |
| }, |
| { |
| "epoch": 12.317460317460318, |
| "grad_norm": 0.000531713361851871, |
| "learning_rate": 1.931550559292597e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 831104, |
| "step": 3880 |
| }, |
| { |
| "epoch": 12.333333333333334, |
| "grad_norm": 0.000578387756831944, |
| "learning_rate": 1.9248082557602078e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 832144, |
| "step": 3885 |
| }, |
| { |
| "epoch": 12.34920634920635, |
| "grad_norm": 0.00023974425857886672, |
| "learning_rate": 1.9180703667733713e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 833216, |
| "step": 3890 |
| }, |
| { |
| "epoch": 12.365079365079366, |
| "grad_norm": 0.013734663836658001, |
| "learning_rate": 1.911336944044786e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 834304, |
| "step": 3895 |
| }, |
| { |
| "epoch": 12.380952380952381, |
| "grad_norm": 0.00040066608926281333, |
| "learning_rate": 1.9046080392528735e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 835424, |
| "step": 3900 |
| }, |
| { |
| "epoch": 12.396825396825397, |
| "grad_norm": 0.0012625795789062977, |
| "learning_rate": 1.89788370404138e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 836496, |
| "step": 3905 |
| }, |
| { |
| "epoch": 12.412698412698413, |
| "grad_norm": 0.0016813823021948338, |
| "learning_rate": 1.8911639900189818e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 837568, |
| "step": 3910 |
| }, |
| { |
| "epoch": 12.428571428571429, |
| "grad_norm": 0.00017749129619915038, |
| "learning_rate": 1.8844489487588867e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 838640, |
| "step": 3915 |
| }, |
| { |
| "epoch": 12.444444444444445, |
| "grad_norm": 0.00817769207060337, |
| "learning_rate": 1.8777386317984404e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 839696, |
| "step": 3920 |
| }, |
| { |
| "epoch": 12.46031746031746, |
| "grad_norm": 0.00019275283557362854, |
| "learning_rate": 1.871033090638729e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 840704, |
| "step": 3925 |
| }, |
| { |
| "epoch": 12.476190476190476, |
| "grad_norm": 0.0051796757616102695, |
| "learning_rate": 1.864332376744186e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 841792, |
| "step": 3930 |
| }, |
| { |
| "epoch": 12.492063492063492, |
| "grad_norm": 0.00040796739631332457, |
| "learning_rate": 1.857636541542195e-05, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 842896, |
| "step": 3935 |
| }, |
| { |
| "epoch": 12.507936507936508, |
| "grad_norm": 0.00011765657836804166, |
| "learning_rate": 1.850945636422697e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 843984, |
| "step": 3940 |
| }, |
| { |
| "epoch": 12.523809523809524, |
| "grad_norm": 0.00013000769831705838, |
| "learning_rate": 1.844259712737793e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 845056, |
| "step": 3945 |
| }, |
| { |
| "epoch": 12.53968253968254, |
| "grad_norm": 0.00014240505697671324, |
| "learning_rate": 1.8375788218013556e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 846128, |
| "step": 3950 |
| }, |
| { |
| "epoch": 12.555555555555555, |
| "grad_norm": 0.0013110644649714231, |
| "learning_rate": 1.8309030148886284e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 847152, |
| "step": 3955 |
| }, |
| { |
| "epoch": 12.571428571428571, |
| "grad_norm": 0.02542807348072529, |
| "learning_rate": 1.8242323432358365e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 848256, |
| "step": 3960 |
| }, |
| { |
| "epoch": 12.587301587301587, |
| "grad_norm": 0.009853278286755085, |
| "learning_rate": 1.8175668580397914e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 849328, |
| "step": 3965 |
| }, |
| { |
| "epoch": 12.603174603174603, |
| "grad_norm": 0.0005432798061519861, |
| "learning_rate": 1.8109066104575023e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 850400, |
| "step": 3970 |
| }, |
| { |
| "epoch": 12.619047619047619, |
| "grad_norm": 0.002314529847353697, |
| "learning_rate": 1.8042516516057763e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 851504, |
| "step": 3975 |
| }, |
| { |
| "epoch": 12.634920634920634, |
| "grad_norm": 0.0008332771249115467, |
| "learning_rate": 1.7976020325608318e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 852560, |
| "step": 3980 |
| }, |
| { |
| "epoch": 12.65079365079365, |
| "grad_norm": 0.013491770252585411, |
| "learning_rate": 1.7909578043579037e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 853632, |
| "step": 3985 |
| }, |
| { |
| "epoch": 12.666666666666666, |
| "grad_norm": 0.00032894726609811187, |
| "learning_rate": 1.784319017990855e-05, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 854720, |
| "step": 3990 |
| }, |
| { |
| "epoch": 12.682539682539682, |
| "grad_norm": 1.5756915807724, |
| "learning_rate": 1.7776857244117807e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 855792, |
| "step": 3995 |
| }, |
| { |
| "epoch": 12.698412698412698, |
| "grad_norm": 0.00010613162157824263, |
| "learning_rate": 1.7710579745306193e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 856896, |
| "step": 4000 |
| }, |
| { |
| "epoch": 12.714285714285714, |
| "grad_norm": 9.347883315058425e-05, |
| "learning_rate": 1.764435819214762e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 857968, |
| "step": 4005 |
| }, |
| { |
| "epoch": 12.73015873015873, |
| "grad_norm": 0.00020842064986936748, |
| "learning_rate": 1.7578193092886647e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 858992, |
| "step": 4010 |
| }, |
| { |
| "epoch": 12.746031746031747, |
| "grad_norm": 0.00023689692898187786, |
| "learning_rate": 1.751208495533452e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 860032, |
| "step": 4015 |
| }, |
| { |
| "epoch": 12.761904761904763, |
| "grad_norm": 0.0012503145262598991, |
| "learning_rate": 1.744603428686533e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 861136, |
| "step": 4020 |
| }, |
| { |
| "epoch": 12.777777777777779, |
| "grad_norm": 0.0003447967173997313, |
| "learning_rate": 1.7380041594412084e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 862272, |
| "step": 4025 |
| }, |
| { |
| "epoch": 12.793650793650794, |
| "grad_norm": 0.00016737495025154203, |
| "learning_rate": 1.731410738446284e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 863280, |
| "step": 4030 |
| }, |
| { |
| "epoch": 12.80952380952381, |
| "grad_norm": 0.0002947594039142132, |
| "learning_rate": 1.724823216305681e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 864432, |
| "step": 4035 |
| }, |
| { |
| "epoch": 12.825396825396826, |
| "grad_norm": 0.00010101118095917627, |
| "learning_rate": 1.7182416435780454e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 865504, |
| "step": 4040 |
| }, |
| { |
| "epoch": 12.841269841269842, |
| "grad_norm": 0.00018932884267996997, |
| "learning_rate": 1.7116660707763636e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 866560, |
| "step": 4045 |
| }, |
| { |
| "epoch": 12.857142857142858, |
| "grad_norm": 0.00017417047638446093, |
| "learning_rate": 1.7050965483675743e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 867680, |
| "step": 4050 |
| }, |
| { |
| "epoch": 12.873015873015873, |
| "grad_norm": 0.00018292553431820124, |
| "learning_rate": 1.698533126772177e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 868800, |
| "step": 4055 |
| }, |
| { |
| "epoch": 12.88888888888889, |
| "grad_norm": 0.00012824099394492805, |
| "learning_rate": 1.6919758563638504e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 869824, |
| "step": 4060 |
| }, |
| { |
| "epoch": 12.904761904761905, |
| "grad_norm": 0.004027406685054302, |
| "learning_rate": 1.6854247874690617e-05, |
| "loss": 0.0015, |
| "num_input_tokens_seen": 870912, |
| "step": 4065 |
| }, |
| { |
| "epoch": 12.920634920634921, |
| "grad_norm": 0.00022077480389270931, |
| "learning_rate": 1.678879970366683e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 872000, |
| "step": 4070 |
| }, |
| { |
| "epoch": 12.936507936507937, |
| "grad_norm": 0.0001212661009049043, |
| "learning_rate": 1.672341455287605e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 872992, |
| "step": 4075 |
| }, |
| { |
| "epoch": 12.952380952380953, |
| "grad_norm": 0.00012811145279556513, |
| "learning_rate": 1.6658092924143497e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 874064, |
| "step": 4080 |
| }, |
| { |
| "epoch": 12.968253968253968, |
| "grad_norm": 0.00033971265656873584, |
| "learning_rate": 1.6592835318806868e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 875072, |
| "step": 4085 |
| }, |
| { |
| "epoch": 12.984126984126984, |
| "grad_norm": 0.00036233861465007067, |
| "learning_rate": 1.6527642237712494e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 876144, |
| "step": 4090 |
| }, |
| { |
| "epoch": 13.0, |
| "grad_norm": 0.00014307050150819123, |
| "learning_rate": 1.646251418121148e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 877248, |
| "step": 4095 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.19546957314014435, |
| "eval_runtime": 1.4542, |
| "eval_samples_per_second": 48.135, |
| "eval_steps_per_second": 24.068, |
| "num_input_tokens_seen": 877248, |
| "step": 4095 |
| }, |
| { |
| "epoch": 13.015873015873016, |
| "grad_norm": 0.00017348073015455157, |
| "learning_rate": 1.639745164915587e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 878256, |
| "step": 4100 |
| }, |
| { |
| "epoch": 13.031746031746032, |
| "grad_norm": 0.0034831962548196316, |
| "learning_rate": 1.633245514089482e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 879296, |
| "step": 4105 |
| }, |
| { |
| "epoch": 13.047619047619047, |
| "grad_norm": 0.0001327778008999303, |
| "learning_rate": 1.6267525155270773e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 880448, |
| "step": 4110 |
| }, |
| { |
| "epoch": 13.063492063492063, |
| "grad_norm": 0.000698502582963556, |
| "learning_rate": 1.6202662190615586e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 881568, |
| "step": 4115 |
| }, |
| { |
| "epoch": 13.079365079365079, |
| "grad_norm": 0.00014820935030002147, |
| "learning_rate": 1.6137866744746757e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 882592, |
| "step": 4120 |
| }, |
| { |
| "epoch": 13.095238095238095, |
| "grad_norm": 0.00021490654035005718, |
| "learning_rate": 1.607313931496357e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 883632, |
| "step": 4125 |
| }, |
| { |
| "epoch": 13.11111111111111, |
| "grad_norm": 0.00028194382321089506, |
| "learning_rate": 1.6008480398043313e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 884688, |
| "step": 4130 |
| }, |
| { |
| "epoch": 13.126984126984127, |
| "grad_norm": 0.013800282031297684, |
| "learning_rate": 1.5943890490237433e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 885776, |
| "step": 4135 |
| }, |
| { |
| "epoch": 13.142857142857142, |
| "grad_norm": 0.00010271323844790459, |
| "learning_rate": 1.5879370087267725e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 886832, |
| "step": 4140 |
| }, |
| { |
| "epoch": 13.158730158730158, |
| "grad_norm": 0.005160802509635687, |
| "learning_rate": 1.5814919684322545e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 887904, |
| "step": 4145 |
| }, |
| { |
| "epoch": 13.174603174603174, |
| "grad_norm": 0.00016589944425504655, |
| "learning_rate": 1.575053977605303e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 888976, |
| "step": 4150 |
| }, |
| { |
| "epoch": 13.19047619047619, |
| "grad_norm": 0.000149158135172911, |
| "learning_rate": 1.5686230856569252e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 890032, |
| "step": 4155 |
| }, |
| { |
| "epoch": 13.206349206349206, |
| "grad_norm": 0.0001676021929597482, |
| "learning_rate": 1.5621993419436453e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 891136, |
| "step": 4160 |
| }, |
| { |
| "epoch": 13.222222222222221, |
| "grad_norm": 0.00020368795958347619, |
| "learning_rate": 1.5557827957671248e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 892256, |
| "step": 4165 |
| }, |
| { |
| "epoch": 13.238095238095237, |
| "grad_norm": 0.00017661222955211997, |
| "learning_rate": 1.549373496373788e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 893376, |
| "step": 4170 |
| }, |
| { |
| "epoch": 13.253968253968253, |
| "grad_norm": 0.00016313417290803045, |
| "learning_rate": 1.542971492954437e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 894432, |
| "step": 4175 |
| }, |
| { |
| "epoch": 13.26984126984127, |
| "grad_norm": 0.00040456498390994966, |
| "learning_rate": 1.5365768346438797e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 895520, |
| "step": 4180 |
| }, |
| { |
| "epoch": 13.285714285714286, |
| "grad_norm": 0.0001557384239276871, |
| "learning_rate": 1.5301895705205503e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 896688, |
| "step": 4185 |
| }, |
| { |
| "epoch": 13.301587301587302, |
| "grad_norm": 0.00019953801529482007, |
| "learning_rate": 1.5238097496061348e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 897744, |
| "step": 4190 |
| }, |
| { |
| "epoch": 13.317460317460318, |
| "grad_norm": 0.00024302539532072842, |
| "learning_rate": 1.5174374208651912e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 898800, |
| "step": 4195 |
| }, |
| { |
| "epoch": 13.333333333333334, |
| "grad_norm": 0.00029446918051689863, |
| "learning_rate": 1.511072633204777e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 899904, |
| "step": 4200 |
| }, |
| { |
| "epoch": 13.34920634920635, |
| "grad_norm": 0.0001966770796570927, |
| "learning_rate": 1.5047154354740717e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 900928, |
| "step": 4205 |
| }, |
| { |
| "epoch": 13.365079365079366, |
| "grad_norm": 0.0003508214431349188, |
| "learning_rate": 1.4983658764640039e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 901984, |
| "step": 4210 |
| }, |
| { |
| "epoch": 13.380952380952381, |
| "grad_norm": 0.00016046679229475558, |
| "learning_rate": 1.4920240049068748e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 903008, |
| "step": 4215 |
| }, |
| { |
| "epoch": 13.396825396825397, |
| "grad_norm": 0.0002185764751629904, |
| "learning_rate": 1.4856898694759855e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 904032, |
| "step": 4220 |
| }, |
| { |
| "epoch": 13.412698412698413, |
| "grad_norm": 0.00029554381035268307, |
| "learning_rate": 1.4793635187852622e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 905040, |
| "step": 4225 |
| }, |
| { |
| "epoch": 13.428571428571429, |
| "grad_norm": 7.924468081910163e-05, |
| "learning_rate": 1.4730450013888857e-05, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 906176, |
| "step": 4230 |
| }, |
| { |
| "epoch": 13.444444444444445, |
| "grad_norm": 0.00014417868806049228, |
| "learning_rate": 1.4667343657809152e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 907296, |
| "step": 4235 |
| }, |
| { |
| "epoch": 13.46031746031746, |
| "grad_norm": 0.0023033898323774338, |
| "learning_rate": 1.4604316603949186e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 908352, |
| "step": 4240 |
| }, |
| { |
| "epoch": 13.476190476190476, |
| "grad_norm": 0.00021005469898227602, |
| "learning_rate": 1.4541369336035988e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 909376, |
| "step": 4245 |
| }, |
| { |
| "epoch": 13.492063492063492, |
| "grad_norm": 0.0001569826272316277, |
| "learning_rate": 1.4478502337184274e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 910448, |
| "step": 4250 |
| }, |
| { |
| "epoch": 13.507936507936508, |
| "grad_norm": 0.003406015457585454, |
| "learning_rate": 1.4415716089892656e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 911488, |
| "step": 4255 |
| }, |
| { |
| "epoch": 13.523809523809524, |
| "grad_norm": 0.0002489403123036027, |
| "learning_rate": 1.4353011076040021e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 912528, |
| "step": 4260 |
| }, |
| { |
| "epoch": 13.53968253968254, |
| "grad_norm": 0.0004728223429992795, |
| "learning_rate": 1.4290387776881764e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 913568, |
| "step": 4265 |
| }, |
| { |
| "epoch": 13.555555555555555, |
| "grad_norm": 0.0011484220158308744, |
| "learning_rate": 1.422784667304615e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 914656, |
| "step": 4270 |
| }, |
| { |
| "epoch": 13.571428571428571, |
| "grad_norm": 0.00022116370382718742, |
| "learning_rate": 1.4165388244530608e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 915728, |
| "step": 4275 |
| }, |
| { |
| "epoch": 13.587301587301587, |
| "grad_norm": 0.0001274219830520451, |
| "learning_rate": 1.4103012970698016e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 916816, |
| "step": 4280 |
| }, |
| { |
| "epoch": 13.603174603174603, |
| "grad_norm": 0.00016120154759846628, |
| "learning_rate": 1.4040721330273062e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 917904, |
| "step": 4285 |
| }, |
| { |
| "epoch": 13.619047619047619, |
| "grad_norm": 0.0003869999200105667, |
| "learning_rate": 1.397851380133857e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 918960, |
| "step": 4290 |
| }, |
| { |
| "epoch": 13.634920634920634, |
| "grad_norm": 0.00033192671253345907, |
| "learning_rate": 1.3916390861331774e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 920064, |
| "step": 4295 |
| }, |
| { |
| "epoch": 13.65079365079365, |
| "grad_norm": 0.00014532014029100537, |
| "learning_rate": 1.3854352987040747e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 921152, |
| "step": 4300 |
| }, |
| { |
| "epoch": 13.666666666666666, |
| "grad_norm": 0.0001411033299518749, |
| "learning_rate": 1.379240065460064e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 922208, |
| "step": 4305 |
| }, |
| { |
| "epoch": 13.682539682539682, |
| "grad_norm": 0.006365750916302204, |
| "learning_rate": 1.3730534339490114e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 923312, |
| "step": 4310 |
| }, |
| { |
| "epoch": 13.698412698412698, |
| "grad_norm": 0.0001843793725129217, |
| "learning_rate": 1.3668754516527655e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 924400, |
| "step": 4315 |
| }, |
| { |
| "epoch": 13.714285714285714, |
| "grad_norm": 0.00045810375013388693, |
| "learning_rate": 1.3607061659867892e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 925472, |
| "step": 4320 |
| }, |
| { |
| "epoch": 13.73015873015873, |
| "grad_norm": 0.0009504028712399304, |
| "learning_rate": 1.3545456242998039e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 926592, |
| "step": 4325 |
| }, |
| { |
| "epoch": 13.746031746031747, |
| "grad_norm": 0.0018185972003266215, |
| "learning_rate": 1.3483938738734198e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 927680, |
| "step": 4330 |
| }, |
| { |
| "epoch": 13.761904761904763, |
| "grad_norm": 0.0010512792505323887, |
| "learning_rate": 1.3422509619217738e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 928784, |
| "step": 4335 |
| }, |
| { |
| "epoch": 13.777777777777779, |
| "grad_norm": 0.00015952046669553965, |
| "learning_rate": 1.3361169355911715e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 929776, |
| "step": 4340 |
| }, |
| { |
| "epoch": 13.793650793650794, |
| "grad_norm": 0.0001598140806891024, |
| "learning_rate": 1.3299918419597171e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 930912, |
| "step": 4345 |
| }, |
| { |
| "epoch": 13.80952380952381, |
| "grad_norm": 0.0001969587174244225, |
| "learning_rate": 1.323875728036964e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 932048, |
| "step": 4350 |
| }, |
| { |
| "epoch": 13.825396825396826, |
| "grad_norm": 0.0056450143456459045, |
| "learning_rate": 1.3177686407635417e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 933152, |
| "step": 4355 |
| }, |
| { |
| "epoch": 13.841269841269842, |
| "grad_norm": 0.00041293379035778344, |
| "learning_rate": 1.3116706270108015e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 934240, |
| "step": 4360 |
| }, |
| { |
| "epoch": 13.857142857142858, |
| "grad_norm": 0.00012529375089798123, |
| "learning_rate": 1.3055817335804582e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 935328, |
| "step": 4365 |
| }, |
| { |
| "epoch": 13.873015873015873, |
| "grad_norm": 0.0004058619379065931, |
| "learning_rate": 1.2995020072042285e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 936400, |
| "step": 4370 |
| }, |
| { |
| "epoch": 13.88888888888889, |
| "grad_norm": 0.0001662838039919734, |
| "learning_rate": 1.2934314945434734e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 937456, |
| "step": 4375 |
| }, |
| { |
| "epoch": 13.904761904761905, |
| "grad_norm": 0.0003785460430663079, |
| "learning_rate": 1.2873702421888365e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 938496, |
| "step": 4380 |
| }, |
| { |
| "epoch": 13.920634920634921, |
| "grad_norm": 0.0023042745888233185, |
| "learning_rate": 1.2813182966598902e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 939568, |
| "step": 4385 |
| }, |
| { |
| "epoch": 13.936507936507937, |
| "grad_norm": 0.00040539150359109044, |
| "learning_rate": 1.2752757044047827e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 940592, |
| "step": 4390 |
| }, |
| { |
| "epoch": 13.952380952380953, |
| "grad_norm": 0.0001386718067806214, |
| "learning_rate": 1.2692425117998699e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 941632, |
| "step": 4395 |
| }, |
| { |
| "epoch": 13.968253968253968, |
| "grad_norm": 0.00020760892948601395, |
| "learning_rate": 1.263218765149371e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 942656, |
| "step": 4400 |
| }, |
| { |
| "epoch": 13.984126984126984, |
| "grad_norm": 0.0005386440316215158, |
| "learning_rate": 1.257204510685005e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 943728, |
| "step": 4405 |
| }, |
| { |
| "epoch": 14.0, |
| "grad_norm": 9.081437747227028e-05, |
| "learning_rate": 1.2511997945656415e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 944752, |
| "step": 4410 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.19688165187835693, |
| "eval_runtime": 1.4465, |
| "eval_samples_per_second": 48.392, |
| "eval_steps_per_second": 24.196, |
| "num_input_tokens_seen": 944752, |
| "step": 4410 |
| }, |
| { |
| "epoch": 14.015873015873016, |
| "grad_norm": 0.0016641680849716067, |
| "learning_rate": 1.2452046628769443e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 945872, |
| "step": 4415 |
| }, |
| { |
| "epoch": 14.031746031746032, |
| "grad_norm": 0.00016903673531487584, |
| "learning_rate": 1.2392191616310148e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 946928, |
| "step": 4420 |
| }, |
| { |
| "epoch": 14.047619047619047, |
| "grad_norm": 0.00014750863192602992, |
| "learning_rate": 1.2332433367660442e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 947952, |
| "step": 4425 |
| }, |
| { |
| "epoch": 14.063492063492063, |
| "grad_norm": 0.0013234770158305764, |
| "learning_rate": 1.227277234145959e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 949056, |
| "step": 4430 |
| }, |
| { |
| "epoch": 14.079365079365079, |
| "grad_norm": 0.00011105871817562729, |
| "learning_rate": 1.2213208995600648e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 950128, |
| "step": 4435 |
| }, |
| { |
| "epoch": 14.095238095238095, |
| "grad_norm": 0.00011416849883971736, |
| "learning_rate": 1.2153743787227023e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 951280, |
| "step": 4440 |
| }, |
| { |
| "epoch": 14.11111111111111, |
| "grad_norm": 0.00014661815657746047, |
| "learning_rate": 1.2094377172728891e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 952400, |
| "step": 4445 |
| }, |
| { |
| "epoch": 14.126984126984127, |
| "grad_norm": 0.00014966045273467898, |
| "learning_rate": 1.2035109607739755e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 953472, |
| "step": 4450 |
| }, |
| { |
| "epoch": 14.142857142857142, |
| "grad_norm": 0.00011174564860993996, |
| "learning_rate": 1.1975941547132922e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 954464, |
| "step": 4455 |
| }, |
| { |
| "epoch": 14.158730158730158, |
| "grad_norm": 0.0003684433759190142, |
| "learning_rate": 1.1916873445017982e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 955520, |
| "step": 4460 |
| }, |
| { |
| "epoch": 14.174603174603174, |
| "grad_norm": 0.0008453542250208557, |
| "learning_rate": 1.185790575473738e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 956624, |
| "step": 4465 |
| }, |
| { |
| "epoch": 14.19047619047619, |
| "grad_norm": 9.045572369359434e-05, |
| "learning_rate": 1.1799038928862919e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 957728, |
| "step": 4470 |
| }, |
| { |
| "epoch": 14.206349206349206, |
| "grad_norm": 0.0008998040575534105, |
| "learning_rate": 1.1740273419192233e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 958768, |
| "step": 4475 |
| }, |
| { |
| "epoch": 14.222222222222221, |
| "grad_norm": 0.00014305087097454816, |
| "learning_rate": 1.1681609676745411e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 959824, |
| "step": 4480 |
| }, |
| { |
| "epoch": 14.238095238095237, |
| "grad_norm": 0.00012867158511653543, |
| "learning_rate": 1.1623048151761436e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 960848, |
| "step": 4485 |
| }, |
| { |
| "epoch": 14.253968253968253, |
| "grad_norm": 8.349162089871243e-05, |
| "learning_rate": 1.1564589293694855e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 961984, |
| "step": 4490 |
| }, |
| { |
| "epoch": 14.26984126984127, |
| "grad_norm": 0.00010760652367025614, |
| "learning_rate": 1.1506233551212186e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 963072, |
| "step": 4495 |
| }, |
| { |
| "epoch": 14.285714285714286, |
| "grad_norm": 0.0005234842537902296, |
| "learning_rate": 1.1447981372188563e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 964176, |
| "step": 4500 |
| }, |
| { |
| "epoch": 14.301587301587302, |
| "grad_norm": 0.00019558188796509057, |
| "learning_rate": 1.1389833203704294e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 965232, |
| "step": 4505 |
| }, |
| { |
| "epoch": 14.317460317460318, |
| "grad_norm": 0.00011165087198605761, |
| "learning_rate": 1.133178949204141e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 966320, |
| "step": 4510 |
| }, |
| { |
| "epoch": 14.333333333333334, |
| "grad_norm": 0.00018003462173510343, |
| "learning_rate": 1.1273850682680252e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 967440, |
| "step": 4515 |
| }, |
| { |
| "epoch": 14.34920634920635, |
| "grad_norm": 0.005392876453697681, |
| "learning_rate": 1.1216017220296026e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 968480, |
| "step": 4520 |
| }, |
| { |
| "epoch": 14.365079365079366, |
| "grad_norm": 0.00011155071842949837, |
| "learning_rate": 1.1158289548755399e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 969536, |
| "step": 4525 |
| }, |
| { |
| "epoch": 14.380952380952381, |
| "grad_norm": 0.005166823975741863, |
| "learning_rate": 1.1100668111113166e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 970608, |
| "step": 4530 |
| }, |
| { |
| "epoch": 14.396825396825397, |
| "grad_norm": 0.008961636573076248, |
| "learning_rate": 1.104315334960871e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 971600, |
| "step": 4535 |
| }, |
| { |
| "epoch": 14.412698412698413, |
| "grad_norm": 6.83280813973397e-05, |
| "learning_rate": 1.0985745705662737e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 972688, |
| "step": 4540 |
| }, |
| { |
| "epoch": 14.428571428571429, |
| "grad_norm": 0.00025904824724420905, |
| "learning_rate": 1.0928445619873795e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 973760, |
| "step": 4545 |
| }, |
| { |
| "epoch": 14.444444444444445, |
| "grad_norm": 9.120917093241587e-05, |
| "learning_rate": 1.0871253532014969e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 974832, |
| "step": 4550 |
| }, |
| { |
| "epoch": 14.46031746031746, |
| "grad_norm": 0.0001160187239293009, |
| "learning_rate": 1.0814169881030459e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 975904, |
| "step": 4555 |
| }, |
| { |
| "epoch": 14.476190476190476, |
| "grad_norm": 0.00010269311314914376, |
| "learning_rate": 1.0757195105032198e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 977008, |
| "step": 4560 |
| }, |
| { |
| "epoch": 14.492063492063492, |
| "grad_norm": 9.548700472805649e-05, |
| "learning_rate": 1.0700329641296541e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 978080, |
| "step": 4565 |
| }, |
| { |
| "epoch": 14.507936507936508, |
| "grad_norm": 0.009873582050204277, |
| "learning_rate": 1.064357392626088e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 979184, |
| "step": 4570 |
| }, |
| { |
| "epoch": 14.523809523809524, |
| "grad_norm": 0.0005500807310454547, |
| "learning_rate": 1.0586928395520271e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 980272, |
| "step": 4575 |
| }, |
| { |
| "epoch": 14.53968253968254, |
| "grad_norm": 0.00010129127622349188, |
| "learning_rate": 1.053039348382415e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 981408, |
| "step": 4580 |
| }, |
| { |
| "epoch": 14.555555555555555, |
| "grad_norm": 0.0009250047733075917, |
| "learning_rate": 1.0473969625072922e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 982496, |
| "step": 4585 |
| }, |
| { |
| "epoch": 14.571428571428571, |
| "grad_norm": 9.661580406827852e-05, |
| "learning_rate": 1.0417657252314702e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 983536, |
| "step": 4590 |
| }, |
| { |
| "epoch": 14.587301587301587, |
| "grad_norm": 0.00022067976533435285, |
| "learning_rate": 1.0361456797741959e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 984576, |
| "step": 4595 |
| }, |
| { |
| "epoch": 14.603174603174603, |
| "grad_norm": 0.00020819882047362626, |
| "learning_rate": 1.0305368692688174e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 985584, |
| "step": 4600 |
| }, |
| { |
| "epoch": 14.619047619047619, |
| "grad_norm": 0.0002456950314808637, |
| "learning_rate": 1.0249393367624579e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 986640, |
| "step": 4605 |
| }, |
| { |
| "epoch": 14.634920634920634, |
| "grad_norm": 0.00010459975601406768, |
| "learning_rate": 1.0193531252156833e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 987648, |
| "step": 4610 |
| }, |
| { |
| "epoch": 14.65079365079365, |
| "grad_norm": 9.877282718662173e-05, |
| "learning_rate": 1.0137782775021686e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 988768, |
| "step": 4615 |
| }, |
| { |
| "epoch": 14.666666666666666, |
| "grad_norm": 0.00016360699373763055, |
| "learning_rate": 1.008214836408378e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 989856, |
| "step": 4620 |
| }, |
| { |
| "epoch": 14.682539682539682, |
| "grad_norm": 0.00014213789836503565, |
| "learning_rate": 1.0026628446332248e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 990896, |
| "step": 4625 |
| }, |
| { |
| "epoch": 14.698412698412698, |
| "grad_norm": 0.0007357973954640329, |
| "learning_rate": 9.97122344787754e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 991952, |
| "step": 4630 |
| }, |
| { |
| "epoch": 14.714285714285714, |
| "grad_norm": 0.00011429537698859349, |
| "learning_rate": 9.91593379394811e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 993008, |
| "step": 4635 |
| }, |
| { |
| "epoch": 14.73015873015873, |
| "grad_norm": 0.00015254374011419713, |
| "learning_rate": 9.860759908887122e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 994048, |
| "step": 4640 |
| }, |
| { |
| "epoch": 14.746031746031747, |
| "grad_norm": 0.00013117569324094802, |
| "learning_rate": 9.805702216149251e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 995104, |
| "step": 4645 |
| }, |
| { |
| "epoch": 14.761904761904763, |
| "grad_norm": 0.0011479692766442895, |
| "learning_rate": 9.75076113829741e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 996208, |
| "step": 4650 |
| }, |
| { |
| "epoch": 14.777777777777779, |
| "grad_norm": 0.00013996977941133082, |
| "learning_rate": 9.695937096999475e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 997280, |
| "step": 4655 |
| }, |
| { |
| "epoch": 14.793650793650794, |
| "grad_norm": 0.0001853140420280397, |
| "learning_rate": 9.641230513025107e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 998320, |
| "step": 4660 |
| }, |
| { |
| "epoch": 14.80952380952381, |
| "grad_norm": 0.000127577266539447, |
| "learning_rate": 9.586641806242457e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 999504, |
| "step": 4665 |
| }, |
| { |
| "epoch": 14.825396825396826, |
| "grad_norm": 0.002337446203455329, |
| "learning_rate": 9.532171395615036e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1000576, |
| "step": 4670 |
| }, |
| { |
| "epoch": 14.841269841269842, |
| "grad_norm": 0.00014514128270093352, |
| "learning_rate": 9.477819699198379e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 1001616, |
| "step": 4675 |
| }, |
| { |
| "epoch": 14.857142857142858, |
| "grad_norm": 0.00025934414588846266, |
| "learning_rate": 9.423587134136949e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1002704, |
| "step": 4680 |
| }, |
| { |
| "epoch": 14.873015873015873, |
| "grad_norm": 0.0001345455675618723, |
| "learning_rate": 9.369474116660848e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1003776, |
| "step": 4685 |
| }, |
| { |
| "epoch": 14.88888888888889, |
| "grad_norm": 0.00010907748946920037, |
| "learning_rate": 9.315481062082687e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1004816, |
| "step": 4690 |
| }, |
| { |
| "epoch": 14.904761904761905, |
| "grad_norm": 0.013357513584196568, |
| "learning_rate": 9.261608384794374e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1005888, |
| "step": 4695 |
| }, |
| { |
| "epoch": 14.920634920634921, |
| "grad_norm": 9.384198347106576e-05, |
| "learning_rate": 9.207856498263902e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1006976, |
| "step": 4700 |
| }, |
| { |
| "epoch": 14.936507936507937, |
| "grad_norm": 0.007067098747938871, |
| "learning_rate": 9.154225815032242e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1008064, |
| "step": 4705 |
| }, |
| { |
| "epoch": 14.952380952380953, |
| "grad_norm": 0.0002748421102296561, |
| "learning_rate": 9.100716746710126e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1009120, |
| "step": 4710 |
| }, |
| { |
| "epoch": 14.968253968253968, |
| "grad_norm": 0.00014957235543988645, |
| "learning_rate": 9.047329703974888e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1010192, |
| "step": 4715 |
| }, |
| { |
| "epoch": 14.984126984126984, |
| "grad_norm": 0.00050693703815341, |
| "learning_rate": 8.994065096567355e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1011248, |
| "step": 4720 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.00014735362492501736, |
| "learning_rate": 8.940923333288643e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1012272, |
| "step": 4725 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.19820746779441833, |
| "eval_runtime": 1.444, |
| "eval_samples_per_second": 48.478, |
| "eval_steps_per_second": 24.239, |
| "num_input_tokens_seen": 1012272, |
| "step": 4725 |
| }, |
| { |
| "epoch": 15.015873015873016, |
| "grad_norm": 0.00021902845764998347, |
| "learning_rate": 8.88790482199707e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1013296, |
| "step": 4730 |
| }, |
| { |
| "epoch": 15.031746031746032, |
| "grad_norm": 0.00099784170743078, |
| "learning_rate": 8.835009969605012e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1014384, |
| "step": 4735 |
| }, |
| { |
| "epoch": 15.047619047619047, |
| "grad_norm": 0.000178317932295613, |
| "learning_rate": 8.78223918207575e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1015472, |
| "step": 4740 |
| }, |
| { |
| "epoch": 15.063492063492063, |
| "grad_norm": 0.00011670400999719277, |
| "learning_rate": 8.729592864420394e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1016544, |
| "step": 4745 |
| }, |
| { |
| "epoch": 15.079365079365079, |
| "grad_norm": 0.003956442698836327, |
| "learning_rate": 8.677071420694769e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1017664, |
| "step": 4750 |
| }, |
| { |
| "epoch": 15.095238095238095, |
| "grad_norm": 0.000169846520293504, |
| "learning_rate": 8.62467525399627e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1018784, |
| "step": 4755 |
| }, |
| { |
| "epoch": 15.11111111111111, |
| "grad_norm": 0.00024720613146200776, |
| "learning_rate": 8.572404766460846e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1019808, |
| "step": 4760 |
| }, |
| { |
| "epoch": 15.126984126984127, |
| "grad_norm": 0.00011122092109872028, |
| "learning_rate": 8.520260359259822e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1020896, |
| "step": 4765 |
| }, |
| { |
| "epoch": 15.142857142857142, |
| "grad_norm": 0.0001617560046724975, |
| "learning_rate": 8.468242432596904e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1022000, |
| "step": 4770 |
| }, |
| { |
| "epoch": 15.158730158730158, |
| "grad_norm": 0.00025247674784623086, |
| "learning_rate": 8.41635138570507e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1023088, |
| "step": 4775 |
| }, |
| { |
| "epoch": 15.174603174603174, |
| "grad_norm": 0.0001408685347996652, |
| "learning_rate": 8.364587616843477e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1024192, |
| "step": 4780 |
| }, |
| { |
| "epoch": 15.19047619047619, |
| "grad_norm": 0.00010588414443191141, |
| "learning_rate": 8.312951523294462e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1025232, |
| "step": 4785 |
| }, |
| { |
| "epoch": 15.206349206349206, |
| "grad_norm": 7.913958688732237e-05, |
| "learning_rate": 8.261443501360466e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1026304, |
| "step": 4790 |
| }, |
| { |
| "epoch": 15.222222222222221, |
| "grad_norm": 0.00014240090968087316, |
| "learning_rate": 8.210063946360964e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1027424, |
| "step": 4795 |
| }, |
| { |
| "epoch": 15.238095238095237, |
| "grad_norm": 0.00010943754023173824, |
| "learning_rate": 8.158813252629497e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1028496, |
| "step": 4800 |
| }, |
| { |
| "epoch": 15.253968253968253, |
| "grad_norm": 0.0002636691788211465, |
| "learning_rate": 8.107691813510562e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1029584, |
| "step": 4805 |
| }, |
| { |
| "epoch": 15.26984126984127, |
| "grad_norm": 0.00046621993533335626, |
| "learning_rate": 8.056700021356694e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1030672, |
| "step": 4810 |
| }, |
| { |
| "epoch": 15.285714285714286, |
| "grad_norm": 0.00011222544708289206, |
| "learning_rate": 8.005838267525356e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1031680, |
| "step": 4815 |
| }, |
| { |
| "epoch": 15.301587301587302, |
| "grad_norm": 0.00013111262524034828, |
| "learning_rate": 7.955106942375985e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1032720, |
| "step": 4820 |
| }, |
| { |
| "epoch": 15.317460317460318, |
| "grad_norm": 0.0001799971651053056, |
| "learning_rate": 7.904506435266998e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1033728, |
| "step": 4825 |
| }, |
| { |
| "epoch": 15.333333333333334, |
| "grad_norm": 0.007164331618696451, |
| "learning_rate": 7.854037134552797e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1034784, |
| "step": 4830 |
| }, |
| { |
| "epoch": 15.34920634920635, |
| "grad_norm": 0.00014734613068867475, |
| "learning_rate": 7.803699427580789e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1035872, |
| "step": 4835 |
| }, |
| { |
| "epoch": 15.365079365079366, |
| "grad_norm": 0.003119000233709812, |
| "learning_rate": 7.753493700688397e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1036960, |
| "step": 4840 |
| }, |
| { |
| "epoch": 15.380952380952381, |
| "grad_norm": 0.00016122111992444843, |
| "learning_rate": 7.703420339200101e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1038064, |
| "step": 4845 |
| }, |
| { |
| "epoch": 15.396825396825397, |
| "grad_norm": 0.00026882500969804823, |
| "learning_rate": 7.653479727424534e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1039152, |
| "step": 4850 |
| }, |
| { |
| "epoch": 15.412698412698413, |
| "grad_norm": 9.242565283784643e-05, |
| "learning_rate": 7.603672248651431e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1040240, |
| "step": 4855 |
| }, |
| { |
| "epoch": 15.428571428571429, |
| "grad_norm": 0.003781526582315564, |
| "learning_rate": 7.553998285148786e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1041264, |
| "step": 4860 |
| }, |
| { |
| "epoch": 15.444444444444445, |
| "grad_norm": 0.00021264157840050757, |
| "learning_rate": 7.504458218159841e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1042288, |
| "step": 4865 |
| }, |
| { |
| "epoch": 15.46031746031746, |
| "grad_norm": 0.0001260324497707188, |
| "learning_rate": 7.455052427900213e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1043392, |
| "step": 4870 |
| }, |
| { |
| "epoch": 15.476190476190476, |
| "grad_norm": 0.00015318683290388435, |
| "learning_rate": 7.405781293554973e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1044496, |
| "step": 4875 |
| }, |
| { |
| "epoch": 15.492063492063492, |
| "grad_norm": 0.0002370552538195625, |
| "learning_rate": 7.3566451932756744e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1045504, |
| "step": 4880 |
| }, |
| { |
| "epoch": 15.507936507936508, |
| "grad_norm": 0.0016014057910069823, |
| "learning_rate": 7.307644504177538e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1046592, |
| "step": 4885 |
| }, |
| { |
| "epoch": 15.523809523809524, |
| "grad_norm": 0.00011496651131892577, |
| "learning_rate": 7.258779602336504e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1047728, |
| "step": 4890 |
| }, |
| { |
| "epoch": 15.53968253968254, |
| "grad_norm": 0.0014618715504184365, |
| "learning_rate": 7.210050862786341e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1048848, |
| "step": 4895 |
| }, |
| { |
| "epoch": 15.555555555555555, |
| "grad_norm": 0.00013625272549688816, |
| "learning_rate": 7.161458659515813e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1049872, |
| "step": 4900 |
| }, |
| { |
| "epoch": 15.571428571428571, |
| "grad_norm": 0.00021962377650197595, |
| "learning_rate": 7.113003365465745e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1050944, |
| "step": 4905 |
| }, |
| { |
| "epoch": 15.587301587301587, |
| "grad_norm": 0.006843153852969408, |
| "learning_rate": 7.064685352526229e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1051968, |
| "step": 4910 |
| }, |
| { |
| "epoch": 15.603174603174603, |
| "grad_norm": 0.00014317109889816493, |
| "learning_rate": 7.016504991533726e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1053104, |
| "step": 4915 |
| }, |
| { |
| "epoch": 15.619047619047619, |
| "grad_norm": 0.00021036296675447375, |
| "learning_rate": 6.9684626522682154e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1054144, |
| "step": 4920 |
| }, |
| { |
| "epoch": 15.634920634920634, |
| "grad_norm": 0.0006890058284625411, |
| "learning_rate": 6.920558703450389e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1055168, |
| "step": 4925 |
| }, |
| { |
| "epoch": 15.65079365079365, |
| "grad_norm": 7.351509702857584e-05, |
| "learning_rate": 6.872793512738809e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1056208, |
| "step": 4930 |
| }, |
| { |
| "epoch": 15.666666666666666, |
| "grad_norm": 0.00024063632008619606, |
| "learning_rate": 6.825167446727057e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1057280, |
| "step": 4935 |
| }, |
| { |
| "epoch": 15.682539682539682, |
| "grad_norm": 0.006952513474971056, |
| "learning_rate": 6.777680870940972e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1058352, |
| "step": 4940 |
| }, |
| { |
| "epoch": 15.698412698412698, |
| "grad_norm": 0.0001248103944817558, |
| "learning_rate": 6.730334149835788e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1059408, |
| "step": 4945 |
| }, |
| { |
| "epoch": 15.714285714285714, |
| "grad_norm": 9.666664118412882e-05, |
| "learning_rate": 6.683127646793411e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1060528, |
| "step": 4950 |
| }, |
| { |
| "epoch": 15.73015873015873, |
| "grad_norm": 0.0012612607097253203, |
| "learning_rate": 6.636061724119541e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1061600, |
| "step": 4955 |
| }, |
| { |
| "epoch": 15.746031746031747, |
| "grad_norm": 0.00018189875117968768, |
| "learning_rate": 6.589136743040955e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1062640, |
| "step": 4960 |
| }, |
| { |
| "epoch": 15.761904761904763, |
| "grad_norm": 0.00012195859744679183, |
| "learning_rate": 6.542353063702716e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1063648, |
| "step": 4965 |
| }, |
| { |
| "epoch": 15.777777777777779, |
| "grad_norm": 0.005103013478219509, |
| "learning_rate": 6.495711045165412e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 1064752, |
| "step": 4970 |
| }, |
| { |
| "epoch": 15.793650793650794, |
| "grad_norm": 0.00019367771164979786, |
| "learning_rate": 6.449211045402395e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1065808, |
| "step": 4975 |
| }, |
| { |
| "epoch": 15.80952380952381, |
| "grad_norm": 0.00011932419874938205, |
| "learning_rate": 6.402853421297034e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1066848, |
| "step": 4980 |
| }, |
| { |
| "epoch": 15.825396825396826, |
| "grad_norm": 9.995235450332984e-05, |
| "learning_rate": 6.356638528639955e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1067808, |
| "step": 4985 |
| }, |
| { |
| "epoch": 15.841269841269842, |
| "grad_norm": 0.00012767007865477353, |
| "learning_rate": 6.3105667221263845e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1068912, |
| "step": 4990 |
| }, |
| { |
| "epoch": 15.857142857142858, |
| "grad_norm": 0.0002976842224597931, |
| "learning_rate": 6.2646383553533275e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1070000, |
| "step": 4995 |
| }, |
| { |
| "epoch": 15.873015873015873, |
| "grad_norm": 0.00011814333993243054, |
| "learning_rate": 6.218853780816933e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1071040, |
| "step": 5000 |
| }, |
| { |
| "epoch": 15.88888888888889, |
| "grad_norm": 0.00021936133271083236, |
| "learning_rate": 6.173213349909729e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1072144, |
| "step": 5005 |
| }, |
| { |
| "epoch": 15.904761904761905, |
| "grad_norm": 0.00013133411994203925, |
| "learning_rate": 6.127717412917977e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1073216, |
| "step": 5010 |
| }, |
| { |
| "epoch": 15.920634920634921, |
| "grad_norm": 0.00015381992852780968, |
| "learning_rate": 6.082366319018959e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1074272, |
| "step": 5015 |
| }, |
| { |
| "epoch": 15.936507936507937, |
| "grad_norm": 0.0039579374715685844, |
| "learning_rate": 6.037160416278278e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1075408, |
| "step": 5020 |
| }, |
| { |
| "epoch": 15.952380952380953, |
| "grad_norm": 6.8703229771927e-05, |
| "learning_rate": 5.9921000516472315e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1076496, |
| "step": 5025 |
| }, |
| { |
| "epoch": 15.968253968253968, |
| "grad_norm": 0.00011043099220842123, |
| "learning_rate": 5.947185570960123e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1077600, |
| "step": 5030 |
| }, |
| { |
| "epoch": 15.984126984126984, |
| "grad_norm": 0.00043672084575518966, |
| "learning_rate": 5.902417318931589e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1078736, |
| "step": 5035 |
| }, |
| { |
| "epoch": 16.0, |
| "grad_norm": 0.00018926948541775346, |
| "learning_rate": 5.857795639153998e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1079744, |
| "step": 5040 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.20095133781433105, |
| "eval_runtime": 1.4576, |
| "eval_samples_per_second": 48.023, |
| "eval_steps_per_second": 24.011, |
| "num_input_tokens_seen": 1079744, |
| "step": 5040 |
| }, |
| { |
| "epoch": 16.015873015873016, |
| "grad_norm": 0.00011885230196639895, |
| "learning_rate": 5.813320874094771e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1080784, |
| "step": 5045 |
| }, |
| { |
| "epoch": 16.03174603174603, |
| "grad_norm": 0.0002344203821849078, |
| "learning_rate": 5.768993365093783e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1081808, |
| "step": 5050 |
| }, |
| { |
| "epoch": 16.047619047619047, |
| "grad_norm": 9.413900988874957e-05, |
| "learning_rate": 5.724813452360736e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1082864, |
| "step": 5055 |
| }, |
| { |
| "epoch": 16.063492063492063, |
| "grad_norm": 0.0001468745176680386, |
| "learning_rate": 5.6807814749725245e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1083888, |
| "step": 5060 |
| }, |
| { |
| "epoch": 16.07936507936508, |
| "grad_norm": 0.0007511080475524068, |
| "learning_rate": 5.636897770870666e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1085008, |
| "step": 5065 |
| }, |
| { |
| "epoch": 16.095238095238095, |
| "grad_norm": 0.0004616309597622603, |
| "learning_rate": 5.593162676858707e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1086112, |
| "step": 5070 |
| }, |
| { |
| "epoch": 16.11111111111111, |
| "grad_norm": 0.00021627935348078609, |
| "learning_rate": 5.54957652859959e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1087168, |
| "step": 5075 |
| }, |
| { |
| "epoch": 16.126984126984127, |
| "grad_norm": 0.00010006018419517204, |
| "learning_rate": 5.506139660613147e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1088272, |
| "step": 5080 |
| }, |
| { |
| "epoch": 16.142857142857142, |
| "grad_norm": 0.0001312010281253606, |
| "learning_rate": 5.462852406273464e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1089344, |
| "step": 5085 |
| }, |
| { |
| "epoch": 16.158730158730158, |
| "grad_norm": 0.00013077599578537047, |
| "learning_rate": 5.4197150978063965e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1090368, |
| "step": 5090 |
| }, |
| { |
| "epoch": 16.174603174603174, |
| "grad_norm": 0.00010001740884035826, |
| "learning_rate": 5.376728066286943e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1091440, |
| "step": 5095 |
| }, |
| { |
| "epoch": 16.19047619047619, |
| "grad_norm": 0.00011268608795944601, |
| "learning_rate": 5.333891641636748e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1092496, |
| "step": 5100 |
| }, |
| { |
| "epoch": 16.206349206349206, |
| "grad_norm": 0.0007534271571785212, |
| "learning_rate": 5.291206152621572e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1093520, |
| "step": 5105 |
| }, |
| { |
| "epoch": 16.22222222222222, |
| "grad_norm": 0.004776482004672289, |
| "learning_rate": 5.248671926848753e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1094608, |
| "step": 5110 |
| }, |
| { |
| "epoch": 16.238095238095237, |
| "grad_norm": 0.00011033907503588125, |
| "learning_rate": 5.206289290764702e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1095664, |
| "step": 5115 |
| }, |
| { |
| "epoch": 16.253968253968253, |
| "grad_norm": 0.00014399575593415648, |
| "learning_rate": 5.164058569652377e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1096720, |
| "step": 5120 |
| }, |
| { |
| "epoch": 16.26984126984127, |
| "grad_norm": 0.0005759440246038139, |
| "learning_rate": 5.121980087628803e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1097744, |
| "step": 5125 |
| }, |
| { |
| "epoch": 16.285714285714285, |
| "grad_norm": 9.312896872870624e-05, |
| "learning_rate": 5.080054167642617e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1098816, |
| "step": 5130 |
| }, |
| { |
| "epoch": 16.3015873015873, |
| "grad_norm": 0.00014533007924910635, |
| "learning_rate": 5.038281131471514e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1099904, |
| "step": 5135 |
| }, |
| { |
| "epoch": 16.317460317460316, |
| "grad_norm": 0.00013747379125561565, |
| "learning_rate": 4.996661299719846e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1100944, |
| "step": 5140 |
| }, |
| { |
| "epoch": 16.333333333333332, |
| "grad_norm": 0.002072168281301856, |
| "learning_rate": 4.955194991816114e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1102000, |
| "step": 5145 |
| }, |
| { |
| "epoch": 16.349206349206348, |
| "grad_norm": 0.00013336709525901824, |
| "learning_rate": 4.913882526010555e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1103056, |
| "step": 5150 |
| }, |
| { |
| "epoch": 16.365079365079364, |
| "grad_norm": 0.00018689218268264085, |
| "learning_rate": 4.872724219372679e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1104192, |
| "step": 5155 |
| }, |
| { |
| "epoch": 16.38095238095238, |
| "grad_norm": 0.00011830328730866313, |
| "learning_rate": 4.831720387788827e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1105232, |
| "step": 5160 |
| }, |
| { |
| "epoch": 16.396825396825395, |
| "grad_norm": 0.00030842830892652273, |
| "learning_rate": 4.790871345959764e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1106224, |
| "step": 5165 |
| }, |
| { |
| "epoch": 16.41269841269841, |
| "grad_norm": 0.0011698472080752254, |
| "learning_rate": 4.750177407398268e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1107264, |
| "step": 5170 |
| }, |
| { |
| "epoch": 16.428571428571427, |
| "grad_norm": 0.0031321838032454252, |
| "learning_rate": 4.70963888442669e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1108352, |
| "step": 5175 |
| }, |
| { |
| "epoch": 16.444444444444443, |
| "grad_norm": 0.0015201118076220155, |
| "learning_rate": 4.669256088174606e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1109504, |
| "step": 5180 |
| }, |
| { |
| "epoch": 16.46031746031746, |
| "grad_norm": 0.000647184147965163, |
| "learning_rate": 4.629029328576381e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1110608, |
| "step": 5185 |
| }, |
| { |
| "epoch": 16.476190476190474, |
| "grad_norm": 0.0013111300067976117, |
| "learning_rate": 4.588958914368824e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1111696, |
| "step": 5190 |
| }, |
| { |
| "epoch": 16.49206349206349, |
| "grad_norm": 0.0003529720415826887, |
| "learning_rate": 4.549045153088813e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1112800, |
| "step": 5195 |
| }, |
| { |
| "epoch": 16.507936507936506, |
| "grad_norm": 0.00014995434321463108, |
| "learning_rate": 4.5092883510709085e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1113840, |
| "step": 5200 |
| }, |
| { |
| "epoch": 16.523809523809526, |
| "grad_norm": 6.491740350611508e-05, |
| "learning_rate": 4.469688813445042e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1114928, |
| "step": 5205 |
| }, |
| { |
| "epoch": 16.53968253968254, |
| "grad_norm": 0.0002835427294485271, |
| "learning_rate": 4.4302468441341536e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1115968, |
| "step": 5210 |
| }, |
| { |
| "epoch": 16.555555555555557, |
| "grad_norm": 0.00012973738193977624, |
| "learning_rate": 4.39096274585184e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1117040, |
| "step": 5215 |
| }, |
| { |
| "epoch": 16.571428571428573, |
| "grad_norm": 0.0009239883511327207, |
| "learning_rate": 4.3518368201000834e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1118032, |
| "step": 5220 |
| }, |
| { |
| "epoch": 16.58730158730159, |
| "grad_norm": 0.00019911407434847206, |
| "learning_rate": 4.312869367166875e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1119152, |
| "step": 5225 |
| }, |
| { |
| "epoch": 16.603174603174605, |
| "grad_norm": 0.0006706177373416722, |
| "learning_rate": 4.274060686123959e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1120192, |
| "step": 5230 |
| }, |
| { |
| "epoch": 16.61904761904762, |
| "grad_norm": 0.00014612732047680765, |
| "learning_rate": 4.235411074824524e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1121280, |
| "step": 5235 |
| }, |
| { |
| "epoch": 16.634920634920636, |
| "grad_norm": 0.0002357373887207359, |
| "learning_rate": 4.196920829900891e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1122352, |
| "step": 5240 |
| }, |
| { |
| "epoch": 16.650793650793652, |
| "grad_norm": 0.001285207225009799, |
| "learning_rate": 4.158590246762279e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1123376, |
| "step": 5245 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 8.80981533555314e-05, |
| "learning_rate": 4.120419619592511e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1124400, |
| "step": 5250 |
| }, |
| { |
| "epoch": 16.682539682539684, |
| "grad_norm": 0.00017228191427420825, |
| "learning_rate": 4.082409241347754e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1125440, |
| "step": 5255 |
| }, |
| { |
| "epoch": 16.6984126984127, |
| "grad_norm": 0.0001255661336472258, |
| "learning_rate": 4.044559403754294e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1126464, |
| "step": 5260 |
| }, |
| { |
| "epoch": 16.714285714285715, |
| "grad_norm": 0.009000623598694801, |
| "learning_rate": 4.006870397306256e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1127552, |
| "step": 5265 |
| }, |
| { |
| "epoch": 16.73015873015873, |
| "grad_norm": 0.000503276998642832, |
| "learning_rate": 3.969342511263441e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1128640, |
| "step": 5270 |
| }, |
| { |
| "epoch": 16.746031746031747, |
| "grad_norm": 0.00038995477370917797, |
| "learning_rate": 3.931976033649021e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1129712, |
| "step": 5275 |
| }, |
| { |
| "epoch": 16.761904761904763, |
| "grad_norm": 0.00010485357779543847, |
| "learning_rate": 3.8947712512474085e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1130800, |
| "step": 5280 |
| }, |
| { |
| "epoch": 16.77777777777778, |
| "grad_norm": 7.115237531252205e-05, |
| "learning_rate": 3.857728449601991e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1131888, |
| "step": 5285 |
| }, |
| { |
| "epoch": 16.793650793650794, |
| "grad_norm": 0.0003514425188768655, |
| "learning_rate": 3.820847913012987e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1132960, |
| "step": 5290 |
| }, |
| { |
| "epoch": 16.80952380952381, |
| "grad_norm": 0.00010385180939920247, |
| "learning_rate": 3.784129924535243e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1133984, |
| "step": 5295 |
| }, |
| { |
| "epoch": 16.825396825396826, |
| "grad_norm": 0.00014585713506676257, |
| "learning_rate": 3.7475747659760502e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1135088, |
| "step": 5300 |
| }, |
| { |
| "epoch": 16.841269841269842, |
| "grad_norm": 9.117177978623658e-05, |
| "learning_rate": 3.7111827178930108e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1136176, |
| "step": 5305 |
| }, |
| { |
| "epoch": 16.857142857142858, |
| "grad_norm": 0.00011191629891982302, |
| "learning_rate": 3.6749540595918675e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1137280, |
| "step": 5310 |
| }, |
| { |
| "epoch": 16.873015873015873, |
| "grad_norm": 0.00021030911011621356, |
| "learning_rate": 3.6388890691243403e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1138320, |
| "step": 5315 |
| }, |
| { |
| "epoch": 16.88888888888889, |
| "grad_norm": 0.004643842577934265, |
| "learning_rate": 3.6029880232860413e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1139360, |
| "step": 5320 |
| }, |
| { |
| "epoch": 16.904761904761905, |
| "grad_norm": 0.00018325127894058824, |
| "learning_rate": 3.5672511976142963e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1140448, |
| "step": 5325 |
| }, |
| { |
| "epoch": 16.92063492063492, |
| "grad_norm": 0.00018302863463759422, |
| "learning_rate": 3.531678866386076e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1141488, |
| "step": 5330 |
| }, |
| { |
| "epoch": 16.936507936507937, |
| "grad_norm": 0.0017416487680748105, |
| "learning_rate": 3.4962713026158694e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1142656, |
| "step": 5335 |
| }, |
| { |
| "epoch": 16.952380952380953, |
| "grad_norm": 0.0002722022181842476, |
| "learning_rate": 3.461028778053571e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1143728, |
| "step": 5340 |
| }, |
| { |
| "epoch": 16.96825396825397, |
| "grad_norm": 0.0025406088680028915, |
| "learning_rate": 3.4259515631824306e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1144880, |
| "step": 5345 |
| }, |
| { |
| "epoch": 16.984126984126984, |
| "grad_norm": 0.000209580481168814, |
| "learning_rate": 3.3910399272169657e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1146032, |
| "step": 5350 |
| }, |
| { |
| "epoch": 17.0, |
| "grad_norm": 0.0001166212823591195, |
| "learning_rate": 3.356294138100868e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1147088, |
| "step": 5355 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.20359259843826294, |
| "eval_runtime": 1.4479, |
| "eval_samples_per_second": 48.345, |
| "eval_steps_per_second": 24.173, |
| "num_input_tokens_seen": 1147088, |
| "step": 5355 |
| }, |
| { |
| "epoch": 17.015873015873016, |
| "grad_norm": 0.00018003687728196383, |
| "learning_rate": 3.321714462504999e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1148144, |
| "step": 5360 |
| }, |
| { |
| "epoch": 17.03174603174603, |
| "grad_norm": 0.0001391788391629234, |
| "learning_rate": 3.2873011658252796e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1149248, |
| "step": 5365 |
| }, |
| { |
| "epoch": 17.047619047619047, |
| "grad_norm": 0.00013040899648331106, |
| "learning_rate": 3.2530545121807145e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1150384, |
| "step": 5370 |
| }, |
| { |
| "epoch": 17.063492063492063, |
| "grad_norm": 0.00015444679593201727, |
| "learning_rate": 3.2189747644113365e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1151424, |
| "step": 5375 |
| }, |
| { |
| "epoch": 17.07936507936508, |
| "grad_norm": 6.173488509375602e-05, |
| "learning_rate": 3.185062184076168e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1152512, |
| "step": 5380 |
| }, |
| { |
| "epoch": 17.095238095238095, |
| "grad_norm": 0.003296441398561001, |
| "learning_rate": 3.151317031451259e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1153568, |
| "step": 5385 |
| }, |
| { |
| "epoch": 17.11111111111111, |
| "grad_norm": 0.00010462482168804854, |
| "learning_rate": 3.1177395655276635e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1154672, |
| "step": 5390 |
| }, |
| { |
| "epoch": 17.126984126984127, |
| "grad_norm": 0.003144120331853628, |
| "learning_rate": 3.0843300440094397e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1155776, |
| "step": 5395 |
| }, |
| { |
| "epoch": 17.142857142857142, |
| "grad_norm": 0.0001563921687193215, |
| "learning_rate": 3.0510887233117096e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1156784, |
| "step": 5400 |
| }, |
| { |
| "epoch": 17.158730158730158, |
| "grad_norm": 0.00010624121932778507, |
| "learning_rate": 3.0180158585586397e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1157888, |
| "step": 5405 |
| }, |
| { |
| "epoch": 17.174603174603174, |
| "grad_norm": 8.494222856825218e-05, |
| "learning_rate": 2.98511170358155e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1158912, |
| "step": 5410 |
| }, |
| { |
| "epoch": 17.19047619047619, |
| "grad_norm": 0.00045355354086495936, |
| "learning_rate": 2.9523765109169017e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1159968, |
| "step": 5415 |
| }, |
| { |
| "epoch": 17.206349206349206, |
| "grad_norm": 9.295267227571458e-05, |
| "learning_rate": 2.9198105318043816e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1161008, |
| "step": 5420 |
| }, |
| { |
| "epoch": 17.22222222222222, |
| "grad_norm": 0.0004320423468016088, |
| "learning_rate": 2.8874140161849917e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1162064, |
| "step": 5425 |
| }, |
| { |
| "epoch": 17.238095238095237, |
| "grad_norm": 0.00011025634739780799, |
| "learning_rate": 2.8551872126991147e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1163104, |
| "step": 5430 |
| }, |
| { |
| "epoch": 17.253968253968253, |
| "grad_norm": 0.00024053626111708581, |
| "learning_rate": 2.8231303686846124e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1164160, |
| "step": 5435 |
| }, |
| { |
| "epoch": 17.26984126984127, |
| "grad_norm": 0.0006347851594910026, |
| "learning_rate": 2.7912437301749026e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1165216, |
| "step": 5440 |
| }, |
| { |
| "epoch": 17.285714285714285, |
| "grad_norm": 0.004288922995328903, |
| "learning_rate": 2.759527541897103e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1166240, |
| "step": 5445 |
| }, |
| { |
| "epoch": 17.3015873015873, |
| "grad_norm": 0.00011301678750896826, |
| "learning_rate": 2.7279820472701554e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1167296, |
| "step": 5450 |
| }, |
| { |
| "epoch": 17.317460317460316, |
| "grad_norm": 0.0005049300380051136, |
| "learning_rate": 2.6966074884029164e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1168384, |
| "step": 5455 |
| }, |
| { |
| "epoch": 17.333333333333332, |
| "grad_norm": 0.00013295926328282803, |
| "learning_rate": 2.665404106092348e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1169520, |
| "step": 5460 |
| }, |
| { |
| "epoch": 17.349206349206348, |
| "grad_norm": 0.006194146350026131, |
| "learning_rate": 2.634372139821631e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1170608, |
| "step": 5465 |
| }, |
| { |
| "epoch": 17.365079365079364, |
| "grad_norm": 0.00011388435086701065, |
| "learning_rate": 2.603511827758351e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1171696, |
| "step": 5470 |
| }, |
| { |
| "epoch": 17.38095238095238, |
| "grad_norm": 0.0018900822615250945, |
| "learning_rate": 2.57282340675267e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1172752, |
| "step": 5475 |
| }, |
| { |
| "epoch": 17.396825396825395, |
| "grad_norm": 0.0002572809753473848, |
| "learning_rate": 2.5423071123354845e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1173792, |
| "step": 5480 |
| }, |
| { |
| "epoch": 17.41269841269841, |
| "grad_norm": 0.0001357399160042405, |
| "learning_rate": 2.5119631787166474e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1174848, |
| "step": 5485 |
| }, |
| { |
| "epoch": 17.428571428571427, |
| "grad_norm": 0.007136253640055656, |
| "learning_rate": 2.4817918387831594e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1175920, |
| "step": 5490 |
| }, |
| { |
| "epoch": 17.444444444444443, |
| "grad_norm": 0.001479033729992807, |
| "learning_rate": 2.451793324097365e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1177024, |
| "step": 5495 |
| }, |
| { |
| "epoch": 17.46031746031746, |
| "grad_norm": 0.00012026441982015967, |
| "learning_rate": 2.421967864895211e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1178016, |
| "step": 5500 |
| }, |
| { |
| "epoch": 17.476190476190474, |
| "grad_norm": 8.03721122792922e-05, |
| "learning_rate": 2.3923156900844372e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1179136, |
| "step": 5505 |
| }, |
| { |
| "epoch": 17.49206349206349, |
| "grad_norm": 7.77265740907751e-05, |
| "learning_rate": 2.3628370272428564e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1180224, |
| "step": 5510 |
| }, |
| { |
| "epoch": 17.507936507936506, |
| "grad_norm": 0.002345818327739835, |
| "learning_rate": 2.3335321026165895e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1181312, |
| "step": 5515 |
| }, |
| { |
| "epoch": 17.523809523809526, |
| "grad_norm": 0.00011413685570005327, |
| "learning_rate": 2.304401141118326e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1182352, |
| "step": 5520 |
| }, |
| { |
| "epoch": 17.53968253968254, |
| "grad_norm": 0.00014744508371222764, |
| "learning_rate": 2.275444366325613e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1183504, |
| "step": 5525 |
| }, |
| { |
| "epoch": 17.555555555555557, |
| "grad_norm": 0.0006693506147712469, |
| "learning_rate": 2.2466620004791244e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1184608, |
| "step": 5530 |
| }, |
| { |
| "epoch": 17.571428571428573, |
| "grad_norm": 0.0002359232894377783, |
| "learning_rate": 2.2180542644809564e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1185648, |
| "step": 5535 |
| }, |
| { |
| "epoch": 17.58730158730159, |
| "grad_norm": 0.00016359401342924684, |
| "learning_rate": 2.1896213778929533e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1186704, |
| "step": 5540 |
| }, |
| { |
| "epoch": 17.603174603174605, |
| "grad_norm": 0.0006329666939564049, |
| "learning_rate": 2.1613635589349756e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1187744, |
| "step": 5545 |
| }, |
| { |
| "epoch": 17.61904761904762, |
| "grad_norm": 0.004720310214906931, |
| "learning_rate": 2.133281024483297e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1188784, |
| "step": 5550 |
| }, |
| { |
| "epoch": 17.634920634920636, |
| "grad_norm": 0.001770343049429357, |
| "learning_rate": 2.105373990068862e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1189808, |
| "step": 5555 |
| }, |
| { |
| "epoch": 17.650793650793652, |
| "grad_norm": 0.0001479105558246374, |
| "learning_rate": 2.077642669875679e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1190880, |
| "step": 5560 |
| }, |
| { |
| "epoch": 17.666666666666668, |
| "grad_norm": 9.104243508772925e-05, |
| "learning_rate": 2.050087276739171e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1192032, |
| "step": 5565 |
| }, |
| { |
| "epoch": 17.682539682539684, |
| "grad_norm": 0.00013047012907918543, |
| "learning_rate": 2.0227080221445345e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1193136, |
| "step": 5570 |
| }, |
| { |
| "epoch": 17.6984126984127, |
| "grad_norm": 0.00011812873708549887, |
| "learning_rate": 1.9955051162251216e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1194208, |
| "step": 5575 |
| }, |
| { |
| "epoch": 17.714285714285715, |
| "grad_norm": 0.003984814044088125, |
| "learning_rate": 1.968478767760812e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1195216, |
| "step": 5580 |
| }, |
| { |
| "epoch": 17.73015873015873, |
| "grad_norm": 0.0007169300224632025, |
| "learning_rate": 1.941629184176422e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1196272, |
| "step": 5585 |
| }, |
| { |
| "epoch": 17.746031746031747, |
| "grad_norm": 0.0008787508704699576, |
| "learning_rate": 1.9149565715401415e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1197328, |
| "step": 5590 |
| }, |
| { |
| "epoch": 17.761904761904763, |
| "grad_norm": 0.00010687024041544646, |
| "learning_rate": 1.8884611345618863e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1198368, |
| "step": 5595 |
| }, |
| { |
| "epoch": 17.77777777777778, |
| "grad_norm": 0.00012383893772494048, |
| "learning_rate": 1.8621430765917964e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1199488, |
| "step": 5600 |
| }, |
| { |
| "epoch": 17.793650793650794, |
| "grad_norm": 0.00018226118118036538, |
| "learning_rate": 1.8360025996186137e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1200592, |
| "step": 5605 |
| }, |
| { |
| "epoch": 17.80952380952381, |
| "grad_norm": 9.4871676992625e-05, |
| "learning_rate": 1.8100399042681848e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1201584, |
| "step": 5610 |
| }, |
| { |
| "epoch": 17.825396825396826, |
| "grad_norm": 0.00025049285613931715, |
| "learning_rate": 1.784255189801895e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1202656, |
| "step": 5615 |
| }, |
| { |
| "epoch": 17.841269841269842, |
| "grad_norm": 0.0001846710656536743, |
| "learning_rate": 1.7586486541151303e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1203744, |
| "step": 5620 |
| }, |
| { |
| "epoch": 17.857142857142858, |
| "grad_norm": 0.0005062821437604725, |
| "learning_rate": 1.7332204937357793e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1204864, |
| "step": 5625 |
| }, |
| { |
| "epoch": 17.873015873015873, |
| "grad_norm": 8.464482380077243e-05, |
| "learning_rate": 1.7079709038227227e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1205920, |
| "step": 5630 |
| }, |
| { |
| "epoch": 17.88888888888889, |
| "grad_norm": 8.669216913403943e-05, |
| "learning_rate": 1.6829000781643094e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1206992, |
| "step": 5635 |
| }, |
| { |
| "epoch": 17.904761904761905, |
| "grad_norm": 0.00011535276280483231, |
| "learning_rate": 1.6580082091769088e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1208048, |
| "step": 5640 |
| }, |
| { |
| "epoch": 17.92063492063492, |
| "grad_norm": 0.0006412939983420074, |
| "learning_rate": 1.633295487903394e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1209104, |
| "step": 5645 |
| }, |
| { |
| "epoch": 17.936507936507937, |
| "grad_norm": 0.00021313075558282435, |
| "learning_rate": 1.6087621040117157e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1210160, |
| "step": 5650 |
| }, |
| { |
| "epoch": 17.952380952380953, |
| "grad_norm": 0.0002721626660786569, |
| "learning_rate": 1.5844082457934145e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1211264, |
| "step": 5655 |
| }, |
| { |
| "epoch": 17.96825396825397, |
| "grad_norm": 9.288136789109558e-05, |
| "learning_rate": 1.5602341001621834e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1212288, |
| "step": 5660 |
| }, |
| { |
| "epoch": 17.984126984126984, |
| "grad_norm": 0.00033692075521685183, |
| "learning_rate": 1.5362398526524463e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1213360, |
| "step": 5665 |
| }, |
| { |
| "epoch": 18.0, |
| "grad_norm": 0.00020394432067405432, |
| "learning_rate": 1.5124256874179288e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1214432, |
| "step": 5670 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.20379270613193512, |
| "eval_runtime": 1.4505, |
| "eval_samples_per_second": 48.26, |
| "eval_steps_per_second": 24.13, |
| "num_input_tokens_seen": 1214432, |
| "step": 5670 |
| }, |
| { |
| "epoch": 18.015873015873016, |
| "grad_norm": 0.00013542307715397328, |
| "learning_rate": 1.4887917872302231e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1215504, |
| "step": 5675 |
| }, |
| { |
| "epoch": 18.03174603174603, |
| "grad_norm": 0.000211275095352903, |
| "learning_rate": 1.465338333477423e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1216560, |
| "step": 5680 |
| }, |
| { |
| "epoch": 18.047619047619047, |
| "grad_norm": 0.0007974806358106434, |
| "learning_rate": 1.4420655061626932e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1217584, |
| "step": 5685 |
| }, |
| { |
| "epoch": 18.063492063492063, |
| "grad_norm": 0.0027897171676158905, |
| "learning_rate": 1.4189734839029273e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1218720, |
| "step": 5690 |
| }, |
| { |
| "epoch": 18.07936507936508, |
| "grad_norm": 7.086082769092172e-05, |
| "learning_rate": 1.3960624439273428e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1219744, |
| "step": 5695 |
| }, |
| { |
| "epoch": 18.095238095238095, |
| "grad_norm": 6.145464431028813e-05, |
| "learning_rate": 1.3733325620761294e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1220912, |
| "step": 5700 |
| }, |
| { |
| "epoch": 18.11111111111111, |
| "grad_norm": 0.0001662890863372013, |
| "learning_rate": 1.3507840127991138e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1221904, |
| "step": 5705 |
| }, |
| { |
| "epoch": 18.126984126984127, |
| "grad_norm": 9.012035297928378e-05, |
| "learning_rate": 1.328416969154414e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1222944, |
| "step": 5710 |
| }, |
| { |
| "epoch": 18.142857142857142, |
| "grad_norm": 0.0004921794170513749, |
| "learning_rate": 1.3062316028071065e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1224016, |
| "step": 5715 |
| }, |
| { |
| "epoch": 18.158730158730158, |
| "grad_norm": 0.00010398301674285904, |
| "learning_rate": 1.2842280840278997e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1225120, |
| "step": 5720 |
| }, |
| { |
| "epoch": 18.174603174603174, |
| "grad_norm": 7.405834185192361e-05, |
| "learning_rate": 1.2624065816918413e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1226160, |
| "step": 5725 |
| }, |
| { |
| "epoch": 18.19047619047619, |
| "grad_norm": 0.0009600871126167476, |
| "learning_rate": 1.2407672632770374e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1227296, |
| "step": 5730 |
| }, |
| { |
| "epoch": 18.206349206349206, |
| "grad_norm": 0.00020508236775640398, |
| "learning_rate": 1.219310294863324e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1228352, |
| "step": 5735 |
| }, |
| { |
| "epoch": 18.22222222222222, |
| "grad_norm": 7.430932600982487e-05, |
| "learning_rate": 1.1980358411310344e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1229344, |
| "step": 5740 |
| }, |
| { |
| "epoch": 18.238095238095237, |
| "grad_norm": 0.00010048592957900837, |
| "learning_rate": 1.1769440653597141e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1230416, |
| "step": 5745 |
| }, |
| { |
| "epoch": 18.253968253968253, |
| "grad_norm": 9.934873378369957e-05, |
| "learning_rate": 1.1560351294268579e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1231488, |
| "step": 5750 |
| }, |
| { |
| "epoch": 18.26984126984127, |
| "grad_norm": 0.0001546377025078982, |
| "learning_rate": 1.1353091938067023e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1232512, |
| "step": 5755 |
| }, |
| { |
| "epoch": 18.285714285714285, |
| "grad_norm": 6.141650374047458e-05, |
| "learning_rate": 1.1147664175689577e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1233568, |
| "step": 5760 |
| }, |
| { |
| "epoch": 18.3015873015873, |
| "grad_norm": 0.00037183158565312624, |
| "learning_rate": 1.0944069583776057e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1234656, |
| "step": 5765 |
| }, |
| { |
| "epoch": 18.317460317460316, |
| "grad_norm": 9.08130532479845e-05, |
| "learning_rate": 1.0742309724896925e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1235760, |
| "step": 5770 |
| }, |
| { |
| "epoch": 18.333333333333332, |
| "grad_norm": 0.0001254588714800775, |
| "learning_rate": 1.0542386147541133e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1236816, |
| "step": 5775 |
| }, |
| { |
| "epoch": 18.349206349206348, |
| "grad_norm": 0.0001581174583407119, |
| "learning_rate": 1.03443003861044e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1237920, |
| "step": 5780 |
| }, |
| { |
| "epoch": 18.365079365079364, |
| "grad_norm": 0.0005327375256456435, |
| "learning_rate": 1.0148053960877396e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1238960, |
| "step": 5785 |
| }, |
| { |
| "epoch": 18.38095238095238, |
| "grad_norm": 0.00010738349374150857, |
| "learning_rate": 9.95364837803392e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1239984, |
| "step": 5790 |
| }, |
| { |
| "epoch": 18.396825396825395, |
| "grad_norm": 0.00017572024080436677, |
| "learning_rate": 9.761085129619597e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1241104, |
| "step": 5795 |
| }, |
| { |
| "epoch": 18.41269841269841, |
| "grad_norm": 0.00010885854135267437, |
| "learning_rate": 9.570365693540251e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1242192, |
| "step": 5800 |
| }, |
| { |
| "epoch": 18.428571428571427, |
| "grad_norm": 0.005107260309159756, |
| "learning_rate": 9.381491533550612e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1243248, |
| "step": 5805 |
| }, |
| { |
| "epoch": 18.444444444444443, |
| "grad_norm": 0.0001170175164588727, |
| "learning_rate": 9.194464099243128e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1244304, |
| "step": 5810 |
| }, |
| { |
| "epoch": 18.46031746031746, |
| "grad_norm": 0.00015847616305109113, |
| "learning_rate": 9.009284826036691e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1245344, |
| "step": 5815 |
| }, |
| { |
| "epoch": 18.476190476190474, |
| "grad_norm": 8.687510126037523e-05, |
| "learning_rate": 8.825955135165764e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1246400, |
| "step": 5820 |
| }, |
| { |
| "epoch": 18.49206349206349, |
| "grad_norm": 0.0011609104694798589, |
| "learning_rate": 8.64447643366953e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1247488, |
| "step": 5825 |
| }, |
| { |
| "epoch": 18.507936507936506, |
| "grad_norm": 0.004330518189817667, |
| "learning_rate": 8.464850114380807e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1248576, |
| "step": 5830 |
| }, |
| { |
| "epoch": 18.523809523809526, |
| "grad_norm": 0.00015605830412823707, |
| "learning_rate": 8.287077555915706e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1249648, |
| "step": 5835 |
| }, |
| { |
| "epoch": 18.53968253968254, |
| "grad_norm": 0.0001275492977583781, |
| "learning_rate": 8.111160122662748e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1250704, |
| "step": 5840 |
| }, |
| { |
| "epoch": 18.555555555555557, |
| "grad_norm": 0.00016683620924595743, |
| "learning_rate": 7.937099164772699e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1251728, |
| "step": 5845 |
| }, |
| { |
| "epoch": 18.571428571428573, |
| "grad_norm": 0.0003593855944927782, |
| "learning_rate": 7.764896018147921e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1252816, |
| "step": 5850 |
| }, |
| { |
| "epoch": 18.58730158730159, |
| "grad_norm": 0.0005089179612696171, |
| "learning_rate": 7.594552004432265e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1253920, |
| "step": 5855 |
| }, |
| { |
| "epoch": 18.603174603174605, |
| "grad_norm": 0.006525584030896425, |
| "learning_rate": 7.426068431000882e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1254976, |
| "step": 5860 |
| }, |
| { |
| "epoch": 18.61904761904762, |
| "grad_norm": 0.00029004551470279694, |
| "learning_rate": 7.259446590950264e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1256128, |
| "step": 5865 |
| }, |
| { |
| "epoch": 18.634920634920636, |
| "grad_norm": 0.0001299941068282351, |
| "learning_rate": 7.094687763088248e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1257200, |
| "step": 5870 |
| }, |
| { |
| "epoch": 18.650793650793652, |
| "grad_norm": 0.00023286275973077863, |
| "learning_rate": 6.931793211924192e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1258256, |
| "step": 5875 |
| }, |
| { |
| "epoch": 18.666666666666668, |
| "grad_norm": 0.00012554308341350406, |
| "learning_rate": 6.770764187659262e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1259312, |
| "step": 5880 |
| }, |
| { |
| "epoch": 18.682539682539684, |
| "grad_norm": 0.002065224340185523, |
| "learning_rate": 6.611601926177019e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1260352, |
| "step": 5885 |
| }, |
| { |
| "epoch": 18.6984126984127, |
| "grad_norm": 6.723072146996856e-05, |
| "learning_rate": 6.454307649033569e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1261456, |
| "step": 5890 |
| }, |
| { |
| "epoch": 18.714285714285715, |
| "grad_norm": 0.00010569453297648579, |
| "learning_rate": 6.298882563448599e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1262560, |
| "step": 5895 |
| }, |
| { |
| "epoch": 18.73015873015873, |
| "grad_norm": 0.0001583786215633154, |
| "learning_rate": 6.145327862295824e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1263648, |
| "step": 5900 |
| }, |
| { |
| "epoch": 18.746031746031747, |
| "grad_norm": 0.00013246589514892548, |
| "learning_rate": 5.993644724093888e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1264768, |
| "step": 5905 |
| }, |
| { |
| "epoch": 18.761904761904763, |
| "grad_norm": 8.11208738014102e-05, |
| "learning_rate": 5.843834312997481e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1265888, |
| "step": 5910 |
| }, |
| { |
| "epoch": 18.77777777777778, |
| "grad_norm": 8.87354981387034e-05, |
| "learning_rate": 5.695897778788151e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1266976, |
| "step": 5915 |
| }, |
| { |
| "epoch": 18.793650793650794, |
| "grad_norm": 0.00012595752195920795, |
| "learning_rate": 5.549836256865642e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1268064, |
| "step": 5920 |
| }, |
| { |
| "epoch": 18.80952380952381, |
| "grad_norm": 0.0027428085450083017, |
| "learning_rate": 5.405650868239242e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1269104, |
| "step": 5925 |
| }, |
| { |
| "epoch": 18.825396825396826, |
| "grad_norm": 0.008050281554460526, |
| "learning_rate": 5.263342719518921e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1270192, |
| "step": 5930 |
| }, |
| { |
| "epoch": 18.841269841269842, |
| "grad_norm": 0.0020299924071878195, |
| "learning_rate": 5.122912902907145e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1271264, |
| "step": 5935 |
| }, |
| { |
| "epoch": 18.857142857142858, |
| "grad_norm": 0.00032130113686434925, |
| "learning_rate": 4.98436249619022e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1272368, |
| "step": 5940 |
| }, |
| { |
| "epoch": 18.873015873015873, |
| "grad_norm": 0.002536200685426593, |
| "learning_rate": 4.847692562730238e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1273440, |
| "step": 5945 |
| }, |
| { |
| "epoch": 18.88888888888889, |
| "grad_norm": 0.000218776855035685, |
| "learning_rate": 4.712904151456865e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1274528, |
| "step": 5950 |
| }, |
| { |
| "epoch": 18.904761904761905, |
| "grad_norm": 0.00021643297804985195, |
| "learning_rate": 4.579998296859067e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1275584, |
| "step": 5955 |
| }, |
| { |
| "epoch": 18.92063492063492, |
| "grad_norm": 0.00038543707341887057, |
| "learning_rate": 4.448976018977563e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1276672, |
| "step": 5960 |
| }, |
| { |
| "epoch": 18.936507936507937, |
| "grad_norm": 0.0002801400551106781, |
| "learning_rate": 4.319838323396691e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1277792, |
| "step": 5965 |
| }, |
| { |
| "epoch": 18.952380952380953, |
| "grad_norm": 0.00012454042735043913, |
| "learning_rate": 4.192586201236748e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1278832, |
| "step": 5970 |
| }, |
| { |
| "epoch": 18.96825396825397, |
| "grad_norm": 0.0001259175915038213, |
| "learning_rate": 4.067220629146523e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1279888, |
| "step": 5975 |
| }, |
| { |
| "epoch": 18.984126984126984, |
| "grad_norm": 0.00011188061034772545, |
| "learning_rate": 3.943742569295583e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1281008, |
| "step": 5980 |
| }, |
| { |
| "epoch": 19.0, |
| "grad_norm": 0.0001576906506670639, |
| "learning_rate": 3.8221529693671375e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1282000, |
| "step": 5985 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.2037855088710785, |
| "eval_runtime": 1.4442, |
| "eval_samples_per_second": 48.469, |
| "eval_steps_per_second": 24.234, |
| "num_input_tokens_seen": 1282000, |
| "step": 5985 |
| }, |
| { |
| "epoch": 19.015873015873016, |
| "grad_norm": 0.00013823146582581103, |
| "learning_rate": 3.702452762550546e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1283104, |
| "step": 5990 |
| }, |
| { |
| "epoch": 19.03174603174603, |
| "grad_norm": 9.556670556776226e-05, |
| "learning_rate": 3.5846428675342657e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1284096, |
| "step": 5995 |
| }, |
| { |
| "epoch": 19.047619047619047, |
| "grad_norm": 0.0001208999747177586, |
| "learning_rate": 3.468724188498751e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1285152, |
| "step": 6000 |
| }, |
| { |
| "epoch": 19.063492063492063, |
| "grad_norm": 0.0001655189407756552, |
| "learning_rate": 3.3546976151095924e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1286240, |
| "step": 6005 |
| }, |
| { |
| "epoch": 19.07936507936508, |
| "grad_norm": 0.0003586008388083428, |
| "learning_rate": 3.242564022510608e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1287328, |
| "step": 6010 |
| }, |
| { |
| "epoch": 19.095238095238095, |
| "grad_norm": 0.00048587197670713067, |
| "learning_rate": 3.132324271317183e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1288448, |
| "step": 6015 |
| }, |
| { |
| "epoch": 19.11111111111111, |
| "grad_norm": 0.00016730540664866567, |
| "learning_rate": 3.0239792076095506e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1289456, |
| "step": 6020 |
| }, |
| { |
| "epoch": 19.126984126984127, |
| "grad_norm": 0.0003036040288861841, |
| "learning_rate": 2.9175296629265493e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1290512, |
| "step": 6025 |
| }, |
| { |
| "epoch": 19.142857142857142, |
| "grad_norm": 0.0002358776400797069, |
| "learning_rate": 2.8129764542589033e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1291584, |
| "step": 6030 |
| }, |
| { |
| "epoch": 19.158730158730158, |
| "grad_norm": 0.007324682082980871, |
| "learning_rate": 2.71032038404323e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1292624, |
| "step": 6035 |
| }, |
| { |
| "epoch": 19.174603174603174, |
| "grad_norm": 0.00022556190378963947, |
| "learning_rate": 2.609562240155766e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1293680, |
| "step": 6040 |
| }, |
| { |
| "epoch": 19.19047619047619, |
| "grad_norm": 0.0002528807381168008, |
| "learning_rate": 2.510702795906289e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1294800, |
| "step": 6045 |
| }, |
| { |
| "epoch": 19.206349206349206, |
| "grad_norm": 0.00019078988407272846, |
| "learning_rate": 2.413742810032288e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1295888, |
| "step": 6050 |
| }, |
| { |
| "epoch": 19.22222222222222, |
| "grad_norm": 0.00042369638686068356, |
| "learning_rate": 2.318683026692997e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1296976, |
| "step": 6055 |
| }, |
| { |
| "epoch": 19.238095238095237, |
| "grad_norm": 0.00010991137969540432, |
| "learning_rate": 2.2255241754638167e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1298048, |
| "step": 6060 |
| }, |
| { |
| "epoch": 19.253968253968253, |
| "grad_norm": 0.00027539842994883657, |
| "learning_rate": 2.1342669713307063e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1299088, |
| "step": 6065 |
| }, |
| { |
| "epoch": 19.26984126984127, |
| "grad_norm": 9.853248775471002e-05, |
| "learning_rate": 2.0449121146845774e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1300144, |
| "step": 6070 |
| }, |
| { |
| "epoch": 19.285714285714285, |
| "grad_norm": 0.00015365192666649818, |
| "learning_rate": 1.9574602913159934e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1301296, |
| "step": 6075 |
| }, |
| { |
| "epoch": 19.3015873015873, |
| "grad_norm": 0.00011225073831155896, |
| "learning_rate": 1.8719121724099508e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1302320, |
| "step": 6080 |
| }, |
| { |
| "epoch": 19.317460317460316, |
| "grad_norm": 0.000277652230579406, |
| "learning_rate": 1.7882684145406614e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1303408, |
| "step": 6085 |
| }, |
| { |
| "epoch": 19.333333333333332, |
| "grad_norm": 9.394479275215417e-05, |
| "learning_rate": 1.706529659666556e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1304448, |
| "step": 6090 |
| }, |
| { |
| "epoch": 19.349206349206348, |
| "grad_norm": 0.004892684053629637, |
| "learning_rate": 1.6266965351252884e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1305552, |
| "step": 6095 |
| }, |
| { |
| "epoch": 19.365079365079364, |
| "grad_norm": 0.00015211277059279382, |
| "learning_rate": 1.5487696536290176e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1306592, |
| "step": 6100 |
| }, |
| { |
| "epoch": 19.38095238095238, |
| "grad_norm": 0.00013414997374638915, |
| "learning_rate": 1.472749613259661e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1307632, |
| "step": 6105 |
| }, |
| { |
| "epoch": 19.396825396825395, |
| "grad_norm": 0.00033985593472607434, |
| "learning_rate": 1.398636997464231e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1308720, |
| "step": 6110 |
| }, |
| { |
| "epoch": 19.41269841269841, |
| "grad_norm": 8.913094643503428e-05, |
| "learning_rate": 1.326432375050479e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1309824, |
| "step": 6115 |
| }, |
| { |
| "epoch": 19.428571428571427, |
| "grad_norm": 0.00010829928942257538, |
| "learning_rate": 1.2561363001824812e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1310832, |
| "step": 6120 |
| }, |
| { |
| "epoch": 19.444444444444443, |
| "grad_norm": 0.00025828013895079494, |
| "learning_rate": 1.1877493123763905e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1311936, |
| "step": 6125 |
| }, |
| { |
| "epoch": 19.46031746031746, |
| "grad_norm": 0.0003705843409989029, |
| "learning_rate": 1.1212719364962209e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1312992, |
| "step": 6130 |
| }, |
| { |
| "epoch": 19.476190476190474, |
| "grad_norm": 0.00011419185466365889, |
| "learning_rate": 1.0567046827499594e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1314080, |
| "step": 6135 |
| }, |
| { |
| "epoch": 19.49206349206349, |
| "grad_norm": 8.912238990888e-05, |
| "learning_rate": 9.940480466855417e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1315200, |
| "step": 6140 |
| }, |
| { |
| "epoch": 19.507936507936506, |
| "grad_norm": 0.00011151758371852338, |
| "learning_rate": 9.333025091870506e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1316256, |
| "step": 6145 |
| }, |
| { |
| "epoch": 19.523809523809526, |
| "grad_norm": 0.00011681997420964763, |
| "learning_rate": 8.744685364711624e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1317328, |
| "step": 6150 |
| }, |
| { |
| "epoch": 19.53968253968254, |
| "grad_norm": 0.000633031188044697, |
| "learning_rate": 8.17546580083317e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1318368, |
| "step": 6155 |
| }, |
| { |
| "epoch": 19.555555555555557, |
| "grad_norm": 0.0011906184954568744, |
| "learning_rate": 7.625370768944984e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1319440, |
| "step": 6160 |
| }, |
| { |
| "epoch": 19.571428571428573, |
| "grad_norm": 9.492343087913468e-05, |
| "learning_rate": 7.094404490977923e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1320464, |
| "step": 6165 |
| }, |
| { |
| "epoch": 19.58730158730159, |
| "grad_norm": 8.97967693163082e-05, |
| "learning_rate": 6.582571042050567e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1321552, |
| "step": 6170 |
| }, |
| { |
| "epoch": 19.603174603174605, |
| "grad_norm": 0.00012746088032145053, |
| "learning_rate": 6.089874350439506e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1322576, |
| "step": 6175 |
| }, |
| { |
| "epoch": 19.61904761904762, |
| "grad_norm": 0.001789805362932384, |
| "learning_rate": 5.6163181975477096e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1323632, |
| "step": 6180 |
| }, |
| { |
| "epoch": 19.634920634920636, |
| "grad_norm": 0.00017200844013132155, |
| "learning_rate": 5.161906217877044e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1324704, |
| "step": 6185 |
| }, |
| { |
| "epoch": 19.650793650793652, |
| "grad_norm": 0.0007840655161999166, |
| "learning_rate": 4.726641898998574e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1325840, |
| "step": 6190 |
| }, |
| { |
| "epoch": 19.666666666666668, |
| "grad_norm": 0.005137111991643906, |
| "learning_rate": 4.310528581527862e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1326896, |
| "step": 6195 |
| }, |
| { |
| "epoch": 19.682539682539684, |
| "grad_norm": 0.000191491621080786, |
| "learning_rate": 3.9135694590972104e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1328000, |
| "step": 6200 |
| }, |
| { |
| "epoch": 19.6984126984127, |
| "grad_norm": 7.77663808548823e-05, |
| "learning_rate": 3.5357675783331825e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1329056, |
| "step": 6205 |
| }, |
| { |
| "epoch": 19.714285714285715, |
| "grad_norm": 0.00060555204981938, |
| "learning_rate": 3.177125838830786e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1330144, |
| "step": 6210 |
| }, |
| { |
| "epoch": 19.73015873015873, |
| "grad_norm": 0.00011826303671114147, |
| "learning_rate": 2.837646993134324e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1331168, |
| "step": 6215 |
| }, |
| { |
| "epoch": 19.746031746031747, |
| "grad_norm": 0.00021305291738826782, |
| "learning_rate": 2.5173336467135267e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1332288, |
| "step": 6220 |
| }, |
| { |
| "epoch": 19.761904761904763, |
| "grad_norm": 0.00013271687203086913, |
| "learning_rate": 2.2161882579446735e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1333408, |
| "step": 6225 |
| }, |
| { |
| "epoch": 19.77777777777778, |
| "grad_norm": 0.0009385115699842572, |
| "learning_rate": 1.9342131380920005e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1334464, |
| "step": 6230 |
| }, |
| { |
| "epoch": 19.793650793650794, |
| "grad_norm": 0.00010659523832146078, |
| "learning_rate": 1.6714104512896568e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1335520, |
| "step": 6235 |
| }, |
| { |
| "epoch": 19.80952380952381, |
| "grad_norm": 0.0001576508511789143, |
| "learning_rate": 1.427782214524498e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1336624, |
| "step": 6240 |
| }, |
| { |
| "epoch": 19.825396825396826, |
| "grad_norm": 6.760272663086653e-05, |
| "learning_rate": 1.2033302976222071e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1337776, |
| "step": 6245 |
| }, |
| { |
| "epoch": 19.841269841269842, |
| "grad_norm": 0.000145541358506307, |
| "learning_rate": 9.980564232311973e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1338848, |
| "step": 6250 |
| }, |
| { |
| "epoch": 19.857142857142858, |
| "grad_norm": 0.0001003668294288218, |
| "learning_rate": 8.11962166809843e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1339920, |
| "step": 6255 |
| }, |
| { |
| "epoch": 19.873015873015873, |
| "grad_norm": 0.0012003045994788408, |
| "learning_rate": 6.450489566151019e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1341072, |
| "step": 6260 |
| }, |
| { |
| "epoch": 19.88888888888889, |
| "grad_norm": 0.00013508858683053404, |
| "learning_rate": 4.9731807369113316e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1342096, |
| "step": 6265 |
| }, |
| { |
| "epoch": 19.904761904761905, |
| "grad_norm": 0.00233248807489872, |
| "learning_rate": 3.687706518595846e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1343168, |
| "step": 6270 |
| }, |
| { |
| "epoch": 19.92063492063492, |
| "grad_norm": 0.00012182630598545074, |
| "learning_rate": 2.594076777104326e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1344208, |
| "step": 6275 |
| }, |
| { |
| "epoch": 19.936507936507937, |
| "grad_norm": 0.0001995829225052148, |
| "learning_rate": 1.692299905944883e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1345216, |
| "step": 6280 |
| }, |
| { |
| "epoch": 19.952380952380953, |
| "grad_norm": 7.580827514175326e-05, |
| "learning_rate": 9.823828261756873e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1346240, |
| "step": 6285 |
| }, |
| { |
| "epoch": 19.96825396825397, |
| "grad_norm": 0.00014859516522847116, |
| "learning_rate": 4.643309863494594e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1347312, |
| "step": 6290 |
| }, |
| { |
| "epoch": 19.984126984126984, |
| "grad_norm": 9.452983067603782e-05, |
| "learning_rate": 1.381483624662838e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1348352, |
| "step": 6295 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 6.697617936879396e-05, |
| "learning_rate": 3.837457948629997e-12, |
| "loss": 0.0, |
| "num_input_tokens_seen": 1349424, |
| "step": 6300 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.20304246246814728, |
| "eval_runtime": 1.458, |
| "eval_samples_per_second": 48.011, |
| "eval_steps_per_second": 24.005, |
| "num_input_tokens_seen": 1349424, |
| "step": 6300 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 1349424, |
| "step": 6300, |
| "total_flos": 6.076395282353357e+16, |
| "train_loss": 0.10612090195964623, |
| "train_runtime": 586.1962, |
| "train_samples_per_second": 21.495, |
| "train_steps_per_second": 10.747 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 6300, |
| "num_input_tokens_seen": 1349424, |
| "num_train_epochs": 20, |
| "save_steps": 315, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.076395282353357e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|