| { | |
| "best_global_step": 180, | |
| "best_metric": 0.23078909516334534, | |
| "best_model_checkpoint": "saves_multiple/prompt-tuning/llama-3-8b-instruct/train_copa_789_1760637873/checkpoint-180", | |
| "epoch": 20.0, | |
| "eval_steps": 90, | |
| "global_step": 1800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05555555555555555, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 0.0006666666666666666, | |
| "loss": 0.2431, | |
| "num_input_tokens_seen": 1632, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 0.0015, | |
| "loss": 0.1204, | |
| "num_input_tokens_seen": 3232, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 25.25, | |
| "learning_rate": 0.002333333333333333, | |
| "loss": 0.416, | |
| "num_input_tokens_seen": 4832, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 448.0, | |
| "learning_rate": 0.0031666666666666666, | |
| "loss": 4.6587, | |
| "num_input_tokens_seen": 6432, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2777777777777778, | |
| "grad_norm": 32.0, | |
| "learning_rate": 0.004, | |
| "loss": 2.6983, | |
| "num_input_tokens_seen": 7968, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 80.0, | |
| "learning_rate": 0.004833333333333334, | |
| "loss": 1.4806, | |
| "num_input_tokens_seen": 9504, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3888888888888889, | |
| "grad_norm": 25.0, | |
| "learning_rate": 0.005666666666666666, | |
| "loss": 1.5501, | |
| "num_input_tokens_seen": 11104, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 0.0065, | |
| "loss": 0.3955, | |
| "num_input_tokens_seen": 12704, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 0.007333333333333333, | |
| "loss": 0.5341, | |
| "num_input_tokens_seen": 14240, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 0.008166666666666666, | |
| "loss": 0.3637, | |
| "num_input_tokens_seen": 15808, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6111111111111112, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 0.009, | |
| "loss": 0.3436, | |
| "num_input_tokens_seen": 17344, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 34.0, | |
| "learning_rate": 0.009833333333333333, | |
| "loss": 0.4661, | |
| "num_input_tokens_seen": 18912, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7222222222222222, | |
| "grad_norm": 10.5, | |
| "learning_rate": 0.010666666666666666, | |
| "loss": 2.1202, | |
| "num_input_tokens_seen": 20448, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 0.0115, | |
| "loss": 4.2431, | |
| "num_input_tokens_seen": 21984, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 0.012333333333333332, | |
| "loss": 1.4771, | |
| "num_input_tokens_seen": 23552, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.013166666666666667, | |
| "loss": 0.4494, | |
| "num_input_tokens_seen": 25120, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9444444444444444, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.014, | |
| "loss": 0.2653, | |
| "num_input_tokens_seen": 26656, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.014833333333333334, | |
| "loss": 0.2715, | |
| "num_input_tokens_seen": 28192, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.27601727843284607, | |
| "eval_runtime": 0.8043, | |
| "eval_samples_per_second": 49.735, | |
| "eval_steps_per_second": 12.434, | |
| "num_input_tokens_seen": 28192, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0555555555555556, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 0.015666666666666666, | |
| "loss": 0.8311, | |
| "num_input_tokens_seen": 29792, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 0.0165, | |
| "loss": 0.2497, | |
| "num_input_tokens_seen": 31328, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 0.12255859375, | |
| "learning_rate": 0.017333333333333333, | |
| "loss": 0.2812, | |
| "num_input_tokens_seen": 32832, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 0.10205078125, | |
| "learning_rate": 0.018166666666666664, | |
| "loss": 0.258, | |
| "num_input_tokens_seen": 34304, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.2777777777777777, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.019, | |
| "loss": 0.2346, | |
| "num_input_tokens_seen": 35840, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.056640625, | |
| "learning_rate": 0.01983333333333333, | |
| "loss": 1.16, | |
| "num_input_tokens_seen": 37376, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 0.019775390625, | |
| "learning_rate": 0.020666666666666667, | |
| "loss": 0.2455, | |
| "num_input_tokens_seen": 38944, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 0.1103515625, | |
| "learning_rate": 0.0215, | |
| "loss": 0.2391, | |
| "num_input_tokens_seen": 40512, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.007568359375, | |
| "learning_rate": 0.022333333333333334, | |
| "loss": 0.2597, | |
| "num_input_tokens_seen": 42080, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 0.04296875, | |
| "learning_rate": 0.023166666666666665, | |
| "loss": 0.244, | |
| "num_input_tokens_seen": 43680, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6111111111111112, | |
| "grad_norm": 0.0257568359375, | |
| "learning_rate": 0.024, | |
| "loss": 0.235, | |
| "num_input_tokens_seen": 45216, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.040283203125, | |
| "learning_rate": 0.024833333333333332, | |
| "loss": 0.2389, | |
| "num_input_tokens_seen": 46720, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7222222222222223, | |
| "grad_norm": 0.00457763671875, | |
| "learning_rate": 0.025666666666666664, | |
| "loss": 0.2523, | |
| "num_input_tokens_seen": 48288, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 0.01251220703125, | |
| "learning_rate": 0.0265, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 49888, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 0.0074462890625, | |
| "learning_rate": 0.02733333333333333, | |
| "loss": 0.2492, | |
| "num_input_tokens_seen": 51424, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.00616455078125, | |
| "learning_rate": 0.028166666666666666, | |
| "loss": 0.2366, | |
| "num_input_tokens_seen": 52992, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9444444444444444, | |
| "grad_norm": 0.0211181640625, | |
| "learning_rate": 0.028999999999999998, | |
| "loss": 0.2344, | |
| "num_input_tokens_seen": 54592, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.04443359375, | |
| "learning_rate": 0.029833333333333333, | |
| "loss": 0.2409, | |
| "num_input_tokens_seen": 56192, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.23078909516334534, | |
| "eval_runtime": 0.8102, | |
| "eval_samples_per_second": 49.373, | |
| "eval_steps_per_second": 12.343, | |
| "num_input_tokens_seen": 56192, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0555555555555554, | |
| "grad_norm": 0.006591796875, | |
| "learning_rate": 0.02999954871719651, | |
| "loss": 0.2389, | |
| "num_input_tokens_seen": 57760, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 0.0233154296875, | |
| "learning_rate": 0.029997715427345868, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 59264, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 0.04296875, | |
| "learning_rate": 0.02999447209750064, | |
| "loss": 0.2269, | |
| "num_input_tokens_seen": 60768, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.010009765625, | |
| "learning_rate": 0.02998981903258893, | |
| "loss": 0.2491, | |
| "num_input_tokens_seen": 62272, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2777777777777777, | |
| "grad_norm": 0.0107421875, | |
| "learning_rate": 0.02998375667007787, | |
| "loss": 0.2373, | |
| "num_input_tokens_seen": 63808, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.00897216796875, | |
| "learning_rate": 0.029976285579932503, | |
| "loss": 0.2353, | |
| "num_input_tokens_seen": 65344, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.388888888888889, | |
| "grad_norm": 0.0169677734375, | |
| "learning_rate": 0.029967406464562214, | |
| "loss": 0.2293, | |
| "num_input_tokens_seen": 66912, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.02685546875, | |
| "learning_rate": 0.02995712015875466, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 68480, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.0272216796875, | |
| "learning_rate": 0.029945427629597305, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 70048, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 0.0069580078125, | |
| "learning_rate": 0.029932329976386493, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 71680, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.611111111111111, | |
| "grad_norm": 0.005126953125, | |
| "learning_rate": 0.0299178284305241, | |
| "loss": 0.2408, | |
| "num_input_tokens_seen": 73280, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 28.5, | |
| "learning_rate": 0.02990192435540175, | |
| "loss": 1.2081, | |
| "num_input_tokens_seen": 74752, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.7222222222222223, | |
| "grad_norm": 0.0556640625, | |
| "learning_rate": 0.029884619246272646, | |
| "loss": 0.5992, | |
| "num_input_tokens_seen": 76288, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.02986591473011098, | |
| "loss": 0.2334, | |
| "num_input_tokens_seen": 77888, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 0.02984581256545898, | |
| "loss": 0.3293, | |
| "num_input_tokens_seen": 79456, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 0.02982431464226157, | |
| "loss": 0.3808, | |
| "num_input_tokens_seen": 81088, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9444444444444446, | |
| "grad_norm": 0.014892578125, | |
| "learning_rate": 0.02980142298168869, | |
| "loss": 0.2639, | |
| "num_input_tokens_seen": 82592, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.02978515625, | |
| "learning_rate": 0.029777139735945243, | |
| "loss": 0.2493, | |
| "num_input_tokens_seen": 84192, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.24166898429393768, | |
| "eval_runtime": 0.8029, | |
| "eval_samples_per_second": 49.817, | |
| "eval_steps_per_second": 12.454, | |
| "num_input_tokens_seen": 84192, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.0555555555555554, | |
| "grad_norm": 0.0224609375, | |
| "learning_rate": 0.029751467188068818, | |
| "loss": 0.2136, | |
| "num_input_tokens_seen": 85696, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 3.111111111111111, | |
| "grad_norm": 0.0216064453125, | |
| "learning_rate": 0.02972440775171496, | |
| "loss": 0.2619, | |
| "num_input_tokens_seen": 87296, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.1666666666666665, | |
| "grad_norm": 0.010498046875, | |
| "learning_rate": 0.029695963970930307, | |
| "loss": 0.2313, | |
| "num_input_tokens_seen": 88832, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.2222222222222223, | |
| "grad_norm": 0.021728515625, | |
| "learning_rate": 0.029666138519913395, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 90368, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.2777777777777777, | |
| "grad_norm": 0.05224609375, | |
| "learning_rate": 0.029634934202763214, | |
| "loss": 0.2376, | |
| "num_input_tokens_seen": 91968, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0296023539532156, | |
| "loss": 0.2345, | |
| "num_input_tokens_seen": 93568, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.388888888888889, | |
| "grad_norm": 0.007354736328125, | |
| "learning_rate": 0.029568400834367403, | |
| "loss": 0.2343, | |
| "num_input_tokens_seen": 95168, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 0.0201416015625, | |
| "learning_rate": 0.02953307803838851, | |
| "loss": 0.2429, | |
| "num_input_tokens_seen": 96704, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.007080078125, | |
| "learning_rate": 0.02949638888622172, | |
| "loss": 0.2387, | |
| "num_input_tokens_seen": 98304, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 0.01434326171875, | |
| "learning_rate": 0.029458336827270518, | |
| "loss": 0.2367, | |
| "num_input_tokens_seen": 99840, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.611111111111111, | |
| "grad_norm": 0.004425048828125, | |
| "learning_rate": 0.029418925439074782, | |
| "loss": 0.2289, | |
| "num_input_tokens_seen": 101376, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 0.00457763671875, | |
| "learning_rate": 0.029378158426974426, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 102976, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.7222222222222223, | |
| "grad_norm": 0.032958984375, | |
| "learning_rate": 0.029336039623761044, | |
| "loss": 0.2383, | |
| "num_input_tokens_seen": 104576, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 0.0159912109375, | |
| "learning_rate": 0.02929257298931754, | |
| "loss": 0.2305, | |
| "num_input_tokens_seen": 106112, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.8333333333333335, | |
| "grad_norm": 0.005157470703125, | |
| "learning_rate": 0.02924776261024586, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 107712, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 0.01953125, | |
| "learning_rate": 0.02920161269948277, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 109344, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.9444444444444446, | |
| "grad_norm": 0.05078125, | |
| "learning_rate": 0.029154127595903752, | |
| "loss": 0.2425, | |
| "num_input_tokens_seen": 110944, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.006561279296875, | |
| "learning_rate": 0.029105311763915113, | |
| "loss": 0.2453, | |
| "num_input_tokens_seen": 112544, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.23456093668937683, | |
| "eval_runtime": 0.8072, | |
| "eval_samples_per_second": 49.552, | |
| "eval_steps_per_second": 12.388, | |
| "num_input_tokens_seen": 112544, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.055555555555555, | |
| "grad_norm": 0.0140380859375, | |
| "learning_rate": 0.029055169793034224, | |
| "loss": 0.2495, | |
| "num_input_tokens_seen": 114144, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.029003706397458022, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 115712, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 0.0169677734375, | |
| "learning_rate": 0.028950926415619846, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 117312, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 0.013427734375, | |
| "learning_rate": 0.028896834809734474, | |
| "loss": 0.4716, | |
| "num_input_tokens_seen": 118848, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.277777777777778, | |
| "grad_norm": 0.007720947265625, | |
| "learning_rate": 0.028841436665331635, | |
| "loss": 0.2248, | |
| "num_input_tokens_seen": 120416, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 0.02878473719077787, | |
| "loss": 0.2557, | |
| "num_input_tokens_seen": 122016, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.388888888888889, | |
| "grad_norm": 0.04248046875, | |
| "learning_rate": 0.028726741716786866, | |
| "loss": 0.3798, | |
| "num_input_tokens_seen": 123584, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 0.0255126953125, | |
| "learning_rate": 0.02866745569591825, | |
| "loss": 0.2471, | |
| "num_input_tokens_seen": 125216, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 27.375, | |
| "learning_rate": 0.028606884702065006, | |
| "loss": 0.3568, | |
| "num_input_tokens_seen": 126784, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.555555555555555, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 0.028545034429929377, | |
| "loss": 0.2359, | |
| "num_input_tokens_seen": 128352, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.611111111111111, | |
| "grad_norm": 0.019287109375, | |
| "learning_rate": 0.028481910694487505, | |
| "loss": 0.2658, | |
| "num_input_tokens_seen": 129920, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 0.057373046875, | |
| "learning_rate": 0.02841751943044271, | |
| "loss": 0.2446, | |
| "num_input_tokens_seen": 131488, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.722222222222222, | |
| "grad_norm": 0.06201171875, | |
| "learning_rate": 0.028351866691667543, | |
| "loss": 0.2816, | |
| "num_input_tokens_seen": 133056, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.777777777777778, | |
| "grad_norm": 0.02685546875, | |
| "learning_rate": 0.02828495865063459, | |
| "loss": 0.2347, | |
| "num_input_tokens_seen": 134624, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.833333333333333, | |
| "grad_norm": 0.049560546875, | |
| "learning_rate": 0.028216801597836176, | |
| "loss": 0.2373, | |
| "num_input_tokens_seen": 136192, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 0.0166015625, | |
| "learning_rate": 0.028147401941192952, | |
| "loss": 0.2371, | |
| "num_input_tokens_seen": 137824, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.944444444444445, | |
| "grad_norm": 0.03955078125, | |
| "learning_rate": 0.028076766205451433, | |
| "loss": 0.2381, | |
| "num_input_tokens_seen": 139392, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.03125, | |
| "learning_rate": 0.028004901031570568, | |
| "loss": 0.2262, | |
| "num_input_tokens_seen": 140960, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.24221666157245636, | |
| "eval_runtime": 0.803, | |
| "eval_samples_per_second": 49.812, | |
| "eval_steps_per_second": 12.453, | |
| "num_input_tokens_seen": 140960, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.055555555555555, | |
| "grad_norm": 0.03466796875, | |
| "learning_rate": 0.027931813176097366, | |
| "loss": 0.229, | |
| "num_input_tokens_seen": 142560, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 5.111111111111111, | |
| "grad_norm": 0.0140380859375, | |
| "learning_rate": 0.027857509510531685, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 144096, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.166666666666667, | |
| "grad_norm": 0.0186767578125, | |
| "learning_rate": 0.02778199702068017, | |
| "loss": 0.248, | |
| "num_input_tokens_seen": 145632, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 5.222222222222222, | |
| "grad_norm": 0.0133056640625, | |
| "learning_rate": 0.02770528280599949, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 147232, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.277777777777778, | |
| "grad_norm": 0.028076171875, | |
| "learning_rate": 0.02762737407892886, | |
| "loss": 0.2383, | |
| "num_input_tokens_seen": 148864, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 0.0030517578125, | |
| "learning_rate": 0.02754827816421195, | |
| "loss": 0.2343, | |
| "num_input_tokens_seen": 150432, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.388888888888889, | |
| "grad_norm": 0.0135498046875, | |
| "learning_rate": 0.02746800249820822, | |
| "loss": 0.2442, | |
| "num_input_tokens_seen": 152032, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 5.444444444444445, | |
| "grad_norm": 0.01458740234375, | |
| "learning_rate": 0.027386554628193813, | |
| "loss": 0.2354, | |
| "num_input_tokens_seen": 153600, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.01361083984375, | |
| "learning_rate": 0.027303942211651937, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 155200, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.002471923828125, | |
| "learning_rate": 0.02722017301555297, | |
| "loss": 0.2403, | |
| "num_input_tokens_seen": 156800, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.611111111111111, | |
| "grad_norm": 0.021728515625, | |
| "learning_rate": 0.02713525491562421, | |
| "loss": 0.227, | |
| "num_input_tokens_seen": 158336, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 0.0029144287109375, | |
| "learning_rate": 0.027049195895609432, | |
| "loss": 0.2365, | |
| "num_input_tokens_seen": 159904, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.722222222222222, | |
| "grad_norm": 0.0128173828125, | |
| "learning_rate": 0.026962004046518273, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 161504, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 5.777777777777778, | |
| "grad_norm": 0.01275634765625, | |
| "learning_rate": 0.02687368756586555, | |
| "loss": 0.2341, | |
| "num_input_tokens_seen": 163008, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.833333333333333, | |
| "grad_norm": 0.00151824951171875, | |
| "learning_rate": 0.02678425475690055, | |
| "loss": 0.232, | |
| "num_input_tokens_seen": 164512, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 5.888888888888889, | |
| "grad_norm": 0.0021820068359375, | |
| "learning_rate": 0.02669371402782638, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 166080, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 5.944444444444445, | |
| "grad_norm": 0.0031585693359375, | |
| "learning_rate": 0.026602073891009458, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 167680, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.0013580322265625, | |
| "learning_rate": 0.0265093429621792, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 169216, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.2318514883518219, | |
| "eval_runtime": 0.8055, | |
| "eval_samples_per_second": 49.658, | |
| "eval_steps_per_second": 12.415, | |
| "num_input_tokens_seen": 169216, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.055555555555555, | |
| "grad_norm": 0.0014495849609375, | |
| "learning_rate": 0.026415529959618007, | |
| "loss": 0.2299, | |
| "num_input_tokens_seen": 170784, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 6.111111111111111, | |
| "grad_norm": 0.00958251953125, | |
| "learning_rate": 0.02632064370334158, | |
| "loss": 0.2342, | |
| "num_input_tokens_seen": 172352, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.166666666666667, | |
| "grad_norm": 0.0021209716796875, | |
| "learning_rate": 0.026224693114269705, | |
| "loss": 0.2279, | |
| "num_input_tokens_seen": 173920, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 6.222222222222222, | |
| "grad_norm": 0.01953125, | |
| "learning_rate": 0.02612768721338753, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 175488, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.277777777777778, | |
| "grad_norm": 0.02294921875, | |
| "learning_rate": 0.02602963512089743, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 177024, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 0.00262451171875, | |
| "learning_rate": 0.025930546055361575, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 178624, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.388888888888889, | |
| "grad_norm": 0.01239013671875, | |
| "learning_rate": 0.025830429332835202, | |
| "loss": 0.2345, | |
| "num_input_tokens_seen": 180096, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 6.444444444444445, | |
| "grad_norm": 0.0205078125, | |
| "learning_rate": 0.025729294365990772, | |
| "loss": 0.227, | |
| "num_input_tokens_seen": 181600, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 0.0037078857421875, | |
| "learning_rate": 0.025627150663232998, | |
| "loss": 0.2245, | |
| "num_input_tokens_seen": 183232, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 6.555555555555555, | |
| "grad_norm": 0.00811767578125, | |
| "learning_rate": 0.025524007827804902, | |
| "loss": 0.235, | |
| "num_input_tokens_seen": 184768, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 6.611111111111111, | |
| "grad_norm": 0.01385498046875, | |
| "learning_rate": 0.025419875556884956, | |
| "loss": 0.233, | |
| "num_input_tokens_seen": 186208, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.0078125, | |
| "learning_rate": 0.025314763640675374, | |
| "loss": 0.2374, | |
| "num_input_tokens_seen": 187776, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.722222222222222, | |
| "grad_norm": 0.0030364990234375, | |
| "learning_rate": 0.025208681961481655, | |
| "loss": 0.2207, | |
| "num_input_tokens_seen": 189344, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 6.777777777777778, | |
| "grad_norm": 0.002960205078125, | |
| "learning_rate": 0.025101640492783503, | |
| "loss": 0.237, | |
| "num_input_tokens_seen": 190912, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 6.833333333333333, | |
| "grad_norm": 0.01025390625, | |
| "learning_rate": 0.024993649298297137, | |
| "loss": 0.2397, | |
| "num_input_tokens_seen": 192512, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 0.01104736328125, | |
| "learning_rate": 0.02488471853102912, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 194112, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 6.944444444444445, | |
| "grad_norm": 0.005401611328125, | |
| "learning_rate": 0.024774858432321828, | |
| "loss": 0.2345, | |
| "num_input_tokens_seen": 195712, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.02001953125, | |
| "learning_rate": 0.024664079330890574, | |
| "loss": 0.2352, | |
| "num_input_tokens_seen": 197248, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.23624002933502197, | |
| "eval_runtime": 0.8061, | |
| "eval_samples_per_second": 49.621, | |
| "eval_steps_per_second": 12.405, | |
| "num_input_tokens_seen": 197248, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.055555555555555, | |
| "grad_norm": 0.0026397705078125, | |
| "learning_rate": 0.02455239164185254, | |
| "loss": 0.2522, | |
| "num_input_tokens_seen": 198784, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 0.010498046875, | |
| "learning_rate": 0.024439805865747562, | |
| "loss": 0.2401, | |
| "num_input_tokens_seen": 200352, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.166666666666667, | |
| "grad_norm": 0.0023193359375, | |
| "learning_rate": 0.02432633258755093, | |
| "loss": 0.2367, | |
| "num_input_tokens_seen": 201856, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 7.222222222222222, | |
| "grad_norm": 0.0036468505859375, | |
| "learning_rate": 0.024211982475678205, | |
| "loss": 0.2218, | |
| "num_input_tokens_seen": 203488, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 7.277777777777778, | |
| "grad_norm": 0.01275634765625, | |
| "learning_rate": 0.024096766280982205, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 205056, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 0.004058837890625, | |
| "learning_rate": 0.023980694835742226, | |
| "loss": 0.209, | |
| "num_input_tokens_seen": 206656, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 7.388888888888889, | |
| "grad_norm": 0.00640869140625, | |
| "learning_rate": 0.023863779052645667, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 208288, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 7.444444444444445, | |
| "grad_norm": 0.00592041015625, | |
| "learning_rate": 0.02374602992376202, | |
| "loss": 0.2224, | |
| "num_input_tokens_seen": 209856, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.01422119140625, | |
| "learning_rate": 0.023627458519509432, | |
| "loss": 0.2686, | |
| "num_input_tokens_seen": 211424, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 7.555555555555555, | |
| "grad_norm": 0.007476806640625, | |
| "learning_rate": 0.023508075987613904, | |
| "loss": 0.2426, | |
| "num_input_tokens_seen": 212928, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 7.611111111111111, | |
| "grad_norm": 0.001861572265625, | |
| "learning_rate": 0.023387893552061202, | |
| "loss": 0.2263, | |
| "num_input_tokens_seen": 214496, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 0.00811767578125, | |
| "learning_rate": 0.023266922512041644, | |
| "loss": 0.2251, | |
| "num_input_tokens_seen": 216064, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 7.722222222222222, | |
| "grad_norm": 0.002838134765625, | |
| "learning_rate": 0.023145174240887748, | |
| "loss": 0.2313, | |
| "num_input_tokens_seen": 217600, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 0.0018463134765625, | |
| "learning_rate": 0.023022660185004967, | |
| "loss": 0.2406, | |
| "num_input_tokens_seen": 219136, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 7.833333333333333, | |
| "grad_norm": 0.0020599365234375, | |
| "learning_rate": 0.02289939186279551, | |
| "loss": 0.2329, | |
| "num_input_tokens_seen": 220704, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 0.0031890869140625, | |
| "learning_rate": 0.022775380863575456, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 222272, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 7.944444444444445, | |
| "grad_norm": 0.003631591796875, | |
| "learning_rate": 0.02265063884648513, | |
| "loss": 0.2198, | |
| "num_input_tokens_seen": 223872, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.00946044921875, | |
| "learning_rate": 0.022525177539392937, | |
| "loss": 0.2421, | |
| "num_input_tokens_seen": 225440, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.2336316853761673, | |
| "eval_runtime": 0.8069, | |
| "eval_samples_per_second": 49.574, | |
| "eval_steps_per_second": 12.393, | |
| "num_input_tokens_seen": 225440, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.055555555555555, | |
| "grad_norm": 0.00506591796875, | |
| "learning_rate": 0.02239900873779278, | |
| "loss": 0.2322, | |
| "num_input_tokens_seen": 226912, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 8.11111111111111, | |
| "grad_norm": 0.0031280517578125, | |
| "learning_rate": 0.022272144303695056, | |
| "loss": 0.2425, | |
| "num_input_tokens_seen": 228512, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 8.166666666666666, | |
| "grad_norm": 0.00860595703125, | |
| "learning_rate": 0.02214459616451143, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 230080, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 8.222222222222221, | |
| "grad_norm": 0.018310546875, | |
| "learning_rate": 0.02201637631193346, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 231616, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 8.277777777777779, | |
| "grad_norm": 0.009033203125, | |
| "learning_rate": 0.021887496800805175, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 233184, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.00286865234375, | |
| "learning_rate": 0.021757969747989707, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 234688, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 8.38888888888889, | |
| "grad_norm": 0.0017547607421875, | |
| "learning_rate": 0.02162780733123012, | |
| "loss": 0.2348, | |
| "num_input_tokens_seen": 236256, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 8.444444444444445, | |
| "grad_norm": 0.00836181640625, | |
| "learning_rate": 0.021497021788004445, | |
| "loss": 0.2315, | |
| "num_input_tokens_seen": 237856, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 0.01708984375, | |
| "learning_rate": 0.021365625414375228, | |
| "loss": 0.2304, | |
| "num_input_tokens_seen": 239392, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 8.555555555555555, | |
| "grad_norm": 0.0167236328125, | |
| "learning_rate": 0.021233630563833435, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 240960, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 8.61111111111111, | |
| "grad_norm": 0.009765625, | |
| "learning_rate": 0.021101049646137005, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 242496, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 0.001983642578125, | |
| "learning_rate": 0.02096789512614417, | |
| "loss": 0.2362, | |
| "num_input_tokens_seen": 244064, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 8.722222222222221, | |
| "grad_norm": 0.00860595703125, | |
| "learning_rate": 0.020834179522641504, | |
| "loss": 0.2379, | |
| "num_input_tokens_seen": 245664, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 8.777777777777779, | |
| "grad_norm": 0.008056640625, | |
| "learning_rate": 0.020699915407166987, | |
| "loss": 0.2315, | |
| "num_input_tokens_seen": 247264, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 8.833333333333334, | |
| "grad_norm": 0.00116729736328125, | |
| "learning_rate": 0.020565115402828002, | |
| "loss": 0.237, | |
| "num_input_tokens_seen": 248832, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 0.0087890625, | |
| "learning_rate": 0.02042979218311462, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 250432, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 8.944444444444445, | |
| "grad_norm": 0.00173187255859375, | |
| "learning_rate": 0.02029395847070803, | |
| "loss": 0.2215, | |
| "num_input_tokens_seen": 252064, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.009521484375, | |
| "learning_rate": 0.020157627036284417, | |
| "loss": 0.2379, | |
| "num_input_tokens_seen": 253632, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.23554234206676483, | |
| "eval_runtime": 0.8051, | |
| "eval_samples_per_second": 49.685, | |
| "eval_steps_per_second": 12.421, | |
| "num_input_tokens_seen": 253632, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.055555555555555, | |
| "grad_norm": 0.00127410888671875, | |
| "learning_rate": 0.02002081069731427, | |
| "loss": 0.2313, | |
| "num_input_tokens_seen": 255232, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 9.11111111111111, | |
| "grad_norm": 0.001953125, | |
| "learning_rate": 0.01988352231685735, | |
| "loss": 0.23, | |
| "num_input_tokens_seen": 256800, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 9.166666666666666, | |
| "grad_norm": 0.00193023681640625, | |
| "learning_rate": 0.019745774802353344, | |
| "loss": 0.2374, | |
| "num_input_tokens_seen": 258368, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 9.222222222222221, | |
| "grad_norm": 0.008544921875, | |
| "learning_rate": 0.019607581104408342, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 259872, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 9.277777777777779, | |
| "grad_norm": 0.00860595703125, | |
| "learning_rate": 0.019468954215577226, | |
| "loss": 0.2389, | |
| "num_input_tokens_seen": 261376, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 0.007110595703125, | |
| "learning_rate": 0.01932990716914222, | |
| "loss": 0.229, | |
| "num_input_tokens_seen": 262944, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 9.38888888888889, | |
| "grad_norm": 0.0169677734375, | |
| "learning_rate": 0.019190453037887464, | |
| "loss": 0.2404, | |
| "num_input_tokens_seen": 264544, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 9.444444444444445, | |
| "grad_norm": 0.001708984375, | |
| "learning_rate": 0.019050604932870013, | |
| "loss": 0.2351, | |
| "num_input_tokens_seen": 266080, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 0.007568359375, | |
| "learning_rate": 0.01891037600218712, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 267680, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 9.555555555555555, | |
| "grad_norm": 0.00799560546875, | |
| "learning_rate": 0.018769779429740154, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 269280, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 9.61111111111111, | |
| "grad_norm": 0.001983642578125, | |
| "learning_rate": 0.018628828433995014, | |
| "loss": 0.2285, | |
| "num_input_tokens_seen": 270912, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 9.666666666666666, | |
| "grad_norm": 0.0076904296875, | |
| "learning_rate": 0.018487536266739445, | |
| "loss": 0.2305, | |
| "num_input_tokens_seen": 272512, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 9.722222222222221, | |
| "grad_norm": 0.008056640625, | |
| "learning_rate": 0.01834591621183709, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 274048, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 9.777777777777779, | |
| "grad_norm": 0.0152587890625, | |
| "learning_rate": 0.018203981583978603, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 275648, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 9.833333333333334, | |
| "grad_norm": 0.00185394287109375, | |
| "learning_rate": 0.018061745727429836, | |
| "loss": 0.237, | |
| "num_input_tokens_seen": 277248, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 9.88888888888889, | |
| "grad_norm": 0.007781982421875, | |
| "learning_rate": 0.017919222014777265, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 278816, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 9.944444444444445, | |
| "grad_norm": 0.0078125, | |
| "learning_rate": 0.017776423845670717, | |
| "loss": 0.2285, | |
| "num_input_tokens_seen": 280416, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.014892578125, | |
| "learning_rate": 0.0176333646455636, | |
| "loss": 0.2244, | |
| "num_input_tokens_seen": 281984, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.2313733547925949, | |
| "eval_runtime": 0.8045, | |
| "eval_samples_per_second": 49.72, | |
| "eval_steps_per_second": 12.43, | |
| "num_input_tokens_seen": 281984, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 10.055555555555555, | |
| "grad_norm": 0.01513671875, | |
| "learning_rate": 0.017490057864450664, | |
| "loss": 0.2371, | |
| "num_input_tokens_seen": 283552, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 10.11111111111111, | |
| "grad_norm": 0.001739501953125, | |
| "learning_rate": 0.017346516975603462, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 285120, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 10.166666666666666, | |
| "grad_norm": 0.0164794921875, | |
| "learning_rate": 0.017202755474303683, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 286720, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 10.222222222222221, | |
| "grad_norm": 0.00173187255859375, | |
| "learning_rate": 0.017058786876574313, | |
| "loss": 0.233, | |
| "num_input_tokens_seen": 288288, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 10.277777777777779, | |
| "grad_norm": 0.009033203125, | |
| "learning_rate": 0.016914624717908923, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 289856, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 10.333333333333334, | |
| "grad_norm": 0.01513671875, | |
| "learning_rate": 0.016770282551999093, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 291392, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 10.38888888888889, | |
| "grad_norm": 0.01513671875, | |
| "learning_rate": 0.01662577394946016, | |
| "loss": 0.2285, | |
| "num_input_tokens_seen": 292864, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 10.444444444444445, | |
| "grad_norm": 0.00830078125, | |
| "learning_rate": 0.016481112496555317, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 294496, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 10.5, | |
| "grad_norm": 0.0078125, | |
| "learning_rate": 0.016336311793918295, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 296096, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 10.555555555555555, | |
| "grad_norm": 0.0093994140625, | |
| "learning_rate": 0.016191385455274654, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 297632, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 10.61111111111111, | |
| "grad_norm": 0.007080078125, | |
| "learning_rate": 0.016046347106161877, | |
| "loss": 0.232, | |
| "num_input_tokens_seen": 299168, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 10.666666666666666, | |
| "grad_norm": 0.00750732421875, | |
| "learning_rate": 0.01590121038264835, | |
| "loss": 0.229, | |
| "num_input_tokens_seen": 300736, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 10.722222222222221, | |
| "grad_norm": 0.007598876953125, | |
| "learning_rate": 0.015755988930051302, | |
| "loss": 0.2351, | |
| "num_input_tokens_seen": 302304, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 10.777777777777779, | |
| "grad_norm": 0.007415771484375, | |
| "learning_rate": 0.01561069640165394, | |
| "loss": 0.2219, | |
| "num_input_tokens_seen": 303936, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 10.833333333333334, | |
| "grad_norm": 0.007110595703125, | |
| "learning_rate": 0.015465346457421807, | |
| "loss": 0.2315, | |
| "num_input_tokens_seen": 305504, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 10.88888888888889, | |
| "grad_norm": 0.006927490234375, | |
| "learning_rate": 0.015319952762718515, | |
| "loss": 0.2366, | |
| "num_input_tokens_seen": 307040, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 10.944444444444445, | |
| "grad_norm": 0.0081787109375, | |
| "learning_rate": 0.015174528987020958, | |
| "loss": 0.2237, | |
| "num_input_tokens_seen": 308608, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.018310546875, | |
| "learning_rate": 0.015029088802634146, | |
| "loss": 0.2442, | |
| "num_input_tokens_seen": 310176, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.23302462697029114, | |
| "eval_runtime": 0.8104, | |
| "eval_samples_per_second": 49.357, | |
| "eval_steps_per_second": 12.339, | |
| "num_input_tokens_seen": 310176, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 11.055555555555555, | |
| "grad_norm": 0.0155029296875, | |
| "learning_rate": 0.014883645883405797, | |
| "loss": 0.2271, | |
| "num_input_tokens_seen": 311712, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "grad_norm": 0.0023345947265625, | |
| "learning_rate": 0.014738213903440746, | |
| "loss": 0.232, | |
| "num_input_tokens_seen": 313280, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 11.166666666666666, | |
| "grad_norm": 0.007110595703125, | |
| "learning_rate": 0.014592806535815357, | |
| "loss": 0.2219, | |
| "num_input_tokens_seen": 314816, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 11.222222222222221, | |
| "grad_norm": 0.002410888671875, | |
| "learning_rate": 0.014447437451291999, | |
| "loss": 0.2366, | |
| "num_input_tokens_seen": 316384, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 11.277777777777779, | |
| "grad_norm": 0.009521484375, | |
| "learning_rate": 0.014302120317033798, | |
| "loss": 0.2472, | |
| "num_input_tokens_seen": 317984, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 11.333333333333334, | |
| "grad_norm": 0.002471923828125, | |
| "learning_rate": 0.014156868795319669, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 319552, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 11.38888888888889, | |
| "grad_norm": 0.0164794921875, | |
| "learning_rate": 0.014011696542259821, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 321184, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 11.444444444444445, | |
| "grad_norm": 0.00836181640625, | |
| "learning_rate": 0.013866617206511882, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 322688, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 11.5, | |
| "grad_norm": 0.016357421875, | |
| "learning_rate": 0.013721644427997651, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 324224, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 11.555555555555555, | |
| "grad_norm": 0.00165557861328125, | |
| "learning_rate": 0.01357679183662076, | |
| "loss": 0.2326, | |
| "num_input_tokens_seen": 325760, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 11.61111111111111, | |
| "grad_norm": 0.0084228515625, | |
| "learning_rate": 0.0134320730509852, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 327264, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 11.666666666666666, | |
| "grad_norm": 0.00238037109375, | |
| "learning_rate": 0.01328750167711494, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 328832, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 11.722222222222221, | |
| "grad_norm": 0.0011749267578125, | |
| "learning_rate": 0.013143091307174755, | |
| "loss": 0.2315, | |
| "num_input_tokens_seen": 330464, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 11.777777777777779, | |
| "grad_norm": 0.0018310546875, | |
| "learning_rate": 0.012998855518192309, | |
| "loss": 0.2326, | |
| "num_input_tokens_seen": 332064, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 11.833333333333334, | |
| "grad_norm": 0.00927734375, | |
| "learning_rate": 0.012854807870781686, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 333664, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 11.88888888888889, | |
| "grad_norm": 0.002288818359375, | |
| "learning_rate": 0.012710961907868478, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 335232, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 11.944444444444445, | |
| "grad_norm": 0.0027008056640625, | |
| "learning_rate": 0.012567331153416489, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 336800, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.0078125, | |
| "learning_rate": 0.012423929111156296, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 338400, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.23600880801677704, | |
| "eval_runtime": 0.8054, | |
| "eval_samples_per_second": 49.667, | |
| "eval_steps_per_second": 12.417, | |
| "num_input_tokens_seen": 338400, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 12.055555555555555, | |
| "grad_norm": 0.00958251953125, | |
| "learning_rate": 0.012280769263315627, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 339968, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 12.11111111111111, | |
| "grad_norm": 0.0030059814453125, | |
| "learning_rate": 0.012137865069351828, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 341536, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 12.166666666666666, | |
| "grad_norm": 0.00159454345703125, | |
| "learning_rate": 0.01199522996468644, | |
| "loss": 0.238, | |
| "num_input_tokens_seen": 343104, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 12.222222222222221, | |
| "grad_norm": 0.00897216796875, | |
| "learning_rate": 0.01185287735944204, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 344640, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 12.277777777777779, | |
| "grad_norm": 0.00897216796875, | |
| "learning_rate": 0.011710820637181448, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 346272, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 12.333333333333334, | |
| "grad_norm": 0.00174713134765625, | |
| "learning_rate": 0.011569073153649483, | |
| "loss": 0.2285, | |
| "num_input_tokens_seen": 347808, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 12.38888888888889, | |
| "grad_norm": 0.0087890625, | |
| "learning_rate": 0.01142764823551724, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 349344, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 12.444444444444445, | |
| "grad_norm": 0.0025177001953125, | |
| "learning_rate": 0.011286559179129213, | |
| "loss": 0.2326, | |
| "num_input_tokens_seen": 350912, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 0.0084228515625, | |
| "learning_rate": 0.01114581924925317, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 352480, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 12.555555555555555, | |
| "grad_norm": 0.0086669921875, | |
| "learning_rate": 0.011005441677833067, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 354048, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 12.61111111111111, | |
| "grad_norm": 0.008544921875, | |
| "learning_rate": 0.010865439662745013, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 355648, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 12.666666666666666, | |
| "grad_norm": 0.0103759765625, | |
| "learning_rate": 0.01072582636655643, | |
| "loss": 0.2352, | |
| "num_input_tokens_seen": 357248, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 12.722222222222221, | |
| "grad_norm": 0.015869140625, | |
| "learning_rate": 0.010586614915288572, | |
| "loss": 0.2242, | |
| "num_input_tokens_seen": 358752, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 12.777777777777779, | |
| "grad_norm": 0.009521484375, | |
| "learning_rate": 0.010447818397182444, | |
| "loss": 0.2341, | |
| "num_input_tokens_seen": 360288, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 12.833333333333334, | |
| "grad_norm": 0.00848388671875, | |
| "learning_rate": 0.010309449861468272, | |
| "loss": 0.2381, | |
| "num_input_tokens_seen": 361920, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 12.88888888888889, | |
| "grad_norm": 0.00982666015625, | |
| "learning_rate": 0.010171522317138689, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 363520, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 12.944444444444445, | |
| "grad_norm": 0.00885009765625, | |
| "learning_rate": 0.01003404873172563, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 365088, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.00244140625, | |
| "learning_rate": 0.009897042030081191, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 366688, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.23383350670337677, | |
| "eval_runtime": 0.8057, | |
| "eval_samples_per_second": 49.649, | |
| "eval_steps_per_second": 12.412, | |
| "num_input_tokens_seen": 366688, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 13.055555555555555, | |
| "grad_norm": 0.00872802734375, | |
| "learning_rate": 0.009760515093162463, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 368192, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 13.11111111111111, | |
| "grad_norm": 0.003997802734375, | |
| "learning_rate": 0.009624480756820496, | |
| "loss": 0.2225, | |
| "num_input_tokens_seen": 369792, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 13.166666666666666, | |
| "grad_norm": 0.0022735595703125, | |
| "learning_rate": 0.009488951810593525, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 371360, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 13.222222222222221, | |
| "grad_norm": 0.0159912109375, | |
| "learning_rate": 0.009353940996504537, | |
| "loss": 0.2255, | |
| "num_input_tokens_seen": 372928, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 13.277777777777779, | |
| "grad_norm": 0.009033203125, | |
| "learning_rate": 0.009219461007863278, | |
| "loss": 0.237, | |
| "num_input_tokens_seen": 374528, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 0.002655029296875, | |
| "learning_rate": 0.009085524488072901, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 376000, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 13.38888888888889, | |
| "grad_norm": 0.0174560546875, | |
| "learning_rate": 0.008952144029441248, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 377600, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 13.444444444444445, | |
| "grad_norm": 0.0035400390625, | |
| "learning_rate": 0.008819332171996975, | |
| "loss": 0.2266, | |
| "num_input_tokens_seen": 379200, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 13.5, | |
| "grad_norm": 0.00933837890625, | |
| "learning_rate": 0.008687101402310564, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 380704, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 13.555555555555555, | |
| "grad_norm": 0.0159912109375, | |
| "learning_rate": 0.008555464152320372, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 382272, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 13.61111111111111, | |
| "grad_norm": 0.00823974609375, | |
| "learning_rate": 0.008424432798163836, | |
| "loss": 0.2257, | |
| "num_input_tokens_seen": 383776, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 13.666666666666666, | |
| "grad_norm": 0.009521484375, | |
| "learning_rate": 0.008294019659013892, | |
| "loss": 0.2371, | |
| "num_input_tokens_seen": 385344, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 13.722222222222221, | |
| "grad_norm": 0.00897216796875, | |
| "learning_rate": 0.008164236995920735, | |
| "loss": 0.2267, | |
| "num_input_tokens_seen": 386912, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 13.777777777777779, | |
| "grad_norm": 0.00811767578125, | |
| "learning_rate": 0.008035097010659147, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 388480, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 13.833333333333334, | |
| "grad_norm": 0.00909423828125, | |
| "learning_rate": 0.00790661184458125, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 390048, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 13.88888888888889, | |
| "grad_norm": 0.004547119140625, | |
| "learning_rate": 0.007778793577475039, | |
| "loss": 0.2322, | |
| "num_input_tokens_seen": 391584, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 13.944444444444445, | |
| "grad_norm": 0.017822265625, | |
| "learning_rate": 0.007651654226428696, | |
| "loss": 0.231, | |
| "num_input_tokens_seen": 393184, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.003631591796875, | |
| "learning_rate": 0.0075252057447007465, | |
| "loss": 0.2299, | |
| "num_input_tokens_seen": 394752, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.23513850569725037, | |
| "eval_runtime": 0.8092, | |
| "eval_samples_per_second": 49.43, | |
| "eval_steps_per_second": 12.357, | |
| "num_input_tokens_seen": 394752, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 14.055555555555555, | |
| "grad_norm": 0.0093994140625, | |
| "learning_rate": 0.007399460020596265, | |
| "loss": 0.2278, | |
| "num_input_tokens_seen": 396320, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 14.11111111111111, | |
| "grad_norm": 0.003448486328125, | |
| "learning_rate": 0.007274428876349185, | |
| "loss": 0.2363, | |
| "num_input_tokens_seen": 397920, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 14.166666666666666, | |
| "grad_norm": 0.01708984375, | |
| "learning_rate": 0.007150124067010788, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 399552, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 14.222222222222221, | |
| "grad_norm": 0.00994873046875, | |
| "learning_rate": 0.007026557279344533, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 401120, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 14.277777777777779, | |
| "grad_norm": 0.0166015625, | |
| "learning_rate": 0.006903740130727311, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 402720, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 14.333333333333334, | |
| "grad_norm": 0.00131988525390625, | |
| "learning_rate": 0.0067816841680572015, | |
| "loss": 0.2299, | |
| "num_input_tokens_seen": 404320, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 14.38888888888889, | |
| "grad_norm": 0.0096435546875, | |
| "learning_rate": 0.006660400866667899, | |
| "loss": 0.2266, | |
| "num_input_tokens_seen": 405824, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 14.444444444444445, | |
| "grad_norm": 0.0086669921875, | |
| "learning_rate": 0.006539901629249787, | |
| "loss": 0.2329, | |
| "num_input_tokens_seen": 407360, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 14.5, | |
| "grad_norm": 0.016357421875, | |
| "learning_rate": 0.006420197784777924, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 408928, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 14.555555555555555, | |
| "grad_norm": 0.0027313232421875, | |
| "learning_rate": 0.006301300587446937, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 410496, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 14.61111111111111, | |
| "grad_norm": 0.00830078125, | |
| "learning_rate": 0.006183221215612904, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 412032, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 14.666666666666666, | |
| "grad_norm": 0.00323486328125, | |
| "learning_rate": 0.00606597077074242, | |
| "loss": 0.229, | |
| "num_input_tokens_seen": 413536, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 14.722222222222221, | |
| "grad_norm": 0.010986328125, | |
| "learning_rate": 0.005949560276368865, | |
| "loss": 0.2322, | |
| "num_input_tokens_seen": 415136, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 14.777777777777779, | |
| "grad_norm": 0.018310546875, | |
| "learning_rate": 0.005834000677056003, | |
| "loss": 0.2215, | |
| "num_input_tokens_seen": 416704, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 14.833333333333334, | |
| "grad_norm": 0.00885009765625, | |
| "learning_rate": 0.005719302837369021, | |
| "loss": 0.2482, | |
| "num_input_tokens_seen": 418272, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 14.88888888888889, | |
| "grad_norm": 0.009033203125, | |
| "learning_rate": 0.00560547754085305, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 419808, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 14.944444444444445, | |
| "grad_norm": 0.0106201171875, | |
| "learning_rate": 0.005492535489019344, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 421344, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.0021820068359375, | |
| "learning_rate": 0.005380487300339167, | |
| "loss": 0.2361, | |
| "num_input_tokens_seen": 422912, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.2367170751094818, | |
| "eval_runtime": 0.8091, | |
| "eval_samples_per_second": 49.436, | |
| "eval_steps_per_second": 12.359, | |
| "num_input_tokens_seen": 422912, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 15.055555555555555, | |
| "grad_norm": 0.00811767578125, | |
| "learning_rate": 0.005269343509245449, | |
| "loss": 0.2278, | |
| "num_input_tokens_seen": 424480, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 15.11111111111111, | |
| "grad_norm": 0.018798828125, | |
| "learning_rate": 0.005159114565142392, | |
| "loss": 0.2361, | |
| "num_input_tokens_seen": 426048, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 15.166666666666666, | |
| "grad_norm": 0.004302978515625, | |
| "learning_rate": 0.0050498108314230425, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 427648, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 15.222222222222221, | |
| "grad_norm": 0.00830078125, | |
| "learning_rate": 0.0049414425844949445, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 429216, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 15.277777777777779, | |
| "grad_norm": 0.0169677734375, | |
| "learning_rate": 0.004834020012814016, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 430720, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 15.333333333333334, | |
| "grad_norm": 0.0030975341796875, | |
| "learning_rate": 0.004727553215926623, | |
| "loss": 0.233, | |
| "num_input_tokens_seen": 432320, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 15.38888888888889, | |
| "grad_norm": 0.0087890625, | |
| "learning_rate": 0.004622052203520061, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 433888, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 15.444444444444445, | |
| "grad_norm": 0.00360107421875, | |
| "learning_rate": 0.004517526894481498, | |
| "loss": 0.2215, | |
| "num_input_tokens_seen": 435456, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 15.5, | |
| "grad_norm": 0.009033203125, | |
| "learning_rate": 0.004413987115965404, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 437024, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 15.555555555555555, | |
| "grad_norm": 0.0189208984375, | |
| "learning_rate": 0.004311442602469636, | |
| "loss": 0.2425, | |
| "num_input_tokens_seen": 438528, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 15.61111111111111, | |
| "grad_norm": 0.010009765625, | |
| "learning_rate": 0.004209902994920235, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 440096, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 15.666666666666666, | |
| "grad_norm": 0.00958251953125, | |
| "learning_rate": 0.004109377839765016, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 441632, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 15.722222222222221, | |
| "grad_norm": 0.00897216796875, | |
| "learning_rate": 0.004009876588076046, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 443136, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 15.777777777777779, | |
| "grad_norm": 0.0042724609375, | |
| "learning_rate": 0.003911408594661061, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 444736, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 15.833333333333334, | |
| "grad_norm": 0.0025787353515625, | |
| "learning_rate": 0.0038139831171839726, | |
| "loss": 0.2274, | |
| "num_input_tokens_seen": 446336, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 15.88888888888889, | |
| "grad_norm": 0.0028076171875, | |
| "learning_rate": 0.0037176093152944947, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 447936, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 15.944444444444445, | |
| "grad_norm": 0.0172119140625, | |
| "learning_rate": 0.0036222962497669668, | |
| "loss": 0.2266, | |
| "num_input_tokens_seen": 449440, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.00147247314453125, | |
| "learning_rate": 0.003528052881648488, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 451008, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.2349519431591034, | |
| "eval_runtime": 0.8071, | |
| "eval_samples_per_second": 49.558, | |
| "eval_steps_per_second": 12.389, | |
| "num_input_tokens_seen": 451008, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 16.055555555555557, | |
| "grad_norm": 0.017822265625, | |
| "learning_rate": 0.0034348880714164414, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 452576, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 16.11111111111111, | |
| "grad_norm": 0.0025634765625, | |
| "learning_rate": 0.0033428105781454364, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 454112, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 16.166666666666668, | |
| "grad_norm": 0.008544921875, | |
| "learning_rate": 0.0032518290586838377, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 455680, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 16.22222222222222, | |
| "grad_norm": 0.00994873046875, | |
| "learning_rate": 0.0031619520668398388, | |
| "loss": 0.2369, | |
| "num_input_tokens_seen": 457280, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 16.27777777777778, | |
| "grad_norm": 0.00124359130859375, | |
| "learning_rate": 0.003073188052577281, | |
| "loss": 0.2285, | |
| "num_input_tokens_seen": 458848, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 16.333333333333332, | |
| "grad_norm": 0.018798828125, | |
| "learning_rate": 0.00298554536122122, | |
| "loss": 0.235, | |
| "num_input_tokens_seen": 460416, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 16.38888888888889, | |
| "grad_norm": 0.01025390625, | |
| "learning_rate": 0.0028990322326732957, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 462048, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 16.444444444444443, | |
| "grad_norm": 0.009765625, | |
| "learning_rate": 0.0028136568006370643, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 463584, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 16.5, | |
| "grad_norm": 0.00994873046875, | |
| "learning_rate": 0.0027294270918532875, | |
| "loss": 0.2274, | |
| "num_input_tokens_seen": 465152, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 16.555555555555557, | |
| "grad_norm": 0.01019287109375, | |
| "learning_rate": 0.0026463510253452744, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 466720, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 16.61111111111111, | |
| "grad_norm": 0.00164794921875, | |
| "learning_rate": 0.0025644364116743754, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 468256, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 0.0089111328125, | |
| "learning_rate": 0.002483690952205637, | |
| "loss": 0.2214, | |
| "num_input_tokens_seen": 469792, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 16.72222222222222, | |
| "grad_norm": 0.01007080078125, | |
| "learning_rate": 0.0024041222383837536, | |
| "loss": 0.2412, | |
| "num_input_tokens_seen": 471360, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 16.77777777777778, | |
| "grad_norm": 0.0177001953125, | |
| "learning_rate": 0.002325737751019347, | |
| "loss": 0.2266, | |
| "num_input_tokens_seen": 472896, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 16.833333333333332, | |
| "grad_norm": 0.010009765625, | |
| "learning_rate": 0.00224854485958563, | |
| "loss": 0.2267, | |
| "num_input_tokens_seen": 474496, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 16.88888888888889, | |
| "grad_norm": 0.017333984375, | |
| "learning_rate": 0.0021725508215255634, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 476064, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 16.944444444444443, | |
| "grad_norm": 0.010498046875, | |
| "learning_rate": 0.0020977627815695213, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 477600, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.0023040771484375, | |
| "learning_rate": 0.0020241877710635747, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 479104, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.2349575012922287, | |
| "eval_runtime": 0.8088, | |
| "eval_samples_per_second": 49.454, | |
| "eval_steps_per_second": 12.363, | |
| "num_input_tokens_seen": 479104, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 17.055555555555557, | |
| "grad_norm": 0.003997802734375, | |
| "learning_rate": 0.0019518327073084285, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 480736, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 17.11111111111111, | |
| "grad_norm": 0.017578125, | |
| "learning_rate": 0.0018807043929090638, | |
| "loss": 0.2244, | |
| "num_input_tokens_seen": 482304, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 17.166666666666668, | |
| "grad_norm": 0.00141143798828125, | |
| "learning_rate": 0.0018108095151351837, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 483872, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 17.22222222222222, | |
| "grad_norm": 0.003997802734375, | |
| "learning_rate": 0.001742154645292508, | |
| "loss": 0.2224, | |
| "num_input_tokens_seen": 485408, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 17.27777777777778, | |
| "grad_norm": 0.00396728515625, | |
| "learning_rate": 0.0016747462381049415, | |
| "loss": 0.2329, | |
| "num_input_tokens_seen": 486944, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 17.333333333333332, | |
| "grad_norm": 0.0032196044921875, | |
| "learning_rate": 0.0016085906311077212, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 488544, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 17.38888888888889, | |
| "grad_norm": 0.00311279296875, | |
| "learning_rate": 0.0015436940440516017, | |
| "loss": 0.235, | |
| "num_input_tokens_seen": 490144, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 17.444444444444443, | |
| "grad_norm": 0.0020751953125, | |
| "learning_rate": 0.0014800625783180658, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 491680, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "grad_norm": 0.016845703125, | |
| "learning_rate": 0.0014177022163457135, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 493248, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 17.555555555555557, | |
| "grad_norm": 0.00933837890625, | |
| "learning_rate": 0.0013566188210677903, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 494816, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 17.61111111111111, | |
| "grad_norm": 0.0037841796875, | |
| "learning_rate": 0.0012968181353609854, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 496416, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 17.666666666666668, | |
| "grad_norm": 0.0089111328125, | |
| "learning_rate": 0.0012383057815055082, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 497952, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 17.72222222222222, | |
| "grad_norm": 0.00390625, | |
| "learning_rate": 0.001181087260656487, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 499520, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 17.77777777777778, | |
| "grad_norm": 0.009033203125, | |
| "learning_rate": 0.0011251679523267587, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 501120, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 17.833333333333332, | |
| "grad_norm": 0.00958251953125, | |
| "learning_rate": 0.0010705531138811369, | |
| "loss": 0.2267, | |
| "num_input_tokens_seen": 502688, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 17.88888888888889, | |
| "grad_norm": 0.00958251953125, | |
| "learning_rate": 0.0010172478800420954, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 504224, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 17.944444444444443, | |
| "grad_norm": 0.011474609375, | |
| "learning_rate": 0.0009652572624070293, | |
| "loss": 0.2288, | |
| "num_input_tokens_seen": 505824, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.00885009765625, | |
| "learning_rate": 0.0009145861489770912, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 507392, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.23599882423877716, | |
| "eval_runtime": 0.8074, | |
| "eval_samples_per_second": 49.543, | |
| "eval_steps_per_second": 12.386, | |
| "num_input_tokens_seen": 507392, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 18.055555555555557, | |
| "grad_norm": 0.003173828125, | |
| "learning_rate": 0.0008652393036976157, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 508928, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 18.11111111111111, | |
| "grad_norm": 0.0107421875, | |
| "learning_rate": 0.0008172213660102473, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 510496, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 18.166666666666668, | |
| "grad_norm": 0.0101318359375, | |
| "learning_rate": 0.0007705368504167398, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 512032, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 18.22222222222222, | |
| "grad_norm": 0.0108642578125, | |
| "learning_rate": 0.0007251901460545118, | |
| "loss": 0.2402, | |
| "num_input_tokens_seen": 513504, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 18.27777777777778, | |
| "grad_norm": 0.0027923583984375, | |
| "learning_rate": 0.0006811855162840213, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 515040, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 18.333333333333332, | |
| "grad_norm": 0.0181884765625, | |
| "learning_rate": 0.0006385270982879065, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 516640, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 18.38888888888889, | |
| "grad_norm": 0.00872802734375, | |
| "learning_rate": 0.0005972189026820351, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 518208, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 18.444444444444443, | |
| "grad_norm": 0.0033416748046875, | |
| "learning_rate": 0.0005572648131384361, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 519776, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 18.5, | |
| "grad_norm": 0.0106201171875, | |
| "learning_rate": 0.0005186685860201717, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 521344, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 18.555555555555557, | |
| "grad_norm": 0.0189208984375, | |
| "learning_rate": 0.0004814338500281634, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 522976, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 18.61111111111111, | |
| "grad_norm": 0.003662109375, | |
| "learning_rate": 0.0004455641058600529, | |
| "loss": 0.2309, | |
| "num_input_tokens_seen": 524544, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 18.666666666666668, | |
| "grad_norm": 0.0093994140625, | |
| "learning_rate": 0.00041106272588105564, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 526112, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 18.72222222222222, | |
| "grad_norm": 0.0186767578125, | |
| "learning_rate": 0.0003779329538069159, | |
| "loss": 0.2265, | |
| "num_input_tokens_seen": 527712, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 18.77777777777778, | |
| "grad_norm": 0.00439453125, | |
| "learning_rate": 0.00034617790439893603, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 529280, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 18.833333333333332, | |
| "grad_norm": 0.01080322265625, | |
| "learning_rate": 0.00031580056317113525, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 530816, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 18.88888888888889, | |
| "grad_norm": 0.002960205078125, | |
| "learning_rate": 0.00028680378610956793, | |
| "loss": 0.2266, | |
| "num_input_tokens_seen": 532448, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 18.944444444444443, | |
| "grad_norm": 0.017578125, | |
| "learning_rate": 0.00025919029940380146, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 534016, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.00897216796875, | |
| "learning_rate": 0.0002329626991906164, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 535584, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.23606657981872559, | |
| "eval_runtime": 0.8079, | |
| "eval_samples_per_second": 49.514, | |
| "eval_steps_per_second": 12.378, | |
| "num_input_tokens_seen": 535584, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 19.055555555555557, | |
| "grad_norm": 0.0032806396484375, | |
| "learning_rate": 0.00020812345130992503, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 537152, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 19.11111111111111, | |
| "grad_norm": 0.01019287109375, | |
| "learning_rate": 0.0001846748910729351, | |
| "loss": 0.2361, | |
| "num_input_tokens_seen": 538720, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 19.166666666666668, | |
| "grad_norm": 0.00909423828125, | |
| "learning_rate": 0.0001626192230425938, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 540256, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 19.22222222222222, | |
| "grad_norm": 0.009033203125, | |
| "learning_rate": 0.00014195852082632686, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 541824, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 19.27777777777778, | |
| "grad_norm": 0.0035400390625, | |
| "learning_rate": 0.00012269472688107463, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 543424, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 19.333333333333332, | |
| "grad_norm": 0.0113525390625, | |
| "learning_rate": 0.00010482965233067298, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 545024, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 19.38888888888889, | |
| "grad_norm": 0.0089111328125, | |
| "learning_rate": 8.836497679557964e-05, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 546560, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 19.444444444444443, | |
| "grad_norm": 0.0031280517578125, | |
| "learning_rate": 7.330224823495379e-05, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 548096, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 19.5, | |
| "grad_norm": 0.00909423828125, | |
| "learning_rate": 5.96428828011325e-05, | |
| "loss": 0.2235, | |
| "num_input_tokens_seen": 549664, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 19.555555555555557, | |
| "grad_norm": 0.01019287109375, | |
| "learning_rate": 4.738816470647389e-05, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 551264, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 19.61111111111111, | |
| "grad_norm": 0.004150390625, | |
| "learning_rate": 3.653924610263703e-05, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 552800, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 19.666666666666668, | |
| "grad_norm": 0.01104736328125, | |
| "learning_rate": 2.7097146972240305e-05, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 554336, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 19.72222222222222, | |
| "grad_norm": 0.0034942626953125, | |
| "learning_rate": 1.9062755032984713e-05, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 555872, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 19.77777777777778, | |
| "grad_norm": 0.0103759765625, | |
| "learning_rate": 1.2436825654180693e-05, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 557504, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 19.833333333333332, | |
| "grad_norm": 0.00445556640625, | |
| "learning_rate": 7.219981785733242e-06, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 559040, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 19.88888888888889, | |
| "grad_norm": 0.0038604736328125, | |
| "learning_rate": 3.4127138995787565e-06, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 560640, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 19.944444444444443, | |
| "grad_norm": 0.004425048828125, | |
| "learning_rate": 1.0153799435669298e-06, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 562176, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.0027618408203125, | |
| "learning_rate": 2.820530780767161e-08, | |
| "loss": 0.2245, | |
| "num_input_tokens_seen": 563744, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.23757648468017578, | |
| "eval_runtime": 0.8085, | |
| "eval_samples_per_second": 49.475, | |
| "eval_steps_per_second": 12.369, | |
| "num_input_tokens_seen": 563744, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "num_input_tokens_seen": 563744, | |
| "step": 1800, | |
| "total_flos": 2.538513752575181e+16, | |
| "train_loss": 0.29290440910392335, | |
| "train_runtime": 315.2013, | |
| "train_samples_per_second": 22.843, | |
| "train_steps_per_second": 5.711 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1800, | |
| "num_input_tokens_seen": 563744, | |
| "num_train_epochs": 20, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.538513752575181e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |