| { | |
| "best_global_step": 3700, | |
| "best_metric": 0.03170738369226456, | |
| "best_model_checkpoint": "trained/ltg/norbert3-small\\checkpoint-3700", | |
| "epoch": 7.2265625, | |
| "eval_steps": 100, | |
| "global_step": 3700, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01953125, | |
| "grad_norm": 7.950218677520752, | |
| "learning_rate": 8.789062500000001e-06, | |
| "loss": 22.0208, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0390625, | |
| "grad_norm": 6.9703545570373535, | |
| "learning_rate": 1.85546875e-05, | |
| "loss": 21.3936, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05859375, | |
| "grad_norm": 5.963114261627197, | |
| "learning_rate": 2.83203125e-05, | |
| "loss": 20.3377, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 5.306910514831543, | |
| "learning_rate": 3.80859375e-05, | |
| "loss": 19.0723, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09765625, | |
| "grad_norm": 5.844285488128662, | |
| "learning_rate": 4.78515625e-05, | |
| "loss": 17.5429, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1171875, | |
| "grad_norm": 5.66201639175415, | |
| "learning_rate": 5.7617187500000004e-05, | |
| "loss": 16.1115, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13671875, | |
| "grad_norm": 5.144637584686279, | |
| "learning_rate": 6.73828125e-05, | |
| "loss": 14.6307, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 5.500018119812012, | |
| "learning_rate": 7.71484375e-05, | |
| "loss": 13.097, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17578125, | |
| "grad_norm": 5.008663654327393, | |
| "learning_rate": 8.69140625e-05, | |
| "loss": 11.5314, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 4.854718208312988, | |
| "learning_rate": 9.66796875e-05, | |
| "loss": 10.3316, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "eval_loss": 4.540501594543457, | |
| "eval_runtime": 34.552, | |
| "eval_samples_per_second": 118.546, | |
| "eval_steps_per_second": 7.409, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.21484375, | |
| "grad_norm": 4.839401721954346, | |
| "learning_rate": 0.0001064453125, | |
| "loss": 8.8709, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 4.494856357574463, | |
| "learning_rate": 0.00011621093750000001, | |
| "loss": 7.8886, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.25390625, | |
| "grad_norm": 4.555051326751709, | |
| "learning_rate": 0.0001259765625, | |
| "loss": 6.8872, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2734375, | |
| "grad_norm": 4.32931661605835, | |
| "learning_rate": 0.0001357421875, | |
| "loss": 5.8908, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.29296875, | |
| "grad_norm": 3.7109575271606445, | |
| "learning_rate": 0.0001455078125, | |
| "loss": 5.2308, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 4.085555076599121, | |
| "learning_rate": 0.0001552734375, | |
| "loss": 4.5431, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.33203125, | |
| "grad_norm": 3.3191423416137695, | |
| "learning_rate": 0.0001650390625, | |
| "loss": 4.0313, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3515625, | |
| "grad_norm": 4.100945949554443, | |
| "learning_rate": 0.0001748046875, | |
| "loss": 3.7451, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.37109375, | |
| "grad_norm": 4.261106014251709, | |
| "learning_rate": 0.0001845703125, | |
| "loss": 3.4457, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 3.0237832069396973, | |
| "learning_rate": 0.0001943359375, | |
| "loss": 3.1331, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "eval_loss": 1.426753282546997, | |
| "eval_runtime": 34.579, | |
| "eval_samples_per_second": 118.453, | |
| "eval_steps_per_second": 7.403, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.41015625, | |
| "grad_norm": 3.6745340824127197, | |
| "learning_rate": 0.0002041015625, | |
| "loss": 2.8997, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4296875, | |
| "grad_norm": 4.417777061462402, | |
| "learning_rate": 0.0002138671875, | |
| "loss": 2.81, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.44921875, | |
| "grad_norm": 5.513803482055664, | |
| "learning_rate": 0.0002236328125, | |
| "loss": 2.5776, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 3.619678020477295, | |
| "learning_rate": 0.00023339843750000002, | |
| "loss": 2.4229, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.48828125, | |
| "grad_norm": 3.636021137237549, | |
| "learning_rate": 0.0002431640625, | |
| "loss": 2.3517, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5078125, | |
| "grad_norm": 5.008062839508057, | |
| "learning_rate": 0.0002529296875, | |
| "loss": 2.1649, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.52734375, | |
| "grad_norm": 4.469094276428223, | |
| "learning_rate": 0.0002626953125, | |
| "loss": 1.9232, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 3.6506378650665283, | |
| "learning_rate": 0.0002724609375, | |
| "loss": 1.922, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.56640625, | |
| "grad_norm": 2.868321418762207, | |
| "learning_rate": 0.0002822265625, | |
| "loss": 1.9761, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 3.0979578495025635, | |
| "learning_rate": 0.0002919921875, | |
| "loss": 1.6869, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "eval_loss": 0.7856405973434448, | |
| "eval_runtime": 34.54, | |
| "eval_samples_per_second": 118.587, | |
| "eval_steps_per_second": 7.412, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.60546875, | |
| "grad_norm": 4.681755542755127, | |
| "learning_rate": 0.0003017578125, | |
| "loss": 1.7185, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 2.84851336479187, | |
| "learning_rate": 0.0003115234375, | |
| "loss": 1.6253, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.64453125, | |
| "grad_norm": 2.8802149295806885, | |
| "learning_rate": 0.0003212890625, | |
| "loss": 1.4382, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6640625, | |
| "grad_norm": 3.137683391571045, | |
| "learning_rate": 0.0003310546875, | |
| "loss": 1.4573, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.68359375, | |
| "grad_norm": 2.84382963180542, | |
| "learning_rate": 0.00034082031250000003, | |
| "loss": 1.2982, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 2.7262027263641357, | |
| "learning_rate": 0.0003505859375, | |
| "loss": 1.3976, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.72265625, | |
| "grad_norm": 2.5905797481536865, | |
| "learning_rate": 0.0003603515625, | |
| "loss": 1.3751, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7421875, | |
| "grad_norm": 2.3549230098724365, | |
| "learning_rate": 0.0003701171875, | |
| "loss": 1.3441, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.76171875, | |
| "grad_norm": 3.1322712898254395, | |
| "learning_rate": 0.0003798828125, | |
| "loss": 1.2995, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 3.1575980186462402, | |
| "learning_rate": 0.0003896484375, | |
| "loss": 1.2435, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "eval_loss": 0.5356224179267883, | |
| "eval_runtime": 34.903, | |
| "eval_samples_per_second": 117.354, | |
| "eval_steps_per_second": 7.335, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.80078125, | |
| "grad_norm": 2.9619956016540527, | |
| "learning_rate": 0.00039941406250000003, | |
| "loss": 1.158, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8203125, | |
| "grad_norm": 2.5613064765930176, | |
| "learning_rate": 0.0004091796875, | |
| "loss": 1.0989, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.83984375, | |
| "grad_norm": 3.0814015865325928, | |
| "learning_rate": 0.0004189453125, | |
| "loss": 1.0123, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 2.1889028549194336, | |
| "learning_rate": 0.0004287109375, | |
| "loss": 1.109, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.87890625, | |
| "grad_norm": 2.4778764247894287, | |
| "learning_rate": 0.0004384765625, | |
| "loss": 1.0337, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8984375, | |
| "grad_norm": 2.1767776012420654, | |
| "learning_rate": 0.0004482421875, | |
| "loss": 1.0537, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.91796875, | |
| "grad_norm": 2.518540382385254, | |
| "learning_rate": 0.00045800781250000003, | |
| "loss": 1.0118, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 2.429670810699463, | |
| "learning_rate": 0.00046777343750000004, | |
| "loss": 0.9931, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.95703125, | |
| "grad_norm": 2.2620222568511963, | |
| "learning_rate": 0.0004775390625, | |
| "loss": 0.9221, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 2.0658576488494873, | |
| "learning_rate": 0.0004873046875, | |
| "loss": 0.9128, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "eval_loss": 0.42379331588745117, | |
| "eval_runtime": 34.733, | |
| "eval_samples_per_second": 117.928, | |
| "eval_steps_per_second": 7.371, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.99609375, | |
| "grad_norm": 2.622636079788208, | |
| "learning_rate": 0.0004970703125, | |
| "loss": 0.9074, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.015625, | |
| "grad_norm": 2.956885576248169, | |
| "learning_rate": 0.0004999971530484696, | |
| "loss": 0.9474, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.03515625, | |
| "grad_norm": 2.8692688941955566, | |
| "learning_rate": 0.0004999832089521691, | |
| "loss": 0.9159, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0546875, | |
| "grad_norm": 2.4971773624420166, | |
| "learning_rate": 0.0004999576454489559, | |
| "loss": 0.8008, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.07421875, | |
| "grad_norm": 1.7604514360427856, | |
| "learning_rate": 0.0004999204637270404, | |
| "loss": 0.8161, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.09375, | |
| "grad_norm": 2.117351531982422, | |
| "learning_rate": 0.0004998716655146573, | |
| "loss": 0.9061, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.11328125, | |
| "grad_norm": 1.9122625589370728, | |
| "learning_rate": 0.0004998112530799839, | |
| "loss": 0.8696, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1328125, | |
| "grad_norm": 2.0043485164642334, | |
| "learning_rate": 0.0004997392292310354, | |
| "loss": 0.7629, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.15234375, | |
| "grad_norm": 1.8798938989639282, | |
| "learning_rate": 0.0004996555973155344, | |
| "loss": 0.7976, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "grad_norm": 2.1096367835998535, | |
| "learning_rate": 0.0004995603612207548, | |
| "loss": 0.7728, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.171875, | |
| "eval_loss": 0.34791266918182373, | |
| "eval_runtime": 34.889, | |
| "eval_samples_per_second": 117.401, | |
| "eval_steps_per_second": 7.338, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.19140625, | |
| "grad_norm": 1.8750674724578857, | |
| "learning_rate": 0.000499453525373342, | |
| "loss": 0.7348, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2109375, | |
| "grad_norm": 1.9086616039276123, | |
| "learning_rate": 0.0004993350947391059, | |
| "loss": 0.7259, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.23046875, | |
| "grad_norm": 2.0031375885009766, | |
| "learning_rate": 0.0004992050748227915, | |
| "loss": 0.6752, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.394740104675293, | |
| "learning_rate": 0.0004990634716678217, | |
| "loss": 0.7237, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.26953125, | |
| "grad_norm": 1.9420517683029175, | |
| "learning_rate": 0.0004989102918560172, | |
| "loss": 0.6634, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.2890625, | |
| "grad_norm": 2.5375707149505615, | |
| "learning_rate": 0.0004987455425072907, | |
| "loss": 0.866, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.30859375, | |
| "grad_norm": 2.0575597286224365, | |
| "learning_rate": 0.0004985692312793153, | |
| "loss": 0.5999, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.328125, | |
| "grad_norm": 1.7922422885894775, | |
| "learning_rate": 0.000498381366367169, | |
| "loss": 0.5935, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.34765625, | |
| "grad_norm": 2.0383851528167725, | |
| "learning_rate": 0.0004981819565029539, | |
| "loss": 0.6418, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3671875, | |
| "grad_norm": 1.6255093812942505, | |
| "learning_rate": 0.0004979710109553896, | |
| "loss": 0.6444, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.3671875, | |
| "eval_loss": 0.2871336340904236, | |
| "eval_runtime": 34.917, | |
| "eval_samples_per_second": 117.307, | |
| "eval_steps_per_second": 7.332, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.38671875, | |
| "grad_norm": 1.7046557664871216, | |
| "learning_rate": 0.0004977485395293836, | |
| "loss": 0.6661, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.40625, | |
| "grad_norm": 2.547611951828003, | |
| "learning_rate": 0.0004975145525655744, | |
| "loss": 0.5595, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.42578125, | |
| "grad_norm": 1.8859753608703613, | |
| "learning_rate": 0.0004972690609398512, | |
| "loss": 0.5831, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.4453125, | |
| "grad_norm": 1.7141295671463013, | |
| "learning_rate": 0.0004970120760628492, | |
| "loss": 0.5981, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.46484375, | |
| "grad_norm": 1.5137165784835815, | |
| "learning_rate": 0.0004967436098794177, | |
| "loss": 0.6364, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.484375, | |
| "grad_norm": 2.534536600112915, | |
| "learning_rate": 0.0004964636748680664, | |
| "loss": 0.54, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.50390625, | |
| "grad_norm": 1.795462727546692, | |
| "learning_rate": 0.0004961722840403843, | |
| "loss": 0.533, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.5234375, | |
| "grad_norm": 2.0635874271392822, | |
| "learning_rate": 0.0004958694509404355, | |
| "loss": 0.593, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.54296875, | |
| "grad_norm": 1.7885998487472534, | |
| "learning_rate": 0.0004955551896441295, | |
| "loss": 0.6076, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 2.2840216159820557, | |
| "learning_rate": 0.0004952295147585667, | |
| "loss": 0.5232, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "eval_loss": 0.24195311963558197, | |
| "eval_runtime": 34.654, | |
| "eval_samples_per_second": 118.197, | |
| "eval_steps_per_second": 7.387, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.58203125, | |
| "grad_norm": 1.3189480304718018, | |
| "learning_rate": 0.0004948924414213601, | |
| "loss": 0.4159, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6015625, | |
| "grad_norm": 1.859397530555725, | |
| "learning_rate": 0.000494543985299931, | |
| "loss": 0.4731, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.62109375, | |
| "grad_norm": 1.8662714958190918, | |
| "learning_rate": 0.0004941841625907811, | |
| "loss": 0.5586, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.640625, | |
| "grad_norm": 1.637846827507019, | |
| "learning_rate": 0.0004938129900187393, | |
| "loss": 0.4765, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.66015625, | |
| "grad_norm": 1.739134430885315, | |
| "learning_rate": 0.0004934304848361855, | |
| "loss": 0.492, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.6796875, | |
| "grad_norm": 1.5579233169555664, | |
| "learning_rate": 0.0004930366648222467, | |
| "loss": 0.5734, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.69921875, | |
| "grad_norm": 1.7096999883651733, | |
| "learning_rate": 0.0004926315482819728, | |
| "loss": 0.55, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.71875, | |
| "grad_norm": 1.415420651435852, | |
| "learning_rate": 0.0004922151540454839, | |
| "loss": 0.4812, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.73828125, | |
| "grad_norm": 1.2610892057418823, | |
| "learning_rate": 0.0004917875014670963, | |
| "loss": 0.4289, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.7578125, | |
| "grad_norm": 1.3743246793746948, | |
| "learning_rate": 0.0004913486104244223, | |
| "loss": 0.4662, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.7578125, | |
| "eval_loss": 0.21815192699432373, | |
| "eval_runtime": 34.764, | |
| "eval_samples_per_second": 117.823, | |
| "eval_steps_per_second": 7.364, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.77734375, | |
| "grad_norm": 1.5094037055969238, | |
| "learning_rate": 0.0004908985013174468, | |
| "loss": 0.4558, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.796875, | |
| "grad_norm": 1.7575606107711792, | |
| "learning_rate": 0.000490437195067578, | |
| "loss": 0.4556, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.81640625, | |
| "grad_norm": 1.3195754289627075, | |
| "learning_rate": 0.0004899647131166763, | |
| "loss": 0.4727, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.8359375, | |
| "grad_norm": 1.6985130310058594, | |
| "learning_rate": 0.0004894810774260572, | |
| "loss": 0.5443, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.85546875, | |
| "grad_norm": 1.709730625152588, | |
| "learning_rate": 0.0004889863104754697, | |
| "loss": 0.556, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 1.666263461112976, | |
| "learning_rate": 0.0004884804352620526, | |
| "loss": 0.4635, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.89453125, | |
| "grad_norm": 1.3932074308395386, | |
| "learning_rate": 0.00048796347529926517, | |
| "loss": 0.4252, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.9140625, | |
| "grad_norm": 2.161454677581787, | |
| "learning_rate": 0.0004874354546157936, | |
| "loss": 0.4579, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.93359375, | |
| "grad_norm": 1.7341850996017456, | |
| "learning_rate": 0.0004868963977544353, | |
| "loss": 0.4311, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "grad_norm": 1.5829185247421265, | |
| "learning_rate": 0.00048634632977095704, | |
| "loss": 0.3864, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.953125, | |
| "eval_loss": 0.19052913784980774, | |
| "eval_runtime": 34.761, | |
| "eval_samples_per_second": 117.833, | |
| "eval_steps_per_second": 7.365, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.97265625, | |
| "grad_norm": 1.101081132888794, | |
| "learning_rate": 0.000485785276232931, | |
| "loss": 0.3874, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.9921875, | |
| "grad_norm": 1.6454037427902222, | |
| "learning_rate": 0.0004852132632185461, | |
| "loss": 0.4326, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.01171875, | |
| "grad_norm": 1.2393088340759277, | |
| "learning_rate": 0.000484630317315396, | |
| "loss": 0.4382, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.03125, | |
| "grad_norm": 1.658333420753479, | |
| "learning_rate": 0.0004840364656192433, | |
| "loss": 0.4206, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.05078125, | |
| "grad_norm": 1.313991665840149, | |
| "learning_rate": 0.0004834317357327597, | |
| "loss": 0.3975, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.0703125, | |
| "grad_norm": 1.3297566175460815, | |
| "learning_rate": 0.00048281615576424374, | |
| "loss": 0.3958, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.08984375, | |
| "grad_norm": 1.1647272109985352, | |
| "learning_rate": 0.00048218975432631365, | |
| "loss": 0.377, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.109375, | |
| "grad_norm": 1.5755267143249512, | |
| "learning_rate": 0.00048155256053457785, | |
| "loss": 0.3417, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.12890625, | |
| "grad_norm": 2.3393445014953613, | |
| "learning_rate": 0.00048090460400628123, | |
| "loss": 0.4214, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.1484375, | |
| "grad_norm": 1.5548704862594604, | |
| "learning_rate": 0.0004802459148589289, | |
| "loss": 0.4284, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.1484375, | |
| "eval_loss": 0.18148231506347656, | |
| "eval_runtime": 34.931, | |
| "eval_samples_per_second": 117.26, | |
| "eval_steps_per_second": 7.329, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.16796875, | |
| "grad_norm": 1.8194355964660645, | |
| "learning_rate": 0.00047957652370888616, | |
| "loss": 0.4066, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 1.922125220298767, | |
| "learning_rate": 0.0004788964616699554, | |
| "loss": 0.3621, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.20703125, | |
| "grad_norm": 1.7716784477233887, | |
| "learning_rate": 0.0004782057603519297, | |
| "loss": 0.3802, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.2265625, | |
| "grad_norm": 1.233404278755188, | |
| "learning_rate": 0.0004775044518591242, | |
| "loss": 0.3952, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.24609375, | |
| "grad_norm": 1.3415420055389404, | |
| "learning_rate": 0.00047679256878888315, | |
| "loss": 0.4005, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.265625, | |
| "grad_norm": 1.4483531713485718, | |
| "learning_rate": 0.00047607014423006527, | |
| "loss": 0.3756, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.28515625, | |
| "grad_norm": 1.1628267765045166, | |
| "learning_rate": 0.0004753372117615055, | |
| "loss": 0.3908, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.3046875, | |
| "grad_norm": 1.5845067501068115, | |
| "learning_rate": 0.00047459380545045426, | |
| "loss": 0.3727, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.32421875, | |
| "grad_norm": 1.2387616634368896, | |
| "learning_rate": 0.00047383995985099414, | |
| "loss": 0.3489, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 1.4392259120941162, | |
| "learning_rate": 0.0004730757100024336, | |
| "loss": 0.3804, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "eval_loss": 0.1616286039352417, | |
| "eval_runtime": 34.8, | |
| "eval_samples_per_second": 117.701, | |
| "eval_steps_per_second": 7.356, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.36328125, | |
| "grad_norm": 1.6787796020507812, | |
| "learning_rate": 0.0004723010914276783, | |
| "loss": 0.3817, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.3828125, | |
| "grad_norm": 1.2978942394256592, | |
| "learning_rate": 0.0004715161401315803, | |
| "loss": 0.3436, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.40234375, | |
| "grad_norm": 1.933402180671692, | |
| "learning_rate": 0.000470720892599264, | |
| "loss": 0.3104, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.421875, | |
| "grad_norm": 1.8381954431533813, | |
| "learning_rate": 0.00046991538579443096, | |
| "loss": 0.3381, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.44140625, | |
| "grad_norm": 1.3875645399093628, | |
| "learning_rate": 0.0004690996571576409, | |
| "loss": 0.3334, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.4609375, | |
| "grad_norm": 1.373177170753479, | |
| "learning_rate": 0.0004682737446045725, | |
| "loss": 0.2988, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.48046875, | |
| "grad_norm": 1.131691336631775, | |
| "learning_rate": 0.00046743768652426015, | |
| "loss": 0.323, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.4642337560653687, | |
| "learning_rate": 0.00046659152177731003, | |
| "loss": 0.3457, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.51953125, | |
| "grad_norm": 1.3850147724151611, | |
| "learning_rate": 0.00046573528969409374, | |
| "loss": 0.3136, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.5390625, | |
| "grad_norm": 1.4759999513626099, | |
| "learning_rate": 0.0004648690300729203, | |
| "loss": 0.3233, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.5390625, | |
| "eval_loss": 0.1541527956724167, | |
| "eval_runtime": 34.809, | |
| "eval_samples_per_second": 117.671, | |
| "eval_steps_per_second": 7.354, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.55859375, | |
| "grad_norm": 1.1843773126602173, | |
| "learning_rate": 0.0004639927831781862, | |
| "loss": 0.3037, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.578125, | |
| "grad_norm": 1.3902676105499268, | |
| "learning_rate": 0.0004631065897385037, | |
| "loss": 0.313, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.59765625, | |
| "grad_norm": 1.2758233547210693, | |
| "learning_rate": 0.0004622104909448082, | |
| "loss": 0.3129, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.6171875, | |
| "grad_norm": 1.7758818864822388, | |
| "learning_rate": 0.0004613045284484432, | |
| "loss": 0.3269, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.63671875, | |
| "grad_norm": 1.3369204998016357, | |
| "learning_rate": 0.00046038874435922465, | |
| "loss": 0.3199, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.65625, | |
| "grad_norm": 1.5773146152496338, | |
| "learning_rate": 0.0004594631812434832, | |
| "loss": 0.3204, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.67578125, | |
| "grad_norm": 1.4648780822753906, | |
| "learning_rate": 0.0004585278821220863, | |
| "loss": 0.2864, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.6953125, | |
| "grad_norm": 1.6221363544464111, | |
| "learning_rate": 0.00045758289046843813, | |
| "loss": 0.34, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.71484375, | |
| "grad_norm": 1.1161458492279053, | |
| "learning_rate": 0.00045662825020645895, | |
| "loss": 0.2723, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.734375, | |
| "grad_norm": 1.5751299858093262, | |
| "learning_rate": 0.0004556640057085436, | |
| "loss": 0.3149, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.734375, | |
| "eval_loss": 0.14088299870491028, | |
| "eval_runtime": 34.861, | |
| "eval_samples_per_second": 117.495, | |
| "eval_steps_per_second": 7.343, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.75390625, | |
| "grad_norm": 1.2796870470046997, | |
| "learning_rate": 0.00045469020179349917, | |
| "loss": 0.3107, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.7734375, | |
| "grad_norm": 1.2906869649887085, | |
| "learning_rate": 0.00045370688372446146, | |
| "loss": 0.3022, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.79296875, | |
| "grad_norm": 1.2449394464492798, | |
| "learning_rate": 0.0004527140972067911, | |
| "loss": 0.271, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 0.9955422282218933, | |
| "learning_rate": 0.00045171188838594986, | |
| "loss": 0.2902, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.83203125, | |
| "grad_norm": 1.6944818496704102, | |
| "learning_rate": 0.0004507003038453546, | |
| "loss": 0.288, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.8515625, | |
| "grad_norm": 1.4964483976364136, | |
| "learning_rate": 0.00044967939060421307, | |
| "loss": 0.3125, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.87109375, | |
| "grad_norm": 1.1946748495101929, | |
| "learning_rate": 0.0004486491961153379, | |
| "loss": 0.3261, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.890625, | |
| "grad_norm": 1.3839339017868042, | |
| "learning_rate": 0.00044760976826294097, | |
| "loss": 0.3375, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.91015625, | |
| "grad_norm": 1.3183151483535767, | |
| "learning_rate": 0.00044656115536040797, | |
| "loss": 0.3032, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.9296875, | |
| "grad_norm": 1.4515591859817505, | |
| "learning_rate": 0.00044550340614805256, | |
| "loss": 0.2747, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.9296875, | |
| "eval_loss": 0.12650033831596375, | |
| "eval_runtime": 34.752, | |
| "eval_samples_per_second": 117.864, | |
| "eval_steps_per_second": 7.366, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.94921875, | |
| "grad_norm": 1.1999162435531616, | |
| "learning_rate": 0.0004444365697908509, | |
| "loss": 0.2887, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.96875, | |
| "grad_norm": 1.201749563217163, | |
| "learning_rate": 0.00044336069587615635, | |
| "loss": 0.2875, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.98828125, | |
| "grad_norm": 1.2322083711624146, | |
| "learning_rate": 0.00044227583441139496, | |
| "loss": 0.2716, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.0078125, | |
| "grad_norm": 1.1172407865524292, | |
| "learning_rate": 0.00044118203582174057, | |
| "loss": 0.2435, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.02734375, | |
| "grad_norm": 1.0245709419250488, | |
| "learning_rate": 0.00044007935094777156, | |
| "loss": 0.2701, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.046875, | |
| "grad_norm": 1.523379921913147, | |
| "learning_rate": 0.00043896783104310734, | |
| "loss": 0.2365, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.06640625, | |
| "grad_norm": 0.9000476598739624, | |
| "learning_rate": 0.00043784752777202595, | |
| "loss": 0.2712, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.0859375, | |
| "grad_norm": 1.0866985321044922, | |
| "learning_rate": 0.00043671849320706335, | |
| "loss": 0.3082, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.10546875, | |
| "grad_norm": 1.2548413276672363, | |
| "learning_rate": 0.00043558077982659216, | |
| "loss": 0.207, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "grad_norm": 1.4354357719421387, | |
| "learning_rate": 0.000434434440512383, | |
| "loss": 0.239, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "eval_loss": 0.12245145440101624, | |
| "eval_runtime": 34.447, | |
| "eval_samples_per_second": 118.907, | |
| "eval_steps_per_second": 7.432, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.14453125, | |
| "grad_norm": 1.0551234483718872, | |
| "learning_rate": 0.0004332795285471465, | |
| "loss": 0.2292, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.1640625, | |
| "grad_norm": 1.2726539373397827, | |
| "learning_rate": 0.00043211609761205626, | |
| "loss": 0.2304, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.18359375, | |
| "grad_norm": 1.4743865728378296, | |
| "learning_rate": 0.0004309442017842543, | |
| "loss": 0.2784, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.203125, | |
| "grad_norm": 1.5050513744354248, | |
| "learning_rate": 0.0004297638955343368, | |
| "loss": 0.3108, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.22265625, | |
| "grad_norm": 1.5667752027511597, | |
| "learning_rate": 0.0004285752337238231, | |
| "loss": 0.2638, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.2421875, | |
| "grad_norm": 1.187193751335144, | |
| "learning_rate": 0.0004273782716026049, | |
| "loss": 0.2541, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.26171875, | |
| "grad_norm": 1.0397928953170776, | |
| "learning_rate": 0.0004261730648063788, | |
| "loss": 0.2217, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.28125, | |
| "grad_norm": 0.8323363065719604, | |
| "learning_rate": 0.00042495966935405995, | |
| "loss": 0.2599, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.30078125, | |
| "grad_norm": 1.1350178718566895, | |
| "learning_rate": 0.00042373814164517833, | |
| "loss": 0.2313, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.3203125, | |
| "grad_norm": 1.0103367567062378, | |
| "learning_rate": 0.00042250853845725745, | |
| "loss": 0.2296, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.3203125, | |
| "eval_loss": 0.11100158095359802, | |
| "eval_runtime": 34.453, | |
| "eval_samples_per_second": 118.887, | |
| "eval_steps_per_second": 7.43, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.33984375, | |
| "grad_norm": 1.268911361694336, | |
| "learning_rate": 0.0004212709169431751, | |
| "loss": 0.2437, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.359375, | |
| "grad_norm": 1.5205929279327393, | |
| "learning_rate": 0.0004200253346285068, | |
| "loss": 0.2649, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.37890625, | |
| "grad_norm": 1.3827295303344727, | |
| "learning_rate": 0.0004187718494088521, | |
| "loss": 0.2552, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.3984375, | |
| "grad_norm": 0.8661313652992249, | |
| "learning_rate": 0.0004175105195471435, | |
| "loss": 0.2318, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.41796875, | |
| "grad_norm": 1.2733081579208374, | |
| "learning_rate": 0.0004162414036709383, | |
| "loss": 0.2276, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.4375, | |
| "grad_norm": 1.0363296270370483, | |
| "learning_rate": 0.0004149645607696936, | |
| "loss": 0.1892, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.45703125, | |
| "grad_norm": 0.9911412000656128, | |
| "learning_rate": 0.0004136800501920245, | |
| "loss": 0.2341, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.4765625, | |
| "grad_norm": 1.0639911890029907, | |
| "learning_rate": 0.00041238793164294536, | |
| "loss": 0.2346, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.49609375, | |
| "grad_norm": 1.0140990018844604, | |
| "learning_rate": 0.0004110882651810948, | |
| "loss": 0.2557, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.515625, | |
| "grad_norm": 1.5262864828109741, | |
| "learning_rate": 0.00040978111121594396, | |
| "loss": 0.2314, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.515625, | |
| "eval_loss": 0.10740732401609421, | |
| "eval_runtime": 34.468, | |
| "eval_samples_per_second": 118.835, | |
| "eval_steps_per_second": 7.427, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.53515625, | |
| "grad_norm": 1.5407931804656982, | |
| "learning_rate": 0.00040846653050498897, | |
| "loss": 0.2093, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.5546875, | |
| "grad_norm": 0.7513458728790283, | |
| "learning_rate": 0.0004071445841509264, | |
| "loss": 0.2161, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.57421875, | |
| "grad_norm": 1.2236207723617554, | |
| "learning_rate": 0.00040581533359881374, | |
| "loss": 0.2536, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.59375, | |
| "grad_norm": 1.3843364715576172, | |
| "learning_rate": 0.0004044788406332128, | |
| "loss": 0.2095, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.61328125, | |
| "grad_norm": 1.372381329536438, | |
| "learning_rate": 0.0004031351673753184, | |
| "loss": 0.2136, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.6328125, | |
| "grad_norm": 1.251421570777893, | |
| "learning_rate": 0.00040178437628007055, | |
| "loss": 0.2403, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.65234375, | |
| "grad_norm": 1.3957655429840088, | |
| "learning_rate": 0.0004004265301332518, | |
| "loss": 0.2149, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.671875, | |
| "grad_norm": 1.109490990638733, | |
| "learning_rate": 0.00039906169204856877, | |
| "loss": 0.2212, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.69140625, | |
| "grad_norm": 1.0680170059204102, | |
| "learning_rate": 0.0003976899254647186, | |
| "loss": 0.1957, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.7109375, | |
| "grad_norm": 1.177930235862732, | |
| "learning_rate": 0.00039631129414244016, | |
| "loss": 0.2181, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.7109375, | |
| "eval_loss": 0.09738427400588989, | |
| "eval_runtime": 34.465, | |
| "eval_samples_per_second": 118.845, | |
| "eval_steps_per_second": 7.428, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.73046875, | |
| "grad_norm": 1.2667044401168823, | |
| "learning_rate": 0.00039492586216155056, | |
| "loss": 0.2459, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.9396888017654419, | |
| "learning_rate": 0.0003935336939179668, | |
| "loss": 0.1969, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.76953125, | |
| "grad_norm": 1.1412925720214844, | |
| "learning_rate": 0.0003921348541207122, | |
| "loss": 0.1768, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.7890625, | |
| "grad_norm": 1.208653211593628, | |
| "learning_rate": 0.0003907294077889089, | |
| "loss": 0.197, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.80859375, | |
| "grad_norm": 1.0788354873657227, | |
| "learning_rate": 0.00038931742024875585, | |
| "loss": 0.1923, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.828125, | |
| "grad_norm": 1.0677030086517334, | |
| "learning_rate": 0.00038789895713049207, | |
| "loss": 0.2269, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.84765625, | |
| "grad_norm": 1.1341768503189087, | |
| "learning_rate": 0.00038647408436534646, | |
| "loss": 0.1879, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.8671875, | |
| "grad_norm": 0.8455312848091125, | |
| "learning_rate": 0.0003850428681824732, | |
| "loss": 0.2181, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.88671875, | |
| "grad_norm": 1.1865947246551514, | |
| "learning_rate": 0.00038360537510587315, | |
| "loss": 0.2105, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.90625, | |
| "grad_norm": 1.2523059844970703, | |
| "learning_rate": 0.0003821616719513017, | |
| "loss": 0.1949, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.90625, | |
| "eval_loss": 0.08832964301109314, | |
| "eval_runtime": 34.454, | |
| "eval_samples_per_second": 118.883, | |
| "eval_steps_per_second": 7.43, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.92578125, | |
| "grad_norm": 1.1952605247497559, | |
| "learning_rate": 0.00038071182582316364, | |
| "loss": 0.2248, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.9453125, | |
| "grad_norm": 1.3617719411849976, | |
| "learning_rate": 0.00037925590411139377, | |
| "loss": 0.2039, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.96484375, | |
| "grad_norm": 1.457497000694275, | |
| "learning_rate": 0.0003777939744883243, | |
| "loss": 0.2065, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.984375, | |
| "grad_norm": 1.2239354848861694, | |
| "learning_rate": 0.0003763261049055399, | |
| "loss": 0.1916, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 4.00390625, | |
| "grad_norm": 0.7479121088981628, | |
| "learning_rate": 0.00037485236359071885, | |
| "loss": 0.1767, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 4.0234375, | |
| "grad_norm": 0.7983143329620361, | |
| "learning_rate": 0.0003733728190444621, | |
| "loss": 0.1865, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 4.04296875, | |
| "grad_norm": 0.8446422219276428, | |
| "learning_rate": 0.000371887540037109, | |
| "loss": 0.1498, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 4.0625, | |
| "grad_norm": 1.9428411722183228, | |
| "learning_rate": 0.0003703965956055411, | |
| "loss": 0.236, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 4.08203125, | |
| "grad_norm": 1.1161248683929443, | |
| "learning_rate": 0.00036890005504997296, | |
| "loss": 0.2052, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 4.1015625, | |
| "grad_norm": 1.3190343379974365, | |
| "learning_rate": 0.0003673979879307314, | |
| "loss": 0.1805, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.1015625, | |
| "eval_loss": 0.08478689938783646, | |
| "eval_runtime": 34.46, | |
| "eval_samples_per_second": 118.862, | |
| "eval_steps_per_second": 7.429, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.12109375, | |
| "grad_norm": 0.820060670375824, | |
| "learning_rate": 0.00036589046406502166, | |
| "loss": 0.212, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 4.140625, | |
| "grad_norm": 1.2379734516143799, | |
| "learning_rate": 0.0003643775535236832, | |
| "loss": 0.1904, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 4.16015625, | |
| "grad_norm": 0.763465940952301, | |
| "learning_rate": 0.0003628593266279316, | |
| "loss": 0.1798, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 4.1796875, | |
| "grad_norm": 1.0594629049301147, | |
| "learning_rate": 0.00036133585394609104, | |
| "loss": 0.2282, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 4.19921875, | |
| "grad_norm": 1.0510274171829224, | |
| "learning_rate": 0.0003598072062903137, | |
| "loss": 0.1604, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 4.21875, | |
| "grad_norm": 0.7347229719161987, | |
| "learning_rate": 0.000358273454713288, | |
| "loss": 0.1874, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 4.23828125, | |
| "grad_norm": 1.2081701755523682, | |
| "learning_rate": 0.0003567346705049371, | |
| "loss": 0.1761, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 4.2578125, | |
| "grad_norm": 0.8199229836463928, | |
| "learning_rate": 0.0003551909251891041, | |
| "loss": 0.1742, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 4.27734375, | |
| "grad_norm": 0.9021660089492798, | |
| "learning_rate": 0.0003536422905202286, | |
| "loss": 0.1469, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 4.296875, | |
| "grad_norm": 1.1995556354522705, | |
| "learning_rate": 0.00035208883848001027, | |
| "loss": 0.1564, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 4.296875, | |
| "eval_loss": 0.08129081130027771, | |
| "eval_runtime": 34.451, | |
| "eval_samples_per_second": 118.893, | |
| "eval_steps_per_second": 7.431, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 4.31640625, | |
| "grad_norm": 1.4361320734024048, | |
| "learning_rate": 0.00035053064127406466, | |
| "loss": 0.1818, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 4.3359375, | |
| "grad_norm": 1.2441810369491577, | |
| "learning_rate": 0.0003489677713285655, | |
| "loss": 0.1815, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 4.35546875, | |
| "grad_norm": 1.0656682252883911, | |
| "learning_rate": 0.0003474003012868793, | |
| "loss": 0.1546, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 4.375, | |
| "grad_norm": 0.9471537470817566, | |
| "learning_rate": 0.00034582830400618834, | |
| "loss": 0.1666, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 4.39453125, | |
| "grad_norm": 0.9355669617652893, | |
| "learning_rate": 0.0003442518525541046, | |
| "loss": 0.1624, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 4.4140625, | |
| "grad_norm": 0.8041836619377136, | |
| "learning_rate": 0.0003426710202052729, | |
| "loss": 0.1739, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 4.43359375, | |
| "grad_norm": 0.9458361864089966, | |
| "learning_rate": 0.0003410858804379658, | |
| "loss": 0.1664, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 4.453125, | |
| "grad_norm": 0.9826671481132507, | |
| "learning_rate": 0.0003394965069306677, | |
| "loss": 0.1644, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 4.47265625, | |
| "grad_norm": 0.9595081210136414, | |
| "learning_rate": 0.00033790297355865037, | |
| "loss": 0.1791, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 4.4921875, | |
| "grad_norm": 1.212173342704773, | |
| "learning_rate": 0.00033630535439053933, | |
| "loss": 0.1888, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.4921875, | |
| "eval_loss": 0.07625420391559601, | |
| "eval_runtime": 34.457, | |
| "eval_samples_per_second": 118.873, | |
| "eval_steps_per_second": 7.43, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.51171875, | |
| "grad_norm": 1.1109684705734253, | |
| "learning_rate": 0.0003347037236848709, | |
| "loss": 0.175, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 4.53125, | |
| "grad_norm": 0.680024266242981, | |
| "learning_rate": 0.00033309815588664077, | |
| "loss": 0.1532, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 4.55078125, | |
| "grad_norm": 1.1742795705795288, | |
| "learning_rate": 0.0003314887256238435, | |
| "loss": 0.1887, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 4.5703125, | |
| "grad_norm": 0.9640453457832336, | |
| "learning_rate": 0.00032987550770400393, | |
| "loss": 0.1294, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 4.58984375, | |
| "grad_norm": 1.4085016250610352, | |
| "learning_rate": 0.0003282585771107001, | |
| "loss": 0.1663, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 4.609375, | |
| "grad_norm": 0.9533945322036743, | |
| "learning_rate": 0.0003266380090000779, | |
| "loss": 0.1543, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 4.62890625, | |
| "grad_norm": 1.1523878574371338, | |
| "learning_rate": 0.00032501387869735774, | |
| "loss": 0.1616, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 4.6484375, | |
| "grad_norm": 1.5500876903533936, | |
| "learning_rate": 0.0003233862616933333, | |
| "loss": 0.1601, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 4.66796875, | |
| "grad_norm": 0.9692633152008057, | |
| "learning_rate": 0.0003217552336408628, | |
| "loss": 0.1531, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 4.6875, | |
| "grad_norm": 0.9365313649177551, | |
| "learning_rate": 0.00032012087035135264, | |
| "loss": 0.1531, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.6875, | |
| "eval_loss": 0.07005032151937485, | |
| "eval_runtime": 34.454, | |
| "eval_samples_per_second": 118.883, | |
| "eval_steps_per_second": 7.43, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.70703125, | |
| "grad_norm": 0.956684410572052, | |
| "learning_rate": 0.0003184832477912334, | |
| "loss": 0.1503, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 4.7265625, | |
| "grad_norm": 1.1487888097763062, | |
| "learning_rate": 0.00031684244207842905, | |
| "loss": 0.1415, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 4.74609375, | |
| "grad_norm": 1.2333308458328247, | |
| "learning_rate": 0.0003151985294788189, | |
| "loss": 0.1753, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 4.765625, | |
| "grad_norm": 1.0322210788726807, | |
| "learning_rate": 0.0003135515864026927, | |
| "loss": 0.1398, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 4.78515625, | |
| "grad_norm": 1.2911465167999268, | |
| "learning_rate": 0.0003119016894011991, | |
| "loss": 0.1412, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 4.8046875, | |
| "grad_norm": 1.3067231178283691, | |
| "learning_rate": 0.00031024891516278713, | |
| "loss": 0.1636, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 4.82421875, | |
| "grad_norm": 0.6816452741622925, | |
| "learning_rate": 0.00030859334050964226, | |
| "loss": 0.1645, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 4.84375, | |
| "grad_norm": 1.6802979707717896, | |
| "learning_rate": 0.0003069350423941152, | |
| "loss": 0.1606, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 4.86328125, | |
| "grad_norm": 0.8826921582221985, | |
| "learning_rate": 0.00030527409789514524, | |
| "loss": 0.1364, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 4.8828125, | |
| "grad_norm": 0.714592456817627, | |
| "learning_rate": 0.0003036105842146775, | |
| "loss": 0.1444, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.8828125, | |
| "eval_loss": 0.06955922394990921, | |
| "eval_runtime": 34.453, | |
| "eval_samples_per_second": 118.887, | |
| "eval_steps_per_second": 7.43, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.90234375, | |
| "grad_norm": 0.8907870650291443, | |
| "learning_rate": 0.0003019445786740747, | |
| "loss": 0.1408, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 4.921875, | |
| "grad_norm": 0.8849253058433533, | |
| "learning_rate": 0.000300276158710523, | |
| "loss": 0.1243, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 4.94140625, | |
| "grad_norm": 1.1217460632324219, | |
| "learning_rate": 0.00029860540187343277, | |
| "loss": 0.1611, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 4.9609375, | |
| "grad_norm": 0.7618215084075928, | |
| "learning_rate": 0.00029693238582083407, | |
| "loss": 0.1344, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 4.98046875, | |
| "grad_norm": 0.904474675655365, | |
| "learning_rate": 0.0002952571883157669, | |
| "loss": 0.1409, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 1.0269676446914673, | |
| "learning_rate": 0.0002935798872226668, | |
| "loss": 0.1477, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 5.01953125, | |
| "grad_norm": 0.8338671326637268, | |
| "learning_rate": 0.0002919005605037458, | |
| "loss": 0.1469, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 5.0390625, | |
| "grad_norm": 0.5822446346282959, | |
| "learning_rate": 0.00029021928621536834, | |
| "loss": 0.1225, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 5.05859375, | |
| "grad_norm": 1.279576063156128, | |
| "learning_rate": 0.00028853614250442356, | |
| "loss": 0.1375, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 5.078125, | |
| "grad_norm": 0.9247716069221497, | |
| "learning_rate": 0.0002868512076046925, | |
| "loss": 0.126, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 5.078125, | |
| "eval_loss": 0.06358367204666138, | |
| "eval_runtime": 34.464, | |
| "eval_samples_per_second": 118.849, | |
| "eval_steps_per_second": 7.428, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 5.09765625, | |
| "grad_norm": 0.9483351111412048, | |
| "learning_rate": 0.0002851645598332123, | |
| "loss": 0.1334, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 5.1171875, | |
| "grad_norm": 0.7646822929382324, | |
| "learning_rate": 0.00028347627758663543, | |
| "loss": 0.1175, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 5.13671875, | |
| "grad_norm": 0.5134221315383911, | |
| "learning_rate": 0.00028178643933758613, | |
| "loss": 0.1182, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 5.15625, | |
| "grad_norm": 1.063459873199463, | |
| "learning_rate": 0.00028009512363101266, | |
| "loss": 0.1415, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 5.17578125, | |
| "grad_norm": 0.9747726917266846, | |
| "learning_rate": 0.0002784024090805367, | |
| "loss": 0.1591, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 5.1953125, | |
| "grad_norm": 1.0984761714935303, | |
| "learning_rate": 0.00027670837436479927, | |
| "loss": 0.1461, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 5.21484375, | |
| "grad_norm": 0.5246152877807617, | |
| "learning_rate": 0.0002750130982238036, | |
| "loss": 0.1102, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 5.234375, | |
| "grad_norm": 1.721552848815918, | |
| "learning_rate": 0.0002733166594552554, | |
| "loss": 0.1362, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 5.25390625, | |
| "grad_norm": 0.5377854108810425, | |
| "learning_rate": 0.0002716191369109, | |
| "loss": 0.1305, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 5.2734375, | |
| "grad_norm": 1.294967532157898, | |
| "learning_rate": 0.00026992060949285754, | |
| "loss": 0.1604, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 5.2734375, | |
| "eval_loss": 0.05843832343816757, | |
| "eval_runtime": 34.482, | |
| "eval_samples_per_second": 118.787, | |
| "eval_steps_per_second": 7.424, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 5.29296875, | |
| "grad_norm": 1.295292854309082, | |
| "learning_rate": 0.0002682211561499555, | |
| "loss": 0.1355, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 5.3125, | |
| "grad_norm": 0.5737702250480652, | |
| "learning_rate": 0.000266520855874059, | |
| "loss": 0.1295, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 5.33203125, | |
| "grad_norm": 1.1641753911972046, | |
| "learning_rate": 0.00026481978769639917, | |
| "loss": 0.1261, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 5.3515625, | |
| "grad_norm": 0.966760516166687, | |
| "learning_rate": 0.0002631180306838999, | |
| "loss": 0.1168, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 5.37109375, | |
| "grad_norm": 1.2680935859680176, | |
| "learning_rate": 0.0002614156639355026, | |
| "loss": 0.1511, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 5.390625, | |
| "grad_norm": 1.0339317321777344, | |
| "learning_rate": 0.00025971276657848965, | |
| "loss": 0.1239, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 5.41015625, | |
| "grad_norm": 0.655948281288147, | |
| "learning_rate": 0.0002580094177648064, | |
| "loss": 0.1134, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 5.4296875, | |
| "grad_norm": 0.9367031455039978, | |
| "learning_rate": 0.00025630569666738233, | |
| "loss": 0.12, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 5.44921875, | |
| "grad_norm": 1.1288447380065918, | |
| "learning_rate": 0.0002546016824764512, | |
| "loss": 0.1317, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 5.46875, | |
| "grad_norm": 1.4184249639511108, | |
| "learning_rate": 0.0002528974543958697, | |
| "loss": 0.1453, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 5.46875, | |
| "eval_loss": 0.054754838347435, | |
| "eval_runtime": 34.455, | |
| "eval_samples_per_second": 118.88, | |
| "eval_steps_per_second": 7.43, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 5.48828125, | |
| "grad_norm": 1.135767936706543, | |
| "learning_rate": 0.00025119309163943614, | |
| "loss": 0.0988, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 5.5078125, | |
| "grad_norm": 0.49979278445243835, | |
| "learning_rate": 0.00024948867342720904, | |
| "loss": 0.1069, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 5.52734375, | |
| "grad_norm": 1.0691176652908325, | |
| "learning_rate": 0.00024778427898182416, | |
| "loss": 0.1215, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 5.546875, | |
| "grad_norm": 1.043531894683838, | |
| "learning_rate": 0.000246079987524813, | |
| "loss": 0.1227, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 5.56640625, | |
| "grad_norm": 0.6733863949775696, | |
| "learning_rate": 0.00024437587827291963, | |
| "loss": 0.1079, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 5.5859375, | |
| "grad_norm": 0.5758414268493652, | |
| "learning_rate": 0.00024267203043441945, | |
| "loss": 0.1132, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 5.60546875, | |
| "grad_norm": 0.5270745158195496, | |
| "learning_rate": 0.00024096852320543686, | |
| "loss": 0.115, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 5.625, | |
| "grad_norm": 0.5830528140068054, | |
| "learning_rate": 0.0002392654357662648, | |
| "loss": 0.0911, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 5.64453125, | |
| "grad_norm": 1.0041881799697876, | |
| "learning_rate": 0.0002375628472776838, | |
| "loss": 0.1322, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 5.6640625, | |
| "grad_norm": 0.8648679256439209, | |
| "learning_rate": 0.00023586083687728284, | |
| "loss": 0.1263, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 5.6640625, | |
| "eval_loss": 0.04941609501838684, | |
| "eval_runtime": 34.47, | |
| "eval_samples_per_second": 118.828, | |
| "eval_steps_per_second": 7.427, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 5.68359375, | |
| "grad_norm": 0.8996643424034119, | |
| "learning_rate": 0.0002341594836757811, | |
| "loss": 0.1196, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 5.703125, | |
| "grad_norm": 0.7420564293861389, | |
| "learning_rate": 0.00023245886675335038, | |
| "loss": 0.1106, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 5.72265625, | |
| "grad_norm": 0.639359176158905, | |
| "learning_rate": 0.00023075906515594003, | |
| "loss": 0.1063, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 5.7421875, | |
| "grad_norm": 0.8854117393493652, | |
| "learning_rate": 0.00022906015789160212, | |
| "loss": 0.1121, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 5.76171875, | |
| "grad_norm": 1.648057222366333, | |
| "learning_rate": 0.0002273622239268197, | |
| "loss": 0.105, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 5.78125, | |
| "grad_norm": 0.8455070853233337, | |
| "learning_rate": 0.000225665342182836, | |
| "loss": 0.1343, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 5.80078125, | |
| "grad_norm": 0.6193717122077942, | |
| "learning_rate": 0.00022396959153198634, | |
| "loss": 0.0955, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 5.8203125, | |
| "grad_norm": 1.3201427459716797, | |
| "learning_rate": 0.00022227505079403193, | |
| "loss": 0.1114, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 5.83984375, | |
| "grad_norm": 1.1538575887680054, | |
| "learning_rate": 0.00022058179873249623, | |
| "loss": 0.1, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 5.859375, | |
| "grad_norm": 0.9225913286209106, | |
| "learning_rate": 0.00021888991405100426, | |
| "loss": 0.109, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.859375, | |
| "eval_loss": 0.04830887168645859, | |
| "eval_runtime": 34.463, | |
| "eval_samples_per_second": 118.852, | |
| "eval_steps_per_second": 7.428, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.87890625, | |
| "grad_norm": 1.0470666885375977, | |
| "learning_rate": 0.00021719947538962386, | |
| "loss": 0.1331, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 5.8984375, | |
| "grad_norm": 1.0696215629577637, | |
| "learning_rate": 0.00021551056132121125, | |
| "loss": 0.0929, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 5.91796875, | |
| "grad_norm": 0.6320193409919739, | |
| "learning_rate": 0.000213823250347758, | |
| "loss": 0.0871, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 5.9375, | |
| "grad_norm": 0.5821089148521423, | |
| "learning_rate": 0.0002121376208967428, | |
| "loss": 0.0957, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 5.95703125, | |
| "grad_norm": 0.6666136980056763, | |
| "learning_rate": 0.00021045375131748589, | |
| "loss": 0.1102, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 5.9765625, | |
| "grad_norm": 0.5259461998939514, | |
| "learning_rate": 0.00020877171987750752, | |
| "loss": 0.093, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 5.99609375, | |
| "grad_norm": 1.2559717893600464, | |
| "learning_rate": 0.0002070916047588896, | |
| "loss": 0.0905, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 6.015625, | |
| "grad_norm": 0.9847208261489868, | |
| "learning_rate": 0.00020541348405464185, | |
| "loss": 0.112, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 6.03515625, | |
| "grad_norm": 0.6805605292320251, | |
| "learning_rate": 0.00020373743576507269, | |
| "loss": 0.0977, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 6.0546875, | |
| "grad_norm": 1.0333281755447388, | |
| "learning_rate": 0.00020206353779416252, | |
| "loss": 0.1017, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 6.0546875, | |
| "eval_loss": 0.04791799187660217, | |
| "eval_runtime": 34.46, | |
| "eval_samples_per_second": 118.862, | |
| "eval_steps_per_second": 7.429, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 6.07421875, | |
| "grad_norm": 0.6116499900817871, | |
| "learning_rate": 0.00020039186794594394, | |
| "loss": 0.1053, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 6.09375, | |
| "grad_norm": 0.6878781318664551, | |
| "learning_rate": 0.00019872250392088402, | |
| "loss": 0.1066, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 6.11328125, | |
| "grad_norm": 1.0378636121749878, | |
| "learning_rate": 0.00019705552331227412, | |
| "loss": 0.0992, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 6.1328125, | |
| "grad_norm": 0.9008749127388, | |
| "learning_rate": 0.00019539100360262208, | |
| "loss": 0.1091, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 6.15234375, | |
| "grad_norm": 0.7476337552070618, | |
| "learning_rate": 0.00019372902216005183, | |
| "loss": 0.1359, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 6.171875, | |
| "grad_norm": 0.4607301354408264, | |
| "learning_rate": 0.00019206965623470626, | |
| "loss": 0.1225, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 6.19140625, | |
| "grad_norm": 0.7025051712989807, | |
| "learning_rate": 0.0001904129829551572, | |
| "loss": 0.0941, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 6.2109375, | |
| "grad_norm": 0.9943326711654663, | |
| "learning_rate": 0.00018875907932482062, | |
| "loss": 0.0909, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 6.23046875, | |
| "grad_norm": 1.1952718496322632, | |
| "learning_rate": 0.0001871080222183766, | |
| "loss": 0.1054, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 1.0064172744750977, | |
| "learning_rate": 0.00018545988837819703, | |
| "loss": 0.0698, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "eval_loss": 0.04147997871041298, | |
| "eval_runtime": 34.463, | |
| "eval_samples_per_second": 118.852, | |
| "eval_steps_per_second": 7.428, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 6.26953125, | |
| "grad_norm": 0.7319508790969849, | |
| "learning_rate": 0.00018381475441077793, | |
| "loss": 0.0826, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 6.2890625, | |
| "grad_norm": 0.6343052387237549, | |
| "learning_rate": 0.00018217269678317936, | |
| "loss": 0.0748, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 6.30859375, | |
| "grad_norm": 1.0118874311447144, | |
| "learning_rate": 0.00018053379181947032, | |
| "loss": 0.085, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 6.328125, | |
| "grad_norm": 0.6920039653778076, | |
| "learning_rate": 0.00017889811569718207, | |
| "loss": 0.0818, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 6.34765625, | |
| "grad_norm": 0.9968111515045166, | |
| "learning_rate": 0.0001772657444437666, | |
| "loss": 0.0892, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 6.3671875, | |
| "grad_norm": 0.7734740376472473, | |
| "learning_rate": 0.00017563675393306313, | |
| "loss": 0.0868, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 6.38671875, | |
| "grad_norm": 0.8593881130218506, | |
| "learning_rate": 0.0001740112198817717, | |
| "loss": 0.0842, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 6.40625, | |
| "grad_norm": 0.5908093452453613, | |
| "learning_rate": 0.00017238921784593325, | |
| "loss": 0.0905, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 6.42578125, | |
| "grad_norm": 0.8824208378791809, | |
| "learning_rate": 0.0001707708232174181, | |
| "loss": 0.068, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 6.4453125, | |
| "grad_norm": 1.5253270864486694, | |
| "learning_rate": 0.0001691561112204215, | |
| "loss": 0.1058, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 6.4453125, | |
| "eval_loss": 0.03964650630950928, | |
| "eval_runtime": 34.469, | |
| "eval_samples_per_second": 118.831, | |
| "eval_steps_per_second": 7.427, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 6.46484375, | |
| "grad_norm": 0.8700875043869019, | |
| "learning_rate": 0.0001675451569079674, | |
| "loss": 0.084, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 6.484375, | |
| "grad_norm": 0.4273681938648224, | |
| "learning_rate": 0.00016593803515841955, | |
| "loss": 0.0958, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 6.50390625, | |
| "grad_norm": 0.6970112919807434, | |
| "learning_rate": 0.00016433482067200144, | |
| "loss": 0.0838, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 6.5234375, | |
| "grad_norm": 0.7514935731887817, | |
| "learning_rate": 0.000162735587967324, | |
| "loss": 0.0901, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 6.54296875, | |
| "grad_norm": 0.5686305165290833, | |
| "learning_rate": 0.00016114041137792185, | |
| "loss": 0.0874, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 6.5625, | |
| "grad_norm": 0.7676323056221008, | |
| "learning_rate": 0.00015954936504879863, | |
| "loss": 0.0815, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 6.58203125, | |
| "grad_norm": 0.7747044563293457, | |
| "learning_rate": 0.00015796252293298006, | |
| "loss": 0.0805, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 6.6015625, | |
| "grad_norm": 0.5802006125450134, | |
| "learning_rate": 0.0001563799587880771, | |
| "loss": 0.0759, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 6.62109375, | |
| "grad_norm": 0.6963436603546143, | |
| "learning_rate": 0.00015480174617285713, | |
| "loss": 0.0883, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 6.640625, | |
| "grad_norm": 0.6564157009124756, | |
| "learning_rate": 0.00015322795844382566, | |
| "loss": 0.082, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 6.640625, | |
| "eval_loss": 0.03754296153783798, | |
| "eval_runtime": 34.623, | |
| "eval_samples_per_second": 118.303, | |
| "eval_steps_per_second": 7.394, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 6.66015625, | |
| "grad_norm": 0.6527138352394104, | |
| "learning_rate": 0.00015165866875181566, | |
| "loss": 0.0966, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 6.6796875, | |
| "grad_norm": 0.7239379286766052, | |
| "learning_rate": 0.00015009395003858834, | |
| "loss": 0.1032, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 6.69921875, | |
| "grad_norm": 0.9023746252059937, | |
| "learning_rate": 0.00014853387503344212, | |
| "loss": 0.0896, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 6.71875, | |
| "grad_norm": 0.7207731604576111, | |
| "learning_rate": 0.00014697851624983243, | |
| "loss": 0.0858, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 6.73828125, | |
| "grad_norm": 0.8441604971885681, | |
| "learning_rate": 0.00014542794598200147, | |
| "loss": 0.0794, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 6.7578125, | |
| "grad_norm": 1.3690321445465088, | |
| "learning_rate": 0.00014388223630161734, | |
| "loss": 0.0774, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 6.77734375, | |
| "grad_norm": 0.8731696009635925, | |
| "learning_rate": 0.00014234145905442445, | |
| "loss": 0.0784, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 6.796875, | |
| "grad_norm": 0.5190788507461548, | |
| "learning_rate": 0.00014080568585690407, | |
| "loss": 0.0767, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 6.81640625, | |
| "grad_norm": 0.9499125480651855, | |
| "learning_rate": 0.0001392749880929456, | |
| "loss": 0.0922, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 6.8359375, | |
| "grad_norm": 0.5663052201271057, | |
| "learning_rate": 0.000137749436910528, | |
| "loss": 0.082, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.8359375, | |
| "eval_loss": 0.03372948616743088, | |
| "eval_runtime": 34.565, | |
| "eval_samples_per_second": 118.501, | |
| "eval_steps_per_second": 7.406, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.85546875, | |
| "grad_norm": 0.8701801896095276, | |
| "learning_rate": 0.0001362291032184141, | |
| "loss": 0.0773, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 6.875, | |
| "grad_norm": 0.4705177843570709, | |
| "learning_rate": 0.00013471405768285299, | |
| "loss": 0.0755, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 6.89453125, | |
| "grad_norm": 0.5040585398674011, | |
| "learning_rate": 0.0001332043707242969, | |
| "loss": 0.0727, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 6.9140625, | |
| "grad_norm": 0.6496409773826599, | |
| "learning_rate": 0.00013170011251412717, | |
| "loss": 0.0904, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 6.93359375, | |
| "grad_norm": 1.0118383169174194, | |
| "learning_rate": 0.00013020135297139257, | |
| "loss": 0.0832, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 6.953125, | |
| "grad_norm": 0.7414892911911011, | |
| "learning_rate": 0.00012870816175956034, | |
| "loss": 0.0859, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 6.97265625, | |
| "grad_norm": 0.41224896907806396, | |
| "learning_rate": 0.00012722060828327693, | |
| "loss": 0.0825, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 6.9921875, | |
| "grad_norm": 1.4074021577835083, | |
| "learning_rate": 0.00012573876168514282, | |
| "loss": 0.0875, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 7.01171875, | |
| "grad_norm": 0.7603104710578918, | |
| "learning_rate": 0.00012426269084249856, | |
| "loss": 0.0635, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 7.03125, | |
| "grad_norm": 0.3152506649494171, | |
| "learning_rate": 0.00012279246436422322, | |
| "loss": 0.0763, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 7.03125, | |
| "eval_loss": 0.032879240810871124, | |
| "eval_runtime": 34.561, | |
| "eval_samples_per_second": 118.515, | |
| "eval_steps_per_second": 7.407, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 7.05078125, | |
| "grad_norm": 0.8169571161270142, | |
| "learning_rate": 0.00012132815058754557, | |
| "loss": 0.0681, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 7.0703125, | |
| "grad_norm": 0.9279233813285828, | |
| "learning_rate": 0.00011986981757486717, | |
| "loss": 0.0829, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 7.08984375, | |
| "grad_norm": 0.7519627809524536, | |
| "learning_rate": 0.00011841753311059967, | |
| "loss": 0.0613, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 7.109375, | |
| "grad_norm": 0.5586460828781128, | |
| "learning_rate": 0.00011697136469801334, | |
| "loss": 0.0702, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 7.12890625, | |
| "grad_norm": 0.3905629813671112, | |
| "learning_rate": 0.00011553137955609993, | |
| "loss": 0.0711, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 7.1484375, | |
| "grad_norm": 1.6302764415740967, | |
| "learning_rate": 0.00011409764461644814, | |
| "loss": 0.0971, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 7.16796875, | |
| "grad_norm": 0.31183406710624695, | |
| "learning_rate": 0.00011267022652013256, | |
| "loss": 0.0612, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 7.1875, | |
| "grad_norm": 0.8089606761932373, | |
| "learning_rate": 0.00011124919161461592, | |
| "loss": 0.0608, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 7.20703125, | |
| "grad_norm": 0.4437573552131653, | |
| "learning_rate": 0.00010983460595066602, | |
| "loss": 0.0639, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 7.2265625, | |
| "grad_norm": 0.5053586363792419, | |
| "learning_rate": 0.00010842653527928447, | |
| "loss": 0.0811, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 7.2265625, | |
| "eval_loss": 0.03170738369226456, | |
| "eval_runtime": 35.205, | |
| "eval_samples_per_second": 116.347, | |
| "eval_steps_per_second": 7.272, | |
| "step": 3700 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5120, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 2, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4810141121097728e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |