| { | |
| "best_global_step": 2000, | |
| "best_metric": 0.3999578198909343, | |
| "best_model_checkpoint": "./SALAMA_NEWMEDTTTT/checkpoint-2000", | |
| "epoch": 1.0976948408342482, | |
| "eval_steps": 2000, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005488474204171241, | |
| "grad_norm": 1.0394881963729858, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0043, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.010976948408342482, | |
| "grad_norm": 0.25432130694389343, | |
| "learning_rate": 3.8e-07, | |
| "loss": 0.0027, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01646542261251372, | |
| "grad_norm": 0.6018465161323547, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 0.0037, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.021953896816684963, | |
| "grad_norm": 0.07274393737316132, | |
| "learning_rate": 7.8e-07, | |
| "loss": 0.0034, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.027442371020856202, | |
| "grad_norm": 1.1111565828323364, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 0.0032, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03293084522502744, | |
| "grad_norm": 2.1740646362304688, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 0.0086, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.038419319429198684, | |
| "grad_norm": 1.1649271249771118, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 0.0049, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.043907793633369926, | |
| "grad_norm": 1.0835011005401611, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 0.0036, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04939626783754116, | |
| "grad_norm": 2.567765474319458, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 0.0072, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.054884742041712405, | |
| "grad_norm": 0.5648300647735596, | |
| "learning_rate": 1.98e-06, | |
| "loss": 0.0033, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06037321624588365, | |
| "grad_norm": 0.5851211547851562, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 0.0042, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06586169045005488, | |
| "grad_norm": 0.40879732370376587, | |
| "learning_rate": 2.38e-06, | |
| "loss": 0.004, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07135016465422613, | |
| "grad_norm": 0.36008283495903015, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 0.0037, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07683863885839737, | |
| "grad_norm": 0.07423322647809982, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 0.0075, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08232711306256861, | |
| "grad_norm": 1.0768777132034302, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 0.0067, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08781558726673985, | |
| "grad_norm": 0.29102134704589844, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 0.0035, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09330406147091108, | |
| "grad_norm": 0.5590409636497498, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 0.0031, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09879253567508232, | |
| "grad_norm": 0.4114173948764801, | |
| "learning_rate": 3.58e-06, | |
| "loss": 0.0038, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10428100987925357, | |
| "grad_norm": 0.9015783667564392, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 0.004, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10976948408342481, | |
| "grad_norm": 0.26067736744880676, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 0.0101, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11525795828759605, | |
| "grad_norm": 0.819459080696106, | |
| "learning_rate": 4.18e-06, | |
| "loss": 0.0043, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1207464324917673, | |
| "grad_norm": 0.9547446966171265, | |
| "learning_rate": 4.38e-06, | |
| "loss": 0.0078, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12623490669593854, | |
| "grad_norm": 0.6792054772377014, | |
| "learning_rate": 4.58e-06, | |
| "loss": 0.0058, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13172338090010977, | |
| "grad_norm": 0.04598504304885864, | |
| "learning_rate": 4.78e-06, | |
| "loss": 0.0058, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13721185510428102, | |
| "grad_norm": 0.977815568447113, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 0.0065, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14270032930845225, | |
| "grad_norm": 1.0802408456802368, | |
| "learning_rate": 5.18e-06, | |
| "loss": 0.0086, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14818880351262348, | |
| "grad_norm": 0.30211061239242554, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 0.0045, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.15367727771679474, | |
| "grad_norm": 1.0189473628997803, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 0.0035, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.15916575192096596, | |
| "grad_norm": 1.2080388069152832, | |
| "learning_rate": 5.78e-06, | |
| "loss": 0.0054, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16465422612513722, | |
| "grad_norm": 0.7697501182556152, | |
| "learning_rate": 5.98e-06, | |
| "loss": 0.0074, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17014270032930845, | |
| "grad_norm": 0.23319111764431, | |
| "learning_rate": 6.18e-06, | |
| "loss": 0.0075, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1756311745334797, | |
| "grad_norm": 1.1132267713546753, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 0.006, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.18111964873765093, | |
| "grad_norm": 0.9462475776672363, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 0.0082, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.18660812294182216, | |
| "grad_norm": 0.6547773480415344, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 0.0064, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.19209659714599342, | |
| "grad_norm": 1.4683443307876587, | |
| "learning_rate": 6.98e-06, | |
| "loss": 0.0059, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19758507135016465, | |
| "grad_norm": 0.6405034065246582, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 0.0096, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2030735455543359, | |
| "grad_norm": 1.1234091520309448, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 0.0099, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.20856201975850713, | |
| "grad_norm": 0.9663105607032776, | |
| "learning_rate": 7.58e-06, | |
| "loss": 0.0092, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.21405049396267836, | |
| "grad_norm": 0.7793697714805603, | |
| "learning_rate": 7.78e-06, | |
| "loss": 0.0081, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.21953896816684962, | |
| "grad_norm": 0.7131162285804749, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 0.0084, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.22502744237102085, | |
| "grad_norm": 1.2374234199523926, | |
| "learning_rate": 8.18e-06, | |
| "loss": 0.0106, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2305159165751921, | |
| "grad_norm": 1.7101589441299438, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 0.0106, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.23600439077936333, | |
| "grad_norm": 1.1548316478729248, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 0.0078, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2414928649835346, | |
| "grad_norm": 0.6724960803985596, | |
| "learning_rate": 8.78e-06, | |
| "loss": 0.0069, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.24698133918770582, | |
| "grad_norm": 1.403664469718933, | |
| "learning_rate": 8.98e-06, | |
| "loss": 0.0094, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2524698133918771, | |
| "grad_norm": 1.1001019477844238, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 0.0107, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2579582875960483, | |
| "grad_norm": 1.0355250835418701, | |
| "learning_rate": 9.38e-06, | |
| "loss": 0.0081, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.26344676180021953, | |
| "grad_norm": 1.619025707244873, | |
| "learning_rate": 9.58e-06, | |
| "loss": 0.0134, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2689352360043908, | |
| "grad_norm": 1.4473015069961548, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 0.0119, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.27442371020856204, | |
| "grad_norm": 1.3764768838882446, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 0.0086, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.27991218441273324, | |
| "grad_norm": 1.75978422164917, | |
| "learning_rate": 9.971374045801527e-06, | |
| "loss": 0.0106, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2854006586169045, | |
| "grad_norm": 2.658644914627075, | |
| "learning_rate": 9.939567430025446e-06, | |
| "loss": 0.0146, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.29088913282107576, | |
| "grad_norm": 0.3355913758277893, | |
| "learning_rate": 9.907760814249365e-06, | |
| "loss": 0.0134, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.29637760702524696, | |
| "grad_norm": 1.7025257349014282, | |
| "learning_rate": 9.875954198473283e-06, | |
| "loss": 0.0151, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3018660812294182, | |
| "grad_norm": 1.6538467407226562, | |
| "learning_rate": 9.844147582697202e-06, | |
| "loss": 0.0147, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.30735455543358947, | |
| "grad_norm": 1.4546349048614502, | |
| "learning_rate": 9.81234096692112e-06, | |
| "loss": 0.0181, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.31284302963776073, | |
| "grad_norm": 1.5585579872131348, | |
| "learning_rate": 9.780534351145039e-06, | |
| "loss": 0.0163, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.31833150384193193, | |
| "grad_norm": 1.1905714273452759, | |
| "learning_rate": 9.748727735368957e-06, | |
| "loss": 0.0158, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3238199780461032, | |
| "grad_norm": 1.6334969997406006, | |
| "learning_rate": 9.716921119592876e-06, | |
| "loss": 0.0128, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.32930845225027444, | |
| "grad_norm": 1.060271143913269, | |
| "learning_rate": 9.685114503816794e-06, | |
| "loss": 0.018, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.33479692645444564, | |
| "grad_norm": 1.6735498905181885, | |
| "learning_rate": 9.653307888040713e-06, | |
| "loss": 0.0114, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3402854006586169, | |
| "grad_norm": 1.7198753356933594, | |
| "learning_rate": 9.621501272264631e-06, | |
| "loss": 0.0156, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.34577387486278816, | |
| "grad_norm": 0.7011512517929077, | |
| "learning_rate": 9.58969465648855e-06, | |
| "loss": 0.0124, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3512623490669594, | |
| "grad_norm": 1.9055498838424683, | |
| "learning_rate": 9.557888040712468e-06, | |
| "loss": 0.0177, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3567508232711306, | |
| "grad_norm": 1.77641761302948, | |
| "learning_rate": 9.526081424936387e-06, | |
| "loss": 0.0114, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.36223929747530187, | |
| "grad_norm": 2.173353910446167, | |
| "learning_rate": 9.494274809160307e-06, | |
| "loss": 0.0187, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3677277716794731, | |
| "grad_norm": 1.061390995979309, | |
| "learning_rate": 9.462468193384224e-06, | |
| "loss": 0.0132, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3732162458836443, | |
| "grad_norm": 0.8496463298797607, | |
| "learning_rate": 9.430661577608143e-06, | |
| "loss": 0.0136, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3787047200878156, | |
| "grad_norm": 1.2099004983901978, | |
| "learning_rate": 9.398854961832063e-06, | |
| "loss": 0.0109, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.38419319429198684, | |
| "grad_norm": 1.3495599031448364, | |
| "learning_rate": 9.36704834605598e-06, | |
| "loss": 0.0153, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3896816684961581, | |
| "grad_norm": 0.764531135559082, | |
| "learning_rate": 9.3352417302799e-06, | |
| "loss": 0.0073, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3951701427003293, | |
| "grad_norm": 2.1928865909576416, | |
| "learning_rate": 9.303435114503817e-06, | |
| "loss": 0.0135, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.40065861690450055, | |
| "grad_norm": 1.8005603551864624, | |
| "learning_rate": 9.271628498727735e-06, | |
| "loss": 0.0187, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4061470911086718, | |
| "grad_norm": 1.2742944955825806, | |
| "learning_rate": 9.239821882951655e-06, | |
| "loss": 0.0089, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.411635565312843, | |
| "grad_norm": 1.6193122863769531, | |
| "learning_rate": 9.208015267175572e-06, | |
| "loss": 0.0152, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.41712403951701427, | |
| "grad_norm": 1.4442307949066162, | |
| "learning_rate": 9.176208651399493e-06, | |
| "loss": 0.0162, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4226125137211855, | |
| "grad_norm": 0.9129316806793213, | |
| "learning_rate": 9.144402035623411e-06, | |
| "loss": 0.0151, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4281009879253567, | |
| "grad_norm": 1.479588270187378, | |
| "learning_rate": 9.112595419847328e-06, | |
| "loss": 0.014, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.433589462129528, | |
| "grad_norm": 1.5315167903900146, | |
| "learning_rate": 9.080788804071248e-06, | |
| "loss": 0.0123, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.43907793633369924, | |
| "grad_norm": 2.470548391342163, | |
| "learning_rate": 9.048982188295165e-06, | |
| "loss": 0.0112, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4445664105378705, | |
| "grad_norm": 1.5762847661972046, | |
| "learning_rate": 9.017175572519085e-06, | |
| "loss": 0.0163, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4500548847420417, | |
| "grad_norm": 1.4822980165481567, | |
| "learning_rate": 8.985368956743004e-06, | |
| "loss": 0.0145, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.45554335894621295, | |
| "grad_norm": 2.682856798171997, | |
| "learning_rate": 8.95356234096692e-06, | |
| "loss": 0.0141, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.4610318331503842, | |
| "grad_norm": 1.2349945306777954, | |
| "learning_rate": 8.92175572519084e-06, | |
| "loss": 0.0148, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4665203073545554, | |
| "grad_norm": 3.259676694869995, | |
| "learning_rate": 8.88994910941476e-06, | |
| "loss": 0.0212, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.47200878155872666, | |
| "grad_norm": 1.4975826740264893, | |
| "learning_rate": 8.858142493638678e-06, | |
| "loss": 0.0112, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4774972557628979, | |
| "grad_norm": 2.8876535892486572, | |
| "learning_rate": 8.826335877862596e-06, | |
| "loss": 0.0168, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4829857299670692, | |
| "grad_norm": 2.306791305541992, | |
| "learning_rate": 8.794529262086515e-06, | |
| "loss": 0.0192, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.4884742041712404, | |
| "grad_norm": 0.6873131394386292, | |
| "learning_rate": 8.762722646310434e-06, | |
| "loss": 0.0144, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.49396267837541163, | |
| "grad_norm": 3.158386468887329, | |
| "learning_rate": 8.730916030534352e-06, | |
| "loss": 0.0131, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4994511525795829, | |
| "grad_norm": 0.8878953456878662, | |
| "learning_rate": 8.69910941475827e-06, | |
| "loss": 0.0173, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5049396267837541, | |
| "grad_norm": 1.9014732837677002, | |
| "learning_rate": 8.667302798982189e-06, | |
| "loss": 0.0112, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5104281009879253, | |
| "grad_norm": 1.7305513620376587, | |
| "learning_rate": 8.635496183206108e-06, | |
| "loss": 0.0137, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5159165751920965, | |
| "grad_norm": 1.7590184211730957, | |
| "learning_rate": 8.603689567430026e-06, | |
| "loss": 0.0126, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5214050493962679, | |
| "grad_norm": 1.3747210502624512, | |
| "learning_rate": 8.571882951653945e-06, | |
| "loss": 0.0156, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5268935236004391, | |
| "grad_norm": 1.0799747705459595, | |
| "learning_rate": 8.540076335877863e-06, | |
| "loss": 0.0101, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5323819978046103, | |
| "grad_norm": 0.8307255506515503, | |
| "learning_rate": 8.508269720101782e-06, | |
| "loss": 0.0145, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5378704720087816, | |
| "grad_norm": 1.852042317390442, | |
| "learning_rate": 8.4764631043257e-06, | |
| "loss": 0.0115, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5433589462129528, | |
| "grad_norm": 2.150557279586792, | |
| "learning_rate": 8.444656488549619e-06, | |
| "loss": 0.0107, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5488474204171241, | |
| "grad_norm": 0.7547608613967896, | |
| "learning_rate": 8.412849872773537e-06, | |
| "loss": 0.0119, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5543358946212953, | |
| "grad_norm": 1.4302098751068115, | |
| "learning_rate": 8.381043256997456e-06, | |
| "loss": 0.0134, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5598243688254665, | |
| "grad_norm": 2.210999011993408, | |
| "learning_rate": 8.349236641221374e-06, | |
| "loss": 0.0109, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5653128430296378, | |
| "grad_norm": 3.0575549602508545, | |
| "learning_rate": 8.317430025445293e-06, | |
| "loss": 0.0288, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.570801317233809, | |
| "grad_norm": 1.2066882848739624, | |
| "learning_rate": 8.285623409669212e-06, | |
| "loss": 0.0109, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5762897914379802, | |
| "grad_norm": 0.9596546292304993, | |
| "learning_rate": 8.25381679389313e-06, | |
| "loss": 0.0154, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5817782656421515, | |
| "grad_norm": 1.2375856637954712, | |
| "learning_rate": 8.222010178117049e-06, | |
| "loss": 0.0117, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5872667398463227, | |
| "grad_norm": 1.287665605545044, | |
| "learning_rate": 8.190203562340969e-06, | |
| "loss": 0.0113, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5927552140504939, | |
| "grad_norm": 1.2491388320922852, | |
| "learning_rate": 8.158396946564886e-06, | |
| "loss": 0.0131, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5982436882546652, | |
| "grad_norm": 1.8166123628616333, | |
| "learning_rate": 8.126590330788804e-06, | |
| "loss": 0.0135, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6037321624588364, | |
| "grad_norm": 0.9061824679374695, | |
| "learning_rate": 8.094783715012723e-06, | |
| "loss": 0.0123, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6092206366630076, | |
| "grad_norm": 1.2774139642715454, | |
| "learning_rate": 8.062977099236641e-06, | |
| "loss": 0.0118, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6147091108671789, | |
| "grad_norm": 1.7925004959106445, | |
| "learning_rate": 8.031170483460562e-06, | |
| "loss": 0.014, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6201975850713501, | |
| "grad_norm": 1.256042242050171, | |
| "learning_rate": 7.999363867684478e-06, | |
| "loss": 0.0174, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6256860592755215, | |
| "grad_norm": 1.2440769672393799, | |
| "learning_rate": 7.967557251908397e-06, | |
| "loss": 0.0114, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6311745334796927, | |
| "grad_norm": 1.6593252420425415, | |
| "learning_rate": 7.935750636132317e-06, | |
| "loss": 0.0119, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6366630076838639, | |
| "grad_norm": 1.7107939720153809, | |
| "learning_rate": 7.903944020356234e-06, | |
| "loss": 0.014, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6421514818880352, | |
| "grad_norm": 1.2454367876052856, | |
| "learning_rate": 7.872137404580154e-06, | |
| "loss": 0.0126, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6476399560922064, | |
| "grad_norm": 1.0048370361328125, | |
| "learning_rate": 7.840330788804071e-06, | |
| "loss": 0.0113, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6531284302963776, | |
| "grad_norm": 4.3503098487854, | |
| "learning_rate": 7.80852417302799e-06, | |
| "loss": 0.01, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6586169045005489, | |
| "grad_norm": 2.078575611114502, | |
| "learning_rate": 7.77671755725191e-06, | |
| "loss": 0.0131, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6641053787047201, | |
| "grad_norm": 2.2236897945404053, | |
| "learning_rate": 7.744910941475827e-06, | |
| "loss": 0.0143, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6695938529088913, | |
| "grad_norm": 2.2201192378997803, | |
| "learning_rate": 7.713104325699747e-06, | |
| "loss": 0.0098, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6750823271130626, | |
| "grad_norm": 1.5262202024459839, | |
| "learning_rate": 7.681297709923665e-06, | |
| "loss": 0.0163, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6805708013172338, | |
| "grad_norm": 0.6526926755905151, | |
| "learning_rate": 7.649491094147582e-06, | |
| "loss": 0.0093, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.686059275521405, | |
| "grad_norm": 0.6294535994529724, | |
| "learning_rate": 7.6176844783715025e-06, | |
| "loss": 0.013, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6915477497255763, | |
| "grad_norm": 0.6937686800956726, | |
| "learning_rate": 7.58587786259542e-06, | |
| "loss": 0.0121, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.6970362239297475, | |
| "grad_norm": 1.6241185665130615, | |
| "learning_rate": 7.554071246819339e-06, | |
| "loss": 0.0146, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7025246981339188, | |
| "grad_norm": 1.467155933380127, | |
| "learning_rate": 7.522264631043258e-06, | |
| "loss": 0.0131, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.70801317233809, | |
| "grad_norm": 1.753973126411438, | |
| "learning_rate": 7.490458015267176e-06, | |
| "loss": 0.014, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7135016465422612, | |
| "grad_norm": 1.4710702896118164, | |
| "learning_rate": 7.458651399491095e-06, | |
| "loss": 0.0103, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7189901207464325, | |
| "grad_norm": 2.0423262119293213, | |
| "learning_rate": 7.426844783715014e-06, | |
| "loss": 0.0107, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7244785949506037, | |
| "grad_norm": 1.1584227085113525, | |
| "learning_rate": 7.395038167938931e-06, | |
| "loss": 0.0099, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7299670691547749, | |
| "grad_norm": 1.1535860300064087, | |
| "learning_rate": 7.363231552162851e-06, | |
| "loss": 0.0113, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7354555433589463, | |
| "grad_norm": 0.7290008664131165, | |
| "learning_rate": 7.331424936386769e-06, | |
| "loss": 0.0111, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7409440175631175, | |
| "grad_norm": 0.7790582776069641, | |
| "learning_rate": 7.299618320610688e-06, | |
| "loss": 0.0067, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7464324917672887, | |
| "grad_norm": 1.8725967407226562, | |
| "learning_rate": 7.267811704834606e-06, | |
| "loss": 0.0132, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.75192096597146, | |
| "grad_norm": 2.039541721343994, | |
| "learning_rate": 7.236005089058524e-06, | |
| "loss": 0.0186, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7574094401756312, | |
| "grad_norm": 1.802741527557373, | |
| "learning_rate": 7.204198473282443e-06, | |
| "loss": 0.0127, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7628979143798024, | |
| "grad_norm": 1.0849511623382568, | |
| "learning_rate": 7.172391857506362e-06, | |
| "loss": 0.0156, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7683863885839737, | |
| "grad_norm": 1.2373745441436768, | |
| "learning_rate": 7.1405852417302805e-06, | |
| "loss": 0.0168, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7738748627881449, | |
| "grad_norm": 1.8411822319030762, | |
| "learning_rate": 7.108778625954199e-06, | |
| "loss": 0.0139, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7793633369923162, | |
| "grad_norm": 2.8104448318481445, | |
| "learning_rate": 7.076972010178118e-06, | |
| "loss": 0.0206, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7848518111964874, | |
| "grad_norm": 0.9695596098899841, | |
| "learning_rate": 7.045165394402036e-06, | |
| "loss": 0.0123, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7903402854006586, | |
| "grad_norm": 1.6235179901123047, | |
| "learning_rate": 7.013358778625955e-06, | |
| "loss": 0.0115, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7958287596048299, | |
| "grad_norm": 1.1207462549209595, | |
| "learning_rate": 6.981552162849873e-06, | |
| "loss": 0.0097, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8013172338090011, | |
| "grad_norm": 1.1788724660873413, | |
| "learning_rate": 6.949745547073792e-06, | |
| "loss": 0.0095, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8068057080131723, | |
| "grad_norm": 2.085524320602417, | |
| "learning_rate": 6.917938931297711e-06, | |
| "loss": 0.0136, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8122941822173436, | |
| "grad_norm": 1.6332577466964722, | |
| "learning_rate": 6.886132315521629e-06, | |
| "loss": 0.0102, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8177826564215148, | |
| "grad_norm": 1.769086241722107, | |
| "learning_rate": 6.854325699745547e-06, | |
| "loss": 0.0118, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.823271130625686, | |
| "grad_norm": 1.046510934829712, | |
| "learning_rate": 6.822519083969467e-06, | |
| "loss": 0.0094, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8287596048298573, | |
| "grad_norm": 1.5111862421035767, | |
| "learning_rate": 6.790712468193384e-06, | |
| "loss": 0.0143, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8342480790340285, | |
| "grad_norm": 1.3604211807250977, | |
| "learning_rate": 6.758905852417304e-06, | |
| "loss": 0.0138, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8397365532381997, | |
| "grad_norm": 0.9713101387023926, | |
| "learning_rate": 6.727099236641222e-06, | |
| "loss": 0.01, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.845225027442371, | |
| "grad_norm": 1.2814525365829468, | |
| "learning_rate": 6.69529262086514e-06, | |
| "loss": 0.0084, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8507135016465422, | |
| "grad_norm": 0.9360769391059875, | |
| "learning_rate": 6.663486005089059e-06, | |
| "loss": 0.0095, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8562019758507134, | |
| "grad_norm": 2.029505491256714, | |
| "learning_rate": 6.631679389312977e-06, | |
| "loss": 0.012, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8616904500548848, | |
| "grad_norm": 1.2836129665374756, | |
| "learning_rate": 6.599872773536896e-06, | |
| "loss": 0.0178, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.867178924259056, | |
| "grad_norm": 1.5491465330123901, | |
| "learning_rate": 6.568066157760815e-06, | |
| "loss": 0.0121, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8726673984632273, | |
| "grad_norm": 1.215768575668335, | |
| "learning_rate": 6.536259541984733e-06, | |
| "loss": 0.0167, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.8781558726673985, | |
| "grad_norm": 1.0636669397354126, | |
| "learning_rate": 6.504452926208652e-06, | |
| "loss": 0.0094, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8836443468715697, | |
| "grad_norm": 1.4701627492904663, | |
| "learning_rate": 6.4726463104325706e-06, | |
| "loss": 0.0124, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.889132821075741, | |
| "grad_norm": 1.176419734954834, | |
| "learning_rate": 6.440839694656489e-06, | |
| "loss": 0.0123, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8946212952799122, | |
| "grad_norm": 2.032910108566284, | |
| "learning_rate": 6.409033078880408e-06, | |
| "loss": 0.0114, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.9001097694840834, | |
| "grad_norm": 1.0917820930480957, | |
| "learning_rate": 6.377226463104325e-06, | |
| "loss": 0.0107, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9055982436882547, | |
| "grad_norm": 1.4592185020446777, | |
| "learning_rate": 6.345419847328245e-06, | |
| "loss": 0.0128, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9110867178924259, | |
| "grad_norm": 1.2474491596221924, | |
| "learning_rate": 6.313613231552164e-06, | |
| "loss": 0.0122, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9165751920965971, | |
| "grad_norm": 1.5561631917953491, | |
| "learning_rate": 6.281806615776082e-06, | |
| "loss": 0.0107, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9220636663007684, | |
| "grad_norm": 0.8761013746261597, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.0068, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9275521405049396, | |
| "grad_norm": 2.1419386863708496, | |
| "learning_rate": 6.21819338422392e-06, | |
| "loss": 0.0147, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9330406147091108, | |
| "grad_norm": 1.0107790231704712, | |
| "learning_rate": 6.186386768447837e-06, | |
| "loss": 0.0075, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9385290889132821, | |
| "grad_norm": 0.9932330846786499, | |
| "learning_rate": 6.154580152671757e-06, | |
| "loss": 0.0079, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9440175631174533, | |
| "grad_norm": 1.2500951290130615, | |
| "learning_rate": 6.122773536895675e-06, | |
| "loss": 0.0108, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9495060373216246, | |
| "grad_norm": 1.5545804500579834, | |
| "learning_rate": 6.090966921119593e-06, | |
| "loss": 0.0104, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9549945115257958, | |
| "grad_norm": 1.4742019176483154, | |
| "learning_rate": 6.059160305343512e-06, | |
| "loss": 0.0139, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.960482985729967, | |
| "grad_norm": 0.8499981760978699, | |
| "learning_rate": 6.02735368956743e-06, | |
| "loss": 0.0073, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9659714599341384, | |
| "grad_norm": 0.7065290808677673, | |
| "learning_rate": 5.9955470737913494e-06, | |
| "loss": 0.0074, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9714599341383096, | |
| "grad_norm": 1.6678274869918823, | |
| "learning_rate": 5.963740458015268e-06, | |
| "loss": 0.0098, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9769484083424808, | |
| "grad_norm": 1.185567855834961, | |
| "learning_rate": 5.931933842239186e-06, | |
| "loss": 0.0118, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.9824368825466521, | |
| "grad_norm": 1.7147798538208008, | |
| "learning_rate": 5.900127226463105e-06, | |
| "loss": 0.012, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9879253567508233, | |
| "grad_norm": 2.5320818424224854, | |
| "learning_rate": 5.8683206106870236e-06, | |
| "loss": 0.0059, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9934138309549945, | |
| "grad_norm": 1.0351759195327759, | |
| "learning_rate": 5.836513994910942e-06, | |
| "loss": 0.0087, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9989023051591658, | |
| "grad_norm": 1.2726657390594482, | |
| "learning_rate": 5.804707379134861e-06, | |
| "loss": 0.0133, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.004390779363337, | |
| "grad_norm": 0.4543689489364624, | |
| "learning_rate": 5.772900763358778e-06, | |
| "loss": 0.0043, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.0098792535675083, | |
| "grad_norm": 2.0367791652679443, | |
| "learning_rate": 5.741094147582698e-06, | |
| "loss": 0.0044, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.0153677277716795, | |
| "grad_norm": 0.6520805358886719, | |
| "learning_rate": 5.709287531806616e-06, | |
| "loss": 0.004, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.0208562019758507, | |
| "grad_norm": 0.8149614930152893, | |
| "learning_rate": 5.677480916030535e-06, | |
| "loss": 0.0032, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.026344676180022, | |
| "grad_norm": 0.4136104881763458, | |
| "learning_rate": 5.645674300254453e-06, | |
| "loss": 0.0036, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.031833150384193, | |
| "grad_norm": 1.050353765487671, | |
| "learning_rate": 5.613867684478373e-06, | |
| "loss": 0.0045, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.0373216245883645, | |
| "grad_norm": 2.067906379699707, | |
| "learning_rate": 5.58206106870229e-06, | |
| "loss": 0.0037, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.0428100987925357, | |
| "grad_norm": 0.31829890608787537, | |
| "learning_rate": 5.550254452926209e-06, | |
| "loss": 0.0044, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.048298572996707, | |
| "grad_norm": 0.434925377368927, | |
| "learning_rate": 5.518447837150128e-06, | |
| "loss": 0.0027, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.0537870472008781, | |
| "grad_norm": 1.5393106937408447, | |
| "learning_rate": 5.486641221374046e-06, | |
| "loss": 0.0043, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.0592755214050493, | |
| "grad_norm": 0.3788773715496063, | |
| "learning_rate": 5.454834605597965e-06, | |
| "loss": 0.002, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.0647639956092205, | |
| "grad_norm": 0.29814398288726807, | |
| "learning_rate": 5.423027989821883e-06, | |
| "loss": 0.0042, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.070252469813392, | |
| "grad_norm": 0.24681848287582397, | |
| "learning_rate": 5.391221374045802e-06, | |
| "loss": 0.0049, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.0757409440175631, | |
| "grad_norm": 0.11974932998418808, | |
| "learning_rate": 5.359414758269721e-06, | |
| "loss": 0.0032, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.0812294182217344, | |
| "grad_norm": 1.4361236095428467, | |
| "learning_rate": 5.327608142493639e-06, | |
| "loss": 0.0028, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.0867178924259056, | |
| "grad_norm": 0.645820140838623, | |
| "learning_rate": 5.295801526717558e-06, | |
| "loss": 0.0024, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.0922063666300768, | |
| "grad_norm": 0.14708861708641052, | |
| "learning_rate": 5.2639949109414766e-06, | |
| "loss": 0.0017, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.0976948408342482, | |
| "grad_norm": 0.40531185269355774, | |
| "learning_rate": 5.232188295165394e-06, | |
| "loss": 0.0032, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0976948408342482, | |
| "eval_loss": 0.0047075627371668816, | |
| "eval_runtime": 10648.9323, | |
| "eval_samples_per_second": 1.369, | |
| "eval_steps_per_second": 0.171, | |
| "eval_wer": 0.3999578198909343, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3644, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.531565226655744e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |