{ "best_global_step": 2000, "best_metric": 0.3999578198909343, "best_model_checkpoint": "./SALAMA_NEWMEDTTTT/checkpoint-2000", "epoch": 1.0976948408342482, "eval_steps": 2000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005488474204171241, "grad_norm": 1.0394881963729858, "learning_rate": 1.8e-07, "loss": 0.0043, "step": 10 }, { "epoch": 0.010976948408342482, "grad_norm": 0.25432130694389343, "learning_rate": 3.8e-07, "loss": 0.0027, "step": 20 }, { "epoch": 0.01646542261251372, "grad_norm": 0.6018465161323547, "learning_rate": 5.800000000000001e-07, "loss": 0.0037, "step": 30 }, { "epoch": 0.021953896816684963, "grad_norm": 0.07274393737316132, "learning_rate": 7.8e-07, "loss": 0.0034, "step": 40 }, { "epoch": 0.027442371020856202, "grad_norm": 1.1111565828323364, "learning_rate": 9.800000000000001e-07, "loss": 0.0032, "step": 50 }, { "epoch": 0.03293084522502744, "grad_norm": 2.1740646362304688, "learning_rate": 1.1800000000000001e-06, "loss": 0.0086, "step": 60 }, { "epoch": 0.038419319429198684, "grad_norm": 1.1649271249771118, "learning_rate": 1.3800000000000001e-06, "loss": 0.0049, "step": 70 }, { "epoch": 0.043907793633369926, "grad_norm": 1.0835011005401611, "learning_rate": 1.5800000000000001e-06, "loss": 0.0036, "step": 80 }, { "epoch": 0.04939626783754116, "grad_norm": 2.567765474319458, "learning_rate": 1.7800000000000001e-06, "loss": 0.0072, "step": 90 }, { "epoch": 0.054884742041712405, "grad_norm": 0.5648300647735596, "learning_rate": 1.98e-06, "loss": 0.0033, "step": 100 }, { "epoch": 0.06037321624588365, "grad_norm": 0.5851211547851562, "learning_rate": 2.1800000000000003e-06, "loss": 0.0042, "step": 110 }, { "epoch": 0.06586169045005488, "grad_norm": 0.40879732370376587, "learning_rate": 2.38e-06, "loss": 0.004, "step": 120 }, { "epoch": 0.07135016465422613, "grad_norm": 0.36008283495903015, "learning_rate": 2.5800000000000003e-06, "loss": 0.0037, "step": 130 }, { "epoch": 0.07683863885839737, "grad_norm": 0.07423322647809982, "learning_rate": 2.7800000000000005e-06, "loss": 0.0075, "step": 140 }, { "epoch": 0.08232711306256861, "grad_norm": 1.0768777132034302, "learning_rate": 2.9800000000000003e-06, "loss": 0.0067, "step": 150 }, { "epoch": 0.08781558726673985, "grad_norm": 0.29102134704589844, "learning_rate": 3.1800000000000005e-06, "loss": 0.0035, "step": 160 }, { "epoch": 0.09330406147091108, "grad_norm": 0.5590409636497498, "learning_rate": 3.3800000000000007e-06, "loss": 0.0031, "step": 170 }, { "epoch": 0.09879253567508232, "grad_norm": 0.4114173948764801, "learning_rate": 3.58e-06, "loss": 0.0038, "step": 180 }, { "epoch": 0.10428100987925357, "grad_norm": 0.9015783667564392, "learning_rate": 3.7800000000000002e-06, "loss": 0.004, "step": 190 }, { "epoch": 0.10976948408342481, "grad_norm": 0.26067736744880676, "learning_rate": 3.980000000000001e-06, "loss": 0.0101, "step": 200 }, { "epoch": 0.11525795828759605, "grad_norm": 0.819459080696106, "learning_rate": 4.18e-06, "loss": 0.0043, "step": 210 }, { "epoch": 0.1207464324917673, "grad_norm": 0.9547446966171265, "learning_rate": 4.38e-06, "loss": 0.0078, "step": 220 }, { "epoch": 0.12623490669593854, "grad_norm": 0.6792054772377014, "learning_rate": 4.58e-06, "loss": 0.0058, "step": 230 }, { "epoch": 0.13172338090010977, "grad_norm": 0.04598504304885864, "learning_rate": 4.78e-06, "loss": 0.0058, "step": 240 }, { "epoch": 0.13721185510428102, "grad_norm": 0.977815568447113, "learning_rate": 4.980000000000001e-06, "loss": 0.0065, "step": 250 }, { "epoch": 0.14270032930845225, "grad_norm": 1.0802408456802368, "learning_rate": 5.18e-06, "loss": 0.0086, "step": 260 }, { "epoch": 0.14818880351262348, "grad_norm": 0.30211061239242554, "learning_rate": 5.380000000000001e-06, "loss": 0.0045, "step": 270 }, { "epoch": 0.15367727771679474, "grad_norm": 1.0189473628997803, "learning_rate": 5.580000000000001e-06, "loss": 0.0035, "step": 280 }, { "epoch": 0.15916575192096596, "grad_norm": 1.2080388069152832, "learning_rate": 5.78e-06, "loss": 0.0054, "step": 290 }, { "epoch": 0.16465422612513722, "grad_norm": 0.7697501182556152, "learning_rate": 5.98e-06, "loss": 0.0074, "step": 300 }, { "epoch": 0.17014270032930845, "grad_norm": 0.23319111764431, "learning_rate": 6.18e-06, "loss": 0.0075, "step": 310 }, { "epoch": 0.1756311745334797, "grad_norm": 1.1132267713546753, "learning_rate": 6.380000000000001e-06, "loss": 0.006, "step": 320 }, { "epoch": 0.18111964873765093, "grad_norm": 0.9462475776672363, "learning_rate": 6.5800000000000005e-06, "loss": 0.0082, "step": 330 }, { "epoch": 0.18660812294182216, "grad_norm": 0.6547773480415344, "learning_rate": 6.780000000000001e-06, "loss": 0.0064, "step": 340 }, { "epoch": 0.19209659714599342, "grad_norm": 1.4683443307876587, "learning_rate": 6.98e-06, "loss": 0.0059, "step": 350 }, { "epoch": 0.19758507135016465, "grad_norm": 0.6405034065246582, "learning_rate": 7.180000000000001e-06, "loss": 0.0096, "step": 360 }, { "epoch": 0.2030735455543359, "grad_norm": 1.1234091520309448, "learning_rate": 7.3800000000000005e-06, "loss": 0.0099, "step": 370 }, { "epoch": 0.20856201975850713, "grad_norm": 0.9663105607032776, "learning_rate": 7.58e-06, "loss": 0.0092, "step": 380 }, { "epoch": 0.21405049396267836, "grad_norm": 0.7793697714805603, "learning_rate": 7.78e-06, "loss": 0.0081, "step": 390 }, { "epoch": 0.21953896816684962, "grad_norm": 0.7131162285804749, "learning_rate": 7.980000000000002e-06, "loss": 0.0084, "step": 400 }, { "epoch": 0.22502744237102085, "grad_norm": 1.2374234199523926, "learning_rate": 8.18e-06, "loss": 0.0106, "step": 410 }, { "epoch": 0.2305159165751921, "grad_norm": 1.7101589441299438, "learning_rate": 8.380000000000001e-06, "loss": 0.0106, "step": 420 }, { "epoch": 0.23600439077936333, "grad_norm": 1.1548316478729248, "learning_rate": 8.580000000000001e-06, "loss": 0.0078, "step": 430 }, { "epoch": 0.2414928649835346, "grad_norm": 0.6724960803985596, "learning_rate": 8.78e-06, "loss": 0.0069, "step": 440 }, { "epoch": 0.24698133918770582, "grad_norm": 1.403664469718933, "learning_rate": 8.98e-06, "loss": 0.0094, "step": 450 }, { "epoch": 0.2524698133918771, "grad_norm": 1.1001019477844238, "learning_rate": 9.180000000000002e-06, "loss": 0.0107, "step": 460 }, { "epoch": 0.2579582875960483, "grad_norm": 1.0355250835418701, "learning_rate": 9.38e-06, "loss": 0.0081, "step": 470 }, { "epoch": 0.26344676180021953, "grad_norm": 1.619025707244873, "learning_rate": 9.58e-06, "loss": 0.0134, "step": 480 }, { "epoch": 0.2689352360043908, "grad_norm": 1.4473015069961548, "learning_rate": 9.780000000000001e-06, "loss": 0.0119, "step": 490 }, { "epoch": 0.27442371020856204, "grad_norm": 1.3764768838882446, "learning_rate": 9.980000000000001e-06, "loss": 0.0086, "step": 500 }, { "epoch": 0.27991218441273324, "grad_norm": 1.75978422164917, "learning_rate": 9.971374045801527e-06, "loss": 0.0106, "step": 510 }, { "epoch": 0.2854006586169045, "grad_norm": 2.658644914627075, "learning_rate": 9.939567430025446e-06, "loss": 0.0146, "step": 520 }, { "epoch": 0.29088913282107576, "grad_norm": 0.3355913758277893, "learning_rate": 9.907760814249365e-06, "loss": 0.0134, "step": 530 }, { "epoch": 0.29637760702524696, "grad_norm": 1.7025257349014282, "learning_rate": 9.875954198473283e-06, "loss": 0.0151, "step": 540 }, { "epoch": 0.3018660812294182, "grad_norm": 1.6538467407226562, "learning_rate": 9.844147582697202e-06, "loss": 0.0147, "step": 550 }, { "epoch": 0.30735455543358947, "grad_norm": 1.4546349048614502, "learning_rate": 9.81234096692112e-06, "loss": 0.0181, "step": 560 }, { "epoch": 0.31284302963776073, "grad_norm": 1.5585579872131348, "learning_rate": 9.780534351145039e-06, "loss": 0.0163, "step": 570 }, { "epoch": 0.31833150384193193, "grad_norm": 1.1905714273452759, "learning_rate": 9.748727735368957e-06, "loss": 0.0158, "step": 580 }, { "epoch": 0.3238199780461032, "grad_norm": 1.6334969997406006, "learning_rate": 9.716921119592876e-06, "loss": 0.0128, "step": 590 }, { "epoch": 0.32930845225027444, "grad_norm": 1.060271143913269, "learning_rate": 9.685114503816794e-06, "loss": 0.018, "step": 600 }, { "epoch": 0.33479692645444564, "grad_norm": 1.6735498905181885, "learning_rate": 9.653307888040713e-06, "loss": 0.0114, "step": 610 }, { "epoch": 0.3402854006586169, "grad_norm": 1.7198753356933594, "learning_rate": 9.621501272264631e-06, "loss": 0.0156, "step": 620 }, { "epoch": 0.34577387486278816, "grad_norm": 0.7011512517929077, "learning_rate": 9.58969465648855e-06, "loss": 0.0124, "step": 630 }, { "epoch": 0.3512623490669594, "grad_norm": 1.9055498838424683, "learning_rate": 9.557888040712468e-06, "loss": 0.0177, "step": 640 }, { "epoch": 0.3567508232711306, "grad_norm": 1.77641761302948, "learning_rate": 9.526081424936387e-06, "loss": 0.0114, "step": 650 }, { "epoch": 0.36223929747530187, "grad_norm": 2.173353910446167, "learning_rate": 9.494274809160307e-06, "loss": 0.0187, "step": 660 }, { "epoch": 0.3677277716794731, "grad_norm": 1.061390995979309, "learning_rate": 9.462468193384224e-06, "loss": 0.0132, "step": 670 }, { "epoch": 0.3732162458836443, "grad_norm": 0.8496463298797607, "learning_rate": 9.430661577608143e-06, "loss": 0.0136, "step": 680 }, { "epoch": 0.3787047200878156, "grad_norm": 1.2099004983901978, "learning_rate": 9.398854961832063e-06, "loss": 0.0109, "step": 690 }, { "epoch": 0.38419319429198684, "grad_norm": 1.3495599031448364, "learning_rate": 9.36704834605598e-06, "loss": 0.0153, "step": 700 }, { "epoch": 0.3896816684961581, "grad_norm": 0.764531135559082, "learning_rate": 9.3352417302799e-06, "loss": 0.0073, "step": 710 }, { "epoch": 0.3951701427003293, "grad_norm": 2.1928865909576416, "learning_rate": 9.303435114503817e-06, "loss": 0.0135, "step": 720 }, { "epoch": 0.40065861690450055, "grad_norm": 1.8005603551864624, "learning_rate": 9.271628498727735e-06, "loss": 0.0187, "step": 730 }, { "epoch": 0.4061470911086718, "grad_norm": 1.2742944955825806, "learning_rate": 9.239821882951655e-06, "loss": 0.0089, "step": 740 }, { "epoch": 0.411635565312843, "grad_norm": 1.6193122863769531, "learning_rate": 9.208015267175572e-06, "loss": 0.0152, "step": 750 }, { "epoch": 0.41712403951701427, "grad_norm": 1.4442307949066162, "learning_rate": 9.176208651399493e-06, "loss": 0.0162, "step": 760 }, { "epoch": 0.4226125137211855, "grad_norm": 0.9129316806793213, "learning_rate": 9.144402035623411e-06, "loss": 0.0151, "step": 770 }, { "epoch": 0.4281009879253567, "grad_norm": 1.479588270187378, "learning_rate": 9.112595419847328e-06, "loss": 0.014, "step": 780 }, { "epoch": 0.433589462129528, "grad_norm": 1.5315167903900146, "learning_rate": 9.080788804071248e-06, "loss": 0.0123, "step": 790 }, { "epoch": 0.43907793633369924, "grad_norm": 2.470548391342163, "learning_rate": 9.048982188295165e-06, "loss": 0.0112, "step": 800 }, { "epoch": 0.4445664105378705, "grad_norm": 1.5762847661972046, "learning_rate": 9.017175572519085e-06, "loss": 0.0163, "step": 810 }, { "epoch": 0.4500548847420417, "grad_norm": 1.4822980165481567, "learning_rate": 8.985368956743004e-06, "loss": 0.0145, "step": 820 }, { "epoch": 0.45554335894621295, "grad_norm": 2.682856798171997, "learning_rate": 8.95356234096692e-06, "loss": 0.0141, "step": 830 }, { "epoch": 0.4610318331503842, "grad_norm": 1.2349945306777954, "learning_rate": 8.92175572519084e-06, "loss": 0.0148, "step": 840 }, { "epoch": 0.4665203073545554, "grad_norm": 3.259676694869995, "learning_rate": 8.88994910941476e-06, "loss": 0.0212, "step": 850 }, { "epoch": 0.47200878155872666, "grad_norm": 1.4975826740264893, "learning_rate": 8.858142493638678e-06, "loss": 0.0112, "step": 860 }, { "epoch": 0.4774972557628979, "grad_norm": 2.8876535892486572, "learning_rate": 8.826335877862596e-06, "loss": 0.0168, "step": 870 }, { "epoch": 0.4829857299670692, "grad_norm": 2.306791305541992, "learning_rate": 8.794529262086515e-06, "loss": 0.0192, "step": 880 }, { "epoch": 0.4884742041712404, "grad_norm": 0.6873131394386292, "learning_rate": 8.762722646310434e-06, "loss": 0.0144, "step": 890 }, { "epoch": 0.49396267837541163, "grad_norm": 3.158386468887329, "learning_rate": 8.730916030534352e-06, "loss": 0.0131, "step": 900 }, { "epoch": 0.4994511525795829, "grad_norm": 0.8878953456878662, "learning_rate": 8.69910941475827e-06, "loss": 0.0173, "step": 910 }, { "epoch": 0.5049396267837541, "grad_norm": 1.9014732837677002, "learning_rate": 8.667302798982189e-06, "loss": 0.0112, "step": 920 }, { "epoch": 0.5104281009879253, "grad_norm": 1.7305513620376587, "learning_rate": 8.635496183206108e-06, "loss": 0.0137, "step": 930 }, { "epoch": 0.5159165751920965, "grad_norm": 1.7590184211730957, "learning_rate": 8.603689567430026e-06, "loss": 0.0126, "step": 940 }, { "epoch": 0.5214050493962679, "grad_norm": 1.3747210502624512, "learning_rate": 8.571882951653945e-06, "loss": 0.0156, "step": 950 }, { "epoch": 0.5268935236004391, "grad_norm": 1.0799747705459595, "learning_rate": 8.540076335877863e-06, "loss": 0.0101, "step": 960 }, { "epoch": 0.5323819978046103, "grad_norm": 0.8307255506515503, "learning_rate": 8.508269720101782e-06, "loss": 0.0145, "step": 970 }, { "epoch": 0.5378704720087816, "grad_norm": 1.852042317390442, "learning_rate": 8.4764631043257e-06, "loss": 0.0115, "step": 980 }, { "epoch": 0.5433589462129528, "grad_norm": 2.150557279586792, "learning_rate": 8.444656488549619e-06, "loss": 0.0107, "step": 990 }, { "epoch": 0.5488474204171241, "grad_norm": 0.7547608613967896, "learning_rate": 8.412849872773537e-06, "loss": 0.0119, "step": 1000 }, { "epoch": 0.5543358946212953, "grad_norm": 1.4302098751068115, "learning_rate": 8.381043256997456e-06, "loss": 0.0134, "step": 1010 }, { "epoch": 0.5598243688254665, "grad_norm": 2.210999011993408, "learning_rate": 8.349236641221374e-06, "loss": 0.0109, "step": 1020 }, { "epoch": 0.5653128430296378, "grad_norm": 3.0575549602508545, "learning_rate": 8.317430025445293e-06, "loss": 0.0288, "step": 1030 }, { "epoch": 0.570801317233809, "grad_norm": 1.2066882848739624, "learning_rate": 8.285623409669212e-06, "loss": 0.0109, "step": 1040 }, { "epoch": 0.5762897914379802, "grad_norm": 0.9596546292304993, "learning_rate": 8.25381679389313e-06, "loss": 0.0154, "step": 1050 }, { "epoch": 0.5817782656421515, "grad_norm": 1.2375856637954712, "learning_rate": 8.222010178117049e-06, "loss": 0.0117, "step": 1060 }, { "epoch": 0.5872667398463227, "grad_norm": 1.287665605545044, "learning_rate": 8.190203562340969e-06, "loss": 0.0113, "step": 1070 }, { "epoch": 0.5927552140504939, "grad_norm": 1.2491388320922852, "learning_rate": 8.158396946564886e-06, "loss": 0.0131, "step": 1080 }, { "epoch": 0.5982436882546652, "grad_norm": 1.8166123628616333, "learning_rate": 8.126590330788804e-06, "loss": 0.0135, "step": 1090 }, { "epoch": 0.6037321624588364, "grad_norm": 0.9061824679374695, "learning_rate": 8.094783715012723e-06, "loss": 0.0123, "step": 1100 }, { "epoch": 0.6092206366630076, "grad_norm": 1.2774139642715454, "learning_rate": 8.062977099236641e-06, "loss": 0.0118, "step": 1110 }, { "epoch": 0.6147091108671789, "grad_norm": 1.7925004959106445, "learning_rate": 8.031170483460562e-06, "loss": 0.014, "step": 1120 }, { "epoch": 0.6201975850713501, "grad_norm": 1.256042242050171, "learning_rate": 7.999363867684478e-06, "loss": 0.0174, "step": 1130 }, { "epoch": 0.6256860592755215, "grad_norm": 1.2440769672393799, "learning_rate": 7.967557251908397e-06, "loss": 0.0114, "step": 1140 }, { "epoch": 0.6311745334796927, "grad_norm": 1.6593252420425415, "learning_rate": 7.935750636132317e-06, "loss": 0.0119, "step": 1150 }, { "epoch": 0.6366630076838639, "grad_norm": 1.7107939720153809, "learning_rate": 7.903944020356234e-06, "loss": 0.014, "step": 1160 }, { "epoch": 0.6421514818880352, "grad_norm": 1.2454367876052856, "learning_rate": 7.872137404580154e-06, "loss": 0.0126, "step": 1170 }, { "epoch": 0.6476399560922064, "grad_norm": 1.0048370361328125, "learning_rate": 7.840330788804071e-06, "loss": 0.0113, "step": 1180 }, { "epoch": 0.6531284302963776, "grad_norm": 4.3503098487854, "learning_rate": 7.80852417302799e-06, "loss": 0.01, "step": 1190 }, { "epoch": 0.6586169045005489, "grad_norm": 2.078575611114502, "learning_rate": 7.77671755725191e-06, "loss": 0.0131, "step": 1200 }, { "epoch": 0.6641053787047201, "grad_norm": 2.2236897945404053, "learning_rate": 7.744910941475827e-06, "loss": 0.0143, "step": 1210 }, { "epoch": 0.6695938529088913, "grad_norm": 2.2201192378997803, "learning_rate": 7.713104325699747e-06, "loss": 0.0098, "step": 1220 }, { "epoch": 0.6750823271130626, "grad_norm": 1.5262202024459839, "learning_rate": 7.681297709923665e-06, "loss": 0.0163, "step": 1230 }, { "epoch": 0.6805708013172338, "grad_norm": 0.6526926755905151, "learning_rate": 7.649491094147582e-06, "loss": 0.0093, "step": 1240 }, { "epoch": 0.686059275521405, "grad_norm": 0.6294535994529724, "learning_rate": 7.6176844783715025e-06, "loss": 0.013, "step": 1250 }, { "epoch": 0.6915477497255763, "grad_norm": 0.6937686800956726, "learning_rate": 7.58587786259542e-06, "loss": 0.0121, "step": 1260 }, { "epoch": 0.6970362239297475, "grad_norm": 1.6241185665130615, "learning_rate": 7.554071246819339e-06, "loss": 0.0146, "step": 1270 }, { "epoch": 0.7025246981339188, "grad_norm": 1.467155933380127, "learning_rate": 7.522264631043258e-06, "loss": 0.0131, "step": 1280 }, { "epoch": 0.70801317233809, "grad_norm": 1.753973126411438, "learning_rate": 7.490458015267176e-06, "loss": 0.014, "step": 1290 }, { "epoch": 0.7135016465422612, "grad_norm": 1.4710702896118164, "learning_rate": 7.458651399491095e-06, "loss": 0.0103, "step": 1300 }, { "epoch": 0.7189901207464325, "grad_norm": 2.0423262119293213, "learning_rate": 7.426844783715014e-06, "loss": 0.0107, "step": 1310 }, { "epoch": 0.7244785949506037, "grad_norm": 1.1584227085113525, "learning_rate": 7.395038167938931e-06, "loss": 0.0099, "step": 1320 }, { "epoch": 0.7299670691547749, "grad_norm": 1.1535860300064087, "learning_rate": 7.363231552162851e-06, "loss": 0.0113, "step": 1330 }, { "epoch": 0.7354555433589463, "grad_norm": 0.7290008664131165, "learning_rate": 7.331424936386769e-06, "loss": 0.0111, "step": 1340 }, { "epoch": 0.7409440175631175, "grad_norm": 0.7790582776069641, "learning_rate": 7.299618320610688e-06, "loss": 0.0067, "step": 1350 }, { "epoch": 0.7464324917672887, "grad_norm": 1.8725967407226562, "learning_rate": 7.267811704834606e-06, "loss": 0.0132, "step": 1360 }, { "epoch": 0.75192096597146, "grad_norm": 2.039541721343994, "learning_rate": 7.236005089058524e-06, "loss": 0.0186, "step": 1370 }, { "epoch": 0.7574094401756312, "grad_norm": 1.802741527557373, "learning_rate": 7.204198473282443e-06, "loss": 0.0127, "step": 1380 }, { "epoch": 0.7628979143798024, "grad_norm": 1.0849511623382568, "learning_rate": 7.172391857506362e-06, "loss": 0.0156, "step": 1390 }, { "epoch": 0.7683863885839737, "grad_norm": 1.2373745441436768, "learning_rate": 7.1405852417302805e-06, "loss": 0.0168, "step": 1400 }, { "epoch": 0.7738748627881449, "grad_norm": 1.8411822319030762, "learning_rate": 7.108778625954199e-06, "loss": 0.0139, "step": 1410 }, { "epoch": 0.7793633369923162, "grad_norm": 2.8104448318481445, "learning_rate": 7.076972010178118e-06, "loss": 0.0206, "step": 1420 }, { "epoch": 0.7848518111964874, "grad_norm": 0.9695596098899841, "learning_rate": 7.045165394402036e-06, "loss": 0.0123, "step": 1430 }, { "epoch": 0.7903402854006586, "grad_norm": 1.6235179901123047, "learning_rate": 7.013358778625955e-06, "loss": 0.0115, "step": 1440 }, { "epoch": 0.7958287596048299, "grad_norm": 1.1207462549209595, "learning_rate": 6.981552162849873e-06, "loss": 0.0097, "step": 1450 }, { "epoch": 0.8013172338090011, "grad_norm": 1.1788724660873413, "learning_rate": 6.949745547073792e-06, "loss": 0.0095, "step": 1460 }, { "epoch": 0.8068057080131723, "grad_norm": 2.085524320602417, "learning_rate": 6.917938931297711e-06, "loss": 0.0136, "step": 1470 }, { "epoch": 0.8122941822173436, "grad_norm": 1.6332577466964722, "learning_rate": 6.886132315521629e-06, "loss": 0.0102, "step": 1480 }, { "epoch": 0.8177826564215148, "grad_norm": 1.769086241722107, "learning_rate": 6.854325699745547e-06, "loss": 0.0118, "step": 1490 }, { "epoch": 0.823271130625686, "grad_norm": 1.046510934829712, "learning_rate": 6.822519083969467e-06, "loss": 0.0094, "step": 1500 }, { "epoch": 0.8287596048298573, "grad_norm": 1.5111862421035767, "learning_rate": 6.790712468193384e-06, "loss": 0.0143, "step": 1510 }, { "epoch": 0.8342480790340285, "grad_norm": 1.3604211807250977, "learning_rate": 6.758905852417304e-06, "loss": 0.0138, "step": 1520 }, { "epoch": 0.8397365532381997, "grad_norm": 0.9713101387023926, "learning_rate": 6.727099236641222e-06, "loss": 0.01, "step": 1530 }, { "epoch": 0.845225027442371, "grad_norm": 1.2814525365829468, "learning_rate": 6.69529262086514e-06, "loss": 0.0084, "step": 1540 }, { "epoch": 0.8507135016465422, "grad_norm": 0.9360769391059875, "learning_rate": 6.663486005089059e-06, "loss": 0.0095, "step": 1550 }, { "epoch": 0.8562019758507134, "grad_norm": 2.029505491256714, "learning_rate": 6.631679389312977e-06, "loss": 0.012, "step": 1560 }, { "epoch": 0.8616904500548848, "grad_norm": 1.2836129665374756, "learning_rate": 6.599872773536896e-06, "loss": 0.0178, "step": 1570 }, { "epoch": 0.867178924259056, "grad_norm": 1.5491465330123901, "learning_rate": 6.568066157760815e-06, "loss": 0.0121, "step": 1580 }, { "epoch": 0.8726673984632273, "grad_norm": 1.215768575668335, "learning_rate": 6.536259541984733e-06, "loss": 0.0167, "step": 1590 }, { "epoch": 0.8781558726673985, "grad_norm": 1.0636669397354126, "learning_rate": 6.504452926208652e-06, "loss": 0.0094, "step": 1600 }, { "epoch": 0.8836443468715697, "grad_norm": 1.4701627492904663, "learning_rate": 6.4726463104325706e-06, "loss": 0.0124, "step": 1610 }, { "epoch": 0.889132821075741, "grad_norm": 1.176419734954834, "learning_rate": 6.440839694656489e-06, "loss": 0.0123, "step": 1620 }, { "epoch": 0.8946212952799122, "grad_norm": 2.032910108566284, "learning_rate": 6.409033078880408e-06, "loss": 0.0114, "step": 1630 }, { "epoch": 0.9001097694840834, "grad_norm": 1.0917820930480957, "learning_rate": 6.377226463104325e-06, "loss": 0.0107, "step": 1640 }, { "epoch": 0.9055982436882547, "grad_norm": 1.4592185020446777, "learning_rate": 6.345419847328245e-06, "loss": 0.0128, "step": 1650 }, { "epoch": 0.9110867178924259, "grad_norm": 1.2474491596221924, "learning_rate": 6.313613231552164e-06, "loss": 0.0122, "step": 1660 }, { "epoch": 0.9165751920965971, "grad_norm": 1.5561631917953491, "learning_rate": 6.281806615776082e-06, "loss": 0.0107, "step": 1670 }, { "epoch": 0.9220636663007684, "grad_norm": 0.8761013746261597, "learning_rate": 6.25e-06, "loss": 0.0068, "step": 1680 }, { "epoch": 0.9275521405049396, "grad_norm": 2.1419386863708496, "learning_rate": 6.21819338422392e-06, "loss": 0.0147, "step": 1690 }, { "epoch": 0.9330406147091108, "grad_norm": 1.0107790231704712, "learning_rate": 6.186386768447837e-06, "loss": 0.0075, "step": 1700 }, { "epoch": 0.9385290889132821, "grad_norm": 0.9932330846786499, "learning_rate": 6.154580152671757e-06, "loss": 0.0079, "step": 1710 }, { "epoch": 0.9440175631174533, "grad_norm": 1.2500951290130615, "learning_rate": 6.122773536895675e-06, "loss": 0.0108, "step": 1720 }, { "epoch": 0.9495060373216246, "grad_norm": 1.5545804500579834, "learning_rate": 6.090966921119593e-06, "loss": 0.0104, "step": 1730 }, { "epoch": 0.9549945115257958, "grad_norm": 1.4742019176483154, "learning_rate": 6.059160305343512e-06, "loss": 0.0139, "step": 1740 }, { "epoch": 0.960482985729967, "grad_norm": 0.8499981760978699, "learning_rate": 6.02735368956743e-06, "loss": 0.0073, "step": 1750 }, { "epoch": 0.9659714599341384, "grad_norm": 0.7065290808677673, "learning_rate": 5.9955470737913494e-06, "loss": 0.0074, "step": 1760 }, { "epoch": 0.9714599341383096, "grad_norm": 1.6678274869918823, "learning_rate": 5.963740458015268e-06, "loss": 0.0098, "step": 1770 }, { "epoch": 0.9769484083424808, "grad_norm": 1.185567855834961, "learning_rate": 5.931933842239186e-06, "loss": 0.0118, "step": 1780 }, { "epoch": 0.9824368825466521, "grad_norm": 1.7147798538208008, "learning_rate": 5.900127226463105e-06, "loss": 0.012, "step": 1790 }, { "epoch": 0.9879253567508233, "grad_norm": 2.5320818424224854, "learning_rate": 5.8683206106870236e-06, "loss": 0.0059, "step": 1800 }, { "epoch": 0.9934138309549945, "grad_norm": 1.0351759195327759, "learning_rate": 5.836513994910942e-06, "loss": 0.0087, "step": 1810 }, { "epoch": 0.9989023051591658, "grad_norm": 1.2726657390594482, "learning_rate": 5.804707379134861e-06, "loss": 0.0133, "step": 1820 }, { "epoch": 1.004390779363337, "grad_norm": 0.4543689489364624, "learning_rate": 5.772900763358778e-06, "loss": 0.0043, "step": 1830 }, { "epoch": 1.0098792535675083, "grad_norm": 2.0367791652679443, "learning_rate": 5.741094147582698e-06, "loss": 0.0044, "step": 1840 }, { "epoch": 1.0153677277716795, "grad_norm": 0.6520805358886719, "learning_rate": 5.709287531806616e-06, "loss": 0.004, "step": 1850 }, { "epoch": 1.0208562019758507, "grad_norm": 0.8149614930152893, "learning_rate": 5.677480916030535e-06, "loss": 0.0032, "step": 1860 }, { "epoch": 1.026344676180022, "grad_norm": 0.4136104881763458, "learning_rate": 5.645674300254453e-06, "loss": 0.0036, "step": 1870 }, { "epoch": 1.031833150384193, "grad_norm": 1.050353765487671, "learning_rate": 5.613867684478373e-06, "loss": 0.0045, "step": 1880 }, { "epoch": 1.0373216245883645, "grad_norm": 2.067906379699707, "learning_rate": 5.58206106870229e-06, "loss": 0.0037, "step": 1890 }, { "epoch": 1.0428100987925357, "grad_norm": 0.31829890608787537, "learning_rate": 5.550254452926209e-06, "loss": 0.0044, "step": 1900 }, { "epoch": 1.048298572996707, "grad_norm": 0.434925377368927, "learning_rate": 5.518447837150128e-06, "loss": 0.0027, "step": 1910 }, { "epoch": 1.0537870472008781, "grad_norm": 1.5393106937408447, "learning_rate": 5.486641221374046e-06, "loss": 0.0043, "step": 1920 }, { "epoch": 1.0592755214050493, "grad_norm": 0.3788773715496063, "learning_rate": 5.454834605597965e-06, "loss": 0.002, "step": 1930 }, { "epoch": 1.0647639956092205, "grad_norm": 0.29814398288726807, "learning_rate": 5.423027989821883e-06, "loss": 0.0042, "step": 1940 }, { "epoch": 1.070252469813392, "grad_norm": 0.24681848287582397, "learning_rate": 5.391221374045802e-06, "loss": 0.0049, "step": 1950 }, { "epoch": 1.0757409440175631, "grad_norm": 0.11974932998418808, "learning_rate": 5.359414758269721e-06, "loss": 0.0032, "step": 1960 }, { "epoch": 1.0812294182217344, "grad_norm": 1.4361236095428467, "learning_rate": 5.327608142493639e-06, "loss": 0.0028, "step": 1970 }, { "epoch": 1.0867178924259056, "grad_norm": 0.645820140838623, "learning_rate": 5.295801526717558e-06, "loss": 0.0024, "step": 1980 }, { "epoch": 1.0922063666300768, "grad_norm": 0.14708861708641052, "learning_rate": 5.2639949109414766e-06, "loss": 0.0017, "step": 1990 }, { "epoch": 1.0976948408342482, "grad_norm": 0.40531185269355774, "learning_rate": 5.232188295165394e-06, "loss": 0.0032, "step": 2000 }, { "epoch": 1.0976948408342482, "eval_loss": 0.0047075627371668816, "eval_runtime": 10648.9323, "eval_samples_per_second": 1.369, "eval_steps_per_second": 0.171, "eval_wer": 0.3999578198909343, "step": 2000 } ], "logging_steps": 10, "max_steps": 3644, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.531565226655744e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }