| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.108401684770561, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005542008423852805, | |
| "grad_norm": 0.07243233174085617, | |
| "learning_rate": 1.2315270935960592e-05, | |
| "loss": 1.4594, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01108401684770561, | |
| "grad_norm": 0.40484485030174255, | |
| "learning_rate": 2.4630541871921184e-05, | |
| "loss": 2.2032, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.016626025271558414, | |
| "grad_norm": 0.06850667297840118, | |
| "learning_rate": 3.694581280788178e-05, | |
| "loss": 1.2931, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.02216803369541122, | |
| "grad_norm": 0.4395073354244232, | |
| "learning_rate": 4.926108374384237e-05, | |
| "loss": 1.5698, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.027710042119264023, | |
| "grad_norm": 0.077068030834198, | |
| "learning_rate": 6.157635467980296e-05, | |
| "loss": 1.0537, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.03325205054311683, | |
| "grad_norm": 0.3282291293144226, | |
| "learning_rate": 7.389162561576355e-05, | |
| "loss": 0.9749, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03879405896696963, | |
| "grad_norm": 0.0593000203371048, | |
| "learning_rate": 8.620689655172413e-05, | |
| "loss": 0.9349, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.04433606739082244, | |
| "grad_norm": 0.25612473487854004, | |
| "learning_rate": 9.852216748768474e-05, | |
| "loss": 0.8974, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04987807581467524, | |
| "grad_norm": 0.0757347121834755, | |
| "learning_rate": 0.00011083743842364534, | |
| "loss": 0.9081, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.055420084238528046, | |
| "grad_norm": 0.14145499467849731, | |
| "learning_rate": 0.00012315270935960593, | |
| "loss": 0.8607, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06096209266238085, | |
| "grad_norm": 0.07710155844688416, | |
| "learning_rate": 0.00013546798029556652, | |
| "loss": 0.8973, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.06650410108623366, | |
| "grad_norm": 0.14791467785835266, | |
| "learning_rate": 0.0001477832512315271, | |
| "loss": 0.7924, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07204610951008646, | |
| "grad_norm": 0.07742594182491302, | |
| "learning_rate": 0.00016009852216748767, | |
| "loss": 0.8698, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.07758811793393926, | |
| "grad_norm": 0.14303487539291382, | |
| "learning_rate": 0.00017241379310344826, | |
| "loss": 0.786, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.08313012635779206, | |
| "grad_norm": 0.0865108072757721, | |
| "learning_rate": 0.00018472906403940888, | |
| "loss": 0.8606, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.08867213478164487, | |
| "grad_norm": 0.7533164024353027, | |
| "learning_rate": 0.00019704433497536947, | |
| "loss": 0.807, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09421414320549767, | |
| "grad_norm": 0.08325570821762085, | |
| "learning_rate": 0.00019999896617927833, | |
| "loss": 0.8635, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.09975615162935048, | |
| "grad_norm": 0.1043543666601181, | |
| "learning_rate": 0.0001999944557842899, | |
| "loss": 0.7825, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.10529816005320328, | |
| "grad_norm": 0.07949995994567871, | |
| "learning_rate": 0.0001999863658806385, | |
| "loss": 0.8379, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.11084016847705609, | |
| "grad_norm": 0.12020070850849152, | |
| "learning_rate": 0.00019997469675791905, | |
| "loss": 0.768, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11638217690090889, | |
| "grad_norm": 0.0803595781326294, | |
| "learning_rate": 0.00019995944883385196, | |
| "loss": 0.8487, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.1219241853247617, | |
| "grad_norm": 0.11509452760219574, | |
| "learning_rate": 0.0001999406226542682, | |
| "loss": 0.7787, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1274661937486145, | |
| "grad_norm": 0.07928384840488434, | |
| "learning_rate": 0.00019991821889308987, | |
| "loss": 0.8357, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.1330082021724673, | |
| "grad_norm": 0.09423446655273438, | |
| "learning_rate": 0.00019989223835230606, | |
| "loss": 0.7564, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1385502105963201, | |
| "grad_norm": 0.0835939422249794, | |
| "learning_rate": 0.000199862681961944, | |
| "loss": 0.8568, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.1440922190201729, | |
| "grad_norm": 0.09292898327112198, | |
| "learning_rate": 0.0001998295507800359, | |
| "loss": 0.7612, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.1496342274440257, | |
| "grad_norm": 0.07704215496778488, | |
| "learning_rate": 0.00019979284599258107, | |
| "loss": 0.8263, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.15517623586787851, | |
| "grad_norm": 0.10980474948883057, | |
| "learning_rate": 0.0001997525689135034, | |
| "loss": 0.7677, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.16071824429173132, | |
| "grad_norm": 0.08016064018011093, | |
| "learning_rate": 0.0001997087209846043, | |
| "loss": 0.8344, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.16626025271558412, | |
| "grad_norm": 0.0950881615281105, | |
| "learning_rate": 0.0001996613037755113, | |
| "loss": 0.769, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.17180226113943692, | |
| "grad_norm": 0.07932984828948975, | |
| "learning_rate": 0.00019961031898362152, | |
| "loss": 0.8156, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.17734426956328975, | |
| "grad_norm": 0.09336528927087784, | |
| "learning_rate": 0.00019955576843404128, | |
| "loss": 0.7767, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.18288627798714255, | |
| "grad_norm": 0.08560346812009811, | |
| "learning_rate": 0.00019949765407952042, | |
| "loss": 0.8228, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.18842828641099535, | |
| "grad_norm": 0.08475169539451599, | |
| "learning_rate": 0.00019943597800038267, | |
| "loss": 0.7669, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.19397029483484815, | |
| "grad_norm": 0.09038034081459045, | |
| "learning_rate": 0.00019937074240445105, | |
| "loss": 0.8182, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.19951230325870095, | |
| "grad_norm": 0.09195873886346817, | |
| "learning_rate": 0.0001993019496269688, | |
| "loss": 0.7598, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.20505431168255375, | |
| "grad_norm": 0.08655796200037003, | |
| "learning_rate": 0.0001992296021305159, | |
| "loss": 0.8167, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.21059632010640655, | |
| "grad_norm": 0.08353498578071594, | |
| "learning_rate": 0.00019915370250492084, | |
| "loss": 0.7486, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.21613832853025935, | |
| "grad_norm": 0.09225723147392273, | |
| "learning_rate": 0.0001990742534671679, | |
| "loss": 0.8138, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.22168033695411218, | |
| "grad_norm": 0.12104763090610504, | |
| "learning_rate": 0.00019899125786129997, | |
| "loss": 0.7153, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22722234537796498, | |
| "grad_norm": 0.0815986767411232, | |
| "learning_rate": 0.00019890471865831669, | |
| "loss": 0.7983, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.23276435380181779, | |
| "grad_norm": 0.08845670521259308, | |
| "learning_rate": 0.00019881463895606805, | |
| "loss": 0.7187, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2383063622256706, | |
| "grad_norm": 0.0821809321641922, | |
| "learning_rate": 0.00019872102197914359, | |
| "loss": 0.804, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.2438483706495234, | |
| "grad_norm": 0.08711609989404678, | |
| "learning_rate": 0.00019862387107875688, | |
| "loss": 0.7795, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2493903790733762, | |
| "grad_norm": 0.08517508953809738, | |
| "learning_rate": 0.00019852318973262567, | |
| "loss": 0.7937, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.254932387497229, | |
| "grad_norm": 0.10830071568489075, | |
| "learning_rate": 0.00019841898154484726, | |
| "loss": 0.7458, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.2604743959210818, | |
| "grad_norm": 0.08541836589574814, | |
| "learning_rate": 0.0001983112502457696, | |
| "loss": 0.8131, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.2660164043449346, | |
| "grad_norm": 0.08794037252664566, | |
| "learning_rate": 0.00019819999969185762, | |
| "loss": 0.7577, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2715584127687874, | |
| "grad_norm": 0.08078176528215408, | |
| "learning_rate": 0.00019808523386555542, | |
| "loss": 0.812, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.2771004211926402, | |
| "grad_norm": 0.09263130277395248, | |
| "learning_rate": 0.0001979669568751434, | |
| "loss": 0.7582, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.282642429616493, | |
| "grad_norm": 0.08198932558298111, | |
| "learning_rate": 0.00019784517295459147, | |
| "loss": 0.7958, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.2881844380403458, | |
| "grad_norm": 0.07858102023601532, | |
| "learning_rate": 0.00019771988646340725, | |
| "loss": 0.7744, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2937264464641986, | |
| "grad_norm": 0.0851408839225769, | |
| "learning_rate": 0.00019759110188648026, | |
| "loss": 0.7913, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.2992684548880514, | |
| "grad_norm": 0.09252189099788666, | |
| "learning_rate": 0.00019745882383392116, | |
| "loss": 0.7675, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.30481046331190426, | |
| "grad_norm": 0.08306555449962616, | |
| "learning_rate": 0.0001973230570408968, | |
| "loss": 0.8059, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.31035247173575703, | |
| "grad_norm": 0.0797729641199112, | |
| "learning_rate": 0.0001971838063674608, | |
| "loss": 0.7424, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.31589448015960986, | |
| "grad_norm": 0.08266165107488632, | |
| "learning_rate": 0.0001970410767983794, | |
| "loss": 0.7847, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.32143648858346263, | |
| "grad_norm": 0.09364205598831177, | |
| "learning_rate": 0.00019689487344295322, | |
| "loss": 0.6924, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.32697849700731546, | |
| "grad_norm": 0.08461842685937881, | |
| "learning_rate": 0.00019674520153483414, | |
| "loss": 0.8007, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.33252050543116823, | |
| "grad_norm": 0.0840207040309906, | |
| "learning_rate": 0.00019659206643183813, | |
| "loss": 0.7139, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.33806251385502106, | |
| "grad_norm": 0.08344192802906036, | |
| "learning_rate": 0.00019643547361575343, | |
| "loss": 0.7982, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.34360452227887384, | |
| "grad_norm": 0.07934779673814774, | |
| "learning_rate": 0.0001962754286921442, | |
| "loss": 0.7164, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.34914653070272667, | |
| "grad_norm": 0.08716201782226562, | |
| "learning_rate": 0.00019611193739015, | |
| "loss": 0.7846, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.3546885391265795, | |
| "grad_norm": 0.08384064584970474, | |
| "learning_rate": 0.0001959450055622806, | |
| "loss": 0.7416, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.36023054755043227, | |
| "grad_norm": 0.08661937713623047, | |
| "learning_rate": 0.0001957746391842066, | |
| "loss": 0.8075, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.3657725559742851, | |
| "grad_norm": 0.09327207505702972, | |
| "learning_rate": 0.00019560084435454536, | |
| "loss": 0.7596, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.37131456439813787, | |
| "grad_norm": 0.08391096442937851, | |
| "learning_rate": 0.00019542362729464273, | |
| "loss": 0.7794, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.3768565728219907, | |
| "grad_norm": 0.07694080471992493, | |
| "learning_rate": 0.00019524299434835052, | |
| "loss": 0.7424, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.38239858124584347, | |
| "grad_norm": 0.08567491173744202, | |
| "learning_rate": 0.00019505895198179912, | |
| "loss": 0.7996, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.3879405896696963, | |
| "grad_norm": 0.08828684687614441, | |
| "learning_rate": 0.0001948715067831663, | |
| "loss": 0.7394, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.39348259809354913, | |
| "grad_norm": 0.08347714692354202, | |
| "learning_rate": 0.00019468066546244117, | |
| "loss": 0.7734, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.3990246065174019, | |
| "grad_norm": 0.07736373692750931, | |
| "learning_rate": 0.00019448643485118412, | |
| "loss": 0.7134, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.40456661494125473, | |
| "grad_norm": 0.0840897262096405, | |
| "learning_rate": 0.00019428882190228216, | |
| "loss": 0.787, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.4101086233651075, | |
| "grad_norm": 0.08665871620178223, | |
| "learning_rate": 0.0001940878336897001, | |
| "loss": 0.7151, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.41565063178896033, | |
| "grad_norm": 0.08358912914991379, | |
| "learning_rate": 0.0001938834774082274, | |
| "loss": 0.7982, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.4211926402128131, | |
| "grad_norm": 0.07928963005542755, | |
| "learning_rate": 0.0001936757603732203, | |
| "loss": 0.7195, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.42673464863666594, | |
| "grad_norm": 0.08886470645666122, | |
| "learning_rate": 0.00019346469002034042, | |
| "loss": 0.7762, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.4322766570605187, | |
| "grad_norm": 0.1071886494755745, | |
| "learning_rate": 0.00019325027390528822, | |
| "loss": 0.7453, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.43781866548437154, | |
| "grad_norm": 0.08474262803792953, | |
| "learning_rate": 0.00019303251970353261, | |
| "loss": 0.7839, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.44336067390822437, | |
| "grad_norm": 0.08803894370794296, | |
| "learning_rate": 0.0001928114352100363, | |
| "loss": 0.7171, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.44890268233207714, | |
| "grad_norm": 0.08429575711488724, | |
| "learning_rate": 0.00019258702833897665, | |
| "loss": 0.7781, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.45444469075592997, | |
| "grad_norm": 0.08510231226682663, | |
| "learning_rate": 0.00019235930712346248, | |
| "loss": 0.6949, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.45998669917978274, | |
| "grad_norm": 0.08167176693677902, | |
| "learning_rate": 0.00019212827971524634, | |
| "loss": 0.7722, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.46552870760363557, | |
| "grad_norm": 0.06542418897151947, | |
| "learning_rate": 0.00019189395438443278, | |
| "loss": 0.7203, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.47107071602748835, | |
| "grad_norm": 0.08293402194976807, | |
| "learning_rate": 0.00019165633951918247, | |
| "loss": 0.7735, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.4766127244513412, | |
| "grad_norm": 0.0809284895658493, | |
| "learning_rate": 0.00019141544362541162, | |
| "loss": 0.7412, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.48215473287519395, | |
| "grad_norm": 0.08212891221046448, | |
| "learning_rate": 0.00019117127532648773, | |
| "loss": 0.7629, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.4876967412990468, | |
| "grad_norm": 0.08602219074964523, | |
| "learning_rate": 0.0001909238433629208, | |
| "loss": 0.6935, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.4932387497228996, | |
| "grad_norm": 0.08529417216777802, | |
| "learning_rate": 0.0001906731565920505, | |
| "loss": 0.7915, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.4987807581467524, | |
| "grad_norm": 0.08774964511394501, | |
| "learning_rate": 0.00019041922398772897, | |
| "loss": 0.7359, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5043227665706052, | |
| "grad_norm": 0.08649475872516632, | |
| "learning_rate": 0.00019016205463999984, | |
| "loss": 0.7696, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.509864774994458, | |
| "grad_norm": 0.0878506749868393, | |
| "learning_rate": 0.00018990165775477252, | |
| "loss": 0.7365, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.5154067834183108, | |
| "grad_norm": 0.09131711721420288, | |
| "learning_rate": 0.0001896380426534929, | |
| "loss": 0.7809, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.5209487918421636, | |
| "grad_norm": 0.07379825413227081, | |
| "learning_rate": 0.00018937121877280957, | |
| "loss": 0.7029, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.5264908002660164, | |
| "grad_norm": 0.08535836637020111, | |
| "learning_rate": 0.00018910119566423598, | |
| "loss": 0.7679, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.5320328086898692, | |
| "grad_norm": 0.06719771772623062, | |
| "learning_rate": 0.00018882798299380864, | |
| "loss": 0.7121, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.537574817113722, | |
| "grad_norm": 0.09019796550273895, | |
| "learning_rate": 0.00018855159054174093, | |
| "loss": 0.7754, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.5431168255375748, | |
| "grad_norm": 0.08144286274909973, | |
| "learning_rate": 0.0001882720282020732, | |
| "loss": 0.7255, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.5486588339614277, | |
| "grad_norm": 0.08412271738052368, | |
| "learning_rate": 0.0001879893059823185, | |
| "loss": 0.7722, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.5542008423852804, | |
| "grad_norm": 0.09016039222478867, | |
| "learning_rate": 0.0001877034340031042, | |
| "loss": 0.7275, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5597428508091332, | |
| "grad_norm": 0.08850298821926117, | |
| "learning_rate": 0.00018741442249781, | |
| "loss": 0.7828, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.565284859232986, | |
| "grad_norm": 0.06989564746618271, | |
| "learning_rate": 0.00018712228181220128, | |
| "loss": 0.7111, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.5708268676568389, | |
| "grad_norm": 0.09214618802070618, | |
| "learning_rate": 0.00018682702240405906, | |
| "loss": 0.7752, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.5763688760806917, | |
| "grad_norm": 0.07766986638307571, | |
| "learning_rate": 0.0001865286548428054, | |
| "loss": 0.7108, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.5819108845045444, | |
| "grad_norm": 0.07919591665267944, | |
| "learning_rate": 0.00018622718980912514, | |
| "loss": 0.775, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.5874528929283972, | |
| "grad_norm": 0.07524783164262772, | |
| "learning_rate": 0.00018592263809458361, | |
| "loss": 0.6941, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.5929949013522501, | |
| "grad_norm": 0.08549198508262634, | |
| "learning_rate": 0.00018561501060124024, | |
| "loss": 0.7718, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.5985369097761029, | |
| "grad_norm": 0.08182788640260696, | |
| "learning_rate": 0.0001853043183412584, | |
| "loss": 0.7072, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.6040789181999556, | |
| "grad_norm": 0.084741972386837, | |
| "learning_rate": 0.00018499057243651096, | |
| "loss": 0.7478, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.6096209266238085, | |
| "grad_norm": 0.06824459880590439, | |
| "learning_rate": 0.0001846737841181825, | |
| "loss": 0.7238, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.6151629350476613, | |
| "grad_norm": 0.08315033465623856, | |
| "learning_rate": 0.00018435396472636704, | |
| "loss": 0.7597, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.6207049434715141, | |
| "grad_norm": 0.07116558402776718, | |
| "learning_rate": 0.00018403112570966216, | |
| "loss": 0.7096, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.6262469518953668, | |
| "grad_norm": 0.08500215411186218, | |
| "learning_rate": 0.00018370527862475916, | |
| "loss": 0.756, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.6317889603192197, | |
| "grad_norm": 0.07979004830121994, | |
| "learning_rate": 0.00018337643513602933, | |
| "loss": 0.6886, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.6373309687430725, | |
| "grad_norm": 0.08140358328819275, | |
| "learning_rate": 0.00018304460701510652, | |
| "loss": 0.7648, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.6428729771669253, | |
| "grad_norm": 0.07779423147439957, | |
| "learning_rate": 0.0001827098061404656, | |
| "loss": 0.7222, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.6484149855907781, | |
| "grad_norm": 0.08853591978549957, | |
| "learning_rate": 0.0001823720444969974, | |
| "loss": 0.7736, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.6539569940146309, | |
| "grad_norm": 0.07350102066993713, | |
| "learning_rate": 0.0001820313341755795, | |
| "loss": 0.7256, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.6594990024384837, | |
| "grad_norm": 0.08152145147323608, | |
| "learning_rate": 0.0001816876873726436, | |
| "loss": 0.7598, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.6650410108623365, | |
| "grad_norm": 0.08045897632837296, | |
| "learning_rate": 0.00018134111638973876, | |
| "loss": 0.7275, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6705830192861894, | |
| "grad_norm": 0.08514434099197388, | |
| "learning_rate": 0.00018099163363309123, | |
| "loss": 0.7688, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.6761250277100421, | |
| "grad_norm": 0.060850344598293304, | |
| "learning_rate": 0.00018063925161316012, | |
| "loss": 0.7019, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.6816670361338949, | |
| "grad_norm": 0.08471492677927017, | |
| "learning_rate": 0.00018028398294418977, | |
| "loss": 0.7573, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.6872090445577477, | |
| "grad_norm": 0.0642291009426117, | |
| "learning_rate": 0.00017992584034375798, | |
| "loss": 0.7108, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.6927510529816006, | |
| "grad_norm": 0.09357668459415436, | |
| "learning_rate": 0.000179564836632321, | |
| "loss": 0.7478, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.6982930614054533, | |
| "grad_norm": 0.07198700308799744, | |
| "learning_rate": 0.00017920098473275445, | |
| "loss": 0.6973, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.7038350698293061, | |
| "grad_norm": 0.08420095592737198, | |
| "learning_rate": 0.00017883429766989064, | |
| "loss": 0.7487, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.709377078253159, | |
| "grad_norm": 0.06639819592237473, | |
| "learning_rate": 0.00017846478857005255, | |
| "loss": 0.6741, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.7149190866770118, | |
| "grad_norm": 0.08200914412736893, | |
| "learning_rate": 0.00017809247066058378, | |
| "loss": 0.7526, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.7204610951008645, | |
| "grad_norm": 0.07311141490936279, | |
| "learning_rate": 0.0001777173572693751, | |
| "loss": 0.677, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.7260031035247173, | |
| "grad_norm": 0.08722089231014252, | |
| "learning_rate": 0.00017733946182438726, | |
| "loss": 0.7585, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.7315451119485702, | |
| "grad_norm": 0.06589449942111969, | |
| "learning_rate": 0.00017695879785317048, | |
| "loss": 0.708, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.737087120372423, | |
| "grad_norm": 0.08262074738740921, | |
| "learning_rate": 0.0001765753789823801, | |
| "loss": 0.749, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.7426291287962757, | |
| "grad_norm": 0.07514823973178864, | |
| "learning_rate": 0.00017618921893728867, | |
| "loss": 0.6918, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.7481711372201286, | |
| "grad_norm": 0.08757175505161285, | |
| "learning_rate": 0.00017580033154129503, | |
| "loss": 0.7445, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.7537131456439814, | |
| "grad_norm": 0.0716458335518837, | |
| "learning_rate": 0.0001754087307154289, | |
| "loss": 0.7122, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.7592551540678342, | |
| "grad_norm": 0.08453212678432465, | |
| "learning_rate": 0.00017501443047785296, | |
| "loss": 0.7656, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.7647971624916869, | |
| "grad_norm": 0.06761575490236282, | |
| "learning_rate": 0.00017461744494336098, | |
| "loss": 0.6673, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.7703391709155398, | |
| "grad_norm": 0.08577297627925873, | |
| "learning_rate": 0.0001742177883228724, | |
| "loss": 0.7494, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.7758811793393926, | |
| "grad_norm": 0.05691730976104736, | |
| "learning_rate": 0.00017381547492292376, | |
| "loss": 0.6972, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7814231877632454, | |
| "grad_norm": 0.09115194529294968, | |
| "learning_rate": 0.00017341051914515656, | |
| "loss": 0.7706, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.7869651961870983, | |
| "grad_norm": 0.07214304804801941, | |
| "learning_rate": 0.00017300293548580162, | |
| "loss": 0.6807, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.792507204610951, | |
| "grad_norm": 0.08448139578104019, | |
| "learning_rate": 0.00017259273853516028, | |
| "loss": 0.7661, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.7980492130348038, | |
| "grad_norm": 0.08282499015331268, | |
| "learning_rate": 0.00017217994297708195, | |
| "loss": 0.7391, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.8035912214586566, | |
| "grad_norm": 0.0804004818201065, | |
| "learning_rate": 0.00017176456358843875, | |
| "loss": 0.7402, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.8091332298825095, | |
| "grad_norm": 0.07265755534172058, | |
| "learning_rate": 0.00017134661523859622, | |
| "loss": 0.7019, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.8146752383063622, | |
| "grad_norm": 0.08803457766771317, | |
| "learning_rate": 0.00017092611288888125, | |
| "loss": 0.7572, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.820217246730215, | |
| "grad_norm": 0.0652441680431366, | |
| "learning_rate": 0.0001705030715920464, | |
| "loss": 0.706, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.8257592551540678, | |
| "grad_norm": 0.08185753971338272, | |
| "learning_rate": 0.0001700775064917312, | |
| "loss": 0.764, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.8313012635779207, | |
| "grad_norm": 0.0859500914812088, | |
| "learning_rate": 0.00016964943282191984, | |
| "loss": 0.6927, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.8368432720017734, | |
| "grad_norm": 0.09176376461982727, | |
| "learning_rate": 0.00016921886590639602, | |
| "loss": 0.7567, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.8423852804256262, | |
| "grad_norm": 0.0646485984325409, | |
| "learning_rate": 0.0001687858211581943, | |
| "loss": 0.6848, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.8479272888494791, | |
| "grad_norm": 0.08545655012130737, | |
| "learning_rate": 0.00016835031407904839, | |
| "loss": 0.7546, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.8534692972733319, | |
| "grad_norm": 0.06338818371295929, | |
| "learning_rate": 0.00016791236025883626, | |
| "loss": 0.6655, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.8590113056971846, | |
| "grad_norm": 0.08781229704618454, | |
| "learning_rate": 0.00016747197537502205, | |
| "loss": 0.7441, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.8645533141210374, | |
| "grad_norm": 0.06220358610153198, | |
| "learning_rate": 0.00016702917519209487, | |
| "loss": 0.6795, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.8700953225448903, | |
| "grad_norm": 0.08917712420225143, | |
| "learning_rate": 0.0001665839755610044, | |
| "loss": 0.7552, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.8756373309687431, | |
| "grad_norm": 0.06624036282300949, | |
| "learning_rate": 0.00016613639241859355, | |
| "loss": 0.6632, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.8811793393925959, | |
| "grad_norm": 0.08898719400167465, | |
| "learning_rate": 0.00016568644178702803, | |
| "loss": 0.757, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.8867213478164487, | |
| "grad_norm": 0.05095354840159416, | |
| "learning_rate": 0.0001652341397732227, | |
| "loss": 0.6992, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8922633562403015, | |
| "grad_norm": 0.08842916786670685, | |
| "learning_rate": 0.0001647795025682649, | |
| "loss": 0.7504, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.8978053646641543, | |
| "grad_norm": 0.0758206844329834, | |
| "learning_rate": 0.00016432254644683516, | |
| "loss": 0.7081, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.903347373088007, | |
| "grad_norm": 0.0940496176481247, | |
| "learning_rate": 0.0001638632877666243, | |
| "loss": 0.746, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.9088893815118599, | |
| "grad_norm": 0.06626766920089722, | |
| "learning_rate": 0.00016340174296774804, | |
| "loss": 0.6647, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.9144313899357127, | |
| "grad_norm": 0.08919317275285721, | |
| "learning_rate": 0.00016293792857215844, | |
| "loss": 0.7516, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.9199733983595655, | |
| "grad_norm": 0.06990760564804077, | |
| "learning_rate": 0.00016247186118305252, | |
| "loss": 0.7011, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.9255154067834183, | |
| "grad_norm": 0.0870794802904129, | |
| "learning_rate": 0.00016200355748427782, | |
| "loss": 0.7529, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.9310574152072711, | |
| "grad_norm": 0.06882854551076889, | |
| "learning_rate": 0.00016153303423973526, | |
| "loss": 0.7005, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.9365994236311239, | |
| "grad_norm": 0.084992416203022, | |
| "learning_rate": 0.0001610603082927789, | |
| "loss": 0.7519, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.9421414320549767, | |
| "grad_norm": 0.0638299211859703, | |
| "learning_rate": 0.00016058539656561323, | |
| "loss": 0.716, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.9476834404788296, | |
| "grad_norm": 0.08899606764316559, | |
| "learning_rate": 0.00016010831605868715, | |
| "loss": 0.7257, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.9532254489026823, | |
| "grad_norm": 0.06550378352403641, | |
| "learning_rate": 0.00015962908385008565, | |
| "loss": 0.7174, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.9587674573265351, | |
| "grad_norm": 0.09001540392637253, | |
| "learning_rate": 0.00015914771709491828, | |
| "loss": 0.7271, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.9643094657503879, | |
| "grad_norm": 0.06641615182161331, | |
| "learning_rate": 0.000158664233024705, | |
| "loss": 0.69, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.9698514741742408, | |
| "grad_norm": 0.08917039632797241, | |
| "learning_rate": 0.0001581786489467596, | |
| "loss": 0.7483, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.9753934825980936, | |
| "grad_norm": 0.05995697155594826, | |
| "learning_rate": 0.00015769098224356992, | |
| "loss": 0.7033, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.9809354910219463, | |
| "grad_norm": 0.08998765051364899, | |
| "learning_rate": 0.00015720125037217572, | |
| "loss": 0.7462, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.9864774994457992, | |
| "grad_norm": 0.05868702754378319, | |
| "learning_rate": 0.00015670947086354376, | |
| "loss": 0.6654, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.992019507869652, | |
| "grad_norm": 0.0880926102399826, | |
| "learning_rate": 0.00015621566132194005, | |
| "loss": 0.752, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.9975615162935048, | |
| "grad_norm": 0.08538970351219177, | |
| "learning_rate": 0.00015571983942430005, | |
| "loss": 0.7338, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.0031035247173576, | |
| "grad_norm": 0.0827050730586052, | |
| "learning_rate": 0.0001552220229195956, | |
| "loss": 0.7174, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 1.0086455331412103, | |
| "grad_norm": 0.10867294669151306, | |
| "learning_rate": 0.00015472222962819955, | |
| "loss": 0.7637, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.0141875415650632, | |
| "grad_norm": 0.08738269656896591, | |
| "learning_rate": 0.00015422047744124802, | |
| "loss": 0.6247, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 1.019729549988916, | |
| "grad_norm": 0.12865987420082092, | |
| "learning_rate": 0.0001537167843199998, | |
| "loss": 0.7424, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.0252715584127687, | |
| "grad_norm": 0.08619695156812668, | |
| "learning_rate": 0.00015321116829519345, | |
| "loss": 0.6461, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 1.0308135668366216, | |
| "grad_norm": 0.11726492643356323, | |
| "learning_rate": 0.0001527036474664019, | |
| "loss": 0.7433, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.0363555752604743, | |
| "grad_norm": 0.08198727667331696, | |
| "learning_rate": 0.0001521942400013844, | |
| "loss": 0.6086, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 1.0418975836843272, | |
| "grad_norm": 0.11951526254415512, | |
| "learning_rate": 0.00015168296413543635, | |
| "loss": 0.7521, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.04743959210818, | |
| "grad_norm": 0.08714735507965088, | |
| "learning_rate": 0.0001511698381707363, | |
| "loss": 0.631, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 1.0529816005320327, | |
| "grad_norm": 0.13869455456733704, | |
| "learning_rate": 0.00015065488047569107, | |
| "loss": 0.7524, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.0585236089558856, | |
| "grad_norm": 0.08524268865585327, | |
| "learning_rate": 0.00015013810948427794, | |
| "loss": 0.6617, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 1.0640656173797385, | |
| "grad_norm": 0.11017199605703354, | |
| "learning_rate": 0.00014961954369538494, | |
| "loss": 0.7598, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.0696076258035911, | |
| "grad_norm": 0.0834374874830246, | |
| "learning_rate": 0.00014909920167214858, | |
| "loss": 0.627, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 1.075149634227444, | |
| "grad_norm": 0.1357167363166809, | |
| "learning_rate": 0.0001485771020412894, | |
| "loss": 0.7466, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.080691642651297, | |
| "grad_norm": 0.08910629153251648, | |
| "learning_rate": 0.00014805326349244503, | |
| "loss": 0.6238, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 1.0862336510751496, | |
| "grad_norm": 0.10706546157598495, | |
| "learning_rate": 0.00014752770477750144, | |
| "loss": 0.7533, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.0917756594990025, | |
| "grad_norm": 0.09201759845018387, | |
| "learning_rate": 0.00014700044470992136, | |
| "loss": 0.6521, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 1.0973176679228553, | |
| "grad_norm": 0.14048361778259277, | |
| "learning_rate": 0.00014647150216407106, | |
| "loss": 0.7412, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.102859676346708, | |
| "grad_norm": 0.08308299630880356, | |
| "learning_rate": 0.00014594089607454454, | |
| "loss": 0.6333, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 1.108401684770561, | |
| "grad_norm": 0.12057497352361679, | |
| "learning_rate": 0.00014540864543548582, | |
| "loss": 0.7538, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 13533, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.4028324505774285e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |