| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.0046453183320519995, | |
| "eval_steps": 2000, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 2.3226591660259998e-05, | |
| "grad_norm": 0.43654176592826843, | |
| "learning_rate": 0.0009999930320225019, | |
| "loss": 1.0703, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 4.6453183320519996e-05, | |
| "grad_norm": 0.29478368163108826, | |
| "learning_rate": 0.0009999852898252817, | |
| "loss": 1.1726, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 6.967977498078e-05, | |
| "grad_norm": 0.30410653352737427, | |
| "learning_rate": 0.0009999775476280618, | |
| "loss": 1.106, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 9.290636664103999e-05, | |
| "grad_norm": 0.3648824989795685, | |
| "learning_rate": 0.0009999698054308417, | |
| "loss": 1.1939, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.00011613295830129999, | |
| "grad_norm": 0.430895060300827, | |
| "learning_rate": 0.0009999620632336215, | |
| "loss": 1.2002, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.00013935954996156, | |
| "grad_norm": 0.3720713257789612, | |
| "learning_rate": 0.0009999543210364014, | |
| "loss": 1.0248, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.00016258614162182, | |
| "grad_norm": 0.354899138212204, | |
| "learning_rate": 0.0009999465788391815, | |
| "loss": 1.1271, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.00018581273328207998, | |
| "grad_norm": 0.35504820942878723, | |
| "learning_rate": 0.0009999388366419614, | |
| "loss": 1.1396, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.00020903932494234, | |
| "grad_norm": 0.4521724581718445, | |
| "learning_rate": 0.0009999310944447412, | |
| "loss": 1.1032, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.00023226591660259997, | |
| "grad_norm": 0.2742864787578583, | |
| "learning_rate": 0.000999923352247521, | |
| "loss": 1.1479, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.00025549250826286, | |
| "grad_norm": 0.41575589776039124, | |
| "learning_rate": 0.0009999156100503012, | |
| "loss": 1.1837, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.00027871909992312, | |
| "grad_norm": 0.27715566754341125, | |
| "learning_rate": 0.000999907867853081, | |
| "loss": 1.1597, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.00030194569158338, | |
| "grad_norm": 0.4537408649921417, | |
| "learning_rate": 0.000999900125655861, | |
| "loss": 1.1183, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.00032517228324364, | |
| "grad_norm": 0.2952319383621216, | |
| "learning_rate": 0.0009998923834586408, | |
| "loss": 1.1501, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.00034839887490389996, | |
| "grad_norm": 0.38295623660087585, | |
| "learning_rate": 0.0009998846412614208, | |
| "loss": 1.1381, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.00037162546656415997, | |
| "grad_norm": 0.3845287561416626, | |
| "learning_rate": 0.0009998768990642007, | |
| "loss": 1.0968, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.00039485205822442, | |
| "grad_norm": 0.25896570086479187, | |
| "learning_rate": 0.0009998691568669806, | |
| "loss": 1.0684, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.00041807864988468, | |
| "grad_norm": 0.2440153807401657, | |
| "learning_rate": 0.0009998614146697604, | |
| "loss": 1.1281, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.00044130524154494, | |
| "grad_norm": 0.3060740530490875, | |
| "learning_rate": 0.0009998536724725403, | |
| "loss": 1.1285, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.00046453183320519995, | |
| "grad_norm": 0.2703372538089752, | |
| "learning_rate": 0.0009998459302753204, | |
| "loss": 1.2085, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.00048775842486545995, | |
| "grad_norm": 0.3993639647960663, | |
| "learning_rate": 0.0009998381880781003, | |
| "loss": 1.2365, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.00051098501652572, | |
| "grad_norm": 0.41694164276123047, | |
| "learning_rate": 0.0009998304458808801, | |
| "loss": 1.161, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0005342116081859799, | |
| "grad_norm": 0.2720717191696167, | |
| "learning_rate": 0.00099982270368366, | |
| "loss": 1.0553, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.00055743819984624, | |
| "grad_norm": 0.3238905072212219, | |
| "learning_rate": 0.0009998149614864399, | |
| "loss": 1.0825, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0005806647915064999, | |
| "grad_norm": 0.39301878213882446, | |
| "learning_rate": 0.00099980721928922, | |
| "loss": 1.1421, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.00060389138316676, | |
| "grad_norm": 0.25302958488464355, | |
| "learning_rate": 0.0009997994770919998, | |
| "loss": 1.0533, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.00062711797482702, | |
| "grad_norm": 0.29384830594062805, | |
| "learning_rate": 0.0009997917348947797, | |
| "loss": 1.1011, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.00065034456648728, | |
| "grad_norm": 0.35217076539993286, | |
| "learning_rate": 0.0009997839926975595, | |
| "loss": 1.0289, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.00067357115814754, | |
| "grad_norm": 0.3412124216556549, | |
| "learning_rate": 0.0009997762505003394, | |
| "loss": 1.0974, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.0006967977498077999, | |
| "grad_norm": 0.2988780736923218, | |
| "learning_rate": 0.0009997685083031195, | |
| "loss": 1.1618, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.00072002434146806, | |
| "grad_norm": 0.43221724033355713, | |
| "learning_rate": 0.0009997607661058994, | |
| "loss": 1.1023, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.0007432509331283199, | |
| "grad_norm": 0.2644006013870239, | |
| "learning_rate": 0.0009997530239086792, | |
| "loss": 1.1548, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.00076647752478858, | |
| "grad_norm": 0.2950528860092163, | |
| "learning_rate": 0.000999745281711459, | |
| "loss": 1.1203, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.00078970411644884, | |
| "grad_norm": 0.20538517832756042, | |
| "learning_rate": 0.0009997375395142392, | |
| "loss": 1.0904, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.0008129307081090999, | |
| "grad_norm": 0.3531719446182251, | |
| "learning_rate": 0.000999729797317019, | |
| "loss": 1.0951, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.00083615729976936, | |
| "grad_norm": 0.3661258816719055, | |
| "learning_rate": 0.000999722055119799, | |
| "loss": 1.0885, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0008593838914296199, | |
| "grad_norm": 0.4355231523513794, | |
| "learning_rate": 0.0009997143129225788, | |
| "loss": 1.1301, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.00088261048308988, | |
| "grad_norm": 0.3286990225315094, | |
| "learning_rate": 0.0009997065707253588, | |
| "loss": 1.0705, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.0009058370747501399, | |
| "grad_norm": 0.31140822172164917, | |
| "learning_rate": 0.0009996988285281387, | |
| "loss": 1.1873, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.0009290636664103999, | |
| "grad_norm": 0.2582302689552307, | |
| "learning_rate": 0.0009996910863309186, | |
| "loss": 1.1567, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.00095229025807066, | |
| "grad_norm": 0.36799147725105286, | |
| "learning_rate": 0.0009996833441336984, | |
| "loss": 1.2273, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0009755168497309199, | |
| "grad_norm": 0.28618550300598145, | |
| "learning_rate": 0.0009996756019364785, | |
| "loss": 1.0851, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.00099874344139118, | |
| "grad_norm": 0.3006650507450104, | |
| "learning_rate": 0.0009996678597392584, | |
| "loss": 1.0341, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.00102197003305144, | |
| "grad_norm": 0.3651888072490692, | |
| "learning_rate": 0.0009996601175420383, | |
| "loss": 1.0212, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.0010451966247116999, | |
| "grad_norm": 0.32596904039382935, | |
| "learning_rate": 0.0009996523753448181, | |
| "loss": 1.0919, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0010684232163719598, | |
| "grad_norm": 0.30658453702926636, | |
| "learning_rate": 0.000999644633147598, | |
| "loss": 1.0934, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.00109164980803222, | |
| "grad_norm": 0.49543142318725586, | |
| "learning_rate": 0.0009996368909503779, | |
| "loss": 1.1603, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.00111487639969248, | |
| "grad_norm": 0.24394716322422028, | |
| "learning_rate": 0.000999629148753158, | |
| "loss": 1.1455, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.00113810299135274, | |
| "grad_norm": 0.38373667001724243, | |
| "learning_rate": 0.0009996214065559378, | |
| "loss": 1.1498, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.0011613295830129999, | |
| "grad_norm": 0.5020566582679749, | |
| "learning_rate": 0.0009996136643587177, | |
| "loss": 1.076, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0011845561746732598, | |
| "grad_norm": 0.3413016200065613, | |
| "learning_rate": 0.0009996059221614975, | |
| "loss": 1.1747, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.00120778276633352, | |
| "grad_norm": 0.3450530171394348, | |
| "learning_rate": 0.0009995981799642774, | |
| "loss": 1.1441, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.00123100935799378, | |
| "grad_norm": 0.3582036793231964, | |
| "learning_rate": 0.0009995904377670575, | |
| "loss": 1.1679, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.00125423594965404, | |
| "grad_norm": 0.30296868085861206, | |
| "learning_rate": 0.0009995826955698373, | |
| "loss": 1.0446, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.0012774625413142999, | |
| "grad_norm": 0.3772015869617462, | |
| "learning_rate": 0.0009995749533726172, | |
| "loss": 1.1239, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.00130068913297456, | |
| "grad_norm": 0.3441556692123413, | |
| "learning_rate": 0.000999567211175397, | |
| "loss": 1.112, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.00132391572463482, | |
| "grad_norm": 0.3211918771266937, | |
| "learning_rate": 0.0009995594689781772, | |
| "loss": 1.1344, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.00134714231629508, | |
| "grad_norm": 0.2808244824409485, | |
| "learning_rate": 0.000999551726780957, | |
| "loss": 1.1398, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.0013703689079553399, | |
| "grad_norm": 0.32571667432785034, | |
| "learning_rate": 0.000999543984583737, | |
| "loss": 1.1455, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.0013935954996155998, | |
| "grad_norm": 0.3554767668247223, | |
| "learning_rate": 0.0009995362423865168, | |
| "loss": 0.991, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.00141682209127586, | |
| "grad_norm": 0.253456711769104, | |
| "learning_rate": 0.0009995285001892968, | |
| "loss": 1.1686, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.00144004868293612, | |
| "grad_norm": 0.31393057107925415, | |
| "learning_rate": 0.0009995207579920767, | |
| "loss": 1.1034, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.00146327527459638, | |
| "grad_norm": 0.3797680735588074, | |
| "learning_rate": 0.0009995130157948566, | |
| "loss": 1.1224, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.0014865018662566399, | |
| "grad_norm": 0.3667146563529968, | |
| "learning_rate": 0.0009995052735976364, | |
| "loss": 1.1484, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.0015097284579168998, | |
| "grad_norm": 0.28348517417907715, | |
| "learning_rate": 0.0009994975314004165, | |
| "loss": 1.2004, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.00153295504957716, | |
| "grad_norm": 0.4176248610019684, | |
| "learning_rate": 0.0009994897892031964, | |
| "loss": 1.1415, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.00155618164123742, | |
| "grad_norm": 0.3170236647129059, | |
| "learning_rate": 0.0009994820470059763, | |
| "loss": 1.0853, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.00157940823289768, | |
| "grad_norm": 0.31185317039489746, | |
| "learning_rate": 0.0009994743048087561, | |
| "loss": 1.1353, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.0016026348245579399, | |
| "grad_norm": 0.33214762806892395, | |
| "learning_rate": 0.000999466562611536, | |
| "loss": 1.1504, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.0016258614162181998, | |
| "grad_norm": 0.3761586844921112, | |
| "learning_rate": 0.000999458820414316, | |
| "loss": 1.0549, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.00164908800787846, | |
| "grad_norm": 0.2806662619113922, | |
| "learning_rate": 0.000999451078217096, | |
| "loss": 1.1859, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.00167231459953872, | |
| "grad_norm": 0.39696329832077026, | |
| "learning_rate": 0.0009994433360198758, | |
| "loss": 1.1716, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.0016955411911989799, | |
| "grad_norm": 0.28009161353111267, | |
| "learning_rate": 0.0009994355938226557, | |
| "loss": 1.1932, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.0017187677828592398, | |
| "grad_norm": 0.2747149169445038, | |
| "learning_rate": 0.0009994278516254355, | |
| "loss": 1.0847, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.0017419943745194998, | |
| "grad_norm": 0.30023542046546936, | |
| "learning_rate": 0.0009994201094282154, | |
| "loss": 1.0696, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.00176522096617976, | |
| "grad_norm": 0.3453909158706665, | |
| "learning_rate": 0.0009994123672309955, | |
| "loss": 1.0967, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.00178844755784002, | |
| "grad_norm": 0.49272191524505615, | |
| "learning_rate": 0.0009994046250337753, | |
| "loss": 1.0573, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.0018116741495002799, | |
| "grad_norm": 0.2652382254600525, | |
| "learning_rate": 0.0009993968828365552, | |
| "loss": 1.1404, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.0018349007411605398, | |
| "grad_norm": 0.25675663352012634, | |
| "learning_rate": 0.000999389140639335, | |
| "loss": 1.0459, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.0018581273328207998, | |
| "grad_norm": 0.3685920834541321, | |
| "learning_rate": 0.0009993813984421152, | |
| "loss": 1.0117, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.00188135392448106, | |
| "grad_norm": 0.3216955363750458, | |
| "learning_rate": 0.000999373656244895, | |
| "loss": 1.1672, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.00190458051614132, | |
| "grad_norm": 0.4081834852695465, | |
| "learning_rate": 0.000999365914047675, | |
| "loss": 1.1555, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.0019278071078015799, | |
| "grad_norm": 0.3144775927066803, | |
| "learning_rate": 0.0009993581718504548, | |
| "loss": 1.2002, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.0019510336994618398, | |
| "grad_norm": 0.3642594814300537, | |
| "learning_rate": 0.0009993504296532348, | |
| "loss": 1.0547, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.0019742602911220998, | |
| "grad_norm": 0.3856127858161926, | |
| "learning_rate": 0.0009993426874560147, | |
| "loss": 1.2028, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.00199748688278236, | |
| "grad_norm": 0.41429170966148376, | |
| "learning_rate": 0.0009993349452587946, | |
| "loss": 1.0857, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.0020207134744426197, | |
| "grad_norm": 0.4278993606567383, | |
| "learning_rate": 0.0009993272030615744, | |
| "loss": 1.0574, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.00204394006610288, | |
| "grad_norm": 0.26868101954460144, | |
| "learning_rate": 0.0009993194608643545, | |
| "loss": 1.0538, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.00206716665776314, | |
| "grad_norm": 0.8726014494895935, | |
| "learning_rate": 0.0009993117186671344, | |
| "loss": 1.2263, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.0020903932494233998, | |
| "grad_norm": 0.39568719267845154, | |
| "learning_rate": 0.0009993039764699143, | |
| "loss": 1.1606, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.00211361984108366, | |
| "grad_norm": 0.3933831751346588, | |
| "learning_rate": 0.0009992962342726941, | |
| "loss": 1.1263, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.0021368464327439197, | |
| "grad_norm": 0.4326261579990387, | |
| "learning_rate": 0.000999288492075474, | |
| "loss": 1.0729, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.00216007302440418, | |
| "grad_norm": 0.3416406810283661, | |
| "learning_rate": 0.000999280749878254, | |
| "loss": 1.1538, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.00218329961606444, | |
| "grad_norm": 0.338379830121994, | |
| "learning_rate": 0.000999273007681034, | |
| "loss": 1.0347, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.0022065262077246997, | |
| "grad_norm": 0.34776318073272705, | |
| "learning_rate": 0.0009992652654838138, | |
| "loss": 1.1322, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.00222975279938496, | |
| "grad_norm": 0.23187178373336792, | |
| "learning_rate": 0.0009992575232865937, | |
| "loss": 1.0574, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.0022529793910452196, | |
| "grad_norm": 0.3015563189983368, | |
| "learning_rate": 0.0009992497810893735, | |
| "loss": 1.0911, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.00227620598270548, | |
| "grad_norm": 0.31411874294281006, | |
| "learning_rate": 0.0009992420388921534, | |
| "loss": 1.1008, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.00229943257436574, | |
| "grad_norm": 0.4988269805908203, | |
| "learning_rate": 0.0009992342966949335, | |
| "loss": 1.1292, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.0023226591660259997, | |
| "grad_norm": 0.3398004472255707, | |
| "learning_rate": 0.0009992265544977133, | |
| "loss": 1.1665, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.00234588575768626, | |
| "grad_norm": 0.32879185676574707, | |
| "learning_rate": 0.0009992188123004932, | |
| "loss": 1.1131, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.0023691123493465196, | |
| "grad_norm": 0.40583041310310364, | |
| "learning_rate": 0.000999211070103273, | |
| "loss": 1.0571, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.00239233894100678, | |
| "grad_norm": 0.3514922559261322, | |
| "learning_rate": 0.0009992033279060532, | |
| "loss": 1.1166, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.00241556553266704, | |
| "grad_norm": 1.3851335048675537, | |
| "learning_rate": 0.000999195585708833, | |
| "loss": 1.0532, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.0024387921243272997, | |
| "grad_norm": 0.5054768919944763, | |
| "learning_rate": 0.000999187843511613, | |
| "loss": 1.16, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.00246201871598756, | |
| "grad_norm": 0.37074124813079834, | |
| "learning_rate": 0.0009991801013143928, | |
| "loss": 1.2028, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.0024852453076478196, | |
| "grad_norm": 0.3337225615978241, | |
| "learning_rate": 0.0009991723591171728, | |
| "loss": 1.1109, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.00250847189930808, | |
| "grad_norm": 0.283372163772583, | |
| "learning_rate": 0.0009991646169199527, | |
| "loss": 1.063, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.00253169849096834, | |
| "grad_norm": 0.3113659620285034, | |
| "learning_rate": 0.0009991568747227326, | |
| "loss": 1.1027, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.0025549250826285997, | |
| "grad_norm": 0.43556565046310425, | |
| "learning_rate": 0.0009991491325255124, | |
| "loss": 1.1181, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.00257815167428886, | |
| "grad_norm": 0.3736826479434967, | |
| "learning_rate": 0.0009991413903282925, | |
| "loss": 1.1035, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.00260137826594912, | |
| "grad_norm": 0.3376559913158417, | |
| "learning_rate": 0.0009991336481310724, | |
| "loss": 1.0149, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.0026246048576093798, | |
| "grad_norm": 0.3545368015766144, | |
| "learning_rate": 0.0009991259059338523, | |
| "loss": 1.1472, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.00264783144926964, | |
| "grad_norm": 0.2400045394897461, | |
| "learning_rate": 0.0009991181637366321, | |
| "loss": 1.1423, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.0026710580409298997, | |
| "grad_norm": 0.37132346630096436, | |
| "learning_rate": 0.0009991104215394122, | |
| "loss": 1.1802, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.00269428463259016, | |
| "grad_norm": 0.26770955324172974, | |
| "learning_rate": 0.000999102679342192, | |
| "loss": 1.0859, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.00271751122425042, | |
| "grad_norm": 0.3567134439945221, | |
| "learning_rate": 0.000999094937144972, | |
| "loss": 1.1699, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.0027407378159106798, | |
| "grad_norm": 0.3370940387248993, | |
| "learning_rate": 0.0009990871949477518, | |
| "loss": 1.2679, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.00276396440757094, | |
| "grad_norm": 0.3533010184764862, | |
| "learning_rate": 0.0009990794527505317, | |
| "loss": 1.1444, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.0027871909992311997, | |
| "grad_norm": 0.227728933095932, | |
| "learning_rate": 0.0009990717105533115, | |
| "loss": 1.1105, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.00281041759089146, | |
| "grad_norm": 0.39945659041404724, | |
| "learning_rate": 0.0009990639683560916, | |
| "loss": 1.0122, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.00283364418255172, | |
| "grad_norm": 0.38961905241012573, | |
| "learning_rate": 0.0009990562261588715, | |
| "loss": 1.1677, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.0028568707742119798, | |
| "grad_norm": 0.35965076088905334, | |
| "learning_rate": 0.0009990484839616513, | |
| "loss": 1.2045, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.00288009736587224, | |
| "grad_norm": 0.3876691460609436, | |
| "learning_rate": 0.0009990407417644312, | |
| "loss": 1.1577, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.0029033239575324997, | |
| "grad_norm": 0.3059842586517334, | |
| "learning_rate": 0.000999032999567211, | |
| "loss": 1.1294, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.00292655054919276, | |
| "grad_norm": 0.31481969356536865, | |
| "learning_rate": 0.0009990252573699912, | |
| "loss": 1.1202, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.00294977714085302, | |
| "grad_norm": 0.3077446222305298, | |
| "learning_rate": 0.000999017515172771, | |
| "loss": 1.0893, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.0029730037325132797, | |
| "grad_norm": 0.30285683274269104, | |
| "learning_rate": 0.000999009772975551, | |
| "loss": 1.0844, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.00299623032417354, | |
| "grad_norm": 0.32145956158638, | |
| "learning_rate": 0.0009990020307783308, | |
| "loss": 1.1524, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.0030194569158337996, | |
| "grad_norm": 0.3908081352710724, | |
| "learning_rate": 0.0009989942885811108, | |
| "loss": 1.104, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.00304268350749406, | |
| "grad_norm": 0.32902881503105164, | |
| "learning_rate": 0.0009989865463838907, | |
| "loss": 1.1161, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.00306591009915432, | |
| "grad_norm": 0.3777260184288025, | |
| "learning_rate": 0.0009989788041866706, | |
| "loss": 1.1623, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.0030891366908145797, | |
| "grad_norm": 0.4204845130443573, | |
| "learning_rate": 0.0009989710619894504, | |
| "loss": 1.1284, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.00311236328247484, | |
| "grad_norm": 0.3189554810523987, | |
| "learning_rate": 0.0009989633197922305, | |
| "loss": 1.104, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.0031355898741350996, | |
| "grad_norm": 0.30896514654159546, | |
| "learning_rate": 0.0009989555775950104, | |
| "loss": 1.1221, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.00315881646579536, | |
| "grad_norm": 1.2486257553100586, | |
| "learning_rate": 0.0009989478353977903, | |
| "loss": 1.2578, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.00318204305745562, | |
| "grad_norm": 0.433830201625824, | |
| "learning_rate": 0.0009989400932005701, | |
| "loss": 1.101, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.0032052696491158797, | |
| "grad_norm": 0.3873724341392517, | |
| "learning_rate": 0.0009989323510033502, | |
| "loss": 1.1509, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.00322849624077614, | |
| "grad_norm": 0.238771453499794, | |
| "learning_rate": 0.00099892460880613, | |
| "loss": 1.171, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.0032517228324363996, | |
| "grad_norm": 0.3480624258518219, | |
| "learning_rate": 0.00099891686660891, | |
| "loss": 1.2122, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.00327494942409666, | |
| "grad_norm": 0.35760608315467834, | |
| "learning_rate": 0.0009989091244116898, | |
| "loss": 1.0479, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.00329817601575692, | |
| "grad_norm": 0.3133438527584076, | |
| "learning_rate": 0.0009989013822144697, | |
| "loss": 1.1176, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.0033214026074171797, | |
| "grad_norm": 0.2956129014492035, | |
| "learning_rate": 0.0009988936400172495, | |
| "loss": 1.12, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.00334462919907744, | |
| "grad_norm": 0.2697290778160095, | |
| "learning_rate": 0.0009988858978200296, | |
| "loss": 1.0247, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.0033678557907376996, | |
| "grad_norm": 0.34495481848716736, | |
| "learning_rate": 0.0009988781556228095, | |
| "loss": 1.0775, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.0033910823823979598, | |
| "grad_norm": 0.29800111055374146, | |
| "learning_rate": 0.0009988704134255893, | |
| "loss": 1.1489, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.00341430897405822, | |
| "grad_norm": 0.29650014638900757, | |
| "learning_rate": 0.0009988626712283692, | |
| "loss": 1.0565, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.0034375355657184797, | |
| "grad_norm": 0.35248780250549316, | |
| "learning_rate": 0.000998854929031149, | |
| "loss": 1.1121, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.00346076215737874, | |
| "grad_norm": 0.2716731131076813, | |
| "learning_rate": 0.0009988471868339292, | |
| "loss": 1.0923, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.0034839887490389996, | |
| "grad_norm": 0.4371800422668457, | |
| "learning_rate": 0.000998839444636709, | |
| "loss": 1.0155, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0035072153406992598, | |
| "grad_norm": 0.2633199691772461, | |
| "learning_rate": 0.0009988317024394889, | |
| "loss": 1.1037, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.00353044193235952, | |
| "grad_norm": 0.2944166362285614, | |
| "learning_rate": 0.0009988239602422688, | |
| "loss": 1.0995, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.0035536685240197797, | |
| "grad_norm": 0.2786024212837219, | |
| "learning_rate": 0.0009988162180450488, | |
| "loss": 1.0641, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.00357689511568004, | |
| "grad_norm": 0.31116756796836853, | |
| "learning_rate": 0.0009988084758478287, | |
| "loss": 1.1015, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.0036001217073402996, | |
| "grad_norm": 0.31829699873924255, | |
| "learning_rate": 0.0009988007336506086, | |
| "loss": 1.0519, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.0036233482990005597, | |
| "grad_norm": 0.4150811433792114, | |
| "learning_rate": 0.0009987929914533884, | |
| "loss": 1.1509, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.00364657489066082, | |
| "grad_norm": 0.2690746784210205, | |
| "learning_rate": 0.0009987852492561685, | |
| "loss": 1.0517, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.0036698014823210797, | |
| "grad_norm": 0.3126815855503082, | |
| "learning_rate": 0.0009987775070589484, | |
| "loss": 1.1398, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.00369302807398134, | |
| "grad_norm": 0.34572452306747437, | |
| "learning_rate": 0.0009987697648617283, | |
| "loss": 1.0342, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.0037162546656415996, | |
| "grad_norm": 0.30171483755111694, | |
| "learning_rate": 0.0009987620226645081, | |
| "loss": 1.0517, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0037394812573018597, | |
| "grad_norm": 0.2483634054660797, | |
| "learning_rate": 0.0009987542804672882, | |
| "loss": 1.1146, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.00376270784896212, | |
| "grad_norm": 0.41606566309928894, | |
| "learning_rate": 0.000998746538270068, | |
| "loss": 1.0997, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.0037859344406223796, | |
| "grad_norm": 0.3014843761920929, | |
| "learning_rate": 0.000998738796072848, | |
| "loss": 1.0975, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.00380916103228264, | |
| "grad_norm": 0.31974515318870544, | |
| "learning_rate": 0.0009987310538756278, | |
| "loss": 1.0963, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.0038323876239428996, | |
| "grad_norm": 0.3185972273349762, | |
| "learning_rate": 0.0009987233116784077, | |
| "loss": 1.1598, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.0038556142156031597, | |
| "grad_norm": 0.3430216908454895, | |
| "learning_rate": 0.0009987155694811877, | |
| "loss": 0.9476, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.00387884080726342, | |
| "grad_norm": 0.4456688165664673, | |
| "learning_rate": 0.0009987078272839676, | |
| "loss": 1.1319, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.0039020673989236796, | |
| "grad_norm": 0.4243941605091095, | |
| "learning_rate": 0.0009987000850867475, | |
| "loss": 1.1765, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.003925293990583939, | |
| "grad_norm": 0.22148986160755157, | |
| "learning_rate": 0.0009986923428895273, | |
| "loss": 1.1305, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.0039485205822441995, | |
| "grad_norm": 0.44649383425712585, | |
| "learning_rate": 0.0009986846006923072, | |
| "loss": 1.1282, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.00397174717390446, | |
| "grad_norm": 0.35965171456336975, | |
| "learning_rate": 0.000998676858495087, | |
| "loss": 1.0997, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.00399497376556472, | |
| "grad_norm": 0.4147953987121582, | |
| "learning_rate": 0.0009986691162978672, | |
| "loss": 1.0682, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.00401820035722498, | |
| "grad_norm": 0.47538864612579346, | |
| "learning_rate": 0.000998661374100647, | |
| "loss": 1.1625, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.004041426948885239, | |
| "grad_norm": 0.3181823194026947, | |
| "learning_rate": 0.0009986536319034269, | |
| "loss": 1.1683, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.0040646535405454995, | |
| "grad_norm": 0.32929712533950806, | |
| "learning_rate": 0.0009986458897062068, | |
| "loss": 1.1306, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.00408788013220576, | |
| "grad_norm": 0.34377196431159973, | |
| "learning_rate": 0.0009986381475089868, | |
| "loss": 1.1267, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.00411110672386602, | |
| "grad_norm": 0.3156042695045471, | |
| "learning_rate": 0.0009986304053117667, | |
| "loss": 1.0523, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.00413433331552628, | |
| "grad_norm": 0.35088011622428894, | |
| "learning_rate": 0.0009986226631145466, | |
| "loss": 1.0075, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.004157559907186539, | |
| "grad_norm": 0.3740438222885132, | |
| "learning_rate": 0.0009986149209173264, | |
| "loss": 1.1788, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.0041807864988467995, | |
| "grad_norm": 0.28393882513046265, | |
| "learning_rate": 0.0009986071787201065, | |
| "loss": 1.0374, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.00420401309050706, | |
| "grad_norm": 0.2916342318058014, | |
| "learning_rate": 0.0009985994365228864, | |
| "loss": 1.0783, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.00422723968216732, | |
| "grad_norm": 0.3398910462856293, | |
| "learning_rate": 0.0009985916943256663, | |
| "loss": 1.129, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.00425046627382758, | |
| "grad_norm": 0.3244156837463379, | |
| "learning_rate": 0.0009985839521284461, | |
| "loss": 1.1812, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.004273692865487839, | |
| "grad_norm": 0.5498040318489075, | |
| "learning_rate": 0.0009985762099312262, | |
| "loss": 1.1812, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.0042969194571480995, | |
| "grad_norm": 0.27574270963668823, | |
| "learning_rate": 0.000998568467734006, | |
| "loss": 1.1414, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.00432014604880836, | |
| "grad_norm": 0.3610564172267914, | |
| "learning_rate": 0.000998560725536786, | |
| "loss": 1.0763, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.00434337264046862, | |
| "grad_norm": 0.33828043937683105, | |
| "learning_rate": 0.0009985529833395658, | |
| "loss": 1.0169, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.00436659923212888, | |
| "grad_norm": 0.22078180313110352, | |
| "learning_rate": 0.0009985452411423457, | |
| "loss": 1.0513, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.004389825823789139, | |
| "grad_norm": 0.4355666935443878, | |
| "learning_rate": 0.0009985374989451257, | |
| "loss": 1.1245, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.0044130524154493995, | |
| "grad_norm": 0.3071712851524353, | |
| "learning_rate": 0.0009985297567479056, | |
| "loss": 1.1669, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.00443627900710966, | |
| "grad_norm": 0.3043074905872345, | |
| "learning_rate": 0.0009985220145506855, | |
| "loss": 1.1917, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.00445950559876992, | |
| "grad_norm": 0.33084383606910706, | |
| "learning_rate": 0.0009985142723534653, | |
| "loss": 1.0819, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.00448273219043018, | |
| "grad_norm": 0.32064658403396606, | |
| "learning_rate": 0.0009985065301562452, | |
| "loss": 1.1362, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.004505958782090439, | |
| "grad_norm": 0.34291279315948486, | |
| "learning_rate": 0.0009984987879590253, | |
| "loss": 1.0888, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.0045291853737506995, | |
| "grad_norm": 0.4338567852973938, | |
| "learning_rate": 0.0009984910457618052, | |
| "loss": 1.0783, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.00455241196541096, | |
| "grad_norm": 0.33047792315483093, | |
| "learning_rate": 0.000998483303564585, | |
| "loss": 1.0977, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.00457563855707122, | |
| "grad_norm": 0.33728134632110596, | |
| "learning_rate": 0.0009984755613673649, | |
| "loss": 1.137, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.00459886514873148, | |
| "grad_norm": 0.27301332354545593, | |
| "learning_rate": 0.0009984678191701448, | |
| "loss": 1.1413, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.004622091740391739, | |
| "grad_norm": 0.2804515063762665, | |
| "learning_rate": 0.0009984600769729248, | |
| "loss": 1.0865, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.0046453183320519995, | |
| "grad_norm": 0.33448469638824463, | |
| "learning_rate": 0.0009984523347757047, | |
| "loss": 1.1526, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1291623, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5412177861554176e+16, | |
| "train_batch_size": 7, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |