{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0046453183320519995, "eval_steps": 2000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.3226591660259998e-05, "grad_norm": 0.43654176592826843, "learning_rate": 0.0009999930320225019, "loss": 1.0703, "step": 10 }, { "epoch": 4.6453183320519996e-05, "grad_norm": 0.29478368163108826, "learning_rate": 0.0009999852898252817, "loss": 1.1726, "step": 20 }, { "epoch": 6.967977498078e-05, "grad_norm": 0.30410653352737427, "learning_rate": 0.0009999775476280618, "loss": 1.106, "step": 30 }, { "epoch": 9.290636664103999e-05, "grad_norm": 0.3648824989795685, "learning_rate": 0.0009999698054308417, "loss": 1.1939, "step": 40 }, { "epoch": 0.00011613295830129999, "grad_norm": 0.430895060300827, "learning_rate": 0.0009999620632336215, "loss": 1.2002, "step": 50 }, { "epoch": 0.00013935954996156, "grad_norm": 0.3720713257789612, "learning_rate": 0.0009999543210364014, "loss": 1.0248, "step": 60 }, { "epoch": 0.00016258614162182, "grad_norm": 0.354899138212204, "learning_rate": 0.0009999465788391815, "loss": 1.1271, "step": 70 }, { "epoch": 0.00018581273328207998, "grad_norm": 0.35504820942878723, "learning_rate": 0.0009999388366419614, "loss": 1.1396, "step": 80 }, { "epoch": 0.00020903932494234, "grad_norm": 0.4521724581718445, "learning_rate": 0.0009999310944447412, "loss": 1.1032, "step": 90 }, { "epoch": 0.00023226591660259997, "grad_norm": 0.2742864787578583, "learning_rate": 0.000999923352247521, "loss": 1.1479, "step": 100 }, { "epoch": 0.00025549250826286, "grad_norm": 0.41575589776039124, "learning_rate": 0.0009999156100503012, "loss": 1.1837, "step": 110 }, { "epoch": 0.00027871909992312, "grad_norm": 0.27715566754341125, "learning_rate": 0.000999907867853081, "loss": 1.1597, "step": 120 }, { "epoch": 0.00030194569158338, "grad_norm": 0.4537408649921417, "learning_rate": 0.000999900125655861, "loss": 1.1183, "step": 130 }, { "epoch": 0.00032517228324364, "grad_norm": 0.2952319383621216, "learning_rate": 0.0009998923834586408, "loss": 1.1501, "step": 140 }, { "epoch": 0.00034839887490389996, "grad_norm": 0.38295623660087585, "learning_rate": 0.0009998846412614208, "loss": 1.1381, "step": 150 }, { "epoch": 0.00037162546656415997, "grad_norm": 0.3845287561416626, "learning_rate": 0.0009998768990642007, "loss": 1.0968, "step": 160 }, { "epoch": 0.00039485205822442, "grad_norm": 0.25896570086479187, "learning_rate": 0.0009998691568669806, "loss": 1.0684, "step": 170 }, { "epoch": 0.00041807864988468, "grad_norm": 0.2440153807401657, "learning_rate": 0.0009998614146697604, "loss": 1.1281, "step": 180 }, { "epoch": 0.00044130524154494, "grad_norm": 0.3060740530490875, "learning_rate": 0.0009998536724725403, "loss": 1.1285, "step": 190 }, { "epoch": 0.00046453183320519995, "grad_norm": 0.2703372538089752, "learning_rate": 0.0009998459302753204, "loss": 1.2085, "step": 200 }, { "epoch": 0.00048775842486545995, "grad_norm": 0.3993639647960663, "learning_rate": 0.0009998381880781003, "loss": 1.2365, "step": 210 }, { "epoch": 0.00051098501652572, "grad_norm": 0.41694164276123047, "learning_rate": 0.0009998304458808801, "loss": 1.161, "step": 220 }, { "epoch": 0.0005342116081859799, "grad_norm": 0.2720717191696167, "learning_rate": 0.00099982270368366, "loss": 1.0553, "step": 230 }, { "epoch": 0.00055743819984624, "grad_norm": 0.3238905072212219, "learning_rate": 0.0009998149614864399, "loss": 1.0825, "step": 240 }, { "epoch": 0.0005806647915064999, "grad_norm": 0.39301878213882446, "learning_rate": 0.00099980721928922, "loss": 1.1421, "step": 250 }, { "epoch": 0.00060389138316676, "grad_norm": 0.25302958488464355, "learning_rate": 0.0009997994770919998, "loss": 1.0533, "step": 260 }, { "epoch": 0.00062711797482702, "grad_norm": 0.29384830594062805, "learning_rate": 0.0009997917348947797, "loss": 1.1011, "step": 270 }, { "epoch": 0.00065034456648728, "grad_norm": 0.35217076539993286, "learning_rate": 0.0009997839926975595, "loss": 1.0289, "step": 280 }, { "epoch": 0.00067357115814754, "grad_norm": 0.3412124216556549, "learning_rate": 0.0009997762505003394, "loss": 1.0974, "step": 290 }, { "epoch": 0.0006967977498077999, "grad_norm": 0.2988780736923218, "learning_rate": 0.0009997685083031195, "loss": 1.1618, "step": 300 }, { "epoch": 0.00072002434146806, "grad_norm": 0.43221724033355713, "learning_rate": 0.0009997607661058994, "loss": 1.1023, "step": 310 }, { "epoch": 0.0007432509331283199, "grad_norm": 0.2644006013870239, "learning_rate": 0.0009997530239086792, "loss": 1.1548, "step": 320 }, { "epoch": 0.00076647752478858, "grad_norm": 0.2950528860092163, "learning_rate": 0.000999745281711459, "loss": 1.1203, "step": 330 }, { "epoch": 0.00078970411644884, "grad_norm": 0.20538517832756042, "learning_rate": 0.0009997375395142392, "loss": 1.0904, "step": 340 }, { "epoch": 0.0008129307081090999, "grad_norm": 0.3531719446182251, "learning_rate": 0.000999729797317019, "loss": 1.0951, "step": 350 }, { "epoch": 0.00083615729976936, "grad_norm": 0.3661258816719055, "learning_rate": 0.000999722055119799, "loss": 1.0885, "step": 360 }, { "epoch": 0.0008593838914296199, "grad_norm": 0.4355231523513794, "learning_rate": 0.0009997143129225788, "loss": 1.1301, "step": 370 }, { "epoch": 0.00088261048308988, "grad_norm": 0.3286990225315094, "learning_rate": 0.0009997065707253588, "loss": 1.0705, "step": 380 }, { "epoch": 0.0009058370747501399, "grad_norm": 0.31140822172164917, "learning_rate": 0.0009996988285281387, "loss": 1.1873, "step": 390 }, { "epoch": 0.0009290636664103999, "grad_norm": 0.2582302689552307, "learning_rate": 0.0009996910863309186, "loss": 1.1567, "step": 400 }, { "epoch": 0.00095229025807066, "grad_norm": 0.36799147725105286, "learning_rate": 0.0009996833441336984, "loss": 1.2273, "step": 410 }, { "epoch": 0.0009755168497309199, "grad_norm": 0.28618550300598145, "learning_rate": 0.0009996756019364785, "loss": 1.0851, "step": 420 }, { "epoch": 0.00099874344139118, "grad_norm": 0.3006650507450104, "learning_rate": 0.0009996678597392584, "loss": 1.0341, "step": 430 }, { "epoch": 0.00102197003305144, "grad_norm": 0.3651888072490692, "learning_rate": 0.0009996601175420383, "loss": 1.0212, "step": 440 }, { "epoch": 0.0010451966247116999, "grad_norm": 0.32596904039382935, "learning_rate": 0.0009996523753448181, "loss": 1.0919, "step": 450 }, { "epoch": 0.0010684232163719598, "grad_norm": 0.30658453702926636, "learning_rate": 0.000999644633147598, "loss": 1.0934, "step": 460 }, { "epoch": 0.00109164980803222, "grad_norm": 0.49543142318725586, "learning_rate": 0.0009996368909503779, "loss": 1.1603, "step": 470 }, { "epoch": 0.00111487639969248, "grad_norm": 0.24394716322422028, "learning_rate": 0.000999629148753158, "loss": 1.1455, "step": 480 }, { "epoch": 0.00113810299135274, "grad_norm": 0.38373667001724243, "learning_rate": 0.0009996214065559378, "loss": 1.1498, "step": 490 }, { "epoch": 0.0011613295830129999, "grad_norm": 0.5020566582679749, "learning_rate": 0.0009996136643587177, "loss": 1.076, "step": 500 }, { "epoch": 0.0011845561746732598, "grad_norm": 0.3413016200065613, "learning_rate": 0.0009996059221614975, "loss": 1.1747, "step": 510 }, { "epoch": 0.00120778276633352, "grad_norm": 0.3450530171394348, "learning_rate": 0.0009995981799642774, "loss": 1.1441, "step": 520 }, { "epoch": 0.00123100935799378, "grad_norm": 0.3582036793231964, "learning_rate": 0.0009995904377670575, "loss": 1.1679, "step": 530 }, { "epoch": 0.00125423594965404, "grad_norm": 0.30296868085861206, "learning_rate": 0.0009995826955698373, "loss": 1.0446, "step": 540 }, { "epoch": 0.0012774625413142999, "grad_norm": 0.3772015869617462, "learning_rate": 0.0009995749533726172, "loss": 1.1239, "step": 550 }, { "epoch": 0.00130068913297456, "grad_norm": 0.3441556692123413, "learning_rate": 0.000999567211175397, "loss": 1.112, "step": 560 }, { "epoch": 0.00132391572463482, "grad_norm": 0.3211918771266937, "learning_rate": 0.0009995594689781772, "loss": 1.1344, "step": 570 }, { "epoch": 0.00134714231629508, "grad_norm": 0.2808244824409485, "learning_rate": 0.000999551726780957, "loss": 1.1398, "step": 580 }, { "epoch": 0.0013703689079553399, "grad_norm": 0.32571667432785034, "learning_rate": 0.000999543984583737, "loss": 1.1455, "step": 590 }, { "epoch": 0.0013935954996155998, "grad_norm": 0.3554767668247223, "learning_rate": 0.0009995362423865168, "loss": 0.991, "step": 600 }, { "epoch": 0.00141682209127586, "grad_norm": 0.253456711769104, "learning_rate": 0.0009995285001892968, "loss": 1.1686, "step": 610 }, { "epoch": 0.00144004868293612, "grad_norm": 0.31393057107925415, "learning_rate": 0.0009995207579920767, "loss": 1.1034, "step": 620 }, { "epoch": 0.00146327527459638, "grad_norm": 0.3797680735588074, "learning_rate": 0.0009995130157948566, "loss": 1.1224, "step": 630 }, { "epoch": 0.0014865018662566399, "grad_norm": 0.3667146563529968, "learning_rate": 0.0009995052735976364, "loss": 1.1484, "step": 640 }, { "epoch": 0.0015097284579168998, "grad_norm": 0.28348517417907715, "learning_rate": 0.0009994975314004165, "loss": 1.2004, "step": 650 }, { "epoch": 0.00153295504957716, "grad_norm": 0.4176248610019684, "learning_rate": 0.0009994897892031964, "loss": 1.1415, "step": 660 }, { "epoch": 0.00155618164123742, "grad_norm": 0.3170236647129059, "learning_rate": 0.0009994820470059763, "loss": 1.0853, "step": 670 }, { "epoch": 0.00157940823289768, "grad_norm": 0.31185317039489746, "learning_rate": 0.0009994743048087561, "loss": 1.1353, "step": 680 }, { "epoch": 0.0016026348245579399, "grad_norm": 0.33214762806892395, "learning_rate": 0.000999466562611536, "loss": 1.1504, "step": 690 }, { "epoch": 0.0016258614162181998, "grad_norm": 0.3761586844921112, "learning_rate": 0.000999458820414316, "loss": 1.0549, "step": 700 }, { "epoch": 0.00164908800787846, "grad_norm": 0.2806662619113922, "learning_rate": 0.000999451078217096, "loss": 1.1859, "step": 710 }, { "epoch": 0.00167231459953872, "grad_norm": 0.39696329832077026, "learning_rate": 0.0009994433360198758, "loss": 1.1716, "step": 720 }, { "epoch": 0.0016955411911989799, "grad_norm": 0.28009161353111267, "learning_rate": 0.0009994355938226557, "loss": 1.1932, "step": 730 }, { "epoch": 0.0017187677828592398, "grad_norm": 0.2747149169445038, "learning_rate": 0.0009994278516254355, "loss": 1.0847, "step": 740 }, { "epoch": 0.0017419943745194998, "grad_norm": 0.30023542046546936, "learning_rate": 0.0009994201094282154, "loss": 1.0696, "step": 750 }, { "epoch": 0.00176522096617976, "grad_norm": 0.3453909158706665, "learning_rate": 0.0009994123672309955, "loss": 1.0967, "step": 760 }, { "epoch": 0.00178844755784002, "grad_norm": 0.49272191524505615, "learning_rate": 0.0009994046250337753, "loss": 1.0573, "step": 770 }, { "epoch": 0.0018116741495002799, "grad_norm": 0.2652382254600525, "learning_rate": 0.0009993968828365552, "loss": 1.1404, "step": 780 }, { "epoch": 0.0018349007411605398, "grad_norm": 0.25675663352012634, "learning_rate": 0.000999389140639335, "loss": 1.0459, "step": 790 }, { "epoch": 0.0018581273328207998, "grad_norm": 0.3685920834541321, "learning_rate": 0.0009993813984421152, "loss": 1.0117, "step": 800 }, { "epoch": 0.00188135392448106, "grad_norm": 0.3216955363750458, "learning_rate": 0.000999373656244895, "loss": 1.1672, "step": 810 }, { "epoch": 0.00190458051614132, "grad_norm": 0.4081834852695465, "learning_rate": 0.000999365914047675, "loss": 1.1555, "step": 820 }, { "epoch": 0.0019278071078015799, "grad_norm": 0.3144775927066803, "learning_rate": 0.0009993581718504548, "loss": 1.2002, "step": 830 }, { "epoch": 0.0019510336994618398, "grad_norm": 0.3642594814300537, "learning_rate": 0.0009993504296532348, "loss": 1.0547, "step": 840 }, { "epoch": 0.0019742602911220998, "grad_norm": 0.3856127858161926, "learning_rate": 0.0009993426874560147, "loss": 1.2028, "step": 850 }, { "epoch": 0.00199748688278236, "grad_norm": 0.41429170966148376, "learning_rate": 0.0009993349452587946, "loss": 1.0857, "step": 860 }, { "epoch": 0.0020207134744426197, "grad_norm": 0.4278993606567383, "learning_rate": 0.0009993272030615744, "loss": 1.0574, "step": 870 }, { "epoch": 0.00204394006610288, "grad_norm": 0.26868101954460144, "learning_rate": 0.0009993194608643545, "loss": 1.0538, "step": 880 }, { "epoch": 0.00206716665776314, "grad_norm": 0.8726014494895935, "learning_rate": 0.0009993117186671344, "loss": 1.2263, "step": 890 }, { "epoch": 0.0020903932494233998, "grad_norm": 0.39568719267845154, "learning_rate": 0.0009993039764699143, "loss": 1.1606, "step": 900 }, { "epoch": 0.00211361984108366, "grad_norm": 0.3933831751346588, "learning_rate": 0.0009992962342726941, "loss": 1.1263, "step": 910 }, { "epoch": 0.0021368464327439197, "grad_norm": 0.4326261579990387, "learning_rate": 0.000999288492075474, "loss": 1.0729, "step": 920 }, { "epoch": 0.00216007302440418, "grad_norm": 0.3416406810283661, "learning_rate": 0.000999280749878254, "loss": 1.1538, "step": 930 }, { "epoch": 0.00218329961606444, "grad_norm": 0.338379830121994, "learning_rate": 0.000999273007681034, "loss": 1.0347, "step": 940 }, { "epoch": 0.0022065262077246997, "grad_norm": 0.34776318073272705, "learning_rate": 0.0009992652654838138, "loss": 1.1322, "step": 950 }, { "epoch": 0.00222975279938496, "grad_norm": 0.23187178373336792, "learning_rate": 0.0009992575232865937, "loss": 1.0574, "step": 960 }, { "epoch": 0.0022529793910452196, "grad_norm": 0.3015563189983368, "learning_rate": 0.0009992497810893735, "loss": 1.0911, "step": 970 }, { "epoch": 0.00227620598270548, "grad_norm": 0.31411874294281006, "learning_rate": 0.0009992420388921534, "loss": 1.1008, "step": 980 }, { "epoch": 0.00229943257436574, "grad_norm": 0.4988269805908203, "learning_rate": 0.0009992342966949335, "loss": 1.1292, "step": 990 }, { "epoch": 0.0023226591660259997, "grad_norm": 0.3398004472255707, "learning_rate": 0.0009992265544977133, "loss": 1.1665, "step": 1000 }, { "epoch": 0.00234588575768626, "grad_norm": 0.32879185676574707, "learning_rate": 0.0009992188123004932, "loss": 1.1131, "step": 1010 }, { "epoch": 0.0023691123493465196, "grad_norm": 0.40583041310310364, "learning_rate": 0.000999211070103273, "loss": 1.0571, "step": 1020 }, { "epoch": 0.00239233894100678, "grad_norm": 0.3514922559261322, "learning_rate": 0.0009992033279060532, "loss": 1.1166, "step": 1030 }, { "epoch": 0.00241556553266704, "grad_norm": 1.3851335048675537, "learning_rate": 0.000999195585708833, "loss": 1.0532, "step": 1040 }, { "epoch": 0.0024387921243272997, "grad_norm": 0.5054768919944763, "learning_rate": 0.000999187843511613, "loss": 1.16, "step": 1050 }, { "epoch": 0.00246201871598756, "grad_norm": 0.37074124813079834, "learning_rate": 0.0009991801013143928, "loss": 1.2028, "step": 1060 }, { "epoch": 0.0024852453076478196, "grad_norm": 0.3337225615978241, "learning_rate": 0.0009991723591171728, "loss": 1.1109, "step": 1070 }, { "epoch": 0.00250847189930808, "grad_norm": 0.283372163772583, "learning_rate": 0.0009991646169199527, "loss": 1.063, "step": 1080 }, { "epoch": 0.00253169849096834, "grad_norm": 0.3113659620285034, "learning_rate": 0.0009991568747227326, "loss": 1.1027, "step": 1090 }, { "epoch": 0.0025549250826285997, "grad_norm": 0.43556565046310425, "learning_rate": 0.0009991491325255124, "loss": 1.1181, "step": 1100 }, { "epoch": 0.00257815167428886, "grad_norm": 0.3736826479434967, "learning_rate": 0.0009991413903282925, "loss": 1.1035, "step": 1110 }, { "epoch": 0.00260137826594912, "grad_norm": 0.3376559913158417, "learning_rate": 0.0009991336481310724, "loss": 1.0149, "step": 1120 }, { "epoch": 0.0026246048576093798, "grad_norm": 0.3545368015766144, "learning_rate": 0.0009991259059338523, "loss": 1.1472, "step": 1130 }, { "epoch": 0.00264783144926964, "grad_norm": 0.2400045394897461, "learning_rate": 0.0009991181637366321, "loss": 1.1423, "step": 1140 }, { "epoch": 0.0026710580409298997, "grad_norm": 0.37132346630096436, "learning_rate": 0.0009991104215394122, "loss": 1.1802, "step": 1150 }, { "epoch": 0.00269428463259016, "grad_norm": 0.26770955324172974, "learning_rate": 0.000999102679342192, "loss": 1.0859, "step": 1160 }, { "epoch": 0.00271751122425042, "grad_norm": 0.3567134439945221, "learning_rate": 0.000999094937144972, "loss": 1.1699, "step": 1170 }, { "epoch": 0.0027407378159106798, "grad_norm": 0.3370940387248993, "learning_rate": 0.0009990871949477518, "loss": 1.2679, "step": 1180 }, { "epoch": 0.00276396440757094, "grad_norm": 0.3533010184764862, "learning_rate": 0.0009990794527505317, "loss": 1.1444, "step": 1190 }, { "epoch": 0.0027871909992311997, "grad_norm": 0.227728933095932, "learning_rate": 0.0009990717105533115, "loss": 1.1105, "step": 1200 }, { "epoch": 0.00281041759089146, "grad_norm": 0.39945659041404724, "learning_rate": 0.0009990639683560916, "loss": 1.0122, "step": 1210 }, { "epoch": 0.00283364418255172, "grad_norm": 0.38961905241012573, "learning_rate": 0.0009990562261588715, "loss": 1.1677, "step": 1220 }, { "epoch": 0.0028568707742119798, "grad_norm": 0.35965076088905334, "learning_rate": 0.0009990484839616513, "loss": 1.2045, "step": 1230 }, { "epoch": 0.00288009736587224, "grad_norm": 0.3876691460609436, "learning_rate": 0.0009990407417644312, "loss": 1.1577, "step": 1240 }, { "epoch": 0.0029033239575324997, "grad_norm": 0.3059842586517334, "learning_rate": 0.000999032999567211, "loss": 1.1294, "step": 1250 }, { "epoch": 0.00292655054919276, "grad_norm": 0.31481969356536865, "learning_rate": 0.0009990252573699912, "loss": 1.1202, "step": 1260 }, { "epoch": 0.00294977714085302, "grad_norm": 0.3077446222305298, "learning_rate": 0.000999017515172771, "loss": 1.0893, "step": 1270 }, { "epoch": 0.0029730037325132797, "grad_norm": 0.30285683274269104, "learning_rate": 0.000999009772975551, "loss": 1.0844, "step": 1280 }, { "epoch": 0.00299623032417354, "grad_norm": 0.32145956158638, "learning_rate": 0.0009990020307783308, "loss": 1.1524, "step": 1290 }, { "epoch": 0.0030194569158337996, "grad_norm": 0.3908081352710724, "learning_rate": 0.0009989942885811108, "loss": 1.104, "step": 1300 }, { "epoch": 0.00304268350749406, "grad_norm": 0.32902881503105164, "learning_rate": 0.0009989865463838907, "loss": 1.1161, "step": 1310 }, { "epoch": 0.00306591009915432, "grad_norm": 0.3777260184288025, "learning_rate": 0.0009989788041866706, "loss": 1.1623, "step": 1320 }, { "epoch": 0.0030891366908145797, "grad_norm": 0.4204845130443573, "learning_rate": 0.0009989710619894504, "loss": 1.1284, "step": 1330 }, { "epoch": 0.00311236328247484, "grad_norm": 0.3189554810523987, "learning_rate": 0.0009989633197922305, "loss": 1.104, "step": 1340 }, { "epoch": 0.0031355898741350996, "grad_norm": 0.30896514654159546, "learning_rate": 0.0009989555775950104, "loss": 1.1221, "step": 1350 }, { "epoch": 0.00315881646579536, "grad_norm": 1.2486257553100586, "learning_rate": 0.0009989478353977903, "loss": 1.2578, "step": 1360 }, { "epoch": 0.00318204305745562, "grad_norm": 0.433830201625824, "learning_rate": 0.0009989400932005701, "loss": 1.101, "step": 1370 }, { "epoch": 0.0032052696491158797, "grad_norm": 0.3873724341392517, "learning_rate": 0.0009989323510033502, "loss": 1.1509, "step": 1380 }, { "epoch": 0.00322849624077614, "grad_norm": 0.238771453499794, "learning_rate": 0.00099892460880613, "loss": 1.171, "step": 1390 }, { "epoch": 0.0032517228324363996, "grad_norm": 0.3480624258518219, "learning_rate": 0.00099891686660891, "loss": 1.2122, "step": 1400 }, { "epoch": 0.00327494942409666, "grad_norm": 0.35760608315467834, "learning_rate": 0.0009989091244116898, "loss": 1.0479, "step": 1410 }, { "epoch": 0.00329817601575692, "grad_norm": 0.3133438527584076, "learning_rate": 0.0009989013822144697, "loss": 1.1176, "step": 1420 }, { "epoch": 0.0033214026074171797, "grad_norm": 0.2956129014492035, "learning_rate": 0.0009988936400172495, "loss": 1.12, "step": 1430 }, { "epoch": 0.00334462919907744, "grad_norm": 0.2697290778160095, "learning_rate": 0.0009988858978200296, "loss": 1.0247, "step": 1440 }, { "epoch": 0.0033678557907376996, "grad_norm": 0.34495481848716736, "learning_rate": 0.0009988781556228095, "loss": 1.0775, "step": 1450 }, { "epoch": 0.0033910823823979598, "grad_norm": 0.29800111055374146, "learning_rate": 0.0009988704134255893, "loss": 1.1489, "step": 1460 }, { "epoch": 0.00341430897405822, "grad_norm": 0.29650014638900757, "learning_rate": 0.0009988626712283692, "loss": 1.0565, "step": 1470 }, { "epoch": 0.0034375355657184797, "grad_norm": 0.35248780250549316, "learning_rate": 0.000998854929031149, "loss": 1.1121, "step": 1480 }, { "epoch": 0.00346076215737874, "grad_norm": 0.2716731131076813, "learning_rate": 0.0009988471868339292, "loss": 1.0923, "step": 1490 }, { "epoch": 0.0034839887490389996, "grad_norm": 0.4371800422668457, "learning_rate": 0.000998839444636709, "loss": 1.0155, "step": 1500 }, { "epoch": 0.0035072153406992598, "grad_norm": 0.2633199691772461, "learning_rate": 0.0009988317024394889, "loss": 1.1037, "step": 1510 }, { "epoch": 0.00353044193235952, "grad_norm": 0.2944166362285614, "learning_rate": 0.0009988239602422688, "loss": 1.0995, "step": 1520 }, { "epoch": 0.0035536685240197797, "grad_norm": 0.2786024212837219, "learning_rate": 0.0009988162180450488, "loss": 1.0641, "step": 1530 }, { "epoch": 0.00357689511568004, "grad_norm": 0.31116756796836853, "learning_rate": 0.0009988084758478287, "loss": 1.1015, "step": 1540 }, { "epoch": 0.0036001217073402996, "grad_norm": 0.31829699873924255, "learning_rate": 0.0009988007336506086, "loss": 1.0519, "step": 1550 }, { "epoch": 0.0036233482990005597, "grad_norm": 0.4150811433792114, "learning_rate": 0.0009987929914533884, "loss": 1.1509, "step": 1560 }, { "epoch": 0.00364657489066082, "grad_norm": 0.2690746784210205, "learning_rate": 0.0009987852492561685, "loss": 1.0517, "step": 1570 }, { "epoch": 0.0036698014823210797, "grad_norm": 0.3126815855503082, "learning_rate": 0.0009987775070589484, "loss": 1.1398, "step": 1580 }, { "epoch": 0.00369302807398134, "grad_norm": 0.34572452306747437, "learning_rate": 0.0009987697648617283, "loss": 1.0342, "step": 1590 }, { "epoch": 0.0037162546656415996, "grad_norm": 0.30171483755111694, "learning_rate": 0.0009987620226645081, "loss": 1.0517, "step": 1600 }, { "epoch": 0.0037394812573018597, "grad_norm": 0.2483634054660797, "learning_rate": 0.0009987542804672882, "loss": 1.1146, "step": 1610 }, { "epoch": 0.00376270784896212, "grad_norm": 0.41606566309928894, "learning_rate": 0.000998746538270068, "loss": 1.0997, "step": 1620 }, { "epoch": 0.0037859344406223796, "grad_norm": 0.3014843761920929, "learning_rate": 0.000998738796072848, "loss": 1.0975, "step": 1630 }, { "epoch": 0.00380916103228264, "grad_norm": 0.31974515318870544, "learning_rate": 0.0009987310538756278, "loss": 1.0963, "step": 1640 }, { "epoch": 0.0038323876239428996, "grad_norm": 0.3185972273349762, "learning_rate": 0.0009987233116784077, "loss": 1.1598, "step": 1650 }, { "epoch": 0.0038556142156031597, "grad_norm": 0.3430216908454895, "learning_rate": 0.0009987155694811877, "loss": 0.9476, "step": 1660 }, { "epoch": 0.00387884080726342, "grad_norm": 0.4456688165664673, "learning_rate": 0.0009987078272839676, "loss": 1.1319, "step": 1670 }, { "epoch": 0.0039020673989236796, "grad_norm": 0.4243941605091095, "learning_rate": 0.0009987000850867475, "loss": 1.1765, "step": 1680 }, { "epoch": 0.003925293990583939, "grad_norm": 0.22148986160755157, "learning_rate": 0.0009986923428895273, "loss": 1.1305, "step": 1690 }, { "epoch": 0.0039485205822441995, "grad_norm": 0.44649383425712585, "learning_rate": 0.0009986846006923072, "loss": 1.1282, "step": 1700 }, { "epoch": 0.00397174717390446, "grad_norm": 0.35965171456336975, "learning_rate": 0.000998676858495087, "loss": 1.0997, "step": 1710 }, { "epoch": 0.00399497376556472, "grad_norm": 0.4147953987121582, "learning_rate": 0.0009986691162978672, "loss": 1.0682, "step": 1720 }, { "epoch": 0.00401820035722498, "grad_norm": 0.47538864612579346, "learning_rate": 0.000998661374100647, "loss": 1.1625, "step": 1730 }, { "epoch": 0.004041426948885239, "grad_norm": 0.3181823194026947, "learning_rate": 0.0009986536319034269, "loss": 1.1683, "step": 1740 }, { "epoch": 0.0040646535405454995, "grad_norm": 0.32929712533950806, "learning_rate": 0.0009986458897062068, "loss": 1.1306, "step": 1750 }, { "epoch": 0.00408788013220576, "grad_norm": 0.34377196431159973, "learning_rate": 0.0009986381475089868, "loss": 1.1267, "step": 1760 }, { "epoch": 0.00411110672386602, "grad_norm": 0.3156042695045471, "learning_rate": 0.0009986304053117667, "loss": 1.0523, "step": 1770 }, { "epoch": 0.00413433331552628, "grad_norm": 0.35088011622428894, "learning_rate": 0.0009986226631145466, "loss": 1.0075, "step": 1780 }, { "epoch": 0.004157559907186539, "grad_norm": 0.3740438222885132, "learning_rate": 0.0009986149209173264, "loss": 1.1788, "step": 1790 }, { "epoch": 0.0041807864988467995, "grad_norm": 0.28393882513046265, "learning_rate": 0.0009986071787201065, "loss": 1.0374, "step": 1800 }, { "epoch": 0.00420401309050706, "grad_norm": 0.2916342318058014, "learning_rate": 0.0009985994365228864, "loss": 1.0783, "step": 1810 }, { "epoch": 0.00422723968216732, "grad_norm": 0.3398910462856293, "learning_rate": 0.0009985916943256663, "loss": 1.129, "step": 1820 }, { "epoch": 0.00425046627382758, "grad_norm": 0.3244156837463379, "learning_rate": 0.0009985839521284461, "loss": 1.1812, "step": 1830 }, { "epoch": 0.004273692865487839, "grad_norm": 0.5498040318489075, "learning_rate": 0.0009985762099312262, "loss": 1.1812, "step": 1840 }, { "epoch": 0.0042969194571480995, "grad_norm": 0.27574270963668823, "learning_rate": 0.000998568467734006, "loss": 1.1414, "step": 1850 }, { "epoch": 0.00432014604880836, "grad_norm": 0.3610564172267914, "learning_rate": 0.000998560725536786, "loss": 1.0763, "step": 1860 }, { "epoch": 0.00434337264046862, "grad_norm": 0.33828043937683105, "learning_rate": 0.0009985529833395658, "loss": 1.0169, "step": 1870 }, { "epoch": 0.00436659923212888, "grad_norm": 0.22078180313110352, "learning_rate": 0.0009985452411423457, "loss": 1.0513, "step": 1880 }, { "epoch": 0.004389825823789139, "grad_norm": 0.4355666935443878, "learning_rate": 0.0009985374989451257, "loss": 1.1245, "step": 1890 }, { "epoch": 0.0044130524154493995, "grad_norm": 0.3071712851524353, "learning_rate": 0.0009985297567479056, "loss": 1.1669, "step": 1900 }, { "epoch": 0.00443627900710966, "grad_norm": 0.3043074905872345, "learning_rate": 0.0009985220145506855, "loss": 1.1917, "step": 1910 }, { "epoch": 0.00445950559876992, "grad_norm": 0.33084383606910706, "learning_rate": 0.0009985142723534653, "loss": 1.0819, "step": 1920 }, { "epoch": 0.00448273219043018, "grad_norm": 0.32064658403396606, "learning_rate": 0.0009985065301562452, "loss": 1.1362, "step": 1930 }, { "epoch": 0.004505958782090439, "grad_norm": 0.34291279315948486, "learning_rate": 0.0009984987879590253, "loss": 1.0888, "step": 1940 }, { "epoch": 0.0045291853737506995, "grad_norm": 0.4338567852973938, "learning_rate": 0.0009984910457618052, "loss": 1.0783, "step": 1950 }, { "epoch": 0.00455241196541096, "grad_norm": 0.33047792315483093, "learning_rate": 0.000998483303564585, "loss": 1.0977, "step": 1960 }, { "epoch": 0.00457563855707122, "grad_norm": 0.33728134632110596, "learning_rate": 0.0009984755613673649, "loss": 1.137, "step": 1970 }, { "epoch": 0.00459886514873148, "grad_norm": 0.27301332354545593, "learning_rate": 0.0009984678191701448, "loss": 1.1413, "step": 1980 }, { "epoch": 0.004622091740391739, "grad_norm": 0.2804515063762665, "learning_rate": 0.0009984600769729248, "loss": 1.0865, "step": 1990 }, { "epoch": 0.0046453183320519995, "grad_norm": 0.33448469638824463, "learning_rate": 0.0009984523347757047, "loss": 1.1526, "step": 2000 } ], "logging_steps": 10, "max_steps": 1291623, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5412177861554176e+16, "train_batch_size": 7, "trial_name": null, "trial_params": null }