| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0110567051018797, |
| "eval_steps": 500, |
| "global_step": 3200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00315905860053704, |
| "grad_norm": 4.247729183572734, |
| "learning_rate": 5.685407454200885e-08, |
| "loss": 0.6057, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.00631811720107408, |
| "grad_norm": 4.531449453000221, |
| "learning_rate": 1.2002526847757423e-07, |
| "loss": 0.6101, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00947717580161112, |
| "grad_norm": 4.104292927990717, |
| "learning_rate": 1.831964624131396e-07, |
| "loss": 0.5947, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.01263623440214816, |
| "grad_norm": 3.1655981231206747, |
| "learning_rate": 2.46367656348705e-07, |
| "loss": 0.5609, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0157952930026852, |
| "grad_norm": 1.6081249047797948, |
| "learning_rate": 3.095388502842704e-07, |
| "loss": 0.5128, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.01895435160322224, |
| "grad_norm": 1.0896969313030245, |
| "learning_rate": 3.727100442198358e-07, |
| "loss": 0.4618, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.02211341020375928, |
| "grad_norm": 0.720186068205468, |
| "learning_rate": 4.3588123815540116e-07, |
| "loss": 0.437, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02527246880429632, |
| "grad_norm": 0.5712683718673494, |
| "learning_rate": 4.990524320909665e-07, |
| "loss": 0.4145, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.02843152740483336, |
| "grad_norm": 0.41839125700043645, |
| "learning_rate": 5.62223626026532e-07, |
| "loss": 0.3887, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0315905860053704, |
| "grad_norm": 0.33539615628708846, |
| "learning_rate": 6.253948199620974e-07, |
| "loss": 0.3661, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03474964460590744, |
| "grad_norm": 0.25866894040378646, |
| "learning_rate": 6.885660138976627e-07, |
| "loss": 0.352, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.03790870320644448, |
| "grad_norm": 0.24295153980385867, |
| "learning_rate": 7.517372078332281e-07, |
| "loss": 0.3457, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.04106776180698152, |
| "grad_norm": 0.28237511864322024, |
| "learning_rate": 8.149084017687935e-07, |
| "loss": 0.3345, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.04422682040751856, |
| "grad_norm": 0.21156218948831834, |
| "learning_rate": 8.780795957043589e-07, |
| "loss": 0.3281, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0473858790080556, |
| "grad_norm": 0.20112450248918143, |
| "learning_rate": 9.412507896399242e-07, |
| "loss": 0.3267, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05054493760859264, |
| "grad_norm": 0.19763198530161658, |
| "learning_rate": 1.0044219835754897e-06, |
| "loss": 0.3206, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.05370399620912968, |
| "grad_norm": 0.21467283595984932, |
| "learning_rate": 1.067593177511055e-06, |
| "loss": 0.3136, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.05686305480966672, |
| "grad_norm": 0.18660741459520747, |
| "learning_rate": 1.1307643714466204e-06, |
| "loss": 0.3087, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.06002211341020376, |
| "grad_norm": 0.21253735556196968, |
| "learning_rate": 1.1939355653821858e-06, |
| "loss": 0.3049, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.0631811720107408, |
| "grad_norm": 0.237093634634235, |
| "learning_rate": 1.2571067593177513e-06, |
| "loss": 0.3005, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.06634023061127783, |
| "grad_norm": 0.2231876630708564, |
| "learning_rate": 1.3202779532533167e-06, |
| "loss": 0.2972, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.06949928921181488, |
| "grad_norm": 0.2017070855177483, |
| "learning_rate": 1.383449147188882e-06, |
| "loss": 0.2935, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.07265834781235192, |
| "grad_norm": 0.21789091247744216, |
| "learning_rate": 1.4466203411244472e-06, |
| "loss": 0.29, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.07581740641288896, |
| "grad_norm": 0.24210256044037484, |
| "learning_rate": 1.509791535060013e-06, |
| "loss": 0.289, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.07897646501342599, |
| "grad_norm": 0.21139319424520517, |
| "learning_rate": 1.572962728995578e-06, |
| "loss": 0.2895, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08213552361396304, |
| "grad_norm": 0.20081643445857786, |
| "learning_rate": 1.6361339229311434e-06, |
| "loss": 0.2848, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.08529458221450008, |
| "grad_norm": 0.21867950206393189, |
| "learning_rate": 1.6993051168667088e-06, |
| "loss": 0.2842, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.08845364081503712, |
| "grad_norm": 0.1996646570779461, |
| "learning_rate": 1.7624763108022743e-06, |
| "loss": 0.2785, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.09161269941557416, |
| "grad_norm": 0.1991696950635834, |
| "learning_rate": 1.8256475047378397e-06, |
| "loss": 0.2747, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0947717580161112, |
| "grad_norm": 0.21382346420276455, |
| "learning_rate": 1.888818698673405e-06, |
| "loss": 0.2778, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.09793081661664824, |
| "grad_norm": 0.22163335945192605, |
| "learning_rate": 1.9519898926089704e-06, |
| "loss": 0.2747, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.10108987521718528, |
| "grad_norm": 0.2338176849443747, |
| "learning_rate": 2.0151610865445357e-06, |
| "loss": 0.2719, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.10424893381772232, |
| "grad_norm": 0.18002603770625134, |
| "learning_rate": 2.078332280480101e-06, |
| "loss": 0.2713, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.10740799241825937, |
| "grad_norm": 0.19849866049815784, |
| "learning_rate": 2.1415034744156664e-06, |
| "loss": 0.2671, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.1105670510187964, |
| "grad_norm": 0.18726127817800872, |
| "learning_rate": 2.2046746683512322e-06, |
| "loss": 0.2679, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11372610961933344, |
| "grad_norm": 0.18164666420060355, |
| "learning_rate": 2.2678458622867976e-06, |
| "loss": 0.2696, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.11688516821987048, |
| "grad_norm": 0.188419554987382, |
| "learning_rate": 2.331017056222363e-06, |
| "loss": 0.2657, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.12004422682040752, |
| "grad_norm": 0.21408765943095384, |
| "learning_rate": 2.394188250157928e-06, |
| "loss": 0.2631, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.12320328542094455, |
| "grad_norm": 0.1728133517187042, |
| "learning_rate": 2.4573594440934936e-06, |
| "loss": 0.2627, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.1263623440214816, |
| "grad_norm": 0.20446817453058305, |
| "learning_rate": 2.520530638029059e-06, |
| "loss": 0.2633, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.12952140262201864, |
| "grad_norm": 0.2867541071966156, |
| "learning_rate": 2.5837018319646247e-06, |
| "loss": 0.2606, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.13268046122255567, |
| "grad_norm": 0.19615186721360886, |
| "learning_rate": 2.6468730259001897e-06, |
| "loss": 0.2589, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.13583951982309272, |
| "grad_norm": 0.17250843271916932, |
| "learning_rate": 2.710044219835755e-06, |
| "loss": 0.2578, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.13899857842362975, |
| "grad_norm": 0.21061094083012397, |
| "learning_rate": 2.7732154137713208e-06, |
| "loss": 0.256, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.1421576370241668, |
| "grad_norm": 0.16796761706106636, |
| "learning_rate": 2.8363866077068857e-06, |
| "loss": 0.254, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.14531669562470384, |
| "grad_norm": 0.20087509437202522, |
| "learning_rate": 2.8995578016424515e-06, |
| "loss": 0.2554, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.14847575422524087, |
| "grad_norm": 0.20431208429215614, |
| "learning_rate": 2.9627289955780164e-06, |
| "loss": 0.2524, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.15163481282577793, |
| "grad_norm": 0.20436652455880414, |
| "learning_rate": 3.025900189513582e-06, |
| "loss": 0.2557, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.15479387142631496, |
| "grad_norm": 0.19923295760265683, |
| "learning_rate": 3.0890713834491475e-06, |
| "loss": 0.252, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.15795293002685198, |
| "grad_norm": 0.1925405706241213, |
| "learning_rate": 3.1522425773847125e-06, |
| "loss": 0.2505, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16111198862738904, |
| "grad_norm": 0.20206673704458114, |
| "learning_rate": 3.2154137713202782e-06, |
| "loss": 0.2501, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.16427104722792607, |
| "grad_norm": 0.24333700245600473, |
| "learning_rate": 3.278584965255844e-06, |
| "loss": 0.2464, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.16743010582846313, |
| "grad_norm": 0.18205244482695793, |
| "learning_rate": 3.341756159191409e-06, |
| "loss": 0.2485, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.17058916442900016, |
| "grad_norm": 0.18264545298219137, |
| "learning_rate": 3.4049273531269743e-06, |
| "loss": 0.2456, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.17374822302953719, |
| "grad_norm": 0.17873652388985004, |
| "learning_rate": 3.46809854706254e-06, |
| "loss": 0.248, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17690728163007424, |
| "grad_norm": 0.17062239190578055, |
| "learning_rate": 3.531269740998105e-06, |
| "loss": 0.2449, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.18006634023061127, |
| "grad_norm": 0.19143037375033525, |
| "learning_rate": 3.5944409349336708e-06, |
| "loss": 0.2464, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.18322539883114833, |
| "grad_norm": 0.1861722136519532, |
| "learning_rate": 3.6576121288692357e-06, |
| "loss": 0.2489, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.18638445743168536, |
| "grad_norm": 0.1680077751454427, |
| "learning_rate": 3.7207833228048014e-06, |
| "loss": 0.2433, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.1895435160322224, |
| "grad_norm": 0.1827874982767101, |
| "learning_rate": 3.783954516740367e-06, |
| "loss": 0.2414, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.19270257463275944, |
| "grad_norm": 0.17739754432609572, |
| "learning_rate": 3.847125710675932e-06, |
| "loss": 0.2439, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.19586163323329647, |
| "grad_norm": 0.18166974193276042, |
| "learning_rate": 3.910296904611497e-06, |
| "loss": 0.2427, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.19902069183383353, |
| "grad_norm": 0.23535305540149423, |
| "learning_rate": 3.973468098547063e-06, |
| "loss": 0.2397, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.20217975043437056, |
| "grad_norm": 0.17950832267537836, |
| "learning_rate": 4.036639292482628e-06, |
| "loss": 0.2413, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.2053388090349076, |
| "grad_norm": 0.1871436271310335, |
| "learning_rate": 4.099810486418194e-06, |
| "loss": 0.2392, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.20849786763544464, |
| "grad_norm": 0.18940758631882895, |
| "learning_rate": 4.162981680353759e-06, |
| "loss": 0.239, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.21165692623598167, |
| "grad_norm": 0.18679899786034626, |
| "learning_rate": 4.226152874289325e-06, |
| "loss": 0.2394, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.21481598483651873, |
| "grad_norm": 0.2005980889421409, |
| "learning_rate": 4.28932406822489e-06, |
| "loss": 0.2369, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.21797504343705576, |
| "grad_norm": 0.18654971719873092, |
| "learning_rate": 4.3524952621604545e-06, |
| "loss": 0.2372, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.2211341020375928, |
| "grad_norm": 0.19896341390367112, |
| "learning_rate": 4.415666456096021e-06, |
| "loss": 0.2384, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.22429316063812985, |
| "grad_norm": 0.1824518487303919, |
| "learning_rate": 4.478837650031586e-06, |
| "loss": 0.2322, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.22745221923866688, |
| "grad_norm": 0.19030275242513275, |
| "learning_rate": 4.542008843967151e-06, |
| "loss": 0.2373, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2306112778392039, |
| "grad_norm": 0.18760298890701887, |
| "learning_rate": 4.605180037902717e-06, |
| "loss": 0.2345, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.23377033643974096, |
| "grad_norm": 0.15845613302953646, |
| "learning_rate": 4.668351231838282e-06, |
| "loss": 0.2343, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.236929395040278, |
| "grad_norm": 0.1710181463737659, |
| "learning_rate": 4.7315224257738475e-06, |
| "loss": 0.2318, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.24008845364081505, |
| "grad_norm": 0.23880301915969937, |
| "learning_rate": 4.794693619709413e-06, |
| "loss": 0.2339, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.24324751224135208, |
| "grad_norm": 0.1678386308387805, |
| "learning_rate": 4.857864813644978e-06, |
| "loss": 0.2306, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.2464065708418891, |
| "grad_norm": 0.1671346172529239, |
| "learning_rate": 4.9210360075805435e-06, |
| "loss": 0.2333, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.24956562944242616, |
| "grad_norm": 0.19944596405981949, |
| "learning_rate": 4.984207201516109e-06, |
| "loss": 0.2313, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.2527246880429632, |
| "grad_norm": 0.18321786541799437, |
| "learning_rate": 5.047378395451674e-06, |
| "loss": 0.2321, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.2558837466435002, |
| "grad_norm": 0.16781254221044775, |
| "learning_rate": 5.11054958938724e-06, |
| "loss": 0.2313, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.2590428052440373, |
| "grad_norm": 0.17492747555934146, |
| "learning_rate": 5.173720783322806e-06, |
| "loss": 0.2312, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.26220186384457433, |
| "grad_norm": 0.17965819326676355, |
| "learning_rate": 5.23689197725837e-06, |
| "loss": 0.2285, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.26536092244511134, |
| "grad_norm": 0.16804749386309387, |
| "learning_rate": 5.3000631711939365e-06, |
| "loss": 0.2282, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.2685199810456484, |
| "grad_norm": 0.1723473524041935, |
| "learning_rate": 5.363234365129502e-06, |
| "loss": 0.2296, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.27167903964618545, |
| "grad_norm": 0.20773827836779976, |
| "learning_rate": 5.426405559065066e-06, |
| "loss": 0.2273, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.27483809824672245, |
| "grad_norm": 0.1641196545454829, |
| "learning_rate": 5.489576753000632e-06, |
| "loss": 0.2301, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.2779971568472595, |
| "grad_norm": 0.19323437206655858, |
| "learning_rate": 5.552747946936198e-06, |
| "loss": 0.2249, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.28115621544779656, |
| "grad_norm": 0.18267311407058162, |
| "learning_rate": 5.615919140871763e-06, |
| "loss": 0.227, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.2843152740483336, |
| "grad_norm": 0.19801209807316134, |
| "learning_rate": 5.679090334807328e-06, |
| "loss": 0.2273, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2874743326488706, |
| "grad_norm": 0.17413734456766244, |
| "learning_rate": 5.742261528742894e-06, |
| "loss": 0.2267, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.2906333912494077, |
| "grad_norm": 0.21136423644297928, |
| "learning_rate": 5.805432722678459e-06, |
| "loss": 0.2263, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.29379244984994474, |
| "grad_norm": 0.20233679461777548, |
| "learning_rate": 5.868603916614024e-06, |
| "loss": 0.2263, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.29695150845048174, |
| "grad_norm": 0.19521629273254087, |
| "learning_rate": 5.93177511054959e-06, |
| "loss": 0.2269, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.3001105670510188, |
| "grad_norm": 0.1696883052949813, |
| "learning_rate": 5.994946304485155e-06, |
| "loss": 0.2265, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.30326962565155585, |
| "grad_norm": 0.18758477411326932, |
| "learning_rate": 6.058117498420721e-06, |
| "loss": 0.2247, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.30642868425209285, |
| "grad_norm": 0.1897879092670083, |
| "learning_rate": 6.121288692356287e-06, |
| "loss": 0.2282, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.3095877428526299, |
| "grad_norm": 0.18304573332433055, |
| "learning_rate": 6.184459886291851e-06, |
| "loss": 0.2223, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.31274680145316697, |
| "grad_norm": 0.1782561802917822, |
| "learning_rate": 6.247631080227417e-06, |
| "loss": 0.224, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.31590586005370397, |
| "grad_norm": 0.20741862373031386, |
| "learning_rate": 6.310802274162983e-06, |
| "loss": 0.2219, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.319064918654241, |
| "grad_norm": 0.1539256973132976, |
| "learning_rate": 6.373973468098547e-06, |
| "loss": 0.2207, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.3222239772547781, |
| "grad_norm": 0.16875500034138524, |
| "learning_rate": 6.437144662034113e-06, |
| "loss": 0.2225, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.32538303585531514, |
| "grad_norm": 0.17168781652376466, |
| "learning_rate": 6.500315855969679e-06, |
| "loss": 0.22, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.32854209445585214, |
| "grad_norm": 0.18011584935641983, |
| "learning_rate": 6.563487049905244e-06, |
| "loss": 0.2216, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.3317011530563892, |
| "grad_norm": 0.19414711454837585, |
| "learning_rate": 6.626658243840809e-06, |
| "loss": 0.2249, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.33486021165692625, |
| "grad_norm": 0.2103163638452648, |
| "learning_rate": 6.689829437776375e-06, |
| "loss": 0.2242, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.33801927025746326, |
| "grad_norm": 0.17555158548112104, |
| "learning_rate": 6.75300063171194e-06, |
| "loss": 0.2206, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.3411783288580003, |
| "grad_norm": 0.16163792223112092, |
| "learning_rate": 6.816171825647505e-06, |
| "loss": 0.221, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.34433738745853737, |
| "grad_norm": 0.17047037019662084, |
| "learning_rate": 6.87934301958307e-06, |
| "loss": 0.2222, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.34749644605907437, |
| "grad_norm": 0.18118959670707843, |
| "learning_rate": 6.942514213518636e-06, |
| "loss": 0.226, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.35065550465961143, |
| "grad_norm": 0.18596851555170507, |
| "learning_rate": 7.005685407454202e-06, |
| "loss": 0.2208, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.3538145632601485, |
| "grad_norm": 0.17540680145260182, |
| "learning_rate": 7.068856601389766e-06, |
| "loss": 0.2228, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.35697362186068554, |
| "grad_norm": 0.16362413537591447, |
| "learning_rate": 7.1320277953253324e-06, |
| "loss": 0.2199, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.36013268046122254, |
| "grad_norm": 0.16388409560788866, |
| "learning_rate": 7.195198989260898e-06, |
| "loss": 0.2183, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.3632917390617596, |
| "grad_norm": 0.16780920971825364, |
| "learning_rate": 7.258370183196462e-06, |
| "loss": 0.2223, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.36645079766229666, |
| "grad_norm": 0.16629698944531449, |
| "learning_rate": 7.3215413771320285e-06, |
| "loss": 0.2184, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.36960985626283366, |
| "grad_norm": 0.1760306249090938, |
| "learning_rate": 7.384712571067594e-06, |
| "loss": 0.2219, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.3727689148633707, |
| "grad_norm": 0.1790776173967007, |
| "learning_rate": 7.447883765003159e-06, |
| "loss": 0.2198, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.37592797346390777, |
| "grad_norm": 0.1857393797937426, |
| "learning_rate": 7.5110549589387245e-06, |
| "loss": 0.2183, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.3790870320644448, |
| "grad_norm": 0.18072716423697788, |
| "learning_rate": 7.57422615287429e-06, |
| "loss": 0.2198, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.38224609066498183, |
| "grad_norm": 0.1627749965589205, |
| "learning_rate": 7.637397346809855e-06, |
| "loss": 0.2204, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.3854051492655189, |
| "grad_norm": 0.23081430880637033, |
| "learning_rate": 7.700568540745421e-06, |
| "loss": 0.2179, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.3885642078660559, |
| "grad_norm": 0.14668683659878062, |
| "learning_rate": 7.763739734680986e-06, |
| "loss": 0.218, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.39172326646659295, |
| "grad_norm": 0.17314703270798587, |
| "learning_rate": 7.82691092861655e-06, |
| "loss": 0.2172, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.39488232506713, |
| "grad_norm": 0.1587830451358659, |
| "learning_rate": 7.890082122552117e-06, |
| "loss": 0.2183, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.39804138366766706, |
| "grad_norm": 0.1581230238900689, |
| "learning_rate": 7.953253316487683e-06, |
| "loss": 0.2157, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.40120044226820406, |
| "grad_norm": 0.15808321097279437, |
| "learning_rate": 8.016424510423247e-06, |
| "loss": 0.2152, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.4043595008687411, |
| "grad_norm": 0.17727435096583632, |
| "learning_rate": 8.079595704358814e-06, |
| "loss": 0.2169, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.4075185594692782, |
| "grad_norm": 0.16825167940141256, |
| "learning_rate": 8.142766898294378e-06, |
| "loss": 0.2143, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.4106776180698152, |
| "grad_norm": 0.1623055799684783, |
| "learning_rate": 8.205938092229944e-06, |
| "loss": 0.214, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.41383667667035223, |
| "grad_norm": 0.16866092597564897, |
| "learning_rate": 8.269109286165509e-06, |
| "loss": 0.2189, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.4169957352708893, |
| "grad_norm": 0.15469945071292018, |
| "learning_rate": 8.332280480101075e-06, |
| "loss": 0.2147, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.4201547938714263, |
| "grad_norm": 0.15496524048072485, |
| "learning_rate": 8.39545167403664e-06, |
| "loss": 0.2155, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.42331385247196335, |
| "grad_norm": 0.1607478129687131, |
| "learning_rate": 8.458622867972206e-06, |
| "loss": 0.2142, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.4264729110725004, |
| "grad_norm": 0.16297866728767108, |
| "learning_rate": 8.521794061907772e-06, |
| "loss": 0.2148, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.42963196967303746, |
| "grad_norm": 0.1549484893694436, |
| "learning_rate": 8.584965255843336e-06, |
| "loss": 0.2162, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.43279102827357446, |
| "grad_norm": 0.15790917490616427, |
| "learning_rate": 8.6481364497789e-06, |
| "loss": 0.2142, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.4359500868741115, |
| "grad_norm": 0.17471621371832, |
| "learning_rate": 8.711307643714467e-06, |
| "loss": 0.215, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.4391091454746486, |
| "grad_norm": 0.1704061630987402, |
| "learning_rate": 8.774478837650032e-06, |
| "loss": 0.2145, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.4422682040751856, |
| "grad_norm": 0.17423241802858616, |
| "learning_rate": 8.837650031585598e-06, |
| "loss": 0.2132, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.44542726267572263, |
| "grad_norm": 0.16758619433784536, |
| "learning_rate": 8.900821225521164e-06, |
| "loss": 0.2138, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.4485863212762597, |
| "grad_norm": 0.17999186900204928, |
| "learning_rate": 8.963992419456728e-06, |
| "loss": 0.2127, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.4517453798767967, |
| "grad_norm": 0.17065420980005516, |
| "learning_rate": 9.027163613392293e-06, |
| "loss": 0.2138, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.45490443847733375, |
| "grad_norm": 0.1974703018692422, |
| "learning_rate": 9.090334807327859e-06, |
| "loss": 0.2127, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.4580634970778708, |
| "grad_norm": 0.20057492546176425, |
| "learning_rate": 9.153506001263425e-06, |
| "loss": 0.2141, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4612225556784078, |
| "grad_norm": 0.17543621309019505, |
| "learning_rate": 9.21667719519899e-06, |
| "loss": 0.2109, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.46438161427894487, |
| "grad_norm": 0.1693436152342155, |
| "learning_rate": 9.279848389134556e-06, |
| "loss": 0.2138, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.4675406728794819, |
| "grad_norm": 0.18371499893258605, |
| "learning_rate": 9.34301958307012e-06, |
| "loss": 0.2121, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.470699731480019, |
| "grad_norm": 0.18302132069209362, |
| "learning_rate": 9.406190777005687e-06, |
| "loss": 0.2125, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.473858790080556, |
| "grad_norm": 0.16878973943654269, |
| "learning_rate": 9.469361970941253e-06, |
| "loss": 0.2096, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.47701784868109304, |
| "grad_norm": 0.15672142571721928, |
| "learning_rate": 9.532533164876817e-06, |
| "loss": 0.2157, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.4801769072816301, |
| "grad_norm": 0.17793569853299288, |
| "learning_rate": 9.595704358812382e-06, |
| "loss": 0.2116, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.4833359658821671, |
| "grad_norm": 0.1711221799529836, |
| "learning_rate": 9.658875552747946e-06, |
| "loss": 0.2086, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.48649502448270415, |
| "grad_norm": 0.15221905413795137, |
| "learning_rate": 9.722046746683513e-06, |
| "loss": 0.2079, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.4896540830832412, |
| "grad_norm": 0.1706269772815951, |
| "learning_rate": 9.785217940619079e-06, |
| "loss": 0.2124, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4928131416837782, |
| "grad_norm": 0.17041271545684786, |
| "learning_rate": 9.848389134554643e-06, |
| "loss": 0.2105, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.49597220028431527, |
| "grad_norm": 0.1707130122866249, |
| "learning_rate": 9.91156032849021e-06, |
| "loss": 0.2094, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.4991312588848523, |
| "grad_norm": 0.1533585694871482, |
| "learning_rate": 9.974731522425774e-06, |
| "loss": 0.2125, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.5022903174853893, |
| "grad_norm": 0.1445021677961463, |
| "learning_rate": 9.99999562381833e-06, |
| "loss": 0.2104, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.5054493760859264, |
| "grad_norm": 0.14451068806922954, |
| "learning_rate": 9.999968880513634e-06, |
| "loss": 0.2115, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5086084346864634, |
| "grad_norm": 0.1711007683006565, |
| "learning_rate": 9.99991782524616e-06, |
| "loss": 0.2119, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.5117674932870004, |
| "grad_norm": 0.14862996999525604, |
| "learning_rate": 9.999842458264166e-06, |
| "loss": 0.2091, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.5149265518875376, |
| "grad_norm": 0.16833296434447828, |
| "learning_rate": 9.999742779934113e-06, |
| "loss": 0.2089, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.5180856104880746, |
| "grad_norm": 0.16624207360408486, |
| "learning_rate": 9.999618790740677e-06, |
| "loss": 0.2076, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.5212446690886116, |
| "grad_norm": 0.18734837267014448, |
| "learning_rate": 9.99947049128675e-06, |
| "loss": 0.2093, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.5244037276891487, |
| "grad_norm": 0.17593769602265188, |
| "learning_rate": 9.999297882293429e-06, |
| "loss": 0.2104, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5275627862896857, |
| "grad_norm": 0.1534253401122401, |
| "learning_rate": 9.999100964600006e-06, |
| "loss": 0.2094, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.5307218448902227, |
| "grad_norm": 0.1488174937468484, |
| "learning_rate": 9.998879739163982e-06, |
| "loss": 0.2087, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.5338809034907598, |
| "grad_norm": 0.13829240161020598, |
| "learning_rate": 9.998634207061047e-06, |
| "loss": 0.2083, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.5370399620912968, |
| "grad_norm": 0.15837901482606578, |
| "learning_rate": 9.998364369485083e-06, |
| "loss": 0.2065, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5401990206918338, |
| "grad_norm": 0.14373686188939075, |
| "learning_rate": 9.998070227748153e-06, |
| "loss": 0.2077, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.5433580792923709, |
| "grad_norm": 0.13856399191761934, |
| "learning_rate": 9.9977517832805e-06, |
| "loss": 0.2074, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.5465171378929079, |
| "grad_norm": 0.16428912206666343, |
| "learning_rate": 9.997409037630533e-06, |
| "loss": 0.2072, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.5496761964934449, |
| "grad_norm": 0.14598794955578479, |
| "learning_rate": 9.997041992464828e-06, |
| "loss": 0.207, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.552835255093982, |
| "grad_norm": 0.1544681862373563, |
| "learning_rate": 9.996650649568116e-06, |
| "loss": 0.2067, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.555994313694519, |
| "grad_norm": 0.16063147300592975, |
| "learning_rate": 9.996235010843269e-06, |
| "loss": 0.2091, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.5591533722950561, |
| "grad_norm": 0.16009334025881428, |
| "learning_rate": 9.9957950783113e-06, |
| "loss": 0.2068, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.5623124308955931, |
| "grad_norm": 0.1473832672273453, |
| "learning_rate": 9.995330854111342e-06, |
| "loss": 0.2072, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.5654714894961301, |
| "grad_norm": 0.1758179524267874, |
| "learning_rate": 9.994842340500654e-06, |
| "loss": 0.2051, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.5686305480966672, |
| "grad_norm": 0.160062543193855, |
| "learning_rate": 9.994329539854597e-06, |
| "loss": 0.2023, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5717896066972042, |
| "grad_norm": 0.14481223083195094, |
| "learning_rate": 9.993792454666622e-06, |
| "loss": 0.2049, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.5749486652977412, |
| "grad_norm": 0.17516638701269122, |
| "learning_rate": 9.993231087548263e-06, |
| "loss": 0.2056, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.5781077238982784, |
| "grad_norm": 0.15577929587795278, |
| "learning_rate": 9.992645441229128e-06, |
| "loss": 0.2053, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.5812667824988154, |
| "grad_norm": 0.16063695498724154, |
| "learning_rate": 9.992035518556873e-06, |
| "loss": 0.2032, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.5844258410993524, |
| "grad_norm": 0.13465469823943357, |
| "learning_rate": 9.991401322497202e-06, |
| "loss": 0.2078, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5875848996998895, |
| "grad_norm": 0.14458781318924407, |
| "learning_rate": 9.990742856133844e-06, |
| "loss": 0.2075, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.5907439583004265, |
| "grad_norm": 0.15458252884662305, |
| "learning_rate": 9.990060122668543e-06, |
| "loss": 0.2058, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.5939030169009635, |
| "grad_norm": 0.13378310324370862, |
| "learning_rate": 9.989353125421034e-06, |
| "loss": 0.2077, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.5970620755015006, |
| "grad_norm": 0.14092827320728965, |
| "learning_rate": 9.98862186782904e-06, |
| "loss": 0.205, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.6002211341020376, |
| "grad_norm": 0.14528693075899177, |
| "learning_rate": 9.987866353448241e-06, |
| "loss": 0.2056, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.6033801927025746, |
| "grad_norm": 0.16835849378700468, |
| "learning_rate": 9.987086585952271e-06, |
| "loss": 0.202, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.6065392513031117, |
| "grad_norm": 0.14702108474072842, |
| "learning_rate": 9.986282569132688e-06, |
| "loss": 0.2046, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.6096983099036487, |
| "grad_norm": 0.15598954366138304, |
| "learning_rate": 9.98545430689896e-06, |
| "loss": 0.2037, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.6128573685041857, |
| "grad_norm": 0.15363339652231678, |
| "learning_rate": 9.984601803278451e-06, |
| "loss": 0.2065, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.6160164271047228, |
| "grad_norm": 0.15079303342445485, |
| "learning_rate": 9.983725062416392e-06, |
| "loss": 0.2046, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.6191754857052598, |
| "grad_norm": 0.13780206734265157, |
| "learning_rate": 9.98282408857587e-06, |
| "loss": 0.2054, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.6223345443057968, |
| "grad_norm": 0.1355920930705493, |
| "learning_rate": 9.981898886137795e-06, |
| "loss": 0.2039, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.6254936029063339, |
| "grad_norm": 0.14054778625440462, |
| "learning_rate": 9.980949459600899e-06, |
| "loss": 0.2045, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.6286526615068709, |
| "grad_norm": 0.16259862482128506, |
| "learning_rate": 9.979975813581694e-06, |
| "loss": 0.2033, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.6318117201074079, |
| "grad_norm": 0.16320641790440754, |
| "learning_rate": 9.978977952814456e-06, |
| "loss": 0.2053, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.634970778707945, |
| "grad_norm": 0.14504338218528204, |
| "learning_rate": 9.97795588215121e-06, |
| "loss": 0.2041, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.638129837308482, |
| "grad_norm": 0.14088062087038478, |
| "learning_rate": 9.97690960656169e-06, |
| "loss": 0.2034, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.6412888959090192, |
| "grad_norm": 0.16368648316215648, |
| "learning_rate": 9.975839131133335e-06, |
| "loss": 0.2004, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.6444479545095562, |
| "grad_norm": 0.14137074446596326, |
| "learning_rate": 9.974744461071246e-06, |
| "loss": 0.2039, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.6476070131100932, |
| "grad_norm": 0.16743385196077595, |
| "learning_rate": 9.973625601698176e-06, |
| "loss": 0.2024, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6507660717106303, |
| "grad_norm": 0.1420212347341941, |
| "learning_rate": 9.972482558454488e-06, |
| "loss": 0.2006, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.6539251303111673, |
| "grad_norm": 0.1541147190019739, |
| "learning_rate": 9.971315336898144e-06, |
| "loss": 0.2031, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.6570841889117043, |
| "grad_norm": 0.14954970584415023, |
| "learning_rate": 9.970123942704667e-06, |
| "loss": 0.2022, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.6602432475122414, |
| "grad_norm": 0.13835942774621643, |
| "learning_rate": 9.968908381667122e-06, |
| "loss": 0.205, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.6634023061127784, |
| "grad_norm": 0.1444063211993615, |
| "learning_rate": 9.967668659696077e-06, |
| "loss": 0.2003, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6665613647133154, |
| "grad_norm": 0.15181588903023388, |
| "learning_rate": 9.966404782819587e-06, |
| "loss": 0.2041, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.6697204233138525, |
| "grad_norm": 0.14453828048169265, |
| "learning_rate": 9.965116757183156e-06, |
| "loss": 0.2008, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.6728794819143895, |
| "grad_norm": 0.16433983585515474, |
| "learning_rate": 9.963804589049709e-06, |
| "loss": 0.2045, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.6760385405149265, |
| "grad_norm": 0.14280102041208004, |
| "learning_rate": 9.962468284799559e-06, |
| "loss": 0.2021, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.6791975991154636, |
| "grad_norm": 0.14204139100462726, |
| "learning_rate": 9.961107850930386e-06, |
| "loss": 0.201, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6823566577160006, |
| "grad_norm": 0.1324076473779632, |
| "learning_rate": 9.959723294057195e-06, |
| "loss": 0.1991, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.6855157163165376, |
| "grad_norm": 0.14295162932415698, |
| "learning_rate": 9.958314620912283e-06, |
| "loss": 0.2025, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.6886747749170747, |
| "grad_norm": 0.15726554545849142, |
| "learning_rate": 9.956881838345221e-06, |
| "loss": 0.2033, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.6918338335176117, |
| "grad_norm": 0.16250545787100992, |
| "learning_rate": 9.955424953322797e-06, |
| "loss": 0.2015, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.6949928921181487, |
| "grad_norm": 0.1614266022365173, |
| "learning_rate": 9.953943972929003e-06, |
| "loss": 0.1996, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6981519507186859, |
| "grad_norm": 0.15900062576977386, |
| "learning_rate": 9.952438904364996e-06, |
| "loss": 0.2026, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.7013110093192229, |
| "grad_norm": 0.13873539419869083, |
| "learning_rate": 9.950909754949052e-06, |
| "loss": 0.2035, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.7044700679197599, |
| "grad_norm": 0.16329226810226954, |
| "learning_rate": 9.949356532116546e-06, |
| "loss": 0.2017, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.707629126520297, |
| "grad_norm": 0.130433030569765, |
| "learning_rate": 9.947779243419899e-06, |
| "loss": 0.2017, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.710788185120834, |
| "grad_norm": 0.13992897215148528, |
| "learning_rate": 9.946177896528557e-06, |
| "loss": 0.2001, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.7139472437213711, |
| "grad_norm": 0.14070910045992718, |
| "learning_rate": 9.944552499228947e-06, |
| "loss": 0.2007, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.7171063023219081, |
| "grad_norm": 0.1469150772379642, |
| "learning_rate": 9.942903059424441e-06, |
| "loss": 0.2006, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.7202653609224451, |
| "grad_norm": 0.18224682305919618, |
| "learning_rate": 9.941229585135307e-06, |
| "loss": 0.2028, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.7234244195229822, |
| "grad_norm": 0.14288906489417755, |
| "learning_rate": 9.939532084498685e-06, |
| "loss": 0.1987, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.7265834781235192, |
| "grad_norm": 0.13437271496120856, |
| "learning_rate": 9.937810565768544e-06, |
| "loss": 0.1993, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.7297425367240562, |
| "grad_norm": 0.1446887097806904, |
| "learning_rate": 9.936065037315636e-06, |
| "loss": 0.2011, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.7329015953245933, |
| "grad_norm": 0.13251427513003186, |
| "learning_rate": 9.934295507627456e-06, |
| "loss": 0.2022, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.7360606539251303, |
| "grad_norm": 0.1452288699010562, |
| "learning_rate": 9.932501985308206e-06, |
| "loss": 0.2009, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.7392197125256673, |
| "grad_norm": 0.14728963865115374, |
| "learning_rate": 9.93068447907875e-06, |
| "loss": 0.1987, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.7423787711262044, |
| "grad_norm": 0.1517245487863863, |
| "learning_rate": 9.928842997776574e-06, |
| "loss": 0.2013, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.7455378297267414, |
| "grad_norm": 0.1614866572575232, |
| "learning_rate": 9.926977550355734e-06, |
| "loss": 0.1997, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.7486968883272784, |
| "grad_norm": 0.14031629423175507, |
| "learning_rate": 9.92508814588683e-06, |
| "loss": 0.199, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.7518559469278155, |
| "grad_norm": 0.14118485538481557, |
| "learning_rate": 9.92317479355694e-06, |
| "loss": 0.1976, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.7550150055283525, |
| "grad_norm": 0.13181048592845238, |
| "learning_rate": 9.921237502669595e-06, |
| "loss": 0.198, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.7581740641288895, |
| "grad_norm": 0.1345265368505879, |
| "learning_rate": 9.919276282644723e-06, |
| "loss": 0.201, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7613331227294267, |
| "grad_norm": 0.13720890023292417, |
| "learning_rate": 9.917291143018604e-06, |
| "loss": 0.2009, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.7644921813299637, |
| "grad_norm": 0.13845111986552353, |
| "learning_rate": 9.915282093443825e-06, |
| "loss": 0.2008, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.7676512399305007, |
| "grad_norm": 0.14617335075904797, |
| "learning_rate": 9.913249143689234e-06, |
| "loss": 0.1991, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.7708102985310378, |
| "grad_norm": 0.1320877727948845, |
| "learning_rate": 9.911192303639896e-06, |
| "loss": 0.1999, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.7739693571315748, |
| "grad_norm": 0.13482883240500468, |
| "learning_rate": 9.909111583297035e-06, |
| "loss": 0.1997, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7771284157321118, |
| "grad_norm": 0.14200097798675781, |
| "learning_rate": 9.907006992777991e-06, |
| "loss": 0.2008, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.7802874743326489, |
| "grad_norm": 0.13733098129824253, |
| "learning_rate": 9.904878542316177e-06, |
| "loss": 0.1988, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.7834465329331859, |
| "grad_norm": 0.14967333958732693, |
| "learning_rate": 9.902726242261015e-06, |
| "loss": 0.2, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.786605591533723, |
| "grad_norm": 0.18469961665919096, |
| "learning_rate": 9.9005501030779e-06, |
| "loss": 0.1998, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.78976465013426, |
| "grad_norm": 0.1291065536177641, |
| "learning_rate": 9.898350135348143e-06, |
| "loss": 0.1994, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.792923708734797, |
| "grad_norm": 0.1455654343221393, |
| "learning_rate": 9.896126349768913e-06, |
| "loss": 0.1961, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.7960827673353341, |
| "grad_norm": 0.1305825664747534, |
| "learning_rate": 9.893878757153197e-06, |
| "loss": 0.1997, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.7992418259358711, |
| "grad_norm": 0.1355469021976556, |
| "learning_rate": 9.891607368429741e-06, |
| "loss": 0.2009, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.8024008845364081, |
| "grad_norm": 0.1393422807545934, |
| "learning_rate": 9.889312194642999e-06, |
| "loss": 0.1996, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.8055599431369452, |
| "grad_norm": 0.14137833832556562, |
| "learning_rate": 9.886993246953075e-06, |
| "loss": 0.1984, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.8087190017374822, |
| "grad_norm": 0.1288518244334966, |
| "learning_rate": 9.884650536635674e-06, |
| "loss": 0.1998, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.8118780603380192, |
| "grad_norm": 0.12540981604153706, |
| "learning_rate": 9.882284075082042e-06, |
| "loss": 0.1953, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.8150371189385563, |
| "grad_norm": 0.1298044947287737, |
| "learning_rate": 9.879893873798918e-06, |
| "loss": 0.1998, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.8181961775390934, |
| "grad_norm": 0.13593942142698026, |
| "learning_rate": 9.877479944408469e-06, |
| "loss": 0.1994, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.8213552361396304, |
| "grad_norm": 0.14577596422732375, |
| "learning_rate": 9.875042298648241e-06, |
| "loss": 0.1968, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.8245142947401675, |
| "grad_norm": 0.13651649878117303, |
| "learning_rate": 9.872580948371101e-06, |
| "loss": 0.1959, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.8276733533407045, |
| "grad_norm": 0.13458568002303536, |
| "learning_rate": 9.870095905545172e-06, |
| "loss": 0.1975, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.8308324119412415, |
| "grad_norm": 0.13467497395688513, |
| "learning_rate": 9.867587182253783e-06, |
| "loss": 0.198, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.8339914705417786, |
| "grad_norm": 0.1334106181890542, |
| "learning_rate": 9.86505479069541e-06, |
| "loss": 0.1975, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.8371505291423156, |
| "grad_norm": 0.14726119516550862, |
| "learning_rate": 9.862498743183606e-06, |
| "loss": 0.1962, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.8403095877428526, |
| "grad_norm": 0.13818332053028007, |
| "learning_rate": 9.85991905214696e-06, |
| "loss": 0.1998, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.8434686463433897, |
| "grad_norm": 0.14412901343016873, |
| "learning_rate": 9.85731573012902e-06, |
| "loss": 0.2, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.8466277049439267, |
| "grad_norm": 0.1255413662933083, |
| "learning_rate": 9.854688789788236e-06, |
| "loss": 0.198, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.8497867635444637, |
| "grad_norm": 0.13187344960522424, |
| "learning_rate": 9.852038243897903e-06, |
| "loss": 0.1972, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.8529458221450008, |
| "grad_norm": 0.13247624619465903, |
| "learning_rate": 9.849364105346098e-06, |
| "loss": 0.1982, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8561048807455378, |
| "grad_norm": 0.12352050553226067, |
| "learning_rate": 9.846666387135613e-06, |
| "loss": 0.1954, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.8592639393460749, |
| "grad_norm": 0.13384183862518867, |
| "learning_rate": 9.843945102383892e-06, |
| "loss": 0.197, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.8624229979466119, |
| "grad_norm": 0.12329786824129346, |
| "learning_rate": 9.841200264322974e-06, |
| "loss": 0.1977, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.8655820565471489, |
| "grad_norm": 0.13477039032719176, |
| "learning_rate": 9.838431886299421e-06, |
| "loss": 0.1961, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.868741115147686, |
| "grad_norm": 0.13828179616792935, |
| "learning_rate": 9.83563998177426e-06, |
| "loss": 0.1967, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.871900173748223, |
| "grad_norm": 0.14564045162827766, |
| "learning_rate": 9.83282456432291e-06, |
| "loss": 0.1965, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.87505923234876, |
| "grad_norm": 0.13685777011127429, |
| "learning_rate": 9.829985647635118e-06, |
| "loss": 0.1981, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.8782182909492972, |
| "grad_norm": 0.13617569439938054, |
| "learning_rate": 9.827123245514901e-06, |
| "loss": 0.1951, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.8813773495498342, |
| "grad_norm": 0.12868163128280088, |
| "learning_rate": 9.824237371880469e-06, |
| "loss": 0.195, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.8845364081503712, |
| "grad_norm": 0.13965590647075304, |
| "learning_rate": 9.821328040764157e-06, |
| "loss": 0.1984, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8876954667509083, |
| "grad_norm": 0.1191526599411457, |
| "learning_rate": 9.818395266312363e-06, |
| "loss": 0.1925, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.8908545253514453, |
| "grad_norm": 0.1209195098683138, |
| "learning_rate": 9.81543906278548e-06, |
| "loss": 0.1995, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.8940135839519823, |
| "grad_norm": 0.12466523038010362, |
| "learning_rate": 9.812459444557815e-06, |
| "loss": 0.1937, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.8971726425525194, |
| "grad_norm": 0.12952393163312614, |
| "learning_rate": 9.809456426117533e-06, |
| "loss": 0.1932, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.9003317011530564, |
| "grad_norm": 0.13392155972728179, |
| "learning_rate": 9.806430022066582e-06, |
| "loss": 0.1978, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.9034907597535934, |
| "grad_norm": 0.13186718584966667, |
| "learning_rate": 9.803380247120616e-06, |
| "loss": 0.1953, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.9066498183541305, |
| "grad_norm": 0.1283857878998356, |
| "learning_rate": 9.800307116108931e-06, |
| "loss": 0.1962, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.9098088769546675, |
| "grad_norm": 0.12302487764368193, |
| "learning_rate": 9.797210643974388e-06, |
| "loss": 0.1954, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.9129679355552045, |
| "grad_norm": 0.1279449953769118, |
| "learning_rate": 9.794090845773346e-06, |
| "loss": 0.1936, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.9161269941557416, |
| "grad_norm": 0.13723185562370793, |
| "learning_rate": 9.79094773667558e-06, |
| "loss": 0.1948, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.9192860527562786, |
| "grad_norm": 0.1382694059956154, |
| "learning_rate": 9.787781331964217e-06, |
| "loss": 0.1961, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.9224451113568156, |
| "grad_norm": 0.13522487929855218, |
| "learning_rate": 9.784591647035654e-06, |
| "loss": 0.1944, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.9256041699573527, |
| "grad_norm": 0.13619232160862846, |
| "learning_rate": 9.781378697399492e-06, |
| "loss": 0.1939, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.9287632285578897, |
| "grad_norm": 0.13966605421891545, |
| "learning_rate": 9.778142498678447e-06, |
| "loss": 0.1936, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.9319222871584268, |
| "grad_norm": 0.13610146242659704, |
| "learning_rate": 9.774883066608288e-06, |
| "loss": 0.1955, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.9350813457589638, |
| "grad_norm": 0.1272898066516385, |
| "learning_rate": 9.771600417037747e-06, |
| "loss": 0.1951, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.9382404043595008, |
| "grad_norm": 0.12577223515891656, |
| "learning_rate": 9.76829456592846e-06, |
| "loss": 0.1941, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.941399462960038, |
| "grad_norm": 0.13229230817335338, |
| "learning_rate": 9.76496552935487e-06, |
| "loss": 0.1948, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.944558521560575, |
| "grad_norm": 0.13057771902599097, |
| "learning_rate": 9.76161332350416e-06, |
| "loss": 0.1945, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.947717580161112, |
| "grad_norm": 0.1375087406497119, |
| "learning_rate": 9.758237964676175e-06, |
| "loss": 0.1946, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.9508766387616491, |
| "grad_norm": 0.13144411031384784, |
| "learning_rate": 9.754839469283333e-06, |
| "loss": 0.1916, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.9540356973621861, |
| "grad_norm": 0.1307858984474674, |
| "learning_rate": 9.751417853850557e-06, |
| "loss": 0.1961, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.9571947559627231, |
| "grad_norm": 0.12400375737914372, |
| "learning_rate": 9.747973135015187e-06, |
| "loss": 0.1948, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.9603538145632602, |
| "grad_norm": 0.14800804219149807, |
| "learning_rate": 9.744505329526906e-06, |
| "loss": 0.1951, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.9635128731637972, |
| "grad_norm": 0.13528680108076863, |
| "learning_rate": 9.741014454247648e-06, |
| "loss": 0.1946, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9666719317643342, |
| "grad_norm": 0.15276170947974638, |
| "learning_rate": 9.737500526151525e-06, |
| "loss": 0.1935, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.9698309903648713, |
| "grad_norm": 0.1280505905081845, |
| "learning_rate": 9.733963562324739e-06, |
| "loss": 0.193, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.9729900489654083, |
| "grad_norm": 0.13045657850862527, |
| "learning_rate": 9.730403579965508e-06, |
| "loss": 0.1953, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.9761491075659453, |
| "grad_norm": 0.12920108483379814, |
| "learning_rate": 9.726820596383968e-06, |
| "loss": 0.194, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.9793081661664824, |
| "grad_norm": 0.1275121663048079, |
| "learning_rate": 9.723214629002103e-06, |
| "loss": 0.1937, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9824672247670194, |
| "grad_norm": 0.13153937715884076, |
| "learning_rate": 9.719585695353648e-06, |
| "loss": 0.1927, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.9856262833675564, |
| "grad_norm": 0.13540077808963083, |
| "learning_rate": 9.715933813084012e-06, |
| "loss": 0.1948, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.9887853419680935, |
| "grad_norm": 0.13634113133020404, |
| "learning_rate": 9.712258999950196e-06, |
| "loss": 0.192, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.9919444005686305, |
| "grad_norm": 0.11908229587933031, |
| "learning_rate": 9.70856127382069e-06, |
| "loss": 0.1937, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.9951034591691675, |
| "grad_norm": 0.11928897371871516, |
| "learning_rate": 9.704840652675405e-06, |
| "loss": 0.1939, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.9982625177697046, |
| "grad_norm": 0.12957379845580613, |
| "learning_rate": 9.701097154605572e-06, |
| "loss": 0.1934, |
| "step": 3160 |
| }, |
| { |
| "epoch": 1.0015795293002685, |
| "grad_norm": 0.12555682464472692, |
| "learning_rate": 9.697330797813665e-06, |
| "loss": 0.2078, |
| "step": 3170 |
| }, |
| { |
| "epoch": 1.0047385879008055, |
| "grad_norm": 0.12366696963941619, |
| "learning_rate": 9.693541600613297e-06, |
| "loss": 0.1833, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.0078976465013425, |
| "grad_norm": 0.12387115361971955, |
| "learning_rate": 9.689729581429154e-06, |
| "loss": 0.184, |
| "step": 3190 |
| }, |
| { |
| "epoch": 1.0110567051018797, |
| "grad_norm": 0.12394939667356165, |
| "learning_rate": 9.68589475879688e-06, |
| "loss": 0.182, |
| "step": 3200 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 15830, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.177750985487155e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|