MMR1-32B-SFT / trainer_state.json
Sicong's picture
Add files using upload-large-folder tool
8b92ec0 verified
raw
history blame
56.6 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0110567051018797,
"eval_steps": 500,
"global_step": 3200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00315905860053704,
"grad_norm": 4.247729183572734,
"learning_rate": 5.685407454200885e-08,
"loss": 0.6057,
"step": 10
},
{
"epoch": 0.00631811720107408,
"grad_norm": 4.531449453000221,
"learning_rate": 1.2002526847757423e-07,
"loss": 0.6101,
"step": 20
},
{
"epoch": 0.00947717580161112,
"grad_norm": 4.104292927990717,
"learning_rate": 1.831964624131396e-07,
"loss": 0.5947,
"step": 30
},
{
"epoch": 0.01263623440214816,
"grad_norm": 3.1655981231206747,
"learning_rate": 2.46367656348705e-07,
"loss": 0.5609,
"step": 40
},
{
"epoch": 0.0157952930026852,
"grad_norm": 1.6081249047797948,
"learning_rate": 3.095388502842704e-07,
"loss": 0.5128,
"step": 50
},
{
"epoch": 0.01895435160322224,
"grad_norm": 1.0896969313030245,
"learning_rate": 3.727100442198358e-07,
"loss": 0.4618,
"step": 60
},
{
"epoch": 0.02211341020375928,
"grad_norm": 0.720186068205468,
"learning_rate": 4.3588123815540116e-07,
"loss": 0.437,
"step": 70
},
{
"epoch": 0.02527246880429632,
"grad_norm": 0.5712683718673494,
"learning_rate": 4.990524320909665e-07,
"loss": 0.4145,
"step": 80
},
{
"epoch": 0.02843152740483336,
"grad_norm": 0.41839125700043645,
"learning_rate": 5.62223626026532e-07,
"loss": 0.3887,
"step": 90
},
{
"epoch": 0.0315905860053704,
"grad_norm": 0.33539615628708846,
"learning_rate": 6.253948199620974e-07,
"loss": 0.3661,
"step": 100
},
{
"epoch": 0.03474964460590744,
"grad_norm": 0.25866894040378646,
"learning_rate": 6.885660138976627e-07,
"loss": 0.352,
"step": 110
},
{
"epoch": 0.03790870320644448,
"grad_norm": 0.24295153980385867,
"learning_rate": 7.517372078332281e-07,
"loss": 0.3457,
"step": 120
},
{
"epoch": 0.04106776180698152,
"grad_norm": 0.28237511864322024,
"learning_rate": 8.149084017687935e-07,
"loss": 0.3345,
"step": 130
},
{
"epoch": 0.04422682040751856,
"grad_norm": 0.21156218948831834,
"learning_rate": 8.780795957043589e-07,
"loss": 0.3281,
"step": 140
},
{
"epoch": 0.0473858790080556,
"grad_norm": 0.20112450248918143,
"learning_rate": 9.412507896399242e-07,
"loss": 0.3267,
"step": 150
},
{
"epoch": 0.05054493760859264,
"grad_norm": 0.19763198530161658,
"learning_rate": 1.0044219835754897e-06,
"loss": 0.3206,
"step": 160
},
{
"epoch": 0.05370399620912968,
"grad_norm": 0.21467283595984932,
"learning_rate": 1.067593177511055e-06,
"loss": 0.3136,
"step": 170
},
{
"epoch": 0.05686305480966672,
"grad_norm": 0.18660741459520747,
"learning_rate": 1.1307643714466204e-06,
"loss": 0.3087,
"step": 180
},
{
"epoch": 0.06002211341020376,
"grad_norm": 0.21253735556196968,
"learning_rate": 1.1939355653821858e-06,
"loss": 0.3049,
"step": 190
},
{
"epoch": 0.0631811720107408,
"grad_norm": 0.237093634634235,
"learning_rate": 1.2571067593177513e-06,
"loss": 0.3005,
"step": 200
},
{
"epoch": 0.06634023061127783,
"grad_norm": 0.2231876630708564,
"learning_rate": 1.3202779532533167e-06,
"loss": 0.2972,
"step": 210
},
{
"epoch": 0.06949928921181488,
"grad_norm": 0.2017070855177483,
"learning_rate": 1.383449147188882e-06,
"loss": 0.2935,
"step": 220
},
{
"epoch": 0.07265834781235192,
"grad_norm": 0.21789091247744216,
"learning_rate": 1.4466203411244472e-06,
"loss": 0.29,
"step": 230
},
{
"epoch": 0.07581740641288896,
"grad_norm": 0.24210256044037484,
"learning_rate": 1.509791535060013e-06,
"loss": 0.289,
"step": 240
},
{
"epoch": 0.07897646501342599,
"grad_norm": 0.21139319424520517,
"learning_rate": 1.572962728995578e-06,
"loss": 0.2895,
"step": 250
},
{
"epoch": 0.08213552361396304,
"grad_norm": 0.20081643445857786,
"learning_rate": 1.6361339229311434e-06,
"loss": 0.2848,
"step": 260
},
{
"epoch": 0.08529458221450008,
"grad_norm": 0.21867950206393189,
"learning_rate": 1.6993051168667088e-06,
"loss": 0.2842,
"step": 270
},
{
"epoch": 0.08845364081503712,
"grad_norm": 0.1996646570779461,
"learning_rate": 1.7624763108022743e-06,
"loss": 0.2785,
"step": 280
},
{
"epoch": 0.09161269941557416,
"grad_norm": 0.1991696950635834,
"learning_rate": 1.8256475047378397e-06,
"loss": 0.2747,
"step": 290
},
{
"epoch": 0.0947717580161112,
"grad_norm": 0.21382346420276455,
"learning_rate": 1.888818698673405e-06,
"loss": 0.2778,
"step": 300
},
{
"epoch": 0.09793081661664824,
"grad_norm": 0.22163335945192605,
"learning_rate": 1.9519898926089704e-06,
"loss": 0.2747,
"step": 310
},
{
"epoch": 0.10108987521718528,
"grad_norm": 0.2338176849443747,
"learning_rate": 2.0151610865445357e-06,
"loss": 0.2719,
"step": 320
},
{
"epoch": 0.10424893381772232,
"grad_norm": 0.18002603770625134,
"learning_rate": 2.078332280480101e-06,
"loss": 0.2713,
"step": 330
},
{
"epoch": 0.10740799241825937,
"grad_norm": 0.19849866049815784,
"learning_rate": 2.1415034744156664e-06,
"loss": 0.2671,
"step": 340
},
{
"epoch": 0.1105670510187964,
"grad_norm": 0.18726127817800872,
"learning_rate": 2.2046746683512322e-06,
"loss": 0.2679,
"step": 350
},
{
"epoch": 0.11372610961933344,
"grad_norm": 0.18164666420060355,
"learning_rate": 2.2678458622867976e-06,
"loss": 0.2696,
"step": 360
},
{
"epoch": 0.11688516821987048,
"grad_norm": 0.188419554987382,
"learning_rate": 2.331017056222363e-06,
"loss": 0.2657,
"step": 370
},
{
"epoch": 0.12004422682040752,
"grad_norm": 0.21408765943095384,
"learning_rate": 2.394188250157928e-06,
"loss": 0.2631,
"step": 380
},
{
"epoch": 0.12320328542094455,
"grad_norm": 0.1728133517187042,
"learning_rate": 2.4573594440934936e-06,
"loss": 0.2627,
"step": 390
},
{
"epoch": 0.1263623440214816,
"grad_norm": 0.20446817453058305,
"learning_rate": 2.520530638029059e-06,
"loss": 0.2633,
"step": 400
},
{
"epoch": 0.12952140262201864,
"grad_norm": 0.2867541071966156,
"learning_rate": 2.5837018319646247e-06,
"loss": 0.2606,
"step": 410
},
{
"epoch": 0.13268046122255567,
"grad_norm": 0.19615186721360886,
"learning_rate": 2.6468730259001897e-06,
"loss": 0.2589,
"step": 420
},
{
"epoch": 0.13583951982309272,
"grad_norm": 0.17250843271916932,
"learning_rate": 2.710044219835755e-06,
"loss": 0.2578,
"step": 430
},
{
"epoch": 0.13899857842362975,
"grad_norm": 0.21061094083012397,
"learning_rate": 2.7732154137713208e-06,
"loss": 0.256,
"step": 440
},
{
"epoch": 0.1421576370241668,
"grad_norm": 0.16796761706106636,
"learning_rate": 2.8363866077068857e-06,
"loss": 0.254,
"step": 450
},
{
"epoch": 0.14531669562470384,
"grad_norm": 0.20087509437202522,
"learning_rate": 2.8995578016424515e-06,
"loss": 0.2554,
"step": 460
},
{
"epoch": 0.14847575422524087,
"grad_norm": 0.20431208429215614,
"learning_rate": 2.9627289955780164e-06,
"loss": 0.2524,
"step": 470
},
{
"epoch": 0.15163481282577793,
"grad_norm": 0.20436652455880414,
"learning_rate": 3.025900189513582e-06,
"loss": 0.2557,
"step": 480
},
{
"epoch": 0.15479387142631496,
"grad_norm": 0.19923295760265683,
"learning_rate": 3.0890713834491475e-06,
"loss": 0.252,
"step": 490
},
{
"epoch": 0.15795293002685198,
"grad_norm": 0.1925405706241213,
"learning_rate": 3.1522425773847125e-06,
"loss": 0.2505,
"step": 500
},
{
"epoch": 0.16111198862738904,
"grad_norm": 0.20206673704458114,
"learning_rate": 3.2154137713202782e-06,
"loss": 0.2501,
"step": 510
},
{
"epoch": 0.16427104722792607,
"grad_norm": 0.24333700245600473,
"learning_rate": 3.278584965255844e-06,
"loss": 0.2464,
"step": 520
},
{
"epoch": 0.16743010582846313,
"grad_norm": 0.18205244482695793,
"learning_rate": 3.341756159191409e-06,
"loss": 0.2485,
"step": 530
},
{
"epoch": 0.17058916442900016,
"grad_norm": 0.18264545298219137,
"learning_rate": 3.4049273531269743e-06,
"loss": 0.2456,
"step": 540
},
{
"epoch": 0.17374822302953719,
"grad_norm": 0.17873652388985004,
"learning_rate": 3.46809854706254e-06,
"loss": 0.248,
"step": 550
},
{
"epoch": 0.17690728163007424,
"grad_norm": 0.17062239190578055,
"learning_rate": 3.531269740998105e-06,
"loss": 0.2449,
"step": 560
},
{
"epoch": 0.18006634023061127,
"grad_norm": 0.19143037375033525,
"learning_rate": 3.5944409349336708e-06,
"loss": 0.2464,
"step": 570
},
{
"epoch": 0.18322539883114833,
"grad_norm": 0.1861722136519532,
"learning_rate": 3.6576121288692357e-06,
"loss": 0.2489,
"step": 580
},
{
"epoch": 0.18638445743168536,
"grad_norm": 0.1680077751454427,
"learning_rate": 3.7207833228048014e-06,
"loss": 0.2433,
"step": 590
},
{
"epoch": 0.1895435160322224,
"grad_norm": 0.1827874982767101,
"learning_rate": 3.783954516740367e-06,
"loss": 0.2414,
"step": 600
},
{
"epoch": 0.19270257463275944,
"grad_norm": 0.17739754432609572,
"learning_rate": 3.847125710675932e-06,
"loss": 0.2439,
"step": 610
},
{
"epoch": 0.19586163323329647,
"grad_norm": 0.18166974193276042,
"learning_rate": 3.910296904611497e-06,
"loss": 0.2427,
"step": 620
},
{
"epoch": 0.19902069183383353,
"grad_norm": 0.23535305540149423,
"learning_rate": 3.973468098547063e-06,
"loss": 0.2397,
"step": 630
},
{
"epoch": 0.20217975043437056,
"grad_norm": 0.17950832267537836,
"learning_rate": 4.036639292482628e-06,
"loss": 0.2413,
"step": 640
},
{
"epoch": 0.2053388090349076,
"grad_norm": 0.1871436271310335,
"learning_rate": 4.099810486418194e-06,
"loss": 0.2392,
"step": 650
},
{
"epoch": 0.20849786763544464,
"grad_norm": 0.18940758631882895,
"learning_rate": 4.162981680353759e-06,
"loss": 0.239,
"step": 660
},
{
"epoch": 0.21165692623598167,
"grad_norm": 0.18679899786034626,
"learning_rate": 4.226152874289325e-06,
"loss": 0.2394,
"step": 670
},
{
"epoch": 0.21481598483651873,
"grad_norm": 0.2005980889421409,
"learning_rate": 4.28932406822489e-06,
"loss": 0.2369,
"step": 680
},
{
"epoch": 0.21797504343705576,
"grad_norm": 0.18654971719873092,
"learning_rate": 4.3524952621604545e-06,
"loss": 0.2372,
"step": 690
},
{
"epoch": 0.2211341020375928,
"grad_norm": 0.19896341390367112,
"learning_rate": 4.415666456096021e-06,
"loss": 0.2384,
"step": 700
},
{
"epoch": 0.22429316063812985,
"grad_norm": 0.1824518487303919,
"learning_rate": 4.478837650031586e-06,
"loss": 0.2322,
"step": 710
},
{
"epoch": 0.22745221923866688,
"grad_norm": 0.19030275242513275,
"learning_rate": 4.542008843967151e-06,
"loss": 0.2373,
"step": 720
},
{
"epoch": 0.2306112778392039,
"grad_norm": 0.18760298890701887,
"learning_rate": 4.605180037902717e-06,
"loss": 0.2345,
"step": 730
},
{
"epoch": 0.23377033643974096,
"grad_norm": 0.15845613302953646,
"learning_rate": 4.668351231838282e-06,
"loss": 0.2343,
"step": 740
},
{
"epoch": 0.236929395040278,
"grad_norm": 0.1710181463737659,
"learning_rate": 4.7315224257738475e-06,
"loss": 0.2318,
"step": 750
},
{
"epoch": 0.24008845364081505,
"grad_norm": 0.23880301915969937,
"learning_rate": 4.794693619709413e-06,
"loss": 0.2339,
"step": 760
},
{
"epoch": 0.24324751224135208,
"grad_norm": 0.1678386308387805,
"learning_rate": 4.857864813644978e-06,
"loss": 0.2306,
"step": 770
},
{
"epoch": 0.2464065708418891,
"grad_norm": 0.1671346172529239,
"learning_rate": 4.9210360075805435e-06,
"loss": 0.2333,
"step": 780
},
{
"epoch": 0.24956562944242616,
"grad_norm": 0.19944596405981949,
"learning_rate": 4.984207201516109e-06,
"loss": 0.2313,
"step": 790
},
{
"epoch": 0.2527246880429632,
"grad_norm": 0.18321786541799437,
"learning_rate": 5.047378395451674e-06,
"loss": 0.2321,
"step": 800
},
{
"epoch": 0.2558837466435002,
"grad_norm": 0.16781254221044775,
"learning_rate": 5.11054958938724e-06,
"loss": 0.2313,
"step": 810
},
{
"epoch": 0.2590428052440373,
"grad_norm": 0.17492747555934146,
"learning_rate": 5.173720783322806e-06,
"loss": 0.2312,
"step": 820
},
{
"epoch": 0.26220186384457433,
"grad_norm": 0.17965819326676355,
"learning_rate": 5.23689197725837e-06,
"loss": 0.2285,
"step": 830
},
{
"epoch": 0.26536092244511134,
"grad_norm": 0.16804749386309387,
"learning_rate": 5.3000631711939365e-06,
"loss": 0.2282,
"step": 840
},
{
"epoch": 0.2685199810456484,
"grad_norm": 0.1723473524041935,
"learning_rate": 5.363234365129502e-06,
"loss": 0.2296,
"step": 850
},
{
"epoch": 0.27167903964618545,
"grad_norm": 0.20773827836779976,
"learning_rate": 5.426405559065066e-06,
"loss": 0.2273,
"step": 860
},
{
"epoch": 0.27483809824672245,
"grad_norm": 0.1641196545454829,
"learning_rate": 5.489576753000632e-06,
"loss": 0.2301,
"step": 870
},
{
"epoch": 0.2779971568472595,
"grad_norm": 0.19323437206655858,
"learning_rate": 5.552747946936198e-06,
"loss": 0.2249,
"step": 880
},
{
"epoch": 0.28115621544779656,
"grad_norm": 0.18267311407058162,
"learning_rate": 5.615919140871763e-06,
"loss": 0.227,
"step": 890
},
{
"epoch": 0.2843152740483336,
"grad_norm": 0.19801209807316134,
"learning_rate": 5.679090334807328e-06,
"loss": 0.2273,
"step": 900
},
{
"epoch": 0.2874743326488706,
"grad_norm": 0.17413734456766244,
"learning_rate": 5.742261528742894e-06,
"loss": 0.2267,
"step": 910
},
{
"epoch": 0.2906333912494077,
"grad_norm": 0.21136423644297928,
"learning_rate": 5.805432722678459e-06,
"loss": 0.2263,
"step": 920
},
{
"epoch": 0.29379244984994474,
"grad_norm": 0.20233679461777548,
"learning_rate": 5.868603916614024e-06,
"loss": 0.2263,
"step": 930
},
{
"epoch": 0.29695150845048174,
"grad_norm": 0.19521629273254087,
"learning_rate": 5.93177511054959e-06,
"loss": 0.2269,
"step": 940
},
{
"epoch": 0.3001105670510188,
"grad_norm": 0.1696883052949813,
"learning_rate": 5.994946304485155e-06,
"loss": 0.2265,
"step": 950
},
{
"epoch": 0.30326962565155585,
"grad_norm": 0.18758477411326932,
"learning_rate": 6.058117498420721e-06,
"loss": 0.2247,
"step": 960
},
{
"epoch": 0.30642868425209285,
"grad_norm": 0.1897879092670083,
"learning_rate": 6.121288692356287e-06,
"loss": 0.2282,
"step": 970
},
{
"epoch": 0.3095877428526299,
"grad_norm": 0.18304573332433055,
"learning_rate": 6.184459886291851e-06,
"loss": 0.2223,
"step": 980
},
{
"epoch": 0.31274680145316697,
"grad_norm": 0.1782561802917822,
"learning_rate": 6.247631080227417e-06,
"loss": 0.224,
"step": 990
},
{
"epoch": 0.31590586005370397,
"grad_norm": 0.20741862373031386,
"learning_rate": 6.310802274162983e-06,
"loss": 0.2219,
"step": 1000
},
{
"epoch": 0.319064918654241,
"grad_norm": 0.1539256973132976,
"learning_rate": 6.373973468098547e-06,
"loss": 0.2207,
"step": 1010
},
{
"epoch": 0.3222239772547781,
"grad_norm": 0.16875500034138524,
"learning_rate": 6.437144662034113e-06,
"loss": 0.2225,
"step": 1020
},
{
"epoch": 0.32538303585531514,
"grad_norm": 0.17168781652376466,
"learning_rate": 6.500315855969679e-06,
"loss": 0.22,
"step": 1030
},
{
"epoch": 0.32854209445585214,
"grad_norm": 0.18011584935641983,
"learning_rate": 6.563487049905244e-06,
"loss": 0.2216,
"step": 1040
},
{
"epoch": 0.3317011530563892,
"grad_norm": 0.19414711454837585,
"learning_rate": 6.626658243840809e-06,
"loss": 0.2249,
"step": 1050
},
{
"epoch": 0.33486021165692625,
"grad_norm": 0.2103163638452648,
"learning_rate": 6.689829437776375e-06,
"loss": 0.2242,
"step": 1060
},
{
"epoch": 0.33801927025746326,
"grad_norm": 0.17555158548112104,
"learning_rate": 6.75300063171194e-06,
"loss": 0.2206,
"step": 1070
},
{
"epoch": 0.3411783288580003,
"grad_norm": 0.16163792223112092,
"learning_rate": 6.816171825647505e-06,
"loss": 0.221,
"step": 1080
},
{
"epoch": 0.34433738745853737,
"grad_norm": 0.17047037019662084,
"learning_rate": 6.87934301958307e-06,
"loss": 0.2222,
"step": 1090
},
{
"epoch": 0.34749644605907437,
"grad_norm": 0.18118959670707843,
"learning_rate": 6.942514213518636e-06,
"loss": 0.226,
"step": 1100
},
{
"epoch": 0.35065550465961143,
"grad_norm": 0.18596851555170507,
"learning_rate": 7.005685407454202e-06,
"loss": 0.2208,
"step": 1110
},
{
"epoch": 0.3538145632601485,
"grad_norm": 0.17540680145260182,
"learning_rate": 7.068856601389766e-06,
"loss": 0.2228,
"step": 1120
},
{
"epoch": 0.35697362186068554,
"grad_norm": 0.16362413537591447,
"learning_rate": 7.1320277953253324e-06,
"loss": 0.2199,
"step": 1130
},
{
"epoch": 0.36013268046122254,
"grad_norm": 0.16388409560788866,
"learning_rate": 7.195198989260898e-06,
"loss": 0.2183,
"step": 1140
},
{
"epoch": 0.3632917390617596,
"grad_norm": 0.16780920971825364,
"learning_rate": 7.258370183196462e-06,
"loss": 0.2223,
"step": 1150
},
{
"epoch": 0.36645079766229666,
"grad_norm": 0.16629698944531449,
"learning_rate": 7.3215413771320285e-06,
"loss": 0.2184,
"step": 1160
},
{
"epoch": 0.36960985626283366,
"grad_norm": 0.1760306249090938,
"learning_rate": 7.384712571067594e-06,
"loss": 0.2219,
"step": 1170
},
{
"epoch": 0.3727689148633707,
"grad_norm": 0.1790776173967007,
"learning_rate": 7.447883765003159e-06,
"loss": 0.2198,
"step": 1180
},
{
"epoch": 0.37592797346390777,
"grad_norm": 0.1857393797937426,
"learning_rate": 7.5110549589387245e-06,
"loss": 0.2183,
"step": 1190
},
{
"epoch": 0.3790870320644448,
"grad_norm": 0.18072716423697788,
"learning_rate": 7.57422615287429e-06,
"loss": 0.2198,
"step": 1200
},
{
"epoch": 0.38224609066498183,
"grad_norm": 0.1627749965589205,
"learning_rate": 7.637397346809855e-06,
"loss": 0.2204,
"step": 1210
},
{
"epoch": 0.3854051492655189,
"grad_norm": 0.23081430880637033,
"learning_rate": 7.700568540745421e-06,
"loss": 0.2179,
"step": 1220
},
{
"epoch": 0.3885642078660559,
"grad_norm": 0.14668683659878062,
"learning_rate": 7.763739734680986e-06,
"loss": 0.218,
"step": 1230
},
{
"epoch": 0.39172326646659295,
"grad_norm": 0.17314703270798587,
"learning_rate": 7.82691092861655e-06,
"loss": 0.2172,
"step": 1240
},
{
"epoch": 0.39488232506713,
"grad_norm": 0.1587830451358659,
"learning_rate": 7.890082122552117e-06,
"loss": 0.2183,
"step": 1250
},
{
"epoch": 0.39804138366766706,
"grad_norm": 0.1581230238900689,
"learning_rate": 7.953253316487683e-06,
"loss": 0.2157,
"step": 1260
},
{
"epoch": 0.40120044226820406,
"grad_norm": 0.15808321097279437,
"learning_rate": 8.016424510423247e-06,
"loss": 0.2152,
"step": 1270
},
{
"epoch": 0.4043595008687411,
"grad_norm": 0.17727435096583632,
"learning_rate": 8.079595704358814e-06,
"loss": 0.2169,
"step": 1280
},
{
"epoch": 0.4075185594692782,
"grad_norm": 0.16825167940141256,
"learning_rate": 8.142766898294378e-06,
"loss": 0.2143,
"step": 1290
},
{
"epoch": 0.4106776180698152,
"grad_norm": 0.1623055799684783,
"learning_rate": 8.205938092229944e-06,
"loss": 0.214,
"step": 1300
},
{
"epoch": 0.41383667667035223,
"grad_norm": 0.16866092597564897,
"learning_rate": 8.269109286165509e-06,
"loss": 0.2189,
"step": 1310
},
{
"epoch": 0.4169957352708893,
"grad_norm": 0.15469945071292018,
"learning_rate": 8.332280480101075e-06,
"loss": 0.2147,
"step": 1320
},
{
"epoch": 0.4201547938714263,
"grad_norm": 0.15496524048072485,
"learning_rate": 8.39545167403664e-06,
"loss": 0.2155,
"step": 1330
},
{
"epoch": 0.42331385247196335,
"grad_norm": 0.1607478129687131,
"learning_rate": 8.458622867972206e-06,
"loss": 0.2142,
"step": 1340
},
{
"epoch": 0.4264729110725004,
"grad_norm": 0.16297866728767108,
"learning_rate": 8.521794061907772e-06,
"loss": 0.2148,
"step": 1350
},
{
"epoch": 0.42963196967303746,
"grad_norm": 0.1549484893694436,
"learning_rate": 8.584965255843336e-06,
"loss": 0.2162,
"step": 1360
},
{
"epoch": 0.43279102827357446,
"grad_norm": 0.15790917490616427,
"learning_rate": 8.6481364497789e-06,
"loss": 0.2142,
"step": 1370
},
{
"epoch": 0.4359500868741115,
"grad_norm": 0.17471621371832,
"learning_rate": 8.711307643714467e-06,
"loss": 0.215,
"step": 1380
},
{
"epoch": 0.4391091454746486,
"grad_norm": 0.1704061630987402,
"learning_rate": 8.774478837650032e-06,
"loss": 0.2145,
"step": 1390
},
{
"epoch": 0.4422682040751856,
"grad_norm": 0.17423241802858616,
"learning_rate": 8.837650031585598e-06,
"loss": 0.2132,
"step": 1400
},
{
"epoch": 0.44542726267572263,
"grad_norm": 0.16758619433784536,
"learning_rate": 8.900821225521164e-06,
"loss": 0.2138,
"step": 1410
},
{
"epoch": 0.4485863212762597,
"grad_norm": 0.17999186900204928,
"learning_rate": 8.963992419456728e-06,
"loss": 0.2127,
"step": 1420
},
{
"epoch": 0.4517453798767967,
"grad_norm": 0.17065420980005516,
"learning_rate": 9.027163613392293e-06,
"loss": 0.2138,
"step": 1430
},
{
"epoch": 0.45490443847733375,
"grad_norm": 0.1974703018692422,
"learning_rate": 9.090334807327859e-06,
"loss": 0.2127,
"step": 1440
},
{
"epoch": 0.4580634970778708,
"grad_norm": 0.20057492546176425,
"learning_rate": 9.153506001263425e-06,
"loss": 0.2141,
"step": 1450
},
{
"epoch": 0.4612225556784078,
"grad_norm": 0.17543621309019505,
"learning_rate": 9.21667719519899e-06,
"loss": 0.2109,
"step": 1460
},
{
"epoch": 0.46438161427894487,
"grad_norm": 0.1693436152342155,
"learning_rate": 9.279848389134556e-06,
"loss": 0.2138,
"step": 1470
},
{
"epoch": 0.4675406728794819,
"grad_norm": 0.18371499893258605,
"learning_rate": 9.34301958307012e-06,
"loss": 0.2121,
"step": 1480
},
{
"epoch": 0.470699731480019,
"grad_norm": 0.18302132069209362,
"learning_rate": 9.406190777005687e-06,
"loss": 0.2125,
"step": 1490
},
{
"epoch": 0.473858790080556,
"grad_norm": 0.16878973943654269,
"learning_rate": 9.469361970941253e-06,
"loss": 0.2096,
"step": 1500
},
{
"epoch": 0.47701784868109304,
"grad_norm": 0.15672142571721928,
"learning_rate": 9.532533164876817e-06,
"loss": 0.2157,
"step": 1510
},
{
"epoch": 0.4801769072816301,
"grad_norm": 0.17793569853299288,
"learning_rate": 9.595704358812382e-06,
"loss": 0.2116,
"step": 1520
},
{
"epoch": 0.4833359658821671,
"grad_norm": 0.1711221799529836,
"learning_rate": 9.658875552747946e-06,
"loss": 0.2086,
"step": 1530
},
{
"epoch": 0.48649502448270415,
"grad_norm": 0.15221905413795137,
"learning_rate": 9.722046746683513e-06,
"loss": 0.2079,
"step": 1540
},
{
"epoch": 0.4896540830832412,
"grad_norm": 0.1706269772815951,
"learning_rate": 9.785217940619079e-06,
"loss": 0.2124,
"step": 1550
},
{
"epoch": 0.4928131416837782,
"grad_norm": 0.17041271545684786,
"learning_rate": 9.848389134554643e-06,
"loss": 0.2105,
"step": 1560
},
{
"epoch": 0.49597220028431527,
"grad_norm": 0.1707130122866249,
"learning_rate": 9.91156032849021e-06,
"loss": 0.2094,
"step": 1570
},
{
"epoch": 0.4991312588848523,
"grad_norm": 0.1533585694871482,
"learning_rate": 9.974731522425774e-06,
"loss": 0.2125,
"step": 1580
},
{
"epoch": 0.5022903174853893,
"grad_norm": 0.1445021677961463,
"learning_rate": 9.99999562381833e-06,
"loss": 0.2104,
"step": 1590
},
{
"epoch": 0.5054493760859264,
"grad_norm": 0.14451068806922954,
"learning_rate": 9.999968880513634e-06,
"loss": 0.2115,
"step": 1600
},
{
"epoch": 0.5086084346864634,
"grad_norm": 0.1711007683006565,
"learning_rate": 9.99991782524616e-06,
"loss": 0.2119,
"step": 1610
},
{
"epoch": 0.5117674932870004,
"grad_norm": 0.14862996999525604,
"learning_rate": 9.999842458264166e-06,
"loss": 0.2091,
"step": 1620
},
{
"epoch": 0.5149265518875376,
"grad_norm": 0.16833296434447828,
"learning_rate": 9.999742779934113e-06,
"loss": 0.2089,
"step": 1630
},
{
"epoch": 0.5180856104880746,
"grad_norm": 0.16624207360408486,
"learning_rate": 9.999618790740677e-06,
"loss": 0.2076,
"step": 1640
},
{
"epoch": 0.5212446690886116,
"grad_norm": 0.18734837267014448,
"learning_rate": 9.99947049128675e-06,
"loss": 0.2093,
"step": 1650
},
{
"epoch": 0.5244037276891487,
"grad_norm": 0.17593769602265188,
"learning_rate": 9.999297882293429e-06,
"loss": 0.2104,
"step": 1660
},
{
"epoch": 0.5275627862896857,
"grad_norm": 0.1534253401122401,
"learning_rate": 9.999100964600006e-06,
"loss": 0.2094,
"step": 1670
},
{
"epoch": 0.5307218448902227,
"grad_norm": 0.1488174937468484,
"learning_rate": 9.998879739163982e-06,
"loss": 0.2087,
"step": 1680
},
{
"epoch": 0.5338809034907598,
"grad_norm": 0.13829240161020598,
"learning_rate": 9.998634207061047e-06,
"loss": 0.2083,
"step": 1690
},
{
"epoch": 0.5370399620912968,
"grad_norm": 0.15837901482606578,
"learning_rate": 9.998364369485083e-06,
"loss": 0.2065,
"step": 1700
},
{
"epoch": 0.5401990206918338,
"grad_norm": 0.14373686188939075,
"learning_rate": 9.998070227748153e-06,
"loss": 0.2077,
"step": 1710
},
{
"epoch": 0.5433580792923709,
"grad_norm": 0.13856399191761934,
"learning_rate": 9.9977517832805e-06,
"loss": 0.2074,
"step": 1720
},
{
"epoch": 0.5465171378929079,
"grad_norm": 0.16428912206666343,
"learning_rate": 9.997409037630533e-06,
"loss": 0.2072,
"step": 1730
},
{
"epoch": 0.5496761964934449,
"grad_norm": 0.14598794955578479,
"learning_rate": 9.997041992464828e-06,
"loss": 0.207,
"step": 1740
},
{
"epoch": 0.552835255093982,
"grad_norm": 0.1544681862373563,
"learning_rate": 9.996650649568116e-06,
"loss": 0.2067,
"step": 1750
},
{
"epoch": 0.555994313694519,
"grad_norm": 0.16063147300592975,
"learning_rate": 9.996235010843269e-06,
"loss": 0.2091,
"step": 1760
},
{
"epoch": 0.5591533722950561,
"grad_norm": 0.16009334025881428,
"learning_rate": 9.9957950783113e-06,
"loss": 0.2068,
"step": 1770
},
{
"epoch": 0.5623124308955931,
"grad_norm": 0.1473832672273453,
"learning_rate": 9.995330854111342e-06,
"loss": 0.2072,
"step": 1780
},
{
"epoch": 0.5654714894961301,
"grad_norm": 0.1758179524267874,
"learning_rate": 9.994842340500654e-06,
"loss": 0.2051,
"step": 1790
},
{
"epoch": 0.5686305480966672,
"grad_norm": 0.160062543193855,
"learning_rate": 9.994329539854597e-06,
"loss": 0.2023,
"step": 1800
},
{
"epoch": 0.5717896066972042,
"grad_norm": 0.14481223083195094,
"learning_rate": 9.993792454666622e-06,
"loss": 0.2049,
"step": 1810
},
{
"epoch": 0.5749486652977412,
"grad_norm": 0.17516638701269122,
"learning_rate": 9.993231087548263e-06,
"loss": 0.2056,
"step": 1820
},
{
"epoch": 0.5781077238982784,
"grad_norm": 0.15577929587795278,
"learning_rate": 9.992645441229128e-06,
"loss": 0.2053,
"step": 1830
},
{
"epoch": 0.5812667824988154,
"grad_norm": 0.16063695498724154,
"learning_rate": 9.992035518556873e-06,
"loss": 0.2032,
"step": 1840
},
{
"epoch": 0.5844258410993524,
"grad_norm": 0.13465469823943357,
"learning_rate": 9.991401322497202e-06,
"loss": 0.2078,
"step": 1850
},
{
"epoch": 0.5875848996998895,
"grad_norm": 0.14458781318924407,
"learning_rate": 9.990742856133844e-06,
"loss": 0.2075,
"step": 1860
},
{
"epoch": 0.5907439583004265,
"grad_norm": 0.15458252884662305,
"learning_rate": 9.990060122668543e-06,
"loss": 0.2058,
"step": 1870
},
{
"epoch": 0.5939030169009635,
"grad_norm": 0.13378310324370862,
"learning_rate": 9.989353125421034e-06,
"loss": 0.2077,
"step": 1880
},
{
"epoch": 0.5970620755015006,
"grad_norm": 0.14092827320728965,
"learning_rate": 9.98862186782904e-06,
"loss": 0.205,
"step": 1890
},
{
"epoch": 0.6002211341020376,
"grad_norm": 0.14528693075899177,
"learning_rate": 9.987866353448241e-06,
"loss": 0.2056,
"step": 1900
},
{
"epoch": 0.6033801927025746,
"grad_norm": 0.16835849378700468,
"learning_rate": 9.987086585952271e-06,
"loss": 0.202,
"step": 1910
},
{
"epoch": 0.6065392513031117,
"grad_norm": 0.14702108474072842,
"learning_rate": 9.986282569132688e-06,
"loss": 0.2046,
"step": 1920
},
{
"epoch": 0.6096983099036487,
"grad_norm": 0.15598954366138304,
"learning_rate": 9.98545430689896e-06,
"loss": 0.2037,
"step": 1930
},
{
"epoch": 0.6128573685041857,
"grad_norm": 0.15363339652231678,
"learning_rate": 9.984601803278451e-06,
"loss": 0.2065,
"step": 1940
},
{
"epoch": 0.6160164271047228,
"grad_norm": 0.15079303342445485,
"learning_rate": 9.983725062416392e-06,
"loss": 0.2046,
"step": 1950
},
{
"epoch": 0.6191754857052598,
"grad_norm": 0.13780206734265157,
"learning_rate": 9.98282408857587e-06,
"loss": 0.2054,
"step": 1960
},
{
"epoch": 0.6223345443057968,
"grad_norm": 0.1355920930705493,
"learning_rate": 9.981898886137795e-06,
"loss": 0.2039,
"step": 1970
},
{
"epoch": 0.6254936029063339,
"grad_norm": 0.14054778625440462,
"learning_rate": 9.980949459600899e-06,
"loss": 0.2045,
"step": 1980
},
{
"epoch": 0.6286526615068709,
"grad_norm": 0.16259862482128506,
"learning_rate": 9.979975813581694e-06,
"loss": 0.2033,
"step": 1990
},
{
"epoch": 0.6318117201074079,
"grad_norm": 0.16320641790440754,
"learning_rate": 9.978977952814456e-06,
"loss": 0.2053,
"step": 2000
},
{
"epoch": 0.634970778707945,
"grad_norm": 0.14504338218528204,
"learning_rate": 9.97795588215121e-06,
"loss": 0.2041,
"step": 2010
},
{
"epoch": 0.638129837308482,
"grad_norm": 0.14088062087038478,
"learning_rate": 9.97690960656169e-06,
"loss": 0.2034,
"step": 2020
},
{
"epoch": 0.6412888959090192,
"grad_norm": 0.16368648316215648,
"learning_rate": 9.975839131133335e-06,
"loss": 0.2004,
"step": 2030
},
{
"epoch": 0.6444479545095562,
"grad_norm": 0.14137074446596326,
"learning_rate": 9.974744461071246e-06,
"loss": 0.2039,
"step": 2040
},
{
"epoch": 0.6476070131100932,
"grad_norm": 0.16743385196077595,
"learning_rate": 9.973625601698176e-06,
"loss": 0.2024,
"step": 2050
},
{
"epoch": 0.6507660717106303,
"grad_norm": 0.1420212347341941,
"learning_rate": 9.972482558454488e-06,
"loss": 0.2006,
"step": 2060
},
{
"epoch": 0.6539251303111673,
"grad_norm": 0.1541147190019739,
"learning_rate": 9.971315336898144e-06,
"loss": 0.2031,
"step": 2070
},
{
"epoch": 0.6570841889117043,
"grad_norm": 0.14954970584415023,
"learning_rate": 9.970123942704667e-06,
"loss": 0.2022,
"step": 2080
},
{
"epoch": 0.6602432475122414,
"grad_norm": 0.13835942774621643,
"learning_rate": 9.968908381667122e-06,
"loss": 0.205,
"step": 2090
},
{
"epoch": 0.6634023061127784,
"grad_norm": 0.1444063211993615,
"learning_rate": 9.967668659696077e-06,
"loss": 0.2003,
"step": 2100
},
{
"epoch": 0.6665613647133154,
"grad_norm": 0.15181588903023388,
"learning_rate": 9.966404782819587e-06,
"loss": 0.2041,
"step": 2110
},
{
"epoch": 0.6697204233138525,
"grad_norm": 0.14453828048169265,
"learning_rate": 9.965116757183156e-06,
"loss": 0.2008,
"step": 2120
},
{
"epoch": 0.6728794819143895,
"grad_norm": 0.16433983585515474,
"learning_rate": 9.963804589049709e-06,
"loss": 0.2045,
"step": 2130
},
{
"epoch": 0.6760385405149265,
"grad_norm": 0.14280102041208004,
"learning_rate": 9.962468284799559e-06,
"loss": 0.2021,
"step": 2140
},
{
"epoch": 0.6791975991154636,
"grad_norm": 0.14204139100462726,
"learning_rate": 9.961107850930386e-06,
"loss": 0.201,
"step": 2150
},
{
"epoch": 0.6823566577160006,
"grad_norm": 0.1324076473779632,
"learning_rate": 9.959723294057195e-06,
"loss": 0.1991,
"step": 2160
},
{
"epoch": 0.6855157163165376,
"grad_norm": 0.14295162932415698,
"learning_rate": 9.958314620912283e-06,
"loss": 0.2025,
"step": 2170
},
{
"epoch": 0.6886747749170747,
"grad_norm": 0.15726554545849142,
"learning_rate": 9.956881838345221e-06,
"loss": 0.2033,
"step": 2180
},
{
"epoch": 0.6918338335176117,
"grad_norm": 0.16250545787100992,
"learning_rate": 9.955424953322797e-06,
"loss": 0.2015,
"step": 2190
},
{
"epoch": 0.6949928921181487,
"grad_norm": 0.1614266022365173,
"learning_rate": 9.953943972929003e-06,
"loss": 0.1996,
"step": 2200
},
{
"epoch": 0.6981519507186859,
"grad_norm": 0.15900062576977386,
"learning_rate": 9.952438904364996e-06,
"loss": 0.2026,
"step": 2210
},
{
"epoch": 0.7013110093192229,
"grad_norm": 0.13873539419869083,
"learning_rate": 9.950909754949052e-06,
"loss": 0.2035,
"step": 2220
},
{
"epoch": 0.7044700679197599,
"grad_norm": 0.16329226810226954,
"learning_rate": 9.949356532116546e-06,
"loss": 0.2017,
"step": 2230
},
{
"epoch": 0.707629126520297,
"grad_norm": 0.130433030569765,
"learning_rate": 9.947779243419899e-06,
"loss": 0.2017,
"step": 2240
},
{
"epoch": 0.710788185120834,
"grad_norm": 0.13992897215148528,
"learning_rate": 9.946177896528557e-06,
"loss": 0.2001,
"step": 2250
},
{
"epoch": 0.7139472437213711,
"grad_norm": 0.14070910045992718,
"learning_rate": 9.944552499228947e-06,
"loss": 0.2007,
"step": 2260
},
{
"epoch": 0.7171063023219081,
"grad_norm": 0.1469150772379642,
"learning_rate": 9.942903059424441e-06,
"loss": 0.2006,
"step": 2270
},
{
"epoch": 0.7202653609224451,
"grad_norm": 0.18224682305919618,
"learning_rate": 9.941229585135307e-06,
"loss": 0.2028,
"step": 2280
},
{
"epoch": 0.7234244195229822,
"grad_norm": 0.14288906489417755,
"learning_rate": 9.939532084498685e-06,
"loss": 0.1987,
"step": 2290
},
{
"epoch": 0.7265834781235192,
"grad_norm": 0.13437271496120856,
"learning_rate": 9.937810565768544e-06,
"loss": 0.1993,
"step": 2300
},
{
"epoch": 0.7297425367240562,
"grad_norm": 0.1446887097806904,
"learning_rate": 9.936065037315636e-06,
"loss": 0.2011,
"step": 2310
},
{
"epoch": 0.7329015953245933,
"grad_norm": 0.13251427513003186,
"learning_rate": 9.934295507627456e-06,
"loss": 0.2022,
"step": 2320
},
{
"epoch": 0.7360606539251303,
"grad_norm": 0.1452288699010562,
"learning_rate": 9.932501985308206e-06,
"loss": 0.2009,
"step": 2330
},
{
"epoch": 0.7392197125256673,
"grad_norm": 0.14728963865115374,
"learning_rate": 9.93068447907875e-06,
"loss": 0.1987,
"step": 2340
},
{
"epoch": 0.7423787711262044,
"grad_norm": 0.1517245487863863,
"learning_rate": 9.928842997776574e-06,
"loss": 0.2013,
"step": 2350
},
{
"epoch": 0.7455378297267414,
"grad_norm": 0.1614866572575232,
"learning_rate": 9.926977550355734e-06,
"loss": 0.1997,
"step": 2360
},
{
"epoch": 0.7486968883272784,
"grad_norm": 0.14031629423175507,
"learning_rate": 9.92508814588683e-06,
"loss": 0.199,
"step": 2370
},
{
"epoch": 0.7518559469278155,
"grad_norm": 0.14118485538481557,
"learning_rate": 9.92317479355694e-06,
"loss": 0.1976,
"step": 2380
},
{
"epoch": 0.7550150055283525,
"grad_norm": 0.13181048592845238,
"learning_rate": 9.921237502669595e-06,
"loss": 0.198,
"step": 2390
},
{
"epoch": 0.7581740641288895,
"grad_norm": 0.1345265368505879,
"learning_rate": 9.919276282644723e-06,
"loss": 0.201,
"step": 2400
},
{
"epoch": 0.7613331227294267,
"grad_norm": 0.13720890023292417,
"learning_rate": 9.917291143018604e-06,
"loss": 0.2009,
"step": 2410
},
{
"epoch": 0.7644921813299637,
"grad_norm": 0.13845111986552353,
"learning_rate": 9.915282093443825e-06,
"loss": 0.2008,
"step": 2420
},
{
"epoch": 0.7676512399305007,
"grad_norm": 0.14617335075904797,
"learning_rate": 9.913249143689234e-06,
"loss": 0.1991,
"step": 2430
},
{
"epoch": 0.7708102985310378,
"grad_norm": 0.1320877727948845,
"learning_rate": 9.911192303639896e-06,
"loss": 0.1999,
"step": 2440
},
{
"epoch": 0.7739693571315748,
"grad_norm": 0.13482883240500468,
"learning_rate": 9.909111583297035e-06,
"loss": 0.1997,
"step": 2450
},
{
"epoch": 0.7771284157321118,
"grad_norm": 0.14200097798675781,
"learning_rate": 9.907006992777991e-06,
"loss": 0.2008,
"step": 2460
},
{
"epoch": 0.7802874743326489,
"grad_norm": 0.13733098129824253,
"learning_rate": 9.904878542316177e-06,
"loss": 0.1988,
"step": 2470
},
{
"epoch": 0.7834465329331859,
"grad_norm": 0.14967333958732693,
"learning_rate": 9.902726242261015e-06,
"loss": 0.2,
"step": 2480
},
{
"epoch": 0.786605591533723,
"grad_norm": 0.18469961665919096,
"learning_rate": 9.9005501030779e-06,
"loss": 0.1998,
"step": 2490
},
{
"epoch": 0.78976465013426,
"grad_norm": 0.1291065536177641,
"learning_rate": 9.898350135348143e-06,
"loss": 0.1994,
"step": 2500
},
{
"epoch": 0.792923708734797,
"grad_norm": 0.1455654343221393,
"learning_rate": 9.896126349768913e-06,
"loss": 0.1961,
"step": 2510
},
{
"epoch": 0.7960827673353341,
"grad_norm": 0.1305825664747534,
"learning_rate": 9.893878757153197e-06,
"loss": 0.1997,
"step": 2520
},
{
"epoch": 0.7992418259358711,
"grad_norm": 0.1355469021976556,
"learning_rate": 9.891607368429741e-06,
"loss": 0.2009,
"step": 2530
},
{
"epoch": 0.8024008845364081,
"grad_norm": 0.1393422807545934,
"learning_rate": 9.889312194642999e-06,
"loss": 0.1996,
"step": 2540
},
{
"epoch": 0.8055599431369452,
"grad_norm": 0.14137833832556562,
"learning_rate": 9.886993246953075e-06,
"loss": 0.1984,
"step": 2550
},
{
"epoch": 0.8087190017374822,
"grad_norm": 0.1288518244334966,
"learning_rate": 9.884650536635674e-06,
"loss": 0.1998,
"step": 2560
},
{
"epoch": 0.8118780603380192,
"grad_norm": 0.12540981604153706,
"learning_rate": 9.882284075082042e-06,
"loss": 0.1953,
"step": 2570
},
{
"epoch": 0.8150371189385563,
"grad_norm": 0.1298044947287737,
"learning_rate": 9.879893873798918e-06,
"loss": 0.1998,
"step": 2580
},
{
"epoch": 0.8181961775390934,
"grad_norm": 0.13593942142698026,
"learning_rate": 9.877479944408469e-06,
"loss": 0.1994,
"step": 2590
},
{
"epoch": 0.8213552361396304,
"grad_norm": 0.14577596422732375,
"learning_rate": 9.875042298648241e-06,
"loss": 0.1968,
"step": 2600
},
{
"epoch": 0.8245142947401675,
"grad_norm": 0.13651649878117303,
"learning_rate": 9.872580948371101e-06,
"loss": 0.1959,
"step": 2610
},
{
"epoch": 0.8276733533407045,
"grad_norm": 0.13458568002303536,
"learning_rate": 9.870095905545172e-06,
"loss": 0.1975,
"step": 2620
},
{
"epoch": 0.8308324119412415,
"grad_norm": 0.13467497395688513,
"learning_rate": 9.867587182253783e-06,
"loss": 0.198,
"step": 2630
},
{
"epoch": 0.8339914705417786,
"grad_norm": 0.1334106181890542,
"learning_rate": 9.86505479069541e-06,
"loss": 0.1975,
"step": 2640
},
{
"epoch": 0.8371505291423156,
"grad_norm": 0.14726119516550862,
"learning_rate": 9.862498743183606e-06,
"loss": 0.1962,
"step": 2650
},
{
"epoch": 0.8403095877428526,
"grad_norm": 0.13818332053028007,
"learning_rate": 9.85991905214696e-06,
"loss": 0.1998,
"step": 2660
},
{
"epoch": 0.8434686463433897,
"grad_norm": 0.14412901343016873,
"learning_rate": 9.85731573012902e-06,
"loss": 0.2,
"step": 2670
},
{
"epoch": 0.8466277049439267,
"grad_norm": 0.1255413662933083,
"learning_rate": 9.854688789788236e-06,
"loss": 0.198,
"step": 2680
},
{
"epoch": 0.8497867635444637,
"grad_norm": 0.13187344960522424,
"learning_rate": 9.852038243897903e-06,
"loss": 0.1972,
"step": 2690
},
{
"epoch": 0.8529458221450008,
"grad_norm": 0.13247624619465903,
"learning_rate": 9.849364105346098e-06,
"loss": 0.1982,
"step": 2700
},
{
"epoch": 0.8561048807455378,
"grad_norm": 0.12352050553226067,
"learning_rate": 9.846666387135613e-06,
"loss": 0.1954,
"step": 2710
},
{
"epoch": 0.8592639393460749,
"grad_norm": 0.13384183862518867,
"learning_rate": 9.843945102383892e-06,
"loss": 0.197,
"step": 2720
},
{
"epoch": 0.8624229979466119,
"grad_norm": 0.12329786824129346,
"learning_rate": 9.841200264322974e-06,
"loss": 0.1977,
"step": 2730
},
{
"epoch": 0.8655820565471489,
"grad_norm": 0.13477039032719176,
"learning_rate": 9.838431886299421e-06,
"loss": 0.1961,
"step": 2740
},
{
"epoch": 0.868741115147686,
"grad_norm": 0.13828179616792935,
"learning_rate": 9.83563998177426e-06,
"loss": 0.1967,
"step": 2750
},
{
"epoch": 0.871900173748223,
"grad_norm": 0.14564045162827766,
"learning_rate": 9.83282456432291e-06,
"loss": 0.1965,
"step": 2760
},
{
"epoch": 0.87505923234876,
"grad_norm": 0.13685777011127429,
"learning_rate": 9.829985647635118e-06,
"loss": 0.1981,
"step": 2770
},
{
"epoch": 0.8782182909492972,
"grad_norm": 0.13617569439938054,
"learning_rate": 9.827123245514901e-06,
"loss": 0.1951,
"step": 2780
},
{
"epoch": 0.8813773495498342,
"grad_norm": 0.12868163128280088,
"learning_rate": 9.824237371880469e-06,
"loss": 0.195,
"step": 2790
},
{
"epoch": 0.8845364081503712,
"grad_norm": 0.13965590647075304,
"learning_rate": 9.821328040764157e-06,
"loss": 0.1984,
"step": 2800
},
{
"epoch": 0.8876954667509083,
"grad_norm": 0.1191526599411457,
"learning_rate": 9.818395266312363e-06,
"loss": 0.1925,
"step": 2810
},
{
"epoch": 0.8908545253514453,
"grad_norm": 0.1209195098683138,
"learning_rate": 9.81543906278548e-06,
"loss": 0.1995,
"step": 2820
},
{
"epoch": 0.8940135839519823,
"grad_norm": 0.12466523038010362,
"learning_rate": 9.812459444557815e-06,
"loss": 0.1937,
"step": 2830
},
{
"epoch": 0.8971726425525194,
"grad_norm": 0.12952393163312614,
"learning_rate": 9.809456426117533e-06,
"loss": 0.1932,
"step": 2840
},
{
"epoch": 0.9003317011530564,
"grad_norm": 0.13392155972728179,
"learning_rate": 9.806430022066582e-06,
"loss": 0.1978,
"step": 2850
},
{
"epoch": 0.9034907597535934,
"grad_norm": 0.13186718584966667,
"learning_rate": 9.803380247120616e-06,
"loss": 0.1953,
"step": 2860
},
{
"epoch": 0.9066498183541305,
"grad_norm": 0.1283857878998356,
"learning_rate": 9.800307116108931e-06,
"loss": 0.1962,
"step": 2870
},
{
"epoch": 0.9098088769546675,
"grad_norm": 0.12302487764368193,
"learning_rate": 9.797210643974388e-06,
"loss": 0.1954,
"step": 2880
},
{
"epoch": 0.9129679355552045,
"grad_norm": 0.1279449953769118,
"learning_rate": 9.794090845773346e-06,
"loss": 0.1936,
"step": 2890
},
{
"epoch": 0.9161269941557416,
"grad_norm": 0.13723185562370793,
"learning_rate": 9.79094773667558e-06,
"loss": 0.1948,
"step": 2900
},
{
"epoch": 0.9192860527562786,
"grad_norm": 0.1382694059956154,
"learning_rate": 9.787781331964217e-06,
"loss": 0.1961,
"step": 2910
},
{
"epoch": 0.9224451113568156,
"grad_norm": 0.13522487929855218,
"learning_rate": 9.784591647035654e-06,
"loss": 0.1944,
"step": 2920
},
{
"epoch": 0.9256041699573527,
"grad_norm": 0.13619232160862846,
"learning_rate": 9.781378697399492e-06,
"loss": 0.1939,
"step": 2930
},
{
"epoch": 0.9287632285578897,
"grad_norm": 0.13966605421891545,
"learning_rate": 9.778142498678447e-06,
"loss": 0.1936,
"step": 2940
},
{
"epoch": 0.9319222871584268,
"grad_norm": 0.13610146242659704,
"learning_rate": 9.774883066608288e-06,
"loss": 0.1955,
"step": 2950
},
{
"epoch": 0.9350813457589638,
"grad_norm": 0.1272898066516385,
"learning_rate": 9.771600417037747e-06,
"loss": 0.1951,
"step": 2960
},
{
"epoch": 0.9382404043595008,
"grad_norm": 0.12577223515891656,
"learning_rate": 9.76829456592846e-06,
"loss": 0.1941,
"step": 2970
},
{
"epoch": 0.941399462960038,
"grad_norm": 0.13229230817335338,
"learning_rate": 9.76496552935487e-06,
"loss": 0.1948,
"step": 2980
},
{
"epoch": 0.944558521560575,
"grad_norm": 0.13057771902599097,
"learning_rate": 9.76161332350416e-06,
"loss": 0.1945,
"step": 2990
},
{
"epoch": 0.947717580161112,
"grad_norm": 0.1375087406497119,
"learning_rate": 9.758237964676175e-06,
"loss": 0.1946,
"step": 3000
},
{
"epoch": 0.9508766387616491,
"grad_norm": 0.13144411031384784,
"learning_rate": 9.754839469283333e-06,
"loss": 0.1916,
"step": 3010
},
{
"epoch": 0.9540356973621861,
"grad_norm": 0.1307858984474674,
"learning_rate": 9.751417853850557e-06,
"loss": 0.1961,
"step": 3020
},
{
"epoch": 0.9571947559627231,
"grad_norm": 0.12400375737914372,
"learning_rate": 9.747973135015187e-06,
"loss": 0.1948,
"step": 3030
},
{
"epoch": 0.9603538145632602,
"grad_norm": 0.14800804219149807,
"learning_rate": 9.744505329526906e-06,
"loss": 0.1951,
"step": 3040
},
{
"epoch": 0.9635128731637972,
"grad_norm": 0.13528680108076863,
"learning_rate": 9.741014454247648e-06,
"loss": 0.1946,
"step": 3050
},
{
"epoch": 0.9666719317643342,
"grad_norm": 0.15276170947974638,
"learning_rate": 9.737500526151525e-06,
"loss": 0.1935,
"step": 3060
},
{
"epoch": 0.9698309903648713,
"grad_norm": 0.1280505905081845,
"learning_rate": 9.733963562324739e-06,
"loss": 0.193,
"step": 3070
},
{
"epoch": 0.9729900489654083,
"grad_norm": 0.13045657850862527,
"learning_rate": 9.730403579965508e-06,
"loss": 0.1953,
"step": 3080
},
{
"epoch": 0.9761491075659453,
"grad_norm": 0.12920108483379814,
"learning_rate": 9.726820596383968e-06,
"loss": 0.194,
"step": 3090
},
{
"epoch": 0.9793081661664824,
"grad_norm": 0.1275121663048079,
"learning_rate": 9.723214629002103e-06,
"loss": 0.1937,
"step": 3100
},
{
"epoch": 0.9824672247670194,
"grad_norm": 0.13153937715884076,
"learning_rate": 9.719585695353648e-06,
"loss": 0.1927,
"step": 3110
},
{
"epoch": 0.9856262833675564,
"grad_norm": 0.13540077808963083,
"learning_rate": 9.715933813084012e-06,
"loss": 0.1948,
"step": 3120
},
{
"epoch": 0.9887853419680935,
"grad_norm": 0.13634113133020404,
"learning_rate": 9.712258999950196e-06,
"loss": 0.192,
"step": 3130
},
{
"epoch": 0.9919444005686305,
"grad_norm": 0.11908229587933031,
"learning_rate": 9.70856127382069e-06,
"loss": 0.1937,
"step": 3140
},
{
"epoch": 0.9951034591691675,
"grad_norm": 0.11928897371871516,
"learning_rate": 9.704840652675405e-06,
"loss": 0.1939,
"step": 3150
},
{
"epoch": 0.9982625177697046,
"grad_norm": 0.12957379845580613,
"learning_rate": 9.701097154605572e-06,
"loss": 0.1934,
"step": 3160
},
{
"epoch": 1.0015795293002685,
"grad_norm": 0.12555682464472692,
"learning_rate": 9.697330797813665e-06,
"loss": 0.2078,
"step": 3170
},
{
"epoch": 1.0047385879008055,
"grad_norm": 0.12366696963941619,
"learning_rate": 9.693541600613297e-06,
"loss": 0.1833,
"step": 3180
},
{
"epoch": 1.0078976465013425,
"grad_norm": 0.12387115361971955,
"learning_rate": 9.689729581429154e-06,
"loss": 0.184,
"step": 3190
},
{
"epoch": 1.0110567051018797,
"grad_norm": 0.12394939667356165,
"learning_rate": 9.68589475879688e-06,
"loss": 0.182,
"step": 3200
}
],
"logging_steps": 10,
"max_steps": 15830,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.177750985487155e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}